8th day of python challenges 111-117

This commit is contained in:
abd.shallal
2019-08-04 15:26:35 +03:00
parent b04c1b055f
commit 627802c383
3215 changed files with 760227 additions and 491 deletions

View File

@@ -0,0 +1,144 @@
"""Rudimentary Apache Arrow-backed ExtensionArray.
At the moment, just a boolean array / type is implemented.
Eventually, we'll want to parametrize the type and support
multiple dtypes. Not all methods are implemented yet, and the
current implementation is not efficient.
"""
import copy
import itertools
import numpy as np
import pyarrow as pa
import pandas as pd
from pandas.api.extensions import (
ExtensionArray,
ExtensionDtype,
register_extension_dtype,
take,
)
@register_extension_dtype
class ArrowBoolDtype(ExtensionDtype):
type = np.bool_
kind = "b"
name = "arrow_bool"
na_value = pa.NULL
@classmethod
def construct_from_string(cls, string):
if string == cls.name:
return cls()
else:
raise TypeError("Cannot construct a '{}' from '{}'".format(cls, string))
@classmethod
def construct_array_type(cls):
return ArrowBoolArray
def _is_boolean(self):
return True
class ArrowBoolArray(ExtensionArray):
def __init__(self, values):
if not isinstance(values, pa.ChunkedArray):
raise ValueError
assert values.type == pa.bool_()
self._data = values
self._dtype = ArrowBoolDtype()
def __repr__(self):
return "ArrowBoolArray({})".format(repr(self._data))
@classmethod
def from_scalars(cls, values):
arr = pa.chunked_array([pa.array(np.asarray(values))])
return cls(arr)
@classmethod
def from_array(cls, arr):
assert isinstance(arr, pa.Array)
return cls(pa.chunked_array([arr]))
@classmethod
def _from_sequence(cls, scalars, dtype=None, copy=False):
return cls.from_scalars(scalars)
def __getitem__(self, item):
if pd.api.types.is_scalar(item):
return self._data.to_pandas()[item]
else:
vals = self._data.to_pandas()[item]
return type(self).from_scalars(vals)
def __len__(self):
return len(self._data)
def astype(self, dtype, copy=True):
# needed to fix this astype for the Series constructor.
if isinstance(dtype, type(self.dtype)) and dtype == self.dtype:
if copy:
return self.copy()
return self
return super().astype(dtype, copy)
@property
def dtype(self):
return self._dtype
@property
def nbytes(self):
return sum(
x.size
for chunk in self._data.chunks
for x in chunk.buffers()
if x is not None
)
def isna(self):
nas = pd.isna(self._data.to_pandas())
return type(self).from_scalars(nas)
def take(self, indices, allow_fill=False, fill_value=None):
data = self._data.to_pandas()
if allow_fill and fill_value is None:
fill_value = self.dtype.na_value
result = take(data, indices, fill_value=fill_value, allow_fill=allow_fill)
return self._from_sequence(result, dtype=self.dtype)
def copy(self):
return type(self)(copy.copy(self._data))
@classmethod
def _concat_same_type(cls, to_concat):
chunks = list(itertools.chain.from_iterable(x._data.chunks for x in to_concat))
arr = pa.chunked_array(chunks)
return cls(arr)
def __invert__(self):
return type(self).from_scalars(~self._data.to_pandas())
def _reduce(self, method, skipna=True, **kwargs):
if skipna:
arr = self[~self.isna()]
else:
arr = self
try:
op = getattr(arr, method)
except AttributeError:
raise TypeError
return op(**kwargs)
def any(self, axis=0, out=None):
return self._data.to_pandas().any()
def all(self, axis=0, out=None):
return self._data.to_pandas().all()

View File

@@ -0,0 +1,70 @@
import numpy as np
import pytest
import pandas as pd
from pandas.tests.extension import base
import pandas.util.testing as tm
pytest.importorskip("pyarrow", minversion="0.10.0")
from .bool import ArrowBoolArray, ArrowBoolDtype # isort:skip
@pytest.fixture
def dtype():
return ArrowBoolDtype()
@pytest.fixture
def data():
values = np.random.randint(0, 2, size=100, dtype=bool)
values[1] = ~values[0]
return ArrowBoolArray.from_scalars(values)
@pytest.fixture
def data_missing():
return ArrowBoolArray.from_scalars([None, True])
class BaseArrowTests:
pass
class TestDtype(BaseArrowTests, base.BaseDtypeTests):
def test_array_type_with_arg(self, data, dtype):
pytest.skip("GH-22666")
class TestInterface(BaseArrowTests, base.BaseInterfaceTests):
def test_copy(self, data):
# __setitem__ does not work, so we only have a smoke-test
data.copy()
class TestConstructors(BaseArrowTests, base.BaseConstructorsTests):
def test_from_dtype(self, data):
pytest.skip("GH-22666")
# seems like some bug in isna on empty BoolArray returning floats.
@pytest.mark.xfail(reason="bad is-na for empty data")
def test_from_sequence_from_cls(self, data):
super().test_from_sequence_from_cls(data)
class TestReduce(base.BaseNoReduceTests):
def test_reduce_series_boolean(self):
pass
class TestReduceBoolean(base.BaseBooleanReduceTests):
pass
def test_is_bool_dtype(data):
assert pd.api.types.is_bool_dtype(data)
assert pd.core.common.is_bool_indexer(data)
s = pd.Series(range(len(data)))
result = s[data]
expected = s[np.asarray(data)]
tm.assert_series_equal(result, expected)

View File

@@ -0,0 +1,60 @@
"""Base test suite for extension arrays.
These tests are intended for third-party libraries to subclass to validate
that their extension arrays and dtypes satisfy the interface. Moving or
renaming the tests should not be done lightly.
Libraries are expected to implement a few pytest fixtures to provide data
for the tests. The fixtures may be located in either
* The same module as your test class.
* A ``conftest.py`` in the same directory as your test class.
The full list of fixtures may be found in the ``conftest.py`` next to this
file.
.. code-block:: python
import pytest
from pandas.tests.extension.base import BaseDtypeTests
@pytest.fixture
def dtype():
return MyDtype()
class TestMyDtype(BaseDtypeTests):
pass
Your class ``TestDtype`` will inherit all the tests defined on
``BaseDtypeTests``. pytest's fixture discover will supply your ``dtype``
wherever the test requires it. You're free to implement additional tests.
All the tests in these modules use ``self.assert_frame_equal`` or
``self.assert_series_equal`` for dataframe or series comparisons. By default,
they use the usual ``pandas.testing.assert_frame_equal`` and
``pandas.testing.assert_series_equal``. You can override the checks used
by defining the staticmethods ``assert_frame_equal`` and
``assert_series_equal`` on your base test class.
"""
from .casting import BaseCastingTests # noqa
from .constructors import BaseConstructorsTests # noqa
from .dtype import BaseDtypeTests # noqa
from .getitem import BaseGetitemTests # noqa
from .groupby import BaseGroupbyTests # noqa
from .interface import BaseInterfaceTests # noqa
from .io import BaseParsingTests # noqa
from .methods import BaseMethodsTests # noqa
from .missing import BaseMissingTests # noqa
from .ops import BaseArithmeticOpsTests, BaseComparisonOpsTests, BaseOpsUtil # noqa
from .printing import BasePrintingTests # noqa
from .reduce import ( # noqa
BaseBooleanReduceTests,
BaseNoReduceTests,
BaseNumericReduceTests,
)
from .reshaping import BaseReshapingTests # noqa
from .setitem import BaseSetitemTests # noqa

View File

@@ -0,0 +1,9 @@
import pandas.util.testing as tm
class BaseExtensionTests:
assert_equal = staticmethod(tm.assert_equal)
assert_series_equal = staticmethod(tm.assert_series_equal)
assert_frame_equal = staticmethod(tm.assert_frame_equal)
assert_extension_array_equal = staticmethod(tm.assert_extension_array_equal)

View File

@@ -0,0 +1,23 @@
import pandas as pd
from pandas.core.internals import ObjectBlock
from .base import BaseExtensionTests
class BaseCastingTests(BaseExtensionTests):
"""Casting to and from ExtensionDtypes"""
def test_astype_object_series(self, all_data):
ser = pd.Series({"A": all_data})
result = ser.astype(object)
assert isinstance(result._data.blocks[0], ObjectBlock)
def test_tolist(self, data):
result = pd.Series(data).tolist()
expected = list(data)
assert result == expected
def test_astype_str(self, data):
result = pd.Series(data[:5]).astype(str)
expected = pd.Series(data[:5].astype(str))
self.assert_series_equal(result, expected)

View File

@@ -0,0 +1,76 @@
import numpy as np
import pytest
import pandas as pd
from pandas.core.internals import ExtensionBlock
from .base import BaseExtensionTests
class BaseConstructorsTests(BaseExtensionTests):
def test_from_sequence_from_cls(self, data):
result = type(data)._from_sequence(data, dtype=data.dtype)
self.assert_extension_array_equal(result, data)
data = data[:0]
result = type(data)._from_sequence(data, dtype=data.dtype)
self.assert_extension_array_equal(result, data)
def test_array_from_scalars(self, data):
scalars = [data[0], data[1], data[2]]
result = data._from_sequence(scalars)
assert isinstance(result, type(data))
def test_series_constructor(self, data):
result = pd.Series(data)
assert result.dtype == data.dtype
assert len(result) == len(data)
assert isinstance(result._data.blocks[0], ExtensionBlock)
assert result._data.blocks[0].values is data
# Series[EA] is unboxed / boxed correctly
result2 = pd.Series(result)
assert result2.dtype == data.dtype
assert isinstance(result2._data.blocks[0], ExtensionBlock)
@pytest.mark.parametrize("from_series", [True, False])
def test_dataframe_constructor_from_dict(self, data, from_series):
if from_series:
data = pd.Series(data)
result = pd.DataFrame({"A": data})
assert result.dtypes["A"] == data.dtype
assert result.shape == (len(data), 1)
assert isinstance(result._data.blocks[0], ExtensionBlock)
def test_dataframe_from_series(self, data):
result = pd.DataFrame(pd.Series(data))
assert result.dtypes[0] == data.dtype
assert result.shape == (len(data), 1)
assert isinstance(result._data.blocks[0], ExtensionBlock)
def test_series_given_mismatched_index_raises(self, data):
msg = "Length of passed values is 3, index implies 5"
with pytest.raises(ValueError, match=msg):
pd.Series(data[:3], index=[0, 1, 2, 3, 4])
def test_from_dtype(self, data):
# construct from our dtype & string dtype
dtype = data.dtype
expected = pd.Series(data)
result = pd.Series(list(data), dtype=dtype)
self.assert_series_equal(result, expected)
result = pd.Series(list(data), dtype=str(dtype))
self.assert_series_equal(result, expected)
def test_pandas_array(self, data):
# pd.array(extension_array) should be idempotent...
result = pd.array(data)
self.assert_extension_array_equal(result, data)
def test_pandas_array_dtype(self, data):
# ... but specifying dtype will override idempotency
result = pd.array(data, dtype=np.dtype(object))
expected = pd.arrays.PandasArray(np.asarray(data, dtype=object))
self.assert_equal(result, expected)

View File

@@ -0,0 +1,102 @@
import warnings
import numpy as np
import pytest
import pandas as pd
from .base import BaseExtensionTests
class BaseDtypeTests(BaseExtensionTests):
"""Base class for ExtensionDtype classes"""
def test_name(self, dtype):
assert isinstance(dtype.name, str)
def test_kind(self, dtype):
valid = set("biufcmMOSUV")
if dtype.kind is not None:
assert dtype.kind in valid
def test_construct_from_string_own_name(self, dtype):
result = dtype.construct_from_string(dtype.name)
assert type(result) is type(dtype)
# check OK as classmethod
result = type(dtype).construct_from_string(dtype.name)
assert type(result) is type(dtype)
def test_is_dtype_from_name(self, dtype):
result = type(dtype).is_dtype(dtype.name)
assert result is True
def test_is_dtype_unboxes_dtype(self, data, dtype):
assert dtype.is_dtype(data) is True
def test_is_dtype_from_self(self, dtype):
result = type(dtype).is_dtype(dtype)
assert result is True
def test_is_not_string_type(self, dtype):
return not pd.api.types.is_string_dtype(dtype)
def test_is_not_object_type(self, dtype):
return not pd.api.types.is_object_dtype(dtype)
def test_eq_with_str(self, dtype):
assert dtype == dtype.name
assert dtype != dtype.name + "-suffix"
def test_eq_with_numpy_object(self, dtype):
assert dtype != np.dtype("object")
def test_eq_with_self(self, dtype):
assert dtype == dtype
assert dtype != object()
def test_array_type(self, data, dtype):
assert dtype.construct_array_type() is type(data)
def test_check_dtype(self, data):
dtype = data.dtype
# check equivalency for using .dtypes
df = pd.DataFrame(
{"A": pd.Series(data, dtype=dtype), "B": data, "C": "foo", "D": 1}
)
# np.dtype('int64') == 'Int64' == 'int64'
# so can't distinguish
if dtype.name == "Int64":
expected = pd.Series([True, True, False, True], index=list("ABCD"))
else:
expected = pd.Series([True, True, False, False], index=list("ABCD"))
# XXX: This should probably be *fixed* not ignored.
# See libops.scalar_compare
with warnings.catch_warnings():
warnings.simplefilter("ignore", DeprecationWarning)
result = df.dtypes == str(dtype)
self.assert_series_equal(result, expected)
expected = pd.Series([True, True, False, False], index=list("ABCD"))
result = df.dtypes.apply(str) == str(dtype)
self.assert_series_equal(result, expected)
def test_hashable(self, dtype):
hash(dtype) # no error
def test_str(self, dtype):
assert str(dtype) == dtype.name
def test_eq(self, dtype):
assert dtype == dtype.name
assert dtype != "anonther_type"
def test_construct_from_string(self, dtype):
dtype_instance = dtype.__class__.construct_from_string(dtype.name)
assert isinstance(dtype_instance, dtype.__class__)
with pytest.raises(TypeError):
dtype.__class__.construct_from_string("another_type")

View File

@@ -0,0 +1,262 @@
import numpy as np
import pytest
import pandas as pd
from .base import BaseExtensionTests
class BaseGetitemTests(BaseExtensionTests):
"""Tests for ExtensionArray.__getitem__."""
def test_iloc_series(self, data):
ser = pd.Series(data)
result = ser.iloc[:4]
expected = pd.Series(data[:4])
self.assert_series_equal(result, expected)
result = ser.iloc[[0, 1, 2, 3]]
self.assert_series_equal(result, expected)
def test_iloc_frame(self, data):
df = pd.DataFrame({"A": data, "B": np.arange(len(data), dtype="int64")})
expected = pd.DataFrame({"A": data[:4]})
# slice -> frame
result = df.iloc[:4, [0]]
self.assert_frame_equal(result, expected)
# sequence -> frame
result = df.iloc[[0, 1, 2, 3], [0]]
self.assert_frame_equal(result, expected)
expected = pd.Series(data[:4], name="A")
# slice -> series
result = df.iloc[:4, 0]
self.assert_series_equal(result, expected)
# sequence -> series
result = df.iloc[:4, 0]
self.assert_series_equal(result, expected)
def test_loc_series(self, data):
ser = pd.Series(data)
result = ser.loc[:3]
expected = pd.Series(data[:4])
self.assert_series_equal(result, expected)
result = ser.loc[[0, 1, 2, 3]]
self.assert_series_equal(result, expected)
def test_loc_frame(self, data):
df = pd.DataFrame({"A": data, "B": np.arange(len(data), dtype="int64")})
expected = pd.DataFrame({"A": data[:4]})
# slice -> frame
result = df.loc[:3, ["A"]]
self.assert_frame_equal(result, expected)
# sequence -> frame
result = df.loc[[0, 1, 2, 3], ["A"]]
self.assert_frame_equal(result, expected)
expected = pd.Series(data[:4], name="A")
# slice -> series
result = df.loc[:3, "A"]
self.assert_series_equal(result, expected)
# sequence -> series
result = df.loc[:3, "A"]
self.assert_series_equal(result, expected)
def test_loc_iloc_frame_single_dtype(self, data):
# GH#27110 bug in ExtensionBlock.iget caused df.iloc[n] to incorrectly
# return a scalar
df = pd.DataFrame({"A": data})
expected = pd.Series([data[2]], index=["A"], name=2, dtype=data.dtype)
result = df.loc[2]
self.assert_series_equal(result, expected)
expected = pd.Series(
[data[-1]], index=["A"], name=len(data) - 1, dtype=data.dtype
)
result = df.iloc[-1]
self.assert_series_equal(result, expected)
def test_getitem_scalar(self, data):
result = data[0]
assert isinstance(result, data.dtype.type)
result = pd.Series(data)[0]
assert isinstance(result, data.dtype.type)
def test_getitem_scalar_na(self, data_missing, na_cmp, na_value):
result = data_missing[0]
assert na_cmp(result, na_value)
def test_getitem_mask(self, data):
# Empty mask, raw array
mask = np.zeros(len(data), dtype=bool)
result = data[mask]
assert len(result) == 0
assert isinstance(result, type(data))
# Empty mask, in series
mask = np.zeros(len(data), dtype=bool)
result = pd.Series(data)[mask]
assert len(result) == 0
assert result.dtype == data.dtype
# non-empty mask, raw array
mask[0] = True
result = data[mask]
assert len(result) == 1
assert isinstance(result, type(data))
# non-empty mask, in series
result = pd.Series(data)[mask]
assert len(result) == 1
assert result.dtype == data.dtype
def test_getitem_slice(self, data):
# getitem[slice] should return an array
result = data[slice(0)] # empty
assert isinstance(result, type(data))
result = data[slice(1)] # scalar
assert isinstance(result, type(data))
def test_get(self, data):
# GH 20882
s = pd.Series(data, index=[2 * i for i in range(len(data))])
assert s.get(4) == s.iloc[2]
result = s.get([4, 6])
expected = s.iloc[[2, 3]]
self.assert_series_equal(result, expected)
result = s.get(slice(2))
expected = s.iloc[[0, 1]]
self.assert_series_equal(result, expected)
assert s.get(-1) is None
assert s.get(s.index.max() + 1) is None
s = pd.Series(data[:6], index=list("abcdef"))
assert s.get("c") == s.iloc[2]
result = s.get(slice("b", "d"))
expected = s.iloc[[1, 2, 3]]
self.assert_series_equal(result, expected)
result = s.get("Z")
assert result is None
assert s.get(4) == s.iloc[4]
assert s.get(-1) == s.iloc[-1]
assert s.get(len(s)) is None
# GH 21257
s = pd.Series(data)
s2 = s[::2]
assert s2.get(1) is None
def test_take_sequence(self, data):
result = pd.Series(data)[[0, 1, 3]]
assert result.iloc[0] == data[0]
assert result.iloc[1] == data[1]
assert result.iloc[2] == data[3]
def test_take(self, data, na_value, na_cmp):
result = data.take([0, -1])
assert result.dtype == data.dtype
assert result[0] == data[0]
assert result[1] == data[-1]
result = data.take([0, -1], allow_fill=True, fill_value=na_value)
assert result[0] == data[0]
assert na_cmp(result[1], na_value)
with pytest.raises(IndexError, match="out of bounds"):
data.take([len(data) + 1])
def test_take_empty(self, data, na_value, na_cmp):
empty = data[:0]
result = empty.take([-1], allow_fill=True)
assert na_cmp(result[0], na_value)
with pytest.raises(IndexError):
empty.take([-1])
with pytest.raises(IndexError, match="cannot do a non-empty take"):
empty.take([0, 1])
def test_take_negative(self, data):
# https://github.com/pandas-dev/pandas/issues/20640
n = len(data)
result = data.take([0, -n, n - 1, -1])
expected = data.take([0, 0, n - 1, n - 1])
self.assert_extension_array_equal(result, expected)
def test_take_non_na_fill_value(self, data_missing):
fill_value = data_missing[1] # valid
na = data_missing[0]
array = data_missing._from_sequence([na, fill_value, na])
result = array.take([-1, 1], fill_value=fill_value, allow_fill=True)
expected = array.take([1, 1])
self.assert_extension_array_equal(result, expected)
def test_take_pandas_style_negative_raises(self, data, na_value):
with pytest.raises(ValueError):
data.take([0, -2], fill_value=na_value, allow_fill=True)
@pytest.mark.parametrize("allow_fill", [True, False])
def test_take_out_of_bounds_raises(self, data, allow_fill):
arr = data[:3]
with pytest.raises(IndexError):
arr.take(np.asarray([0, 3]), allow_fill=allow_fill)
def test_take_series(self, data):
s = pd.Series(data)
result = s.take([0, -1])
expected = pd.Series(
data._from_sequence([data[0], data[len(data) - 1]], dtype=s.dtype),
index=[0, len(data) - 1],
)
self.assert_series_equal(result, expected)
def test_reindex(self, data, na_value):
s = pd.Series(data)
result = s.reindex([0, 1, 3])
expected = pd.Series(data.take([0, 1, 3]), index=[0, 1, 3])
self.assert_series_equal(result, expected)
n = len(data)
result = s.reindex([-1, 0, n])
expected = pd.Series(
data._from_sequence([na_value, data[0], na_value], dtype=s.dtype),
index=[-1, 0, n],
)
self.assert_series_equal(result, expected)
result = s.reindex([n, n + 1])
expected = pd.Series(
data._from_sequence([na_value, na_value], dtype=s.dtype), index=[n, n + 1]
)
self.assert_series_equal(result, expected)
def test_reindex_non_na_fill_value(self, data_missing):
valid = data_missing[1]
na = data_missing[0]
array = data_missing._from_sequence([na, valid])
ser = pd.Series(array)
result = ser.reindex([0, 1, 2], fill_value=valid)
expected = pd.Series(data_missing._from_sequence([na, valid, valid]))
self.assert_series_equal(result, expected)

View File

@@ -0,0 +1,91 @@
import pytest
import pandas as pd
import pandas.util.testing as tm
from .base import BaseExtensionTests
class BaseGroupbyTests(BaseExtensionTests):
"""Groupby-specific tests."""
def test_grouping_grouper(self, data_for_grouping):
df = pd.DataFrame(
{"A": ["B", "B", None, None, "A", "A", "B", "C"], "B": data_for_grouping}
)
gr1 = df.groupby("A").grouper.groupings[0]
gr2 = df.groupby("B").grouper.groupings[0]
tm.assert_numpy_array_equal(gr1.grouper, df.A.values)
tm.assert_extension_array_equal(gr2.grouper, data_for_grouping)
@pytest.mark.parametrize("as_index", [True, False])
def test_groupby_extension_agg(self, as_index, data_for_grouping):
df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping})
result = df.groupby("B", as_index=as_index).A.mean()
_, index = pd.factorize(data_for_grouping, sort=True)
index = pd.Index(index, name="B")
expected = pd.Series([3, 1, 4], index=index, name="A")
if as_index:
self.assert_series_equal(result, expected)
else:
expected = expected.reset_index()
self.assert_frame_equal(result, expected)
def test_groupby_extension_no_sort(self, data_for_grouping):
df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping})
result = df.groupby("B", sort=False).A.mean()
_, index = pd.factorize(data_for_grouping, sort=False)
index = pd.Index(index, name="B")
expected = pd.Series([1, 3, 4], index=index, name="A")
self.assert_series_equal(result, expected)
def test_groupby_extension_transform(self, data_for_grouping):
valid = data_for_grouping[~data_for_grouping.isna()]
df = pd.DataFrame({"A": [1, 1, 3, 3, 1, 4], "B": valid})
result = df.groupby("B").A.transform(len)
expected = pd.Series([3, 3, 2, 2, 3, 1], name="A")
self.assert_series_equal(result, expected)
def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op):
df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping})
df.groupby("B").apply(groupby_apply_op)
df.groupby("B").A.apply(groupby_apply_op)
df.groupby("A").apply(groupby_apply_op)
df.groupby("A").B.apply(groupby_apply_op)
def test_groupby_apply_identity(self, data_for_grouping):
df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping})
result = df.groupby("A").B.apply(lambda x: x.array)
expected = pd.Series(
[
df.B.iloc[[0, 1, 6]].array,
df.B.iloc[[2, 3]].array,
df.B.iloc[[4, 5]].array,
df.B.iloc[[7]].array,
],
index=pd.Index([1, 2, 3, 4], name="A"),
name="B",
)
self.assert_series_equal(result, expected)
def test_in_numeric_groupby(self, data_for_grouping):
df = pd.DataFrame(
{
"A": [1, 1, 2, 2, 3, 3, 1, 4],
"B": data_for_grouping,
"C": [1, 1, 1, 1, 1, 1, 1, 1],
}
)
result = df.groupby("A").sum().columns
if data_for_grouping.dtype._is_numeric:
expected = pd.Index(["B", "C"])
else:
expected = pd.Index(["C"])
tm.assert_index_equal(result, expected)

View File

@@ -0,0 +1,77 @@
import numpy as np
from pandas.core.dtypes.common import is_extension_array_dtype
from pandas.core.dtypes.dtypes import ExtensionDtype
import pandas as pd
import pandas.util.testing as tm
from .base import BaseExtensionTests
class BaseInterfaceTests(BaseExtensionTests):
"""Tests that the basic interface is satisfied."""
# ------------------------------------------------------------------------
# Interface
# ------------------------------------------------------------------------
def test_len(self, data):
assert len(data) == 100
def test_ndim(self, data):
assert data.ndim == 1
def test_can_hold_na_valid(self, data):
# GH-20761
assert data._can_hold_na is True
def test_memory_usage(self, data):
s = pd.Series(data)
result = s.memory_usage(index=False)
assert result == s.nbytes
def test_array_interface(self, data):
result = np.array(data)
assert result[0] == data[0]
result = np.array(data, dtype=object)
expected = np.array(list(data), dtype=object)
tm.assert_numpy_array_equal(result, expected)
def test_is_extension_array_dtype(self, data):
assert is_extension_array_dtype(data)
assert is_extension_array_dtype(data.dtype)
assert is_extension_array_dtype(pd.Series(data))
assert isinstance(data.dtype, ExtensionDtype)
def test_no_values_attribute(self, data):
# GH-20735: EA's with .values attribute give problems with internal
# code, disallowing this for now until solved
assert not hasattr(data, "values")
assert not hasattr(data, "_values")
def test_is_numeric_honored(self, data):
result = pd.Series(data)
assert result._data.blocks[0].is_numeric is data.dtype._is_numeric
def test_isna_extension_array(self, data_missing):
# If your `isna` returns an ExtensionArray, you must also implement
# _reduce. At the *very* least, you must implement any and all
na = data_missing.isna()
if is_extension_array_dtype(na):
assert na._reduce("any")
assert na.any()
assert not na._reduce("all")
assert not na.all()
assert na.dtype._is_boolean
def test_copy(self, data):
# GH#27083 removing deep keyword from EA.copy
assert data[0] != data[1]
result = data.copy()
data[1] = data[0]
assert result[1] != result[0]

View File

@@ -0,0 +1,20 @@
from io import StringIO
import numpy as np
import pytest
import pandas as pd
from .base import BaseExtensionTests
class BaseParsingTests(BaseExtensionTests):
@pytest.mark.parametrize("engine", ["c", "python"])
def test_EA_types(self, engine, data):
df = pd.DataFrame({"with_dtype": pd.Series(data, dtype=str(data.dtype))})
csv_output = df.to_csv(index=False, na_rep=np.nan)
result = pd.read_csv(
StringIO(csv_output), dtype={"with_dtype": str(data.dtype)}, engine=engine
)
expected = df
self.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,360 @@
import numpy as np
import pytest
import pandas as pd
from pandas.core.sorting import nargsort
import pandas.util.testing as tm
from .base import BaseExtensionTests
class BaseMethodsTests(BaseExtensionTests):
"""Various Series and DataFrame methods."""
@pytest.mark.parametrize("dropna", [True, False])
def test_value_counts(self, all_data, dropna):
all_data = all_data[:10]
if dropna:
other = np.array(all_data[~all_data.isna()])
else:
other = all_data
result = pd.Series(all_data).value_counts(dropna=dropna).sort_index()
expected = pd.Series(other).value_counts(dropna=dropna).sort_index()
self.assert_series_equal(result, expected)
def test_count(self, data_missing):
df = pd.DataFrame({"A": data_missing})
result = df.count(axis="columns")
expected = pd.Series([0, 1])
self.assert_series_equal(result, expected)
def test_series_count(self, data_missing):
# GH#26835
ser = pd.Series(data_missing)
result = ser.count()
expected = 1
assert result == expected
def test_apply_simple_series(self, data):
result = pd.Series(data).apply(id)
assert isinstance(result, pd.Series)
def test_argsort(self, data_for_sorting):
result = pd.Series(data_for_sorting).argsort()
expected = pd.Series(np.array([2, 0, 1], dtype=np.int64))
self.assert_series_equal(result, expected)
def test_argsort_missing_array(self, data_missing_for_sorting):
result = data_missing_for_sorting.argsort()
expected = np.array([2, 0, 1], dtype=np.dtype("int"))
# we don't care whether it's int32 or int64
result = result.astype("int64", casting="safe")
expected = expected.astype("int64", casting="safe")
tm.assert_numpy_array_equal(result, expected)
def test_argsort_missing(self, data_missing_for_sorting):
result = pd.Series(data_missing_for_sorting).argsort()
expected = pd.Series(np.array([1, -1, 0], dtype=np.int64))
self.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"na_position, expected",
[
("last", np.array([2, 0, 1], dtype=np.dtype("intp"))),
("first", np.array([1, 2, 0], dtype=np.dtype("intp"))),
],
)
def test_nargsort(self, data_missing_for_sorting, na_position, expected):
# GH 25439
result = nargsort(data_missing_for_sorting, na_position=na_position)
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("ascending", [True, False])
def test_sort_values(self, data_for_sorting, ascending):
ser = pd.Series(data_for_sorting)
result = ser.sort_values(ascending=ascending)
expected = ser.iloc[[2, 0, 1]]
if not ascending:
expected = expected[::-1]
self.assert_series_equal(result, expected)
@pytest.mark.parametrize("ascending", [True, False])
def test_sort_values_missing(self, data_missing_for_sorting, ascending):
ser = pd.Series(data_missing_for_sorting)
result = ser.sort_values(ascending=ascending)
if ascending:
expected = ser.iloc[[2, 0, 1]]
else:
expected = ser.iloc[[0, 2, 1]]
self.assert_series_equal(result, expected)
@pytest.mark.parametrize("ascending", [True, False])
def test_sort_values_frame(self, data_for_sorting, ascending):
df = pd.DataFrame({"A": [1, 2, 1], "B": data_for_sorting})
result = df.sort_values(["A", "B"])
expected = pd.DataFrame(
{"A": [1, 1, 2], "B": data_for_sorting.take([2, 0, 1])}, index=[2, 0, 1]
)
self.assert_frame_equal(result, expected)
@pytest.mark.parametrize("box", [pd.Series, lambda x: x])
@pytest.mark.parametrize("method", [lambda x: x.unique(), pd.unique])
def test_unique(self, data, box, method):
duplicated = box(data._from_sequence([data[0], data[0]]))
result = method(duplicated)
assert len(result) == 1
assert isinstance(result, type(data))
assert result[0] == duplicated[0]
@pytest.mark.parametrize("na_sentinel", [-1, -2])
def test_factorize(self, data_for_grouping, na_sentinel):
labels, uniques = pd.factorize(data_for_grouping, na_sentinel=na_sentinel)
expected_labels = np.array(
[0, 0, na_sentinel, na_sentinel, 1, 1, 0, 2], dtype=np.intp
)
expected_uniques = data_for_grouping.take([0, 4, 7])
tm.assert_numpy_array_equal(labels, expected_labels)
self.assert_extension_array_equal(uniques, expected_uniques)
@pytest.mark.parametrize("na_sentinel", [-1, -2])
def test_factorize_equivalence(self, data_for_grouping, na_sentinel):
l1, u1 = pd.factorize(data_for_grouping, na_sentinel=na_sentinel)
l2, u2 = data_for_grouping.factorize(na_sentinel=na_sentinel)
tm.assert_numpy_array_equal(l1, l2)
self.assert_extension_array_equal(u1, u2)
def test_factorize_empty(self, data):
labels, uniques = pd.factorize(data[:0])
expected_labels = np.array([], dtype=np.intp)
expected_uniques = type(data)._from_sequence([], dtype=data[:0].dtype)
tm.assert_numpy_array_equal(labels, expected_labels)
self.assert_extension_array_equal(uniques, expected_uniques)
def test_fillna_copy_frame(self, data_missing):
arr = data_missing.take([1, 1])
df = pd.DataFrame({"A": arr})
filled_val = df.iloc[0, 0]
result = df.fillna(filled_val)
assert df.A.values is not result.A.values
def test_fillna_copy_series(self, data_missing):
arr = data_missing.take([1, 1])
ser = pd.Series(arr)
filled_val = ser[0]
result = ser.fillna(filled_val)
assert ser._values is not result._values
assert ser._values is arr
def test_fillna_length_mismatch(self, data_missing):
msg = "Length of 'value' does not match."
with pytest.raises(ValueError, match=msg):
data_missing.fillna(data_missing.take([1]))
def test_combine_le(self, data_repeated):
# GH 20825
# Test that combine works when doing a <= (le) comparison
orig_data1, orig_data2 = data_repeated(2)
s1 = pd.Series(orig_data1)
s2 = pd.Series(orig_data2)
result = s1.combine(s2, lambda x1, x2: x1 <= x2)
expected = pd.Series(
[a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))]
)
self.assert_series_equal(result, expected)
val = s1.iloc[0]
result = s1.combine(val, lambda x1, x2: x1 <= x2)
expected = pd.Series([a <= val for a in list(orig_data1)])
self.assert_series_equal(result, expected)
def test_combine_add(self, data_repeated):
# GH 20825
orig_data1, orig_data2 = data_repeated(2)
s1 = pd.Series(orig_data1)
s2 = pd.Series(orig_data2)
result = s1.combine(s2, lambda x1, x2: x1 + x2)
with np.errstate(over="ignore"):
expected = pd.Series(
orig_data1._from_sequence(
[a + b for (a, b) in zip(list(orig_data1), list(orig_data2))]
)
)
self.assert_series_equal(result, expected)
val = s1.iloc[0]
result = s1.combine(val, lambda x1, x2: x1 + x2)
expected = pd.Series(
orig_data1._from_sequence([a + val for a in list(orig_data1)])
)
self.assert_series_equal(result, expected)
def test_combine_first(self, data):
# https://github.com/pandas-dev/pandas/issues/24147
a = pd.Series(data[:3])
b = pd.Series(data[2:5], index=[2, 3, 4])
result = a.combine_first(b)
expected = pd.Series(data[:5])
self.assert_series_equal(result, expected)
@pytest.mark.parametrize("frame", [True, False])
@pytest.mark.parametrize(
"periods, indices",
[(-2, [2, 3, 4, -1, -1]), (0, [0, 1, 2, 3, 4]), (2, [-1, -1, 0, 1, 2])],
)
def test_container_shift(self, data, frame, periods, indices):
# https://github.com/pandas-dev/pandas/issues/22386
subset = data[:5]
data = pd.Series(subset, name="A")
expected = pd.Series(subset.take(indices, allow_fill=True), name="A")
if frame:
result = data.to_frame(name="A").assign(B=1).shift(periods)
expected = pd.concat(
[expected, pd.Series([1] * 5, name="B").shift(periods)], axis=1
)
compare = self.assert_frame_equal
else:
result = data.shift(periods)
compare = self.assert_series_equal
compare(result, expected)
@pytest.mark.parametrize(
"periods, indices",
[[-4, [-1, -1]], [-1, [1, -1]], [0, [0, 1]], [1, [-1, 0]], [4, [-1, -1]]],
)
def test_shift_non_empty_array(self, data, periods, indices):
# https://github.com/pandas-dev/pandas/issues/23911
subset = data[:2]
result = subset.shift(periods)
expected = subset.take(indices, allow_fill=True)
self.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize("periods", [-4, -1, 0, 1, 4])
def test_shift_empty_array(self, data, periods):
# https://github.com/pandas-dev/pandas/issues/23911
empty = data[:0]
result = empty.shift(periods)
expected = empty
self.assert_extension_array_equal(result, expected)
def test_shift_fill_value(self, data):
arr = data[:4]
fill_value = data[0]
result = arr.shift(1, fill_value=fill_value)
expected = data.take([0, 0, 1, 2])
self.assert_extension_array_equal(result, expected)
result = arr.shift(-2, fill_value=fill_value)
expected = data.take([2, 3, 0, 0])
self.assert_extension_array_equal(result, expected)
def test_hash_pandas_object_works(self, data, as_frame):
# https://github.com/pandas-dev/pandas/issues/23066
data = pd.Series(data)
if as_frame:
data = data.to_frame()
a = pd.util.hash_pandas_object(data)
b = pd.util.hash_pandas_object(data)
self.assert_equal(a, b)
def test_searchsorted(self, data_for_sorting, as_series):
b, c, a = data_for_sorting
arr = type(data_for_sorting)._from_sequence([a, b, c])
if as_series:
arr = pd.Series(arr)
assert arr.searchsorted(a) == 0
assert arr.searchsorted(a, side="right") == 1
assert arr.searchsorted(b) == 1
assert arr.searchsorted(b, side="right") == 2
assert arr.searchsorted(c) == 2
assert arr.searchsorted(c, side="right") == 3
result = arr.searchsorted(arr.take([0, 2]))
expected = np.array([0, 2], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)
# sorter
sorter = np.array([1, 2, 0])
assert data_for_sorting.searchsorted(a, sorter=sorter) == 0
def test_where_series(self, data, na_value, as_frame):
assert data[0] != data[1]
cls = type(data)
a, b = data[:2]
ser = pd.Series(cls._from_sequence([a, a, b, b], dtype=data.dtype))
cond = np.array([True, True, False, False])
if as_frame:
ser = ser.to_frame(name="a")
cond = cond.reshape(-1, 1)
result = ser.where(cond)
expected = pd.Series(
cls._from_sequence([a, a, na_value, na_value], dtype=data.dtype)
)
if as_frame:
expected = expected.to_frame(name="a")
self.assert_equal(result, expected)
# array other
cond = np.array([True, False, True, True])
other = cls._from_sequence([a, b, a, b], dtype=data.dtype)
if as_frame:
other = pd.DataFrame({"a": other})
cond = pd.DataFrame({"a": cond})
result = ser.where(cond, other)
expected = pd.Series(cls._from_sequence([a, b, b, b], dtype=data.dtype))
if as_frame:
expected = expected.to_frame(name="a")
self.assert_equal(result, expected)
@pytest.mark.parametrize("repeats", [0, 1, 2, [1, 2, 3]])
def test_repeat(self, data, repeats, as_series, use_numpy):
arr = type(data)._from_sequence(data[:3], dtype=data.dtype)
if as_series:
arr = pd.Series(arr)
result = np.repeat(arr, repeats) if use_numpy else arr.repeat(repeats)
repeats = [repeats] * 3 if isinstance(repeats, int) else repeats
expected = [x for x, n in zip(arr, repeats) for _ in range(n)]
expected = type(data)._from_sequence(expected, dtype=data.dtype)
if as_series:
expected = pd.Series(expected, index=arr.index.repeat(repeats))
self.assert_equal(result, expected)
@pytest.mark.parametrize(
"repeats, kwargs, error, msg",
[
(2, dict(axis=1), ValueError, "'axis"),
(-1, dict(), ValueError, "negative"),
([1, 2], dict(), ValueError, "shape"),
(2, dict(foo="bar"), TypeError, "'foo'"),
],
)
def test_repeat_raises(self, data, repeats, kwargs, error, msg, use_numpy):
with pytest.raises(error, match=msg):
if use_numpy:
np.repeat(data, repeats, **kwargs)
else:
data.repeat(repeats, **kwargs)

View File

@@ -0,0 +1,129 @@
import numpy as np
import pandas as pd
import pandas.util.testing as tm
from .base import BaseExtensionTests
class BaseMissingTests(BaseExtensionTests):
def test_isna(self, data_missing):
expected = np.array([True, False])
result = pd.isna(data_missing)
tm.assert_numpy_array_equal(result, expected)
result = pd.Series(data_missing).isna()
expected = pd.Series(expected)
self.assert_series_equal(result, expected)
# GH 21189
result = pd.Series(data_missing).drop([0, 1]).isna()
expected = pd.Series([], dtype=bool)
self.assert_series_equal(result, expected)
def test_dropna_array(self, data_missing):
result = data_missing.dropna()
expected = data_missing[[1]]
self.assert_extension_array_equal(result, expected)
def test_dropna_series(self, data_missing):
ser = pd.Series(data_missing)
result = ser.dropna()
expected = ser.iloc[[1]]
self.assert_series_equal(result, expected)
def test_dropna_frame(self, data_missing):
df = pd.DataFrame({"A": data_missing})
# defaults
result = df.dropna()
expected = df.iloc[[1]]
self.assert_frame_equal(result, expected)
# axis = 1
result = df.dropna(axis="columns")
expected = pd.DataFrame(index=[0, 1])
self.assert_frame_equal(result, expected)
# multiple
df = pd.DataFrame({"A": data_missing, "B": [1, np.nan]})
result = df.dropna()
expected = df.iloc[:0]
self.assert_frame_equal(result, expected)
def test_fillna_scalar(self, data_missing):
valid = data_missing[1]
result = data_missing.fillna(valid)
expected = data_missing.fillna(valid)
self.assert_extension_array_equal(result, expected)
def test_fillna_limit_pad(self, data_missing):
arr = data_missing.take([1, 0, 0, 0, 1])
result = pd.Series(arr).fillna(method="ffill", limit=2)
expected = pd.Series(data_missing.take([1, 1, 1, 0, 1]))
self.assert_series_equal(result, expected)
def test_fillna_limit_backfill(self, data_missing):
arr = data_missing.take([1, 0, 0, 0, 1])
result = pd.Series(arr).fillna(method="backfill", limit=2)
expected = pd.Series(data_missing.take([1, 0, 1, 1, 1]))
self.assert_series_equal(result, expected)
def test_fillna_series(self, data_missing):
fill_value = data_missing[1]
ser = pd.Series(data_missing)
result = ser.fillna(fill_value)
expected = pd.Series(
data_missing._from_sequence(
[fill_value, fill_value], dtype=data_missing.dtype
)
)
self.assert_series_equal(result, expected)
# Fill with a series
result = ser.fillna(expected)
self.assert_series_equal(result, expected)
# Fill with a series not affecting the missing values
result = ser.fillna(ser)
self.assert_series_equal(result, ser)
def test_fillna_series_method(self, data_missing, fillna_method):
fill_value = data_missing[1]
if fillna_method == "ffill":
data_missing = data_missing[::-1]
result = pd.Series(data_missing).fillna(method=fillna_method)
expected = pd.Series(
data_missing._from_sequence(
[fill_value, fill_value], dtype=data_missing.dtype
)
)
self.assert_series_equal(result, expected)
def test_fillna_frame(self, data_missing):
fill_value = data_missing[1]
result = pd.DataFrame({"A": data_missing, "B": [1, 2]}).fillna(fill_value)
expected = pd.DataFrame(
{
"A": data_missing._from_sequence(
[fill_value, fill_value], dtype=data_missing.dtype
),
"B": [1, 2],
}
)
self.assert_frame_equal(result, expected)
def test_fillna_fill_other(self, data):
result = pd.DataFrame({"A": data, "B": [np.nan] * len(data)}).fillna({"B": 0.0})
expected = pd.DataFrame({"A": data, "B": [0.0] * len(result)})
self.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,173 @@
import operator
import pytest
import pandas as pd
from pandas.core import ops
from .base import BaseExtensionTests
class BaseOpsUtil(BaseExtensionTests):
def get_op_from_name(self, op_name):
short_opname = op_name.strip("_")
try:
op = getattr(operator, short_opname)
except AttributeError:
# Assume it is the reverse operator
rop = getattr(operator, short_opname[1:])
op = lambda x, y: rop(y, x)
return op
def check_opname(self, s, op_name, other, exc=Exception):
op = self.get_op_from_name(op_name)
self._check_op(s, op, other, op_name, exc)
def _check_op(self, s, op, other, op_name, exc=NotImplementedError):
if exc is None:
result = op(s, other)
expected = s.combine(other, op)
self.assert_series_equal(result, expected)
else:
with pytest.raises(exc):
op(s, other)
def _check_divmod_op(self, s, op, other, exc=Exception):
# divmod has multiple return values, so check separately
if exc is None:
result_div, result_mod = op(s, other)
if op is divmod:
expected_div, expected_mod = s // other, s % other
else:
expected_div, expected_mod = other // s, other % s
self.assert_series_equal(result_div, expected_div)
self.assert_series_equal(result_mod, expected_mod)
else:
with pytest.raises(exc):
divmod(s, other)
class BaseArithmeticOpsTests(BaseOpsUtil):
"""Various Series and DataFrame arithmetic ops methods.
Subclasses supporting various ops should set the class variables
to indicate that they support ops of that kind
* series_scalar_exc = TypeError
* frame_scalar_exc = TypeError
* series_array_exc = TypeError
* divmod_exc = TypeError
"""
series_scalar_exc = TypeError
frame_scalar_exc = TypeError
series_array_exc = TypeError
divmod_exc = TypeError
def test_arith_series_with_scalar(self, data, all_arithmetic_operators):
# series & scalar
op_name = all_arithmetic_operators
s = pd.Series(data)
self.check_opname(s, op_name, s.iloc[0], exc=self.series_scalar_exc)
@pytest.mark.xfail(run=False, reason="_reduce needs implementation")
def test_arith_frame_with_scalar(self, data, all_arithmetic_operators):
# frame & scalar
op_name = all_arithmetic_operators
df = pd.DataFrame({"A": data})
self.check_opname(df, op_name, data[0], exc=self.frame_scalar_exc)
def test_arith_series_with_array(self, data, all_arithmetic_operators):
# ndarray & other series
op_name = all_arithmetic_operators
s = pd.Series(data)
self.check_opname(
s, op_name, pd.Series([s.iloc[0]] * len(s)), exc=self.series_array_exc
)
def test_divmod(self, data):
s = pd.Series(data)
self._check_divmod_op(s, divmod, 1, exc=self.divmod_exc)
self._check_divmod_op(1, ops.rdivmod, s, exc=self.divmod_exc)
def test_divmod_series_array(self, data, data_for_twos):
s = pd.Series(data)
self._check_divmod_op(s, divmod, data)
other = data_for_twos
self._check_divmod_op(other, ops.rdivmod, s)
other = pd.Series(other)
self._check_divmod_op(other, ops.rdivmod, s)
def test_add_series_with_extension_array(self, data):
s = pd.Series(data)
result = s + data
expected = pd.Series(data + data)
self.assert_series_equal(result, expected)
def test_error(self, data, all_arithmetic_operators):
# invalid ops
op_name = all_arithmetic_operators
with pytest.raises(AttributeError):
getattr(data, op_name)
def test_direct_arith_with_series_returns_not_implemented(self, data):
# EAs should return NotImplemented for ops with Series.
# Pandas takes care of unboxing the series and calling the EA's op.
other = pd.Series(data)
if hasattr(data, "__add__"):
result = data.__add__(other)
assert result is NotImplemented
else:
raise pytest.skip(
"{} does not implement add".format(data.__class__.__name__)
)
class BaseComparisonOpsTests(BaseOpsUtil):
"""Various Series and DataFrame comparison ops methods."""
def _compare_other(self, s, data, op_name, other):
op = self.get_op_from_name(op_name)
if op_name == "__eq__":
assert getattr(data, op_name)(other) is NotImplemented
assert not op(s, other).all()
elif op_name == "__ne__":
assert getattr(data, op_name)(other) is NotImplemented
assert op(s, other).all()
else:
# array
assert getattr(data, op_name)(other) is NotImplemented
# series
s = pd.Series(data)
with pytest.raises(TypeError):
op(s, other)
def test_compare_scalar(self, data, all_compare_operators):
op_name = all_compare_operators
s = pd.Series(data)
self._compare_other(s, data, op_name, 0)
def test_compare_array(self, data, all_compare_operators):
op_name = all_compare_operators
s = pd.Series(data)
other = pd.Series([data[0]] * len(data))
self._compare_other(s, data, op_name, other)
def test_direct_arith_with_series_returns_not_implemented(self, data):
# EAs should return NotImplemented for ops with Series.
# Pandas takes care of unboxing the series and calling the EA's op.
other = pd.Series(data)
if hasattr(data, "__eq__"):
result = data.__eq__(other)
assert result is NotImplemented
else:
raise pytest.skip(
"{} does not implement __eq__".format(data.__class__.__name__)
)

View File

@@ -0,0 +1,43 @@
import io
import pytest
import pandas as pd
from .base import BaseExtensionTests
class BasePrintingTests(BaseExtensionTests):
"""Tests checking the formatting of your EA when printed."""
@pytest.mark.parametrize("size", ["big", "small"])
def test_array_repr(self, data, size):
if size == "small":
data = data[:5]
else:
data = type(data)._concat_same_type([data] * 5)
result = repr(data)
assert data.__class__.__name__ in result
assert "Length: {}".format(len(data)) in result
assert str(data.dtype) in result
if size == "big":
assert "..." in result
def test_array_repr_unicode(self, data):
result = str(data)
assert isinstance(result, str)
def test_series_repr(self, data):
ser = pd.Series(data)
assert data.dtype.name in repr(ser)
def test_dataframe_repr(self, data):
df = pd.DataFrame({"A": data})
repr(df)
def test_dtype_name_in_info(self, data):
buf = io.StringIO()
pd.DataFrame({"A": data}).info(buf=buf)
result = buf.getvalue()
assert data.dtype.name in result

View File

@@ -0,0 +1,60 @@
import warnings
import pytest
import pandas as pd
import pandas.util.testing as tm
from .base import BaseExtensionTests
class BaseReduceTests(BaseExtensionTests):
"""
Reduction specific tests. Generally these only
make sense for numeric/boolean operations.
"""
def check_reduce(self, s, op_name, skipna):
result = getattr(s, op_name)(skipna=skipna)
expected = getattr(s.astype("float64"), op_name)(skipna=skipna)
tm.assert_almost_equal(result, expected)
class BaseNoReduceTests(BaseReduceTests):
""" we don't define any reductions """
@pytest.mark.parametrize("skipna", [True, False])
def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna):
op_name = all_numeric_reductions
s = pd.Series(data)
with pytest.raises(TypeError):
getattr(s, op_name)(skipna=skipna)
@pytest.mark.parametrize("skipna", [True, False])
def test_reduce_series_boolean(self, data, all_boolean_reductions, skipna):
op_name = all_boolean_reductions
s = pd.Series(data)
with pytest.raises(TypeError):
getattr(s, op_name)(skipna=skipna)
class BaseNumericReduceTests(BaseReduceTests):
@pytest.mark.parametrize("skipna", [True, False])
def test_reduce_series(self, data, all_numeric_reductions, skipna):
op_name = all_numeric_reductions
s = pd.Series(data)
# min/max with empty produce numpy warnings
with warnings.catch_warnings():
warnings.simplefilter("ignore", RuntimeWarning)
self.check_reduce(s, op_name, skipna)
class BaseBooleanReduceTests(BaseReduceTests):
@pytest.mark.parametrize("skipna", [True, False])
def test_reduce_series(self, data, all_boolean_reductions, skipna):
op_name = all_boolean_reductions
s = pd.Series(data)
self.check_reduce(s, op_name, skipna)

View File

@@ -0,0 +1,297 @@
import itertools
import numpy as np
import pytest
import pandas as pd
from pandas.core.internals import ExtensionBlock
from .base import BaseExtensionTests
class BaseReshapingTests(BaseExtensionTests):
"""Tests for reshaping and concatenation."""
@pytest.mark.parametrize("in_frame", [True, False])
def test_concat(self, data, in_frame):
wrapped = pd.Series(data)
if in_frame:
wrapped = pd.DataFrame(wrapped)
result = pd.concat([wrapped, wrapped], ignore_index=True)
assert len(result) == len(data) * 2
if in_frame:
dtype = result.dtypes[0]
else:
dtype = result.dtype
assert dtype == data.dtype
assert isinstance(result._data.blocks[0], ExtensionBlock)
@pytest.mark.parametrize("in_frame", [True, False])
def test_concat_all_na_block(self, data_missing, in_frame):
valid_block = pd.Series(data_missing.take([1, 1]), index=[0, 1])
na_block = pd.Series(data_missing.take([0, 0]), index=[2, 3])
if in_frame:
valid_block = pd.DataFrame({"a": valid_block})
na_block = pd.DataFrame({"a": na_block})
result = pd.concat([valid_block, na_block])
if in_frame:
expected = pd.DataFrame({"a": data_missing.take([1, 1, 0, 0])})
self.assert_frame_equal(result, expected)
else:
expected = pd.Series(data_missing.take([1, 1, 0, 0]))
self.assert_series_equal(result, expected)
def test_concat_mixed_dtypes(self, data):
# https://github.com/pandas-dev/pandas/issues/20762
df1 = pd.DataFrame({"A": data[:3]})
df2 = pd.DataFrame({"A": [1, 2, 3]})
df3 = pd.DataFrame({"A": ["a", "b", "c"]}).astype("category")
dfs = [df1, df2, df3]
# dataframes
result = pd.concat(dfs)
expected = pd.concat([x.astype(object) for x in dfs])
self.assert_frame_equal(result, expected)
# series
result = pd.concat([x["A"] for x in dfs])
expected = pd.concat([x["A"].astype(object) for x in dfs])
self.assert_series_equal(result, expected)
# simple test for just EA and one other
result = pd.concat([df1, df2])
expected = pd.concat([df1.astype("object"), df2.astype("object")])
self.assert_frame_equal(result, expected)
result = pd.concat([df1["A"], df2["A"]])
expected = pd.concat([df1["A"].astype("object"), df2["A"].astype("object")])
self.assert_series_equal(result, expected)
def test_concat_columns(self, data, na_value):
df1 = pd.DataFrame({"A": data[:3]})
df2 = pd.DataFrame({"B": [1, 2, 3]})
expected = pd.DataFrame({"A": data[:3], "B": [1, 2, 3]})
result = pd.concat([df1, df2], axis=1)
self.assert_frame_equal(result, expected)
result = pd.concat([df1["A"], df2["B"]], axis=1)
self.assert_frame_equal(result, expected)
# non-aligned
df2 = pd.DataFrame({"B": [1, 2, 3]}, index=[1, 2, 3])
expected = pd.DataFrame(
{
"A": data._from_sequence(list(data[:3]) + [na_value], dtype=data.dtype),
"B": [np.nan, 1, 2, 3],
}
)
result = pd.concat([df1, df2], axis=1)
self.assert_frame_equal(result, expected)
result = pd.concat([df1["A"], df2["B"]], axis=1)
self.assert_frame_equal(result, expected)
def test_align(self, data, na_value):
a = data[:3]
b = data[2:5]
r1, r2 = pd.Series(a).align(pd.Series(b, index=[1, 2, 3]))
# Assumes that the ctor can take a list of scalars of the type
e1 = pd.Series(data._from_sequence(list(a) + [na_value], dtype=data.dtype))
e2 = pd.Series(data._from_sequence([na_value] + list(b), dtype=data.dtype))
self.assert_series_equal(r1, e1)
self.assert_series_equal(r2, e2)
def test_align_frame(self, data, na_value):
a = data[:3]
b = data[2:5]
r1, r2 = pd.DataFrame({"A": a}).align(pd.DataFrame({"A": b}, index=[1, 2, 3]))
# Assumes that the ctor can take a list of scalars of the type
e1 = pd.DataFrame(
{"A": data._from_sequence(list(a) + [na_value], dtype=data.dtype)}
)
e2 = pd.DataFrame(
{"A": data._from_sequence([na_value] + list(b), dtype=data.dtype)}
)
self.assert_frame_equal(r1, e1)
self.assert_frame_equal(r2, e2)
def test_align_series_frame(self, data, na_value):
# https://github.com/pandas-dev/pandas/issues/20576
ser = pd.Series(data, name="a")
df = pd.DataFrame({"col": np.arange(len(ser) + 1)})
r1, r2 = ser.align(df)
e1 = pd.Series(
data._from_sequence(list(data) + [na_value], dtype=data.dtype),
name=ser.name,
)
self.assert_series_equal(r1, e1)
self.assert_frame_equal(r2, df)
def test_set_frame_expand_regular_with_extension(self, data):
df = pd.DataFrame({"A": [1] * len(data)})
df["B"] = data
expected = pd.DataFrame({"A": [1] * len(data), "B": data})
self.assert_frame_equal(df, expected)
def test_set_frame_expand_extension_with_regular(self, data):
df = pd.DataFrame({"A": data})
df["B"] = [1] * len(data)
expected = pd.DataFrame({"A": data, "B": [1] * len(data)})
self.assert_frame_equal(df, expected)
def test_set_frame_overwrite_object(self, data):
# https://github.com/pandas-dev/pandas/issues/20555
df = pd.DataFrame({"A": [1] * len(data)}, dtype=object)
df["A"] = data
assert df.dtypes["A"] == data.dtype
def test_merge(self, data, na_value):
# GH-20743
df1 = pd.DataFrame({"ext": data[:3], "int1": [1, 2, 3], "key": [0, 1, 2]})
df2 = pd.DataFrame({"int2": [1, 2, 3, 4], "key": [0, 0, 1, 3]})
res = pd.merge(df1, df2)
exp = pd.DataFrame(
{
"int1": [1, 1, 2],
"int2": [1, 2, 3],
"key": [0, 0, 1],
"ext": data._from_sequence(
[data[0], data[0], data[1]], dtype=data.dtype
),
}
)
self.assert_frame_equal(res, exp[["ext", "int1", "key", "int2"]])
res = pd.merge(df1, df2, how="outer")
exp = pd.DataFrame(
{
"int1": [1, 1, 2, 3, np.nan],
"int2": [1, 2, 3, np.nan, 4],
"key": [0, 0, 1, 2, 3],
"ext": data._from_sequence(
[data[0], data[0], data[1], data[2], na_value], dtype=data.dtype
),
}
)
self.assert_frame_equal(res, exp[["ext", "int1", "key", "int2"]])
def test_merge_on_extension_array(self, data):
# GH 23020
a, b = data[:2]
key = type(data)._from_sequence([a, b], dtype=data.dtype)
df = pd.DataFrame({"key": key, "val": [1, 2]})
result = pd.merge(df, df, on="key")
expected = pd.DataFrame({"key": key, "val_x": [1, 2], "val_y": [1, 2]})
self.assert_frame_equal(result, expected)
# order
result = pd.merge(df.iloc[[1, 0]], df, on="key")
expected = expected.iloc[[1, 0]].reset_index(drop=True)
self.assert_frame_equal(result, expected)
def test_merge_on_extension_array_duplicates(self, data):
# GH 23020
a, b = data[:2]
key = type(data)._from_sequence([a, b, a], dtype=data.dtype)
df1 = pd.DataFrame({"key": key, "val": [1, 2, 3]})
df2 = pd.DataFrame({"key": key, "val": [1, 2, 3]})
result = pd.merge(df1, df2, on="key")
expected = pd.DataFrame(
{
"key": key.take([0, 0, 0, 0, 1]),
"val_x": [1, 1, 3, 3, 2],
"val_y": [1, 3, 1, 3, 2],
}
)
self.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"columns",
[
["A", "B"],
pd.MultiIndex.from_tuples(
[("A", "a"), ("A", "b")], names=["outer", "inner"]
),
],
)
def test_stack(self, data, columns):
df = pd.DataFrame({"A": data[:5], "B": data[:5]})
df.columns = columns
result = df.stack()
expected = df.astype(object).stack()
# we need a second astype(object), in case the constructor inferred
# object -> specialized, as is done for period.
expected = expected.astype(object)
if isinstance(expected, pd.Series):
assert result.dtype == df.iloc[:, 0].dtype
else:
assert all(result.dtypes == df.iloc[:, 0].dtype)
result = result.astype(object)
self.assert_equal(result, expected)
@pytest.mark.parametrize(
"index",
[
# Two levels, uniform.
pd.MultiIndex.from_product(([["A", "B"], ["a", "b"]]), names=["a", "b"]),
# non-uniform
pd.MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "b")]),
# three levels, non-uniform
pd.MultiIndex.from_product([("A", "B"), ("a", "b", "c"), (0, 1, 2)]),
pd.MultiIndex.from_tuples(
[
("A", "a", 1),
("A", "b", 0),
("A", "a", 0),
("B", "a", 0),
("B", "c", 1),
]
),
],
)
@pytest.mark.parametrize("obj", ["series", "frame"])
def test_unstack(self, data, index, obj):
data = data[: len(index)]
if obj == "series":
ser = pd.Series(data, index=index)
else:
ser = pd.DataFrame({"A": data, "B": data}, index=index)
n = index.nlevels
levels = list(range(n))
# [0, 1, 2]
# [(0,), (1,), (2,), (0, 1), (0, 2), (1, 0), (1, 2), (2, 0), (2, 1)]
combinations = itertools.chain.from_iterable(
itertools.permutations(levels, i) for i in range(1, n)
)
for level in combinations:
result = ser.unstack(level=level)
assert all(
isinstance(result[col].array, type(data)) for col in result.columns
)
expected = ser.astype(object).unstack(level=level)
result = result.astype(object)
self.assert_frame_equal(result, expected)
def test_ravel(self, data):
# as long as EA is 1D-only, ravel is a no-op
result = data.ravel()
assert type(result) == type(data)
# Check that we have a view, not a copy
result[0] = result[1]
assert data[0] == data[1]

View File

@@ -0,0 +1,188 @@
import operator
import numpy as np
import pytest
import pandas as pd
from .base import BaseExtensionTests
class BaseSetitemTests(BaseExtensionTests):
def test_setitem_scalar_series(self, data, box_in_series):
if box_in_series:
data = pd.Series(data)
data[0] = data[1]
assert data[0] == data[1]
def test_setitem_sequence(self, data, box_in_series):
if box_in_series:
data = pd.Series(data)
original = data.copy()
data[[0, 1]] = [data[1], data[0]]
assert data[0] == original[1]
assert data[1] == original[0]
def test_setitem_sequence_mismatched_length_raises(self, data, as_array):
ser = pd.Series(data)
original = ser.copy()
value = [data[0]]
if as_array:
value = data._from_sequence(value)
xpr = "cannot set using a {} indexer with a different length"
with pytest.raises(ValueError, match=xpr.format("list-like")):
ser[[0, 1]] = value
# Ensure no modifications made before the exception
self.assert_series_equal(ser, original)
with pytest.raises(ValueError, match=xpr.format("slice")):
ser[slice(3)] = value
self.assert_series_equal(ser, original)
def test_setitem_empty_indxer(self, data, box_in_series):
if box_in_series:
data = pd.Series(data)
original = data.copy()
data[np.array([], dtype=int)] = []
self.assert_equal(data, original)
def test_setitem_sequence_broadcasts(self, data, box_in_series):
if box_in_series:
data = pd.Series(data)
data[[0, 1]] = data[2]
assert data[0] == data[2]
assert data[1] == data[2]
@pytest.mark.parametrize("setter", ["loc", "iloc"])
def test_setitem_scalar(self, data, setter):
arr = pd.Series(data)
setter = getattr(arr, setter)
operator.setitem(setter, 0, data[1])
assert arr[0] == data[1]
def test_setitem_loc_scalar_mixed(self, data):
df = pd.DataFrame({"A": np.arange(len(data)), "B": data})
df.loc[0, "B"] = data[1]
assert df.loc[0, "B"] == data[1]
def test_setitem_loc_scalar_single(self, data):
df = pd.DataFrame({"B": data})
df.loc[10, "B"] = data[1]
assert df.loc[10, "B"] == data[1]
def test_setitem_loc_scalar_multiple_homogoneous(self, data):
df = pd.DataFrame({"A": data, "B": data})
df.loc[10, "B"] = data[1]
assert df.loc[10, "B"] == data[1]
def test_setitem_iloc_scalar_mixed(self, data):
df = pd.DataFrame({"A": np.arange(len(data)), "B": data})
df.iloc[0, 1] = data[1]
assert df.loc[0, "B"] == data[1]
def test_setitem_iloc_scalar_single(self, data):
df = pd.DataFrame({"B": data})
df.iloc[10, 0] = data[1]
assert df.loc[10, "B"] == data[1]
def test_setitem_iloc_scalar_multiple_homogoneous(self, data):
df = pd.DataFrame({"A": data, "B": data})
df.iloc[10, 1] = data[1]
assert df.loc[10, "B"] == data[1]
@pytest.mark.parametrize("as_callable", [True, False])
@pytest.mark.parametrize("setter", ["loc", None])
def test_setitem_mask_aligned(self, data, as_callable, setter):
ser = pd.Series(data)
mask = np.zeros(len(data), dtype=bool)
mask[:2] = True
if as_callable:
mask2 = lambda x: mask
else:
mask2 = mask
if setter:
# loc
target = getattr(ser, setter)
else:
# Series.__setitem__
target = ser
operator.setitem(target, mask2, data[5:7])
ser[mask2] = data[5:7]
assert ser[0] == data[5]
assert ser[1] == data[6]
@pytest.mark.parametrize("setter", ["loc", None])
def test_setitem_mask_broadcast(self, data, setter):
ser = pd.Series(data)
mask = np.zeros(len(data), dtype=bool)
mask[:2] = True
if setter: # loc
target = getattr(ser, setter)
else: # __setitem__
target = ser
operator.setitem(target, mask, data[10])
assert ser[0] == data[10]
assert ser[1] == data[10]
def test_setitem_expand_columns(self, data):
df = pd.DataFrame({"A": data})
result = df.copy()
result["B"] = 1
expected = pd.DataFrame({"A": data, "B": [1] * len(data)})
self.assert_frame_equal(result, expected)
result = df.copy()
result.loc[:, "B"] = 1
self.assert_frame_equal(result, expected)
# overwrite with new type
result["B"] = data
expected = pd.DataFrame({"A": data, "B": data})
self.assert_frame_equal(result, expected)
def test_setitem_expand_with_extension(self, data):
df = pd.DataFrame({"A": [1] * len(data)})
result = df.copy()
result["B"] = data
expected = pd.DataFrame({"A": [1] * len(data), "B": data})
self.assert_frame_equal(result, expected)
result = df.copy()
result.loc[:, "B"] = data
self.assert_frame_equal(result, expected)
def test_setitem_frame_invalid_length(self, data):
df = pd.DataFrame({"A": [1] * len(data)})
xpr = "Length of values does not match length of index"
with pytest.raises(ValueError, match=xpr):
df["B"] = data[:5]
@pytest.mark.xfail(reason="GH#20441: setitem on extension types.")
def test_setitem_tuple_index(self, data):
s = pd.Series(data[:2], index=[(0, 0), (0, 1)])
expected = pd.Series(data.take([1, 1]), index=s.index)
s[(0, 1)] = data[1]
self.assert_series_equal(s, expected)
def test_setitem_slice_mismatch_length_raises(self, data):
arr = data[:5]
with pytest.raises(ValueError):
arr[:1] = arr[:2]
def test_setitem_slice_array(self, data):
arr = data[:5].copy()
arr[:5] = data[-5:]
self.assert_extension_array_equal(arr, data[-5:])
def test_setitem_scalar_key_sequence_raise(self, data):
arr = data[:5].copy()
with pytest.raises(ValueError):
arr[0] = arr[[0, 1]]

View File

@@ -0,0 +1,178 @@
import operator
import pytest
from pandas import Series
@pytest.fixture
def dtype():
"""A fixture providing the ExtensionDtype to validate."""
raise NotImplementedError
@pytest.fixture
def data():
"""Length-100 array for this type.
* data[0] and data[1] should both be non missing
* data[0] and data[1] should not be equal
"""
raise NotImplementedError
@pytest.fixture
def data_for_twos():
"""Length-100 array in which all the elements are two."""
raise NotImplementedError
@pytest.fixture
def data_missing():
"""Length-2 array with [NA, Valid]"""
raise NotImplementedError
@pytest.fixture(params=["data", "data_missing"])
def all_data(request, data, data_missing):
"""Parametrized fixture giving 'data' and 'data_missing'"""
if request.param == "data":
return data
elif request.param == "data_missing":
return data_missing
@pytest.fixture
def data_repeated(data):
"""
Generate many datasets.
Parameters
----------
data : fixture implementing `data`
Returns
-------
Callable[[int], Generator]:
A callable that takes a `count` argument and
returns a generator yielding `count` datasets.
"""
def gen(count):
for _ in range(count):
yield data
return gen
@pytest.fixture
def data_for_sorting():
"""Length-3 array with a known sort order.
This should be three items [B, C, A] with
A < B < C
"""
raise NotImplementedError
@pytest.fixture
def data_missing_for_sorting():
"""Length-3 array with a known sort order.
This should be three items [B, NA, A] with
A < B and NA missing.
"""
raise NotImplementedError
@pytest.fixture
def na_cmp():
"""Binary operator for comparing NA values.
Should return a function of two arguments that returns
True if both arguments are (scalar) NA for your type.
By default, uses ``operator.is_``
"""
return operator.is_
@pytest.fixture
def na_value():
"""The scalar missing value for this type. Default 'None'"""
return None
@pytest.fixture
def data_for_grouping():
"""Data for factorization, grouping, and unique tests.
Expected to be like [B, B, NA, NA, A, A, B, C]
Where A < B < C and NA is missing
"""
raise NotImplementedError
@pytest.fixture(params=[True, False])
def box_in_series(request):
"""Whether to box the data in a Series"""
return request.param
@pytest.fixture(
params=[
lambda x: 1,
lambda x: [1] * len(x),
lambda x: Series([1] * len(x)),
lambda x: x,
],
ids=["scalar", "list", "series", "object"],
)
def groupby_apply_op(request):
"""
Functions to test groupby.apply().
"""
return request.param
@pytest.fixture(params=[True, False])
def as_frame(request):
"""
Boolean fixture to support Series and Series.to_frame() comparison testing.
"""
return request.param
@pytest.fixture(params=[True, False])
def as_series(request):
"""
Boolean fixture to support arr and Series(arr) comparison testing.
"""
return request.param
@pytest.fixture(params=[True, False])
def use_numpy(request):
"""
Boolean fixture to support comparison testing of ExtensionDtype array
and numpy array.
"""
return request.param
@pytest.fixture(params=["ffill", "bfill"])
def fillna_method(request):
"""
Parametrized fixture giving method parameters 'ffill' and 'bfill' for
Series.fillna(method=<method>) testing.
"""
return request.param
@pytest.fixture(params=[True, False])
def as_array(request):
"""
Boolean fixture to support ExtensionDtype _from_sequence method testing.
"""
return request.param

View File

@@ -0,0 +1,3 @@
from .array import DecimalArray, DecimalDtype, make_data, to_decimal
__all__ = ["DecimalArray", "DecimalDtype", "to_decimal", "make_data"]

View File

@@ -0,0 +1,189 @@
import decimal
import numbers
import random
import sys
import numpy as np
from pandas.core.dtypes.base import ExtensionDtype
import pandas as pd
from pandas.api.extensions import register_extension_dtype
from pandas.core.arrays import ExtensionArray, ExtensionScalarOpsMixin
@register_extension_dtype
class DecimalDtype(ExtensionDtype):
type = decimal.Decimal
name = "decimal"
na_value = decimal.Decimal("NaN")
_metadata = ("context",)
def __init__(self, context=None):
self.context = context or decimal.getcontext()
def __repr__(self):
return "DecimalDtype(context={})".format(self.context)
@classmethod
def construct_array_type(cls):
"""Return the array type associated with this dtype
Returns
-------
type
"""
return DecimalArray
@classmethod
def construct_from_string(cls, string):
if string == cls.name:
return cls()
else:
raise TypeError("Cannot construct a '{}' from '{}'".format(cls, string))
@property
def _is_numeric(self):
return True
class DecimalArray(ExtensionArray, ExtensionScalarOpsMixin):
__array_priority__ = 1000
def __init__(self, values, dtype=None, copy=False, context=None):
for val in values:
if not isinstance(val, decimal.Decimal):
raise TypeError("All values must be of type " + str(decimal.Decimal))
values = np.asarray(values, dtype=object)
self._data = values
# Some aliases for common attribute names to ensure pandas supports
# these
self._items = self.data = self._data
# those aliases are currently not working due to assumptions
# in internal code (GH-20735)
# self._values = self.values = self.data
self._dtype = DecimalDtype(context)
@property
def dtype(self):
return self._dtype
@classmethod
def _from_sequence(cls, scalars, dtype=None, copy=False):
return cls(scalars)
@classmethod
def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
return cls._from_sequence([decimal.Decimal(x) for x in strings], dtype, copy)
@classmethod
def _from_factorized(cls, values, original):
return cls(values)
_HANDLED_TYPES = (decimal.Decimal, numbers.Number, np.ndarray)
def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
#
if not all(
isinstance(t, self._HANDLED_TYPES + (DecimalArray,)) for t in inputs
):
return NotImplemented
inputs = tuple(x._data if isinstance(x, DecimalArray) else x for x in inputs)
result = getattr(ufunc, method)(*inputs, **kwargs)
def reconstruct(x):
if isinstance(x, (decimal.Decimal, numbers.Number)):
return x
else:
return DecimalArray._from_sequence(x)
if isinstance(result, tuple):
return tuple(reconstruct(x) for x in result)
else:
return reconstruct(result)
def __getitem__(self, item):
if isinstance(item, numbers.Integral):
return self._data[item]
else:
return type(self)(self._data[item])
def take(self, indexer, allow_fill=False, fill_value=None):
from pandas.api.extensions import take
data = self._data
if allow_fill and fill_value is None:
fill_value = self.dtype.na_value
result = take(data, indexer, fill_value=fill_value, allow_fill=allow_fill)
return self._from_sequence(result)
def copy(self):
return type(self)(self._data.copy())
def astype(self, dtype, copy=True):
if isinstance(dtype, type(self.dtype)):
return type(self)(self._data, context=dtype.context)
return np.asarray(self, dtype=dtype)
def __setitem__(self, key, value):
if pd.api.types.is_list_like(value):
if pd.api.types.is_scalar(key):
raise ValueError("setting an array element with a sequence.")
value = [decimal.Decimal(v) for v in value]
else:
value = decimal.Decimal(value)
self._data[key] = value
def __len__(self):
return len(self._data)
@property
def nbytes(self):
n = len(self)
if n:
return n * sys.getsizeof(self[0])
return 0
def isna(self):
return np.array([x.is_nan() for x in self._data], dtype=bool)
@property
def _na_value(self):
return decimal.Decimal("NaN")
def _formatter(self, boxed=False):
if boxed:
return "Decimal: {0}".format
return repr
@classmethod
def _concat_same_type(cls, to_concat):
return cls(np.concatenate([x._data for x in to_concat]))
def _reduce(self, name, skipna=True, **kwargs):
if skipna:
raise NotImplementedError("decimal does not support skipna=True")
try:
op = getattr(self.data, name)
except AttributeError:
raise NotImplementedError(
"decimal does not support the {} operation".format(name)
)
return op(axis=0)
def to_decimal(values, context=None):
return DecimalArray([decimal.Decimal(x) for x in values], context=context)
def make_data():
return [decimal.Decimal(random.random()) for _ in range(100)]
DecimalArray._add_arithmetic_ops()
DecimalArray._add_comparison_ops()

View File

@@ -0,0 +1,439 @@
import decimal
import math
import operator
import numpy as np
import pytest
import pandas as pd
from pandas.tests.extension import base
import pandas.util.testing as tm
from .array import DecimalArray, DecimalDtype, make_data, to_decimal
@pytest.fixture
def dtype():
return DecimalDtype()
@pytest.fixture
def data():
return DecimalArray(make_data())
@pytest.fixture
def data_for_twos():
return DecimalArray([decimal.Decimal(2) for _ in range(100)])
@pytest.fixture
def data_missing():
return DecimalArray([decimal.Decimal("NaN"), decimal.Decimal(1)])
@pytest.fixture
def data_for_sorting():
return DecimalArray(
[decimal.Decimal("1"), decimal.Decimal("2"), decimal.Decimal("0")]
)
@pytest.fixture
def data_missing_for_sorting():
return DecimalArray(
[decimal.Decimal("1"), decimal.Decimal("NaN"), decimal.Decimal("0")]
)
@pytest.fixture
def na_cmp():
return lambda x, y: x.is_nan() and y.is_nan()
@pytest.fixture
def na_value():
return decimal.Decimal("NaN")
@pytest.fixture
def data_for_grouping():
b = decimal.Decimal("1.0")
a = decimal.Decimal("0.0")
c = decimal.Decimal("2.0")
na = decimal.Decimal("NaN")
return DecimalArray([b, b, na, na, a, a, b, c])
class BaseDecimal:
def assert_series_equal(self, left, right, *args, **kwargs):
def convert(x):
# need to convert array([Decimal(NaN)], dtype='object') to np.NaN
# because Series[object].isnan doesn't recognize decimal(NaN) as
# NA.
try:
return math.isnan(x)
except TypeError:
return False
if left.dtype == "object":
left_na = left.apply(convert)
else:
left_na = left.isna()
if right.dtype == "object":
right_na = right.apply(convert)
else:
right_na = right.isna()
tm.assert_series_equal(left_na, right_na)
return tm.assert_series_equal(left[~left_na], right[~right_na], *args, **kwargs)
def assert_frame_equal(self, left, right, *args, **kwargs):
# TODO(EA): select_dtypes
tm.assert_index_equal(
left.columns,
right.columns,
exact=kwargs.get("check_column_type", "equiv"),
check_names=kwargs.get("check_names", True),
check_exact=kwargs.get("check_exact", False),
check_categorical=kwargs.get("check_categorical", True),
obj="{obj}.columns".format(obj=kwargs.get("obj", "DataFrame")),
)
decimals = (left.dtypes == "decimal").index
for col in decimals:
self.assert_series_equal(left[col], right[col], *args, **kwargs)
left = left.drop(columns=decimals)
right = right.drop(columns=decimals)
tm.assert_frame_equal(left, right, *args, **kwargs)
class TestDtype(BaseDecimal, base.BaseDtypeTests):
def test_hashable(self, dtype):
pass
class TestInterface(BaseDecimal, base.BaseInterfaceTests):
pass
class TestConstructors(BaseDecimal, base.BaseConstructorsTests):
@pytest.mark.skip(reason="not implemented constructor from dtype")
def test_from_dtype(self, data):
# construct from our dtype & string dtype
pass
class TestReshaping(BaseDecimal, base.BaseReshapingTests):
pass
class TestGetitem(BaseDecimal, base.BaseGetitemTests):
def test_take_na_value_other_decimal(self):
arr = DecimalArray([decimal.Decimal("1.0"), decimal.Decimal("2.0")])
result = arr.take([0, -1], allow_fill=True, fill_value=decimal.Decimal("-1.0"))
expected = DecimalArray([decimal.Decimal("1.0"), decimal.Decimal("-1.0")])
self.assert_extension_array_equal(result, expected)
class TestMissing(BaseDecimal, base.BaseMissingTests):
pass
class Reduce:
def check_reduce(self, s, op_name, skipna):
if skipna or op_name in ["median", "skew", "kurt"]:
with pytest.raises(NotImplementedError):
getattr(s, op_name)(skipna=skipna)
else:
result = getattr(s, op_name)(skipna=skipna)
expected = getattr(np.asarray(s), op_name)()
tm.assert_almost_equal(result, expected)
class TestNumericReduce(Reduce, base.BaseNumericReduceTests):
pass
class TestBooleanReduce(Reduce, base.BaseBooleanReduceTests):
pass
class TestMethods(BaseDecimal, base.BaseMethodsTests):
@pytest.mark.parametrize("dropna", [True, False])
@pytest.mark.xfail(reason="value_counts not implemented yet.")
def test_value_counts(self, all_data, dropna):
all_data = all_data[:10]
if dropna:
other = np.array(all_data[~all_data.isna()])
else:
other = all_data
result = pd.Series(all_data).value_counts(dropna=dropna).sort_index()
expected = pd.Series(other).value_counts(dropna=dropna).sort_index()
tm.assert_series_equal(result, expected)
class TestCasting(BaseDecimal, base.BaseCastingTests):
pass
class TestGroupby(BaseDecimal, base.BaseGroupbyTests):
@pytest.mark.xfail(
reason="needs to correctly define __eq__ to handle nans, xref #27081."
)
def test_groupby_apply_identity(self, data_for_grouping):
super().test_groupby_apply_identity(data_for_grouping)
class TestSetitem(BaseDecimal, base.BaseSetitemTests):
pass
class TestPrinting(BaseDecimal, base.BasePrintingTests):
def test_series_repr(self, data):
# Overriding this base test to explicitly test that
# the custom _formatter is used
ser = pd.Series(data)
assert data.dtype.name in repr(ser)
assert "Decimal: " in repr(ser)
# TODO(extension)
@pytest.mark.xfail(
reason=(
"raising AssertionError as this is not implemented, though easy enough to do"
)
)
def test_series_constructor_coerce_data_to_extension_dtype_raises():
xpr = (
"Cannot cast data to extension dtype 'decimal'. Pass the "
"extension array directly."
)
with pytest.raises(ValueError, match=xpr):
pd.Series([0, 1, 2], dtype=DecimalDtype())
def test_series_constructor_with_dtype():
arr = DecimalArray([decimal.Decimal("10.0")])
result = pd.Series(arr, dtype=DecimalDtype())
expected = pd.Series(arr)
tm.assert_series_equal(result, expected)
result = pd.Series(arr, dtype="int64")
expected = pd.Series([10])
tm.assert_series_equal(result, expected)
def test_dataframe_constructor_with_dtype():
arr = DecimalArray([decimal.Decimal("10.0")])
result = pd.DataFrame({"A": arr}, dtype=DecimalDtype())
expected = pd.DataFrame({"A": arr})
tm.assert_frame_equal(result, expected)
arr = DecimalArray([decimal.Decimal("10.0")])
result = pd.DataFrame({"A": arr}, dtype="int64")
expected = pd.DataFrame({"A": [10]})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("frame", [True, False])
def test_astype_dispatches(frame):
# This is a dtype-specific test that ensures Series[decimal].astype
# gets all the way through to ExtensionArray.astype
# Designing a reliable smoke test that works for arbitrary data types
# is difficult.
data = pd.Series(DecimalArray([decimal.Decimal(2)]), name="a")
ctx = decimal.Context()
ctx.prec = 5
if frame:
data = data.to_frame()
result = data.astype(DecimalDtype(ctx))
if frame:
result = result["a"]
assert result.dtype.context.prec == ctx.prec
class TestArithmeticOps(BaseDecimal, base.BaseArithmeticOpsTests):
def check_opname(self, s, op_name, other, exc=None):
super().check_opname(s, op_name, other, exc=None)
def test_arith_series_with_array(self, data, all_arithmetic_operators):
op_name = all_arithmetic_operators
s = pd.Series(data)
context = decimal.getcontext()
divbyzerotrap = context.traps[decimal.DivisionByZero]
invalidoptrap = context.traps[decimal.InvalidOperation]
context.traps[decimal.DivisionByZero] = 0
context.traps[decimal.InvalidOperation] = 0
# Decimal supports ops with int, but not float
other = pd.Series([int(d * 100) for d in data])
self.check_opname(s, op_name, other)
if "mod" not in op_name:
self.check_opname(s, op_name, s * 2)
self.check_opname(s, op_name, 0)
self.check_opname(s, op_name, 5)
context.traps[decimal.DivisionByZero] = divbyzerotrap
context.traps[decimal.InvalidOperation] = invalidoptrap
def _check_divmod_op(self, s, op, other, exc=NotImplementedError):
# We implement divmod
super()._check_divmod_op(s, op, other, exc=None)
def test_error(self):
pass
class TestComparisonOps(BaseDecimal, base.BaseComparisonOpsTests):
def check_opname(self, s, op_name, other, exc=None):
super().check_opname(s, op_name, other, exc=None)
def _compare_other(self, s, data, op_name, other):
self.check_opname(s, op_name, other)
def test_compare_scalar(self, data, all_compare_operators):
op_name = all_compare_operators
s = pd.Series(data)
self._compare_other(s, data, op_name, 0.5)
def test_compare_array(self, data, all_compare_operators):
op_name = all_compare_operators
s = pd.Series(data)
alter = np.random.choice([-1, 0, 1], len(data))
# Randomly double, halve or keep same value
other = pd.Series(data) * [decimal.Decimal(pow(2.0, i)) for i in alter]
self._compare_other(s, data, op_name, other)
class DecimalArrayWithoutFromSequence(DecimalArray):
"""Helper class for testing error handling in _from_sequence."""
def _from_sequence(cls, scalars, dtype=None, copy=False):
raise KeyError("For the test")
class DecimalArrayWithoutCoercion(DecimalArrayWithoutFromSequence):
@classmethod
def _create_arithmetic_method(cls, op):
return cls._create_method(op, coerce_to_dtype=False)
DecimalArrayWithoutCoercion._add_arithmetic_ops()
def test_combine_from_sequence_raises():
# https://github.com/pandas-dev/pandas/issues/22850
ser = pd.Series(
DecimalArrayWithoutFromSequence(
[decimal.Decimal("1.0"), decimal.Decimal("2.0")]
)
)
result = ser.combine(ser, operator.add)
# note: object dtype
expected = pd.Series(
[decimal.Decimal("2.0"), decimal.Decimal("4.0")], dtype="object"
)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"class_", [DecimalArrayWithoutFromSequence, DecimalArrayWithoutCoercion]
)
def test_scalar_ops_from_sequence_raises(class_):
# op(EA, EA) should return an EA, or an ndarray if it's not possible
# to return an EA with the return values.
arr = class_([decimal.Decimal("1.0"), decimal.Decimal("2.0")])
result = arr + arr
expected = np.array(
[decimal.Decimal("2.0"), decimal.Decimal("4.0")], dtype="object"
)
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize(
"reverse, expected_div, expected_mod",
[(False, [0, 1, 1, 2], [1, 0, 1, 0]), (True, [2, 1, 0, 0], [0, 0, 2, 2])],
)
def test_divmod_array(reverse, expected_div, expected_mod):
# https://github.com/pandas-dev/pandas/issues/22930
arr = to_decimal([1, 2, 3, 4])
if reverse:
div, mod = divmod(2, arr)
else:
div, mod = divmod(arr, 2)
expected_div = to_decimal(expected_div)
expected_mod = to_decimal(expected_mod)
tm.assert_extension_array_equal(div, expected_div)
tm.assert_extension_array_equal(mod, expected_mod)
def test_ufunc_fallback(data):
a = data[:5]
s = pd.Series(a, index=range(3, 8))
result = np.abs(s)
expected = pd.Series(np.abs(a), index=range(3, 8))
tm.assert_series_equal(result, expected)
def test_formatting_values_deprecated():
class DecimalArray2(DecimalArray):
def _formatting_values(self):
return np.array(self)
ser = pd.Series(DecimalArray2([decimal.Decimal("1.0")]))
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
repr(ser)
def test_array_ufunc():
a = to_decimal([1, 2, 3])
result = np.exp(a)
expected = to_decimal(np.exp(a._data))
tm.assert_extension_array_equal(result, expected)
def test_array_ufunc_series():
a = to_decimal([1, 2, 3])
s = pd.Series(a)
result = np.exp(s)
expected = pd.Series(to_decimal(np.exp(a._data)))
tm.assert_series_equal(result, expected)
def test_array_ufunc_series_scalar_other():
# check _HANDLED_TYPES
a = to_decimal([1, 2, 3])
s = pd.Series(a)
result = np.add(s, decimal.Decimal(1))
expected = pd.Series(np.add(a, decimal.Decimal(1)))
tm.assert_series_equal(result, expected)
def test_array_ufunc_series_defer():
a = to_decimal([1, 2, 3])
s = pd.Series(a)
expected = pd.Series(to_decimal([2, 4, 6]))
r1 = np.add(s, a)
r2 = np.add(a, s)
tm.assert_series_equal(r1, expected)
tm.assert_series_equal(r2, expected)

View File

@@ -0,0 +1,3 @@
from .array import JSONArray, JSONDtype, make_data
__all__ = ["JSONArray", "JSONDtype", "make_data"]

View File

@@ -0,0 +1,198 @@
"""Test extension array for storing nested data in a pandas container.
The JSONArray stores lists of dictionaries. The storage mechanism is a list,
not an ndarray.
Note:
We currently store lists of UserDicts. Pandas has a few places
internally that specifically check for dicts, and does non-scalar things
in that case. We *want* the dictionaries to be treated as scalars, so we
hack around pandas by using UserDicts.
"""
from collections import UserDict, abc
import itertools
import numbers
import random
import string
import sys
import numpy as np
from pandas.core.dtypes.base import ExtensionDtype
from pandas.core.arrays import ExtensionArray
class JSONDtype(ExtensionDtype):
type = abc.Mapping
name = "json"
na_value = UserDict()
@classmethod
def construct_array_type(cls):
"""Return the array type associated with this dtype
Returns
-------
type
"""
return JSONArray
@classmethod
def construct_from_string(cls, string):
if string == cls.name:
return cls()
else:
raise TypeError("Cannot construct a '{}' from '{}'".format(cls, string))
class JSONArray(ExtensionArray):
dtype = JSONDtype()
__array_priority__ = 1000
def __init__(self, values, dtype=None, copy=False):
for val in values:
if not isinstance(val, self.dtype.type):
raise TypeError("All values must be of type " + str(self.dtype.type))
self.data = values
# Some aliases for common attribute names to ensure pandas supports
# these
self._items = self._data = self.data
# those aliases are currently not working due to assumptions
# in internal code (GH-20735)
# self._values = self.values = self.data
@classmethod
def _from_sequence(cls, scalars, dtype=None, copy=False):
return cls(scalars)
@classmethod
def _from_factorized(cls, values, original):
return cls([UserDict(x) for x in values if x != ()])
def __getitem__(self, item):
if isinstance(item, numbers.Integral):
return self.data[item]
elif isinstance(item, np.ndarray) and item.dtype == "bool":
return self._from_sequence([x for x, m in zip(self, item) if m])
elif isinstance(item, abc.Iterable):
# fancy indexing
return type(self)([self.data[i] for i in item])
else:
# slice
return type(self)(self.data[item])
def __setitem__(self, key, value):
if isinstance(key, numbers.Integral):
self.data[key] = value
else:
if not isinstance(value, (type(self), abc.Sequence)):
# broadcast value
value = itertools.cycle([value])
if isinstance(key, np.ndarray) and key.dtype == "bool":
# masking
for i, (k, v) in enumerate(zip(key, value)):
if k:
assert isinstance(v, self.dtype.type)
self.data[i] = v
else:
for k, v in zip(key, value):
assert isinstance(v, self.dtype.type)
self.data[k] = v
def __len__(self):
return len(self.data)
@property
def nbytes(self):
return sys.getsizeof(self.data)
def isna(self):
return np.array([x == self.dtype.na_value for x in self.data], dtype=bool)
def take(self, indexer, allow_fill=False, fill_value=None):
# re-implement here, since NumPy has trouble setting
# sized objects like UserDicts into scalar slots of
# an ndarary.
indexer = np.asarray(indexer)
msg = (
"Index is out of bounds or cannot do a "
"non-empty take from an empty array."
)
if allow_fill:
if fill_value is None:
fill_value = self.dtype.na_value
# bounds check
if (indexer < -1).any():
raise ValueError
try:
output = [
self.data[loc] if loc != -1 else fill_value for loc in indexer
]
except IndexError:
raise IndexError(msg)
else:
try:
output = [self.data[loc] for loc in indexer]
except IndexError:
raise IndexError(msg)
return self._from_sequence(output)
def copy(self):
return type(self)(self.data[:])
def astype(self, dtype, copy=True):
# NumPy has issues when all the dicts are the same length.
# np.array([UserDict(...), UserDict(...)]) fails,
# but np.array([{...}, {...}]) works, so cast.
# needed to add this check for the Series constructor
if isinstance(dtype, type(self.dtype)) and dtype == self.dtype:
if copy:
return self.copy()
return self
return np.array([dict(x) for x in self], dtype=dtype, copy=copy)
def unique(self):
# Parent method doesn't work since np.array will try to infer
# a 2-dim object.
return type(self)(
[dict(x) for x in list({tuple(d.items()) for d in self.data})]
)
@classmethod
def _concat_same_type(cls, to_concat):
data = list(itertools.chain.from_iterable([x.data for x in to_concat]))
return cls(data)
def _values_for_factorize(self):
frozen = self._values_for_argsort()
if len(frozen) == 0:
# _factorize_array expects 1-d array, this is a len-0 2-d array.
frozen = frozen.ravel()
return frozen, ()
def _values_for_argsort(self):
# Disable NumPy's shape inference by including an empty tuple...
# If all the elemnts of self are the same size P, NumPy will
# cast them to an (N, P) array, instead of an (N,) array of tuples.
frozen = [()] + [tuple(x.items()) for x in self]
return np.array(frozen, dtype=object)[1:]
def make_data():
# TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer
return [
UserDict(
[
(random.choice(string.ascii_letters), random.randint(0, 100))
for _ in range(random.randint(0, 10))
]
)
for _ in range(100)
]

View File

@@ -0,0 +1,312 @@
import collections
import operator
import pytest
from pandas.compat import PY36
import pandas as pd
from pandas.tests.extension import base
import pandas.util.testing as tm
from .array import JSONArray, JSONDtype, make_data
@pytest.fixture
def dtype():
return JSONDtype()
@pytest.fixture
def data():
"""Length-100 PeriodArray for semantics test."""
data = make_data()
# Why the while loop? NumPy is unable to construct an ndarray from
# equal-length ndarrays. Many of our operations involve coercing the
# EA to an ndarray of objects. To avoid random test failures, we ensure
# that our data is coercible to an ndarray. Several tests deal with only
# the first two elements, so that's what we'll check.
while len(data[0]) == len(data[1]):
data = make_data()
return JSONArray(data)
@pytest.fixture
def data_missing():
"""Length 2 array with [NA, Valid]"""
return JSONArray([{}, {"a": 10}])
@pytest.fixture
def data_for_sorting():
return JSONArray([{"b": 1}, {"c": 4}, {"a": 2, "c": 3}])
@pytest.fixture
def data_missing_for_sorting():
return JSONArray([{"b": 1}, {}, {"a": 4}])
@pytest.fixture
def na_value(dtype):
return dtype.na_value
@pytest.fixture
def na_cmp():
return operator.eq
@pytest.fixture
def data_for_grouping():
return JSONArray(
[
{"b": 1},
{"b": 1},
{},
{},
{"a": 0, "c": 2},
{"a": 0, "c": 2},
{"b": 1},
{"c": 2},
]
)
class BaseJSON:
# NumPy doesn't handle an array of equal-length UserDicts.
# The default assert_series_equal eventually does a
# Series.values, which raises. We work around it by
# converting the UserDicts to dicts.
def assert_series_equal(self, left, right, **kwargs):
if left.dtype.name == "json":
assert left.dtype == right.dtype
left = pd.Series(
JSONArray(left.values.astype(object)), index=left.index, name=left.name
)
right = pd.Series(
JSONArray(right.values.astype(object)),
index=right.index,
name=right.name,
)
tm.assert_series_equal(left, right, **kwargs)
def assert_frame_equal(self, left, right, *args, **kwargs):
tm.assert_index_equal(
left.columns,
right.columns,
exact=kwargs.get("check_column_type", "equiv"),
check_names=kwargs.get("check_names", True),
check_exact=kwargs.get("check_exact", False),
check_categorical=kwargs.get("check_categorical", True),
obj="{obj}.columns".format(obj=kwargs.get("obj", "DataFrame")),
)
jsons = (left.dtypes == "json").index
for col in jsons:
self.assert_series_equal(left[col], right[col], *args, **kwargs)
left = left.drop(columns=jsons)
right = right.drop(columns=jsons)
tm.assert_frame_equal(left, right, *args, **kwargs)
class TestDtype(BaseJSON, base.BaseDtypeTests):
pass
class TestInterface(BaseJSON, base.BaseInterfaceTests):
def test_custom_asserts(self):
# This would always trigger the KeyError from trying to put
# an array of equal-length UserDicts inside an ndarray.
data = JSONArray(
[
collections.UserDict({"a": 1}),
collections.UserDict({"b": 2}),
collections.UserDict({"c": 3}),
]
)
a = pd.Series(data)
self.assert_series_equal(a, a)
self.assert_frame_equal(a.to_frame(), a.to_frame())
b = pd.Series(data.take([0, 0, 1]))
with pytest.raises(AssertionError):
self.assert_series_equal(a, b)
with pytest.raises(AssertionError):
self.assert_frame_equal(a.to_frame(), b.to_frame())
class TestConstructors(BaseJSON, base.BaseConstructorsTests):
@pytest.mark.skip(reason="not implemented constructor from dtype")
def test_from_dtype(self, data):
# construct from our dtype & string dtype
pass
class TestReshaping(BaseJSON, base.BaseReshapingTests):
@pytest.mark.skip(reason="Different definitions of NA")
def test_stack(self):
"""
The test does .astype(object).stack(). If we happen to have
any missing values in `data`, then we'll end up with different
rows since we consider `{}` NA, but `.astype(object)` doesn't.
"""
@pytest.mark.xfail(reason="dict for NA")
def test_unstack(self, data, index):
# The base test has NaN for the expected NA value.
# this matches otherwise
return super().test_unstack(data, index)
class TestGetitem(BaseJSON, base.BaseGetitemTests):
pass
class TestMissing(BaseJSON, base.BaseMissingTests):
@pytest.mark.skip(reason="Setting a dict as a scalar")
def test_fillna_series(self):
"""We treat dictionaries as a mapping in fillna, not a scalar."""
@pytest.mark.skip(reason="Setting a dict as a scalar")
def test_fillna_frame(self):
"""We treat dictionaries as a mapping in fillna, not a scalar."""
unhashable = pytest.mark.skip(reason="Unhashable")
unstable = pytest.mark.skipif(
not PY36, reason="Dictionary order unstable" # 3.6 or higher
)
class TestReduce(base.BaseNoReduceTests):
pass
class TestMethods(BaseJSON, base.BaseMethodsTests):
@unhashable
def test_value_counts(self, all_data, dropna):
pass
@unhashable
def test_sort_values_frame(self):
# TODO (EA.factorize): see if _values_for_factorize allows this.
pass
@unstable
def test_argsort(self, data_for_sorting):
super().test_argsort(data_for_sorting)
@unstable
def test_argsort_missing(self, data_missing_for_sorting):
super().test_argsort_missing(data_missing_for_sorting)
@unstable
@pytest.mark.parametrize("ascending", [True, False])
def test_sort_values(self, data_for_sorting, ascending):
super().test_sort_values(data_for_sorting, ascending)
@unstable
@pytest.mark.parametrize("ascending", [True, False])
def test_sort_values_missing(self, data_missing_for_sorting, ascending):
super().test_sort_values_missing(data_missing_for_sorting, ascending)
@pytest.mark.skip(reason="combine for JSONArray not supported")
def test_combine_le(self, data_repeated):
pass
@pytest.mark.skip(reason="combine for JSONArray not supported")
def test_combine_add(self, data_repeated):
pass
@pytest.mark.skip(reason="combine for JSONArray not supported")
def test_combine_first(self, data):
pass
@unhashable
def test_hash_pandas_object_works(self, data, kind):
super().test_hash_pandas_object_works(data, kind)
@pytest.mark.skip(reason="broadcasting error")
def test_where_series(self, data, na_value):
# Fails with
# *** ValueError: operands could not be broadcast together
# with shapes (4,) (4,) (0,)
super().test_where_series(data, na_value)
@pytest.mark.skip(reason="Can't compare dicts.")
def test_searchsorted(self, data_for_sorting):
super().test_searchsorted(data_for_sorting)
class TestCasting(BaseJSON, base.BaseCastingTests):
@pytest.mark.skip(reason="failing on np.array(self, dtype=str)")
def test_astype_str(self):
"""This currently fails in NumPy on np.array(self, dtype=str) with
*** ValueError: setting an array element with a sequence
"""
# We intentionally don't run base.BaseSetitemTests because pandas'
# internals has trouble setting sequences of values into scalar positions.
class TestGroupby(BaseJSON, base.BaseGroupbyTests):
@unhashable
def test_groupby_extension_transform(self):
"""
This currently fails in Series.name.setter, since the
name must be hashable, but the value is a dictionary.
I think this is what we want, i.e. `.name` should be the original
values, and not the values for factorization.
"""
@unhashable
def test_groupby_extension_apply(self):
"""
This fails in Index._do_unique_check with
> hash(val)
E TypeError: unhashable type: 'UserDict' with
I suspect that once we support Index[ExtensionArray],
we'll be able to dispatch unique.
"""
@unstable
@pytest.mark.parametrize("as_index", [True, False])
def test_groupby_extension_agg(self, as_index, data_for_grouping):
super().test_groupby_extension_agg(as_index, data_for_grouping)
class TestArithmeticOps(BaseJSON, base.BaseArithmeticOpsTests):
def test_error(self, data, all_arithmetic_operators):
pass
def test_add_series_with_extension_array(self, data):
ser = pd.Series(data)
with pytest.raises(TypeError, match="unsupported"):
ser + data
def test_divmod_series_array(self):
# GH 23287
# skipping because it is not implemented
pass
def _check_divmod_op(self, s, op, other, exc=NotImplementedError):
return super()._check_divmod_op(s, op, other, exc=TypeError)
class TestComparisonOps(BaseJSON, base.BaseComparisonOpsTests):
pass
class TestPrinting(BaseJSON, base.BasePrintingTests):
pass

View File

@@ -0,0 +1,245 @@
"""
This file contains a minimal set of tests for compliance with the extension
array interface test suite, and should contain no other tests.
The test suite for the full functionality of the array is located in
`pandas/tests/arrays/`.
The tests in this file are inherited from the BaseExtensionTests, and only
minimal tweaks should be applied to get the tests passing (by overwriting a
parent method).
Additional tests should either be added to one of the BaseExtensionTests
classes (if they are relevant for the extension interface for all dtypes), or
be added to the array-specific tests in `pandas/tests/arrays/`.
"""
import string
import numpy as np
import pytest
import pandas as pd
from pandas import Categorical
from pandas.api.types import CategoricalDtype
from pandas.tests.extension import base
import pandas.util.testing as tm
def make_data():
while True:
values = np.random.choice(list(string.ascii_letters), size=100)
# ensure we meet the requirements
# 1. first two not null
# 2. first and second are different
if values[0] != values[1]:
break
return values
@pytest.fixture
def dtype():
return CategoricalDtype()
@pytest.fixture
def data():
"""Length-100 array for this type.
* data[0] and data[1] should both be non missing
* data[0] and data[1] should not gbe equal
"""
return Categorical(make_data())
@pytest.fixture
def data_missing():
"""Length 2 array with [NA, Valid]"""
return Categorical([np.nan, "A"])
@pytest.fixture
def data_for_sorting():
return Categorical(["A", "B", "C"], categories=["C", "A", "B"], ordered=True)
@pytest.fixture
def data_missing_for_sorting():
return Categorical(["A", None, "B"], categories=["B", "A"], ordered=True)
@pytest.fixture
def na_value():
return np.nan
@pytest.fixture
def data_for_grouping():
return Categorical(["a", "a", None, None, "b", "b", "a", "c"])
class TestDtype(base.BaseDtypeTests):
pass
class TestInterface(base.BaseInterfaceTests):
@pytest.mark.skip(reason="Memory usage doesn't match")
def test_memory_usage(self, data):
# Is this deliberate?
super().test_memory_usage(data)
class TestConstructors(base.BaseConstructorsTests):
pass
class TestReshaping(base.BaseReshapingTests):
def test_ravel(self, data):
# GH#27199 Categorical.ravel returns self until after deprecation cycle
with tm.assert_produces_warning(FutureWarning):
data.ravel()
class TestGetitem(base.BaseGetitemTests):
skip_take = pytest.mark.skip(reason="GH-20664.")
@pytest.mark.skip(reason="Backwards compatibility")
def test_getitem_scalar(self, data):
# CategoricalDtype.type isn't "correct" since it should
# be a parent of the elements (object). But don't want
# to break things by changing.
super().test_getitem_scalar(data)
@skip_take
def test_take(self, data, na_value, na_cmp):
# TODO remove this once Categorical.take is fixed
super().test_take(data, na_value, na_cmp)
@skip_take
def test_take_negative(self, data):
super().test_take_negative(data)
@skip_take
def test_take_pandas_style_negative_raises(self, data, na_value):
super().test_take_pandas_style_negative_raises(data, na_value)
@skip_take
def test_take_non_na_fill_value(self, data_missing):
super().test_take_non_na_fill_value(data_missing)
@skip_take
def test_take_out_of_bounds_raises(self, data, allow_fill):
return super().test_take_out_of_bounds_raises(data, allow_fill)
@pytest.mark.skip(reason="GH-20747. Unobserved categories.")
def test_take_series(self, data):
super().test_take_series(data)
@skip_take
def test_reindex_non_na_fill_value(self, data_missing):
super().test_reindex_non_na_fill_value(data_missing)
@pytest.mark.skip(reason="Categorical.take buggy")
def test_take_empty(self, data, na_value, na_cmp):
super().test_take_empty(data, na_value, na_cmp)
@pytest.mark.skip(reason="test not written correctly for categorical")
def test_reindex(self, data, na_value):
super().test_reindex(data, na_value)
class TestSetitem(base.BaseSetitemTests):
pass
class TestMissing(base.BaseMissingTests):
@pytest.mark.skip(reason="Not implemented")
def test_fillna_limit_pad(self, data_missing):
super().test_fillna_limit_pad(data_missing)
@pytest.mark.skip(reason="Not implemented")
def test_fillna_limit_backfill(self, data_missing):
super().test_fillna_limit_backfill(data_missing)
class TestReduce(base.BaseNoReduceTests):
pass
class TestMethods(base.BaseMethodsTests):
@pytest.mark.skip(reason="Unobserved categories included")
def test_value_counts(self, all_data, dropna):
return super().test_value_counts(all_data, dropna)
def test_combine_add(self, data_repeated):
# GH 20825
# When adding categoricals in combine, result is a string
orig_data1, orig_data2 = data_repeated(2)
s1 = pd.Series(orig_data1)
s2 = pd.Series(orig_data2)
result = s1.combine(s2, lambda x1, x2: x1 + x2)
expected = pd.Series(
([a + b for (a, b) in zip(list(orig_data1), list(orig_data2))])
)
self.assert_series_equal(result, expected)
val = s1.iloc[0]
result = s1.combine(val, lambda x1, x2: x1 + x2)
expected = pd.Series([a + val for a in list(orig_data1)])
self.assert_series_equal(result, expected)
@pytest.mark.skip(reason="Not Applicable")
def test_fillna_length_mismatch(self, data_missing):
super().test_fillna_length_mismatch(data_missing)
def test_searchsorted(self, data_for_sorting):
if not data_for_sorting.ordered:
raise pytest.skip(reason="searchsorted requires ordered data.")
class TestCasting(base.BaseCastingTests):
pass
class TestArithmeticOps(base.BaseArithmeticOpsTests):
def test_arith_series_with_scalar(self, data, all_arithmetic_operators):
op_name = all_arithmetic_operators
if op_name != "__rmod__":
super().test_arith_series_with_scalar(data, op_name)
else:
pytest.skip("rmod never called when string is first argument")
def test_add_series_with_extension_array(self, data):
ser = pd.Series(data)
with pytest.raises(TypeError, match="cannot perform"):
ser + data
def test_divmod_series_array(self):
# GH 23287
# skipping because it is not implemented
pass
def _check_divmod_op(self, s, op, other, exc=NotImplementedError):
return super()._check_divmod_op(s, op, other, exc=TypeError)
class TestComparisonOps(base.BaseComparisonOpsTests):
def _compare_other(self, s, data, op_name, other):
op = self.get_op_from_name(op_name)
if op_name == "__eq__":
result = op(s, other)
expected = s.combine(other, lambda x, y: x == y)
assert (result == expected).all()
elif op_name == "__ne__":
result = op(s, other)
expected = s.combine(other, lambda x, y: x != y)
assert (result == expected).all()
else:
with pytest.raises(TypeError):
op(data, other)
class TestParsing(base.BaseParsingTests):
pass

View File

@@ -0,0 +1,81 @@
import numpy as np
import pytest
from pandas.core.dtypes import dtypes
from pandas.core.dtypes.common import is_extension_array_dtype
import pandas as pd
from pandas.core.arrays import ExtensionArray
import pandas.util.testing as tm
class DummyDtype(dtypes.ExtensionDtype):
pass
class DummyArray(ExtensionArray):
def __init__(self, data):
self.data = data
def __array__(self, dtype):
return self.data
@property
def dtype(self):
return DummyDtype()
def astype(self, dtype, copy=True):
# we don't support anything but a single dtype
if isinstance(dtype, DummyDtype):
if copy:
return type(self)(self.data)
return self
return np.array(self, dtype=dtype, copy=copy)
class TestExtensionArrayDtype:
@pytest.mark.parametrize(
"values",
[
pd.Categorical([]),
pd.Categorical([]).dtype,
pd.Series(pd.Categorical([])),
DummyDtype(),
DummyArray(np.array([1, 2])),
],
)
def test_is_extension_array_dtype(self, values):
assert is_extension_array_dtype(values)
@pytest.mark.parametrize("values", [np.array([]), pd.Series(np.array([]))])
def test_is_not_extension_array_dtype(self, values):
assert not is_extension_array_dtype(values)
def test_astype():
arr = DummyArray(np.array([1, 2, 3]))
expected = np.array([1, 2, 3], dtype=object)
result = arr.astype(object)
tm.assert_numpy_array_equal(result, expected)
result = arr.astype("object")
tm.assert_numpy_array_equal(result, expected)
def test_astype_no_copy():
arr = DummyArray(np.array([1, 2, 3], dtype=np.int64))
result = arr.astype(arr.dtype, copy=False)
assert arr is result
result = arr.astype(arr.dtype)
assert arr is not result
@pytest.mark.parametrize("dtype", [dtypes.CategoricalDtype(), dtypes.IntervalDtype()])
def test_is_extension_array_dtype(dtype):
assert isinstance(dtype, dtypes.ExtensionDtype)
assert is_extension_array_dtype(dtype)

View File

@@ -0,0 +1,230 @@
import numpy as np
import pytest
from pandas.core.dtypes.dtypes import DatetimeTZDtype
import pandas as pd
from pandas.core.arrays import DatetimeArray
from pandas.tests.extension import base
@pytest.fixture(params=["US/Central"])
def dtype(request):
return DatetimeTZDtype(unit="ns", tz=request.param)
@pytest.fixture
def data(dtype):
data = DatetimeArray(pd.date_range("2000", periods=100, tz=dtype.tz), dtype=dtype)
return data
@pytest.fixture
def data_missing(dtype):
return DatetimeArray(
np.array(["NaT", "2000-01-01"], dtype="datetime64[ns]"), dtype=dtype
)
@pytest.fixture
def data_for_sorting(dtype):
a = pd.Timestamp("2000-01-01")
b = pd.Timestamp("2000-01-02")
c = pd.Timestamp("2000-01-03")
return DatetimeArray(np.array([b, c, a], dtype="datetime64[ns]"), dtype=dtype)
@pytest.fixture
def data_missing_for_sorting(dtype):
a = pd.Timestamp("2000-01-01")
b = pd.Timestamp("2000-01-02")
return DatetimeArray(np.array([b, "NaT", a], dtype="datetime64[ns]"), dtype=dtype)
@pytest.fixture
def data_for_grouping(dtype):
"""
Expected to be like [B, B, NA, NA, A, A, B, C]
Where A < B < C and NA is missing
"""
a = pd.Timestamp("2000-01-01")
b = pd.Timestamp("2000-01-02")
c = pd.Timestamp("2000-01-03")
na = "NaT"
return DatetimeArray(
np.array([b, b, na, na, a, a, b, c], dtype="datetime64[ns]"), dtype=dtype
)
@pytest.fixture
def na_cmp():
def cmp(a, b):
return a is pd.NaT and a is b
return cmp
@pytest.fixture
def na_value():
return pd.NaT
# ----------------------------------------------------------------------------
class BaseDatetimeTests:
pass
# ----------------------------------------------------------------------------
# Tests
class TestDatetimeDtype(BaseDatetimeTests, base.BaseDtypeTests):
pass
class TestConstructors(BaseDatetimeTests, base.BaseConstructorsTests):
pass
class TestGetitem(BaseDatetimeTests, base.BaseGetitemTests):
pass
class TestMethods(BaseDatetimeTests, base.BaseMethodsTests):
@pytest.mark.skip(reason="Incorrect expected")
def test_value_counts(self, all_data, dropna):
pass
def test_combine_add(self, data_repeated):
# Timestamp.__add__(Timestamp) not defined
pass
class TestInterface(BaseDatetimeTests, base.BaseInterfaceTests):
def test_array_interface(self, data):
if data.tz:
# np.asarray(DTA) is currently always tz-naive.
pytest.skip("GH-23569")
else:
super().test_array_interface(data)
class TestArithmeticOps(BaseDatetimeTests, base.BaseArithmeticOpsTests):
implements = {"__sub__", "__rsub__"}
def test_arith_series_with_scalar(self, data, all_arithmetic_operators):
if all_arithmetic_operators in self.implements:
s = pd.Series(data)
self.check_opname(s, all_arithmetic_operators, s.iloc[0], exc=None)
else:
# ... but not the rest.
super().test_arith_series_with_scalar(data, all_arithmetic_operators)
def test_add_series_with_extension_array(self, data):
# Datetime + Datetime not implemented
s = pd.Series(data)
msg = "cannot add DatetimeArray and DatetimeArray"
with pytest.raises(TypeError, match=msg):
s + data
def test_arith_series_with_array(self, data, all_arithmetic_operators):
if all_arithmetic_operators in self.implements:
s = pd.Series(data)
self.check_opname(s, all_arithmetic_operators, s.iloc[0], exc=None)
else:
# ... but not the rest.
super().test_arith_series_with_scalar(data, all_arithmetic_operators)
def test_error(self, data, all_arithmetic_operators):
pass
def test_divmod_series_array(self):
# GH 23287
# skipping because it is not implemented
pass
@pytest.mark.xfail(reason="different implementation", strict=False)
def test_direct_arith_with_series_returns_not_implemented(self, data):
# Right now, we have trouble with this. Returning NotImplemented
# fails other tests like
# tests/arithmetic/test_datetime64::TestTimestampSeriesArithmetic::
# test_dt64_seris_add_intlike
return super(
TestArithmeticOps, self
).test_direct_arith_with_series_returns_not_implemented(data)
class TestCasting(BaseDatetimeTests, base.BaseCastingTests):
pass
class TestComparisonOps(BaseDatetimeTests, base.BaseComparisonOpsTests):
def _compare_other(self, s, data, op_name, other):
# the base test is not appropriate for us. We raise on comparison
# with (some) integers, depending on the value.
pass
@pytest.mark.xfail(reason="different implementation", strict=False)
def test_direct_arith_with_series_returns_not_implemented(self, data):
return super(
TestComparisonOps, self
).test_direct_arith_with_series_returns_not_implemented(data)
class TestMissing(BaseDatetimeTests, base.BaseMissingTests):
pass
class TestReshaping(BaseDatetimeTests, base.BaseReshapingTests):
@pytest.mark.skip(reason="We have DatetimeTZBlock")
def test_concat(self, data, in_frame):
pass
def test_concat_mixed_dtypes(self, data):
# concat(Series[datetimetz], Series[category]) uses a
# plain np.array(values) on the DatetimeArray, which
# drops the tz.
super().test_concat_mixed_dtypes(data)
@pytest.mark.parametrize("obj", ["series", "frame"])
def test_unstack(self, obj):
# GH-13287: can't use base test, since building the expected fails.
data = DatetimeArray._from_sequence(
["2000", "2001", "2002", "2003"], tz="US/Central"
)
index = pd.MultiIndex.from_product(([["A", "B"], ["a", "b"]]), names=["a", "b"])
if obj == "series":
ser = pd.Series(data, index=index)
expected = pd.DataFrame(
{"A": data.take([0, 1]), "B": data.take([2, 3])},
index=pd.Index(["a", "b"], name="b"),
)
expected.columns.name = "a"
else:
ser = pd.DataFrame({"A": data, "B": data}, index=index)
expected = pd.DataFrame(
{
("A", "A"): data.take([0, 1]),
("A", "B"): data.take([2, 3]),
("B", "A"): data.take([0, 1]),
("B", "B"): data.take([2, 3]),
},
index=pd.Index(["a", "b"], name="b"),
)
expected.columns.names = [None, "a"]
result = ser.unstack(0)
self.assert_equal(result, expected)
class TestSetitem(BaseDatetimeTests, base.BaseSetitemTests):
pass
class TestGroupby(BaseDatetimeTests, base.BaseGroupbyTests):
pass
class TestPrinting(BaseDatetimeTests, base.BasePrintingTests):
pass

View File

@@ -0,0 +1,74 @@
import numpy as np
import pytest
import pandas as pd
from pandas.core.internals import BlockManager, SingleBlockManager
from pandas.core.internals.blocks import Block, NonConsolidatableMixIn
class CustomBlock(NonConsolidatableMixIn, Block):
_holder = np.ndarray
def formatting_values(self):
return np.array(["Val: {}".format(i) for i in self.values])
def concat_same_type(self, to_concat, placement=None):
"""
Always concatenate disregarding self.ndim as the values are
always 1D in this custom Block
"""
values = np.concatenate([blk.values for blk in to_concat])
return self.make_block_same_class(
values, placement=placement or slice(0, len(values), 1)
)
@pytest.fixture
def df():
df1 = pd.DataFrame({"a": [1, 2, 3]})
blocks = df1._data.blocks
values = np.arange(3, dtype="int64")
custom_block = CustomBlock(values, placement=slice(1, 2))
blocks = blocks + (custom_block,)
block_manager = BlockManager(blocks, [pd.Index(["a", "b"]), df1.index])
return pd.DataFrame(block_manager)
def test_custom_repr():
values = np.arange(3, dtype="int64")
# series
block = CustomBlock(values, placement=slice(0, 3))
s = pd.Series(SingleBlockManager(block, pd.RangeIndex(3)))
assert repr(s) == "0 Val: 0\n1 Val: 1\n2 Val: 2\ndtype: int64"
# dataframe
block = CustomBlock(values, placement=slice(0, 1))
blk_mgr = BlockManager([block], [["col"], range(3)])
df = pd.DataFrame(blk_mgr)
assert repr(df) == " col\n0 Val: 0\n1 Val: 1\n2 Val: 2"
def test_concat_series():
# GH17728
values = np.arange(3, dtype="int64")
block = CustomBlock(values, placement=slice(0, 3))
s = pd.Series(block, pd.RangeIndex(3), fastpath=True)
res = pd.concat([s, s])
assert isinstance(res._data.blocks[0], CustomBlock)
def test_concat_dataframe(df):
# GH17728
res = pd.concat([df, df])
assert isinstance(res._data.blocks[1], CustomBlock)
def test_concat_axis1(df):
# GH17954
df2 = pd.DataFrame({"c": [0.1, 0.2, 0.3]})
res = pd.concat([df, df2], axis=1)
assert isinstance(res._data.blocks[1], CustomBlock)

View File

@@ -0,0 +1,237 @@
"""
This file contains a minimal set of tests for compliance with the extension
array interface test suite, and should contain no other tests.
The test suite for the full functionality of the array is located in
`pandas/tests/arrays/`.
The tests in this file are inherited from the BaseExtensionTests, and only
minimal tweaks should be applied to get the tests passing (by overwriting a
parent method).
Additional tests should either be added to one of the BaseExtensionTests
classes (if they are relevant for the extension interface for all dtypes), or
be added to the array-specific tests in `pandas/tests/arrays/`.
"""
import numpy as np
import pytest
from pandas.core.dtypes.common import is_extension_array_dtype
import pandas as pd
from pandas.core.arrays import integer_array
from pandas.core.arrays.integer import (
Int8Dtype,
Int16Dtype,
Int32Dtype,
Int64Dtype,
UInt8Dtype,
UInt16Dtype,
UInt32Dtype,
UInt64Dtype,
)
from pandas.tests.extension import base
def make_data():
return list(range(1, 9)) + [np.nan] + list(range(10, 98)) + [np.nan] + [99, 100]
@pytest.fixture(
params=[
Int8Dtype,
Int16Dtype,
Int32Dtype,
Int64Dtype,
UInt8Dtype,
UInt16Dtype,
UInt32Dtype,
UInt64Dtype,
]
)
def dtype(request):
return request.param()
@pytest.fixture
def data(dtype):
return integer_array(make_data(), dtype=dtype)
@pytest.fixture
def data_for_twos(dtype):
return integer_array(np.ones(100) * 2, dtype=dtype)
@pytest.fixture
def data_missing(dtype):
return integer_array([np.nan, 1], dtype=dtype)
@pytest.fixture
def data_for_sorting(dtype):
return integer_array([1, 2, 0], dtype=dtype)
@pytest.fixture
def data_missing_for_sorting(dtype):
return integer_array([1, np.nan, 0], dtype=dtype)
@pytest.fixture
def na_cmp():
# we are np.nan
return lambda x, y: np.isnan(x) and np.isnan(y)
@pytest.fixture
def na_value():
return np.nan
@pytest.fixture
def data_for_grouping(dtype):
b = 1
a = 0
c = 2
na = np.nan
return integer_array([b, b, na, na, a, a, b, c], dtype=dtype)
class TestDtype(base.BaseDtypeTests):
@pytest.mark.skip(reason="using multiple dtypes")
def test_is_dtype_unboxes_dtype(self):
# we have multiple dtypes, so skip
pass
class TestArithmeticOps(base.BaseArithmeticOpsTests):
def check_opname(self, s, op_name, other, exc=None):
# overwriting to indicate ops don't raise an error
super().check_opname(s, op_name, other, exc=None)
def _check_op(self, s, op, other, op_name, exc=NotImplementedError):
if exc is None:
if s.dtype.is_unsigned_integer and (op_name == "__rsub__"):
# TODO see https://github.com/pandas-dev/pandas/issues/22023
pytest.skip("unsigned subtraction gives negative values")
if (
hasattr(other, "dtype")
and not is_extension_array_dtype(other.dtype)
and pd.api.types.is_integer_dtype(other.dtype)
):
# other is np.int64 and would therefore always result in
# upcasting, so keeping other as same numpy_dtype
other = other.astype(s.dtype.numpy_dtype)
result = op(s, other)
expected = s.combine(other, op)
if op_name in ("__rtruediv__", "__truediv__", "__div__"):
expected = expected.astype(float)
if op_name == "__rtruediv__":
# TODO reverse operators result in object dtype
result = result.astype(float)
elif op_name.startswith("__r"):
# TODO reverse operators result in object dtype
# see https://github.com/pandas-dev/pandas/issues/22024
expected = expected.astype(s.dtype)
result = result.astype(s.dtype)
else:
# combine method result in 'biggest' (int64) dtype
expected = expected.astype(s.dtype)
pass
if (op_name == "__rpow__") and isinstance(other, pd.Series):
# TODO pow on Int arrays gives different result with NA
# see https://github.com/pandas-dev/pandas/issues/22022
result = result.fillna(1)
self.assert_series_equal(result, expected)
else:
with pytest.raises(exc):
op(s, other)
def _check_divmod_op(self, s, op, other, exc=None):
super()._check_divmod_op(s, op, other, None)
@pytest.mark.skip(reason="intNA does not error on ops")
def test_error(self, data, all_arithmetic_operators):
# other specific errors tested in the integer array specific tests
pass
class TestComparisonOps(base.BaseComparisonOpsTests):
def check_opname(self, s, op_name, other, exc=None):
super().check_opname(s, op_name, other, exc=None)
def _compare_other(self, s, data, op_name, other):
self.check_opname(s, op_name, other)
class TestInterface(base.BaseInterfaceTests):
pass
class TestConstructors(base.BaseConstructorsTests):
pass
class TestReshaping(base.BaseReshapingTests):
pass
# for test_concat_mixed_dtypes test
# concat of an Integer and Int coerces to object dtype
# TODO(jreback) once integrated this would
class TestGetitem(base.BaseGetitemTests):
pass
class TestSetitem(base.BaseSetitemTests):
pass
class TestMissing(base.BaseMissingTests):
pass
class TestMethods(base.BaseMethodsTests):
@pytest.mark.parametrize("dropna", [True, False])
def test_value_counts(self, all_data, dropna):
all_data = all_data[:10]
if dropna:
other = np.array(all_data[~all_data.isna()])
else:
other = all_data
result = pd.Series(all_data).value_counts(dropna=dropna).sort_index()
expected = pd.Series(other).value_counts(dropna=dropna).sort_index()
expected.index = expected.index.astype(all_data.dtype)
self.assert_series_equal(result, expected)
class TestCasting(base.BaseCastingTests):
pass
class TestGroupby(base.BaseGroupbyTests):
pass
class TestNumericReduce(base.BaseNumericReduceTests):
pass
class TestBooleanReduce(base.BaseBooleanReduceTests):
pass
class TestPrinting(base.BasePrintingTests):
pass
class TestParsing(base.BaseParsingTests):
pass

View File

@@ -0,0 +1,161 @@
"""
This file contains a minimal set of tests for compliance with the extension
array interface test suite, and should contain no other tests.
The test suite for the full functionality of the array is located in
`pandas/tests/arrays/`.
The tests in this file are inherited from the BaseExtensionTests, and only
minimal tweaks should be applied to get the tests passing (by overwriting a
parent method).
Additional tests should either be added to one of the BaseExtensionTests
classes (if they are relevant for the extension interface for all dtypes), or
be added to the array-specific tests in `pandas/tests/arrays/`.
"""
import numpy as np
import pytest
from pandas.core.dtypes.dtypes import IntervalDtype
from pandas import Interval
from pandas.core.arrays import IntervalArray
from pandas.tests.extension import base
def make_data():
N = 100
left = np.random.uniform(size=N).cumsum()
right = left + np.random.uniform(size=N)
return [Interval(l, r) for l, r in zip(left, right)]
@pytest.fixture
def dtype():
return IntervalDtype()
@pytest.fixture
def data():
"""Length-100 PeriodArray for semantics test."""
return IntervalArray(make_data())
@pytest.fixture
def data_missing():
"""Length 2 array with [NA, Valid]"""
return IntervalArray.from_tuples([None, (0, 1)])
@pytest.fixture
def data_for_sorting():
return IntervalArray.from_tuples([(1, 2), (2, 3), (0, 1)])
@pytest.fixture
def data_missing_for_sorting():
return IntervalArray.from_tuples([(1, 2), None, (0, 1)])
@pytest.fixture
def na_value():
return np.nan
@pytest.fixture
def data_for_grouping():
a = (0, 1)
b = (1, 2)
c = (2, 3)
return IntervalArray.from_tuples([b, b, None, None, a, a, b, c])
class BaseInterval:
pass
class TestDtype(BaseInterval, base.BaseDtypeTests):
pass
class TestCasting(BaseInterval, base.BaseCastingTests):
pass
class TestConstructors(BaseInterval, base.BaseConstructorsTests):
pass
class TestGetitem(BaseInterval, base.BaseGetitemTests):
pass
class TestGrouping(BaseInterval, base.BaseGroupbyTests):
pass
class TestInterface(BaseInterval, base.BaseInterfaceTests):
pass
class TestReduce(base.BaseNoReduceTests):
pass
class TestMethods(BaseInterval, base.BaseMethodsTests):
@pytest.mark.skip(reason="addition is not defined for intervals")
def test_combine_add(self, data_repeated):
pass
@pytest.mark.skip(reason="Not Applicable")
def test_fillna_length_mismatch(self, data_missing):
pass
class TestMissing(BaseInterval, base.BaseMissingTests):
# Index.fillna only accepts scalar `value`, so we have to skip all
# non-scalar fill tests.
unsupported_fill = pytest.mark.skip("Unsupported fillna option.")
@unsupported_fill
def test_fillna_limit_pad(self):
pass
@unsupported_fill
def test_fillna_series_method(self):
pass
@unsupported_fill
def test_fillna_limit_backfill(self):
pass
@unsupported_fill
def test_fillna_series(self):
pass
def test_non_scalar_raises(self, data_missing):
msg = "Got a 'list' instead."
with pytest.raises(TypeError, match=msg):
data_missing.fillna([1, 1])
class TestReshaping(BaseInterval, base.BaseReshapingTests):
pass
class TestSetitem(BaseInterval, base.BaseSetitemTests):
pass
class TestPrinting(BaseInterval, base.BasePrintingTests):
@pytest.mark.skip(reason="custom repr")
def test_array_repr(self, data, size):
pass
class TestParsing(BaseInterval, base.BaseParsingTests):
@pytest.mark.parametrize("engine", ["c", "python"])
def test_EA_types(self, engine, data):
expected_msg = r".*must implement _from_sequence_of_strings.*"
with pytest.raises(NotImplementedError, match=expected_msg):
super().test_EA_types(engine, data)

View File

@@ -0,0 +1,392 @@
import numpy as np
import pytest
from pandas.compat.numpy import _np_version_under1p16
import pandas as pd
from pandas.core.arrays.numpy_ import PandasArray, PandasDtype
import pandas.util.testing as tm
from . import base
@pytest.fixture(params=["float", "object"])
def dtype(request):
return PandasDtype(np.dtype(request.param))
@pytest.fixture
def allow_in_pandas(monkeypatch):
"""
A monkeypatch to tells pandas to let us in.
By default, passing a PandasArray to an index / series / frame
constructor will unbox that PandasArray to an ndarray, and treat
it as a non-EA column. We don't want people using EAs without
reason.
The mechanism for this is a check against ABCPandasArray
in each constructor.
But, for testing, we need to allow them in pandas. So we patch
the _typ of PandasArray, so that we evade the ABCPandasArray
check.
"""
with monkeypatch.context() as m:
m.setattr(PandasArray, "_typ", "extension")
yield
@pytest.fixture
def data(allow_in_pandas, dtype):
if dtype.numpy_dtype == "object":
return pd.Series([(i,) for i in range(100)]).array
return PandasArray(np.arange(1, 101, dtype=dtype._dtype))
@pytest.fixture
def data_missing(allow_in_pandas, dtype):
# For NumPy <1.16, np.array([np.nan, (1,)]) raises
# ValueError: setting an array element with a sequence.
if dtype.numpy_dtype == "object":
if _np_version_under1p16:
raise pytest.skip("Skipping for NumPy <1.16")
return PandasArray(np.array([np.nan, (1,)]))
return PandasArray(np.array([np.nan, 1.0]))
@pytest.fixture
def na_value():
return np.nan
@pytest.fixture
def na_cmp():
def cmp(a, b):
return np.isnan(a) and np.isnan(b)
return cmp
@pytest.fixture
def data_for_sorting(allow_in_pandas, dtype):
"""Length-3 array with a known sort order.
This should be three items [B, C, A] with
A < B < C
"""
if dtype.numpy_dtype == "object":
# Use an empty tuple for first element, then remove,
# to disable np.array's shape inference.
return PandasArray(np.array([(), (2,), (3,), (1,)])[1:])
return PandasArray(np.array([1, 2, 0]))
@pytest.fixture
def data_missing_for_sorting(allow_in_pandas, dtype):
"""Length-3 array with a known sort order.
This should be three items [B, NA, A] with
A < B and NA missing.
"""
if dtype.numpy_dtype == "object":
return PandasArray(np.array([(1,), np.nan, (0,)]))
return PandasArray(np.array([1, np.nan, 0]))
@pytest.fixture
def data_for_grouping(allow_in_pandas, dtype):
"""Data for factorization, grouping, and unique tests.
Expected to be like [B, B, NA, NA, A, A, B, C]
Where A < B < C and NA is missing
"""
if dtype.numpy_dtype == "object":
a, b, c = (1,), (2,), (3,)
else:
a, b, c = np.arange(3)
return PandasArray(np.array([b, b, np.nan, np.nan, a, a, b, c]))
@pytest.fixture
def skip_numpy_object(dtype):
"""
Tests for PandasArray with nested data. Users typically won't create
these objects via `pd.array`, but they can show up through `.array`
on a Series with nested data. Many of the base tests fail, as they aren't
appropriate for nested data.
This fixture allows these tests to be skipped when used as a usefixtures
marker to either an individual test or a test class.
"""
if dtype == "object":
raise pytest.skip("Skipping for object dtype.")
skip_nested = pytest.mark.usefixtures("skip_numpy_object")
class BaseNumPyTests:
pass
class TestCasting(BaseNumPyTests, base.BaseCastingTests):
@skip_nested
def test_astype_str(self, data):
# ValueError: setting an array element with a sequence
super().test_astype_str(data)
class TestConstructors(BaseNumPyTests, base.BaseConstructorsTests):
@pytest.mark.skip(reason="We don't register our dtype")
# We don't want to register. This test should probably be split in two.
def test_from_dtype(self, data):
pass
@skip_nested
def test_array_from_scalars(self, data):
# ValueError: PandasArray must be 1-dimensional.
super().test_array_from_scalars(data)
class TestDtype(BaseNumPyTests, base.BaseDtypeTests):
@pytest.mark.skip(reason="Incorrect expected.")
# we unsurprisingly clash with a NumPy name.
def test_check_dtype(self, data):
pass
class TestGetitem(BaseNumPyTests, base.BaseGetitemTests):
@skip_nested
def test_getitem_scalar(self, data):
# AssertionError
super().test_getitem_scalar(data)
@skip_nested
def test_take_series(self, data):
# ValueError: PandasArray must be 1-dimensional.
super().test_take_series(data)
@pytest.mark.xfail(reason="astype doesn't recognize data.dtype")
def test_loc_iloc_frame_single_dtype(self, data):
super().test_loc_iloc_frame_single_dtype(data)
class TestGroupby(BaseNumPyTests, base.BaseGroupbyTests):
@skip_nested
def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op):
# ValueError: Names should be list-like for a MultiIndex
super().test_groupby_extension_apply(data_for_grouping, groupby_apply_op)
class TestInterface(BaseNumPyTests, base.BaseInterfaceTests):
@skip_nested
def test_array_interface(self, data):
# NumPy array shape inference
super().test_array_interface(data)
class TestMethods(BaseNumPyTests, base.BaseMethodsTests):
@pytest.mark.skip(reason="TODO: remove?")
def test_value_counts(self, all_data, dropna):
pass
@pytest.mark.skip(reason="Incorrect expected")
# We have a bool dtype, so the result is an ExtensionArray
# but expected is not
def test_combine_le(self, data_repeated):
super().test_combine_le(data_repeated)
@skip_nested
def test_combine_add(self, data_repeated):
# Not numeric
super().test_combine_add(data_repeated)
@skip_nested
def test_shift_fill_value(self, data):
# np.array shape inference. Shift implementation fails.
super().test_shift_fill_value(data)
@skip_nested
@pytest.mark.parametrize("box", [pd.Series, lambda x: x])
@pytest.mark.parametrize("method", [lambda x: x.unique(), pd.unique])
def test_unique(self, data, box, method):
# Fails creating expected
super().test_unique(data, box, method)
@skip_nested
def test_fillna_copy_frame(self, data_missing):
# The "scalar" for this array isn't a scalar.
super().test_fillna_copy_frame(data_missing)
@skip_nested
def test_fillna_copy_series(self, data_missing):
# The "scalar" for this array isn't a scalar.
super().test_fillna_copy_series(data_missing)
@skip_nested
def test_hash_pandas_object_works(self, data, as_frame):
# ndarray of tuples not hashable
super().test_hash_pandas_object_works(data, as_frame)
@skip_nested
def test_searchsorted(self, data_for_sorting, as_series):
# Test setup fails.
super().test_searchsorted(data_for_sorting, as_series)
@skip_nested
def test_where_series(self, data, na_value, as_frame):
# Test setup fails.
super().test_where_series(data, na_value, as_frame)
@skip_nested
@pytest.mark.parametrize("repeats", [0, 1, 2, [1, 2, 3]])
def test_repeat(self, data, repeats, as_series, use_numpy):
# Fails creating expected
super().test_repeat(data, repeats, as_series, use_numpy)
@skip_nested
class TestArithmetics(BaseNumPyTests, base.BaseArithmeticOpsTests):
divmod_exc = None
series_scalar_exc = None
frame_scalar_exc = None
series_array_exc = None
def test_divmod_series_array(self, data):
s = pd.Series(data)
self._check_divmod_op(s, divmod, data, exc=None)
@pytest.mark.skip("We implement ops")
def test_error(self, data, all_arithmetic_operators):
pass
def test_arith_series_with_scalar(self, data, all_arithmetic_operators):
super().test_arith_series_with_scalar(data, all_arithmetic_operators)
def test_arith_series_with_array(self, data, all_arithmetic_operators):
super().test_arith_series_with_array(data, all_arithmetic_operators)
class TestPrinting(BaseNumPyTests, base.BasePrintingTests):
pass
@skip_nested
class TestNumericReduce(BaseNumPyTests, base.BaseNumericReduceTests):
def check_reduce(self, s, op_name, skipna):
result = getattr(s, op_name)(skipna=skipna)
# avoid coercing int -> float. Just cast to the actual numpy type.
expected = getattr(s.astype(s.dtype._dtype), op_name)(skipna=skipna)
tm.assert_almost_equal(result, expected)
@skip_nested
class TestBooleanReduce(BaseNumPyTests, base.BaseBooleanReduceTests):
pass
class TestMissing(BaseNumPyTests, base.BaseMissingTests):
@skip_nested
def test_fillna_scalar(self, data_missing):
# Non-scalar "scalar" values.
super().test_fillna_scalar(data_missing)
@skip_nested
def test_fillna_series_method(self, data_missing, fillna_method):
# Non-scalar "scalar" values.
super().test_fillna_series_method(data_missing, fillna_method)
@skip_nested
def test_fillna_series(self, data_missing):
# Non-scalar "scalar" values.
super().test_fillna_series(data_missing)
@skip_nested
def test_fillna_frame(self, data_missing):
# Non-scalar "scalar" values.
super().test_fillna_frame(data_missing)
class TestReshaping(BaseNumPyTests, base.BaseReshapingTests):
@pytest.mark.skip("Incorrect parent test")
# not actually a mixed concat, since we concat int and int.
def test_concat_mixed_dtypes(self, data):
super().test_concat_mixed_dtypes(data)
@skip_nested
def test_merge(self, data, na_value):
# Fails creating expected
super().test_merge(data, na_value)
@skip_nested
def test_merge_on_extension_array(self, data):
# Fails creating expected
super().test_merge_on_extension_array(data)
@skip_nested
def test_merge_on_extension_array_duplicates(self, data):
# Fails creating expected
super().test_merge_on_extension_array_duplicates(data)
class TestSetitem(BaseNumPyTests, base.BaseSetitemTests):
@skip_nested
def test_setitem_scalar_series(self, data, box_in_series):
# AssertionError
super().test_setitem_scalar_series(data, box_in_series)
@skip_nested
def test_setitem_sequence(self, data, box_in_series):
# ValueError: shape mismatch: value array of shape (2,1) could not
# be broadcast to indexing result of shape (2,)
super().test_setitem_sequence(data, box_in_series)
@skip_nested
def test_setitem_sequence_mismatched_length_raises(self, data, as_array):
# ValueError: PandasArray must be 1-dimensional.
super().test_setitem_sequence_mismatched_length_raises(data, as_array)
@skip_nested
def test_setitem_sequence_broadcasts(self, data, box_in_series):
# ValueError: cannot set using a list-like indexer with a different
# length than the value
super().test_setitem_sequence_broadcasts(data, box_in_series)
@skip_nested
def test_setitem_loc_scalar_mixed(self, data):
# AssertionError
super().test_setitem_loc_scalar_mixed(data)
@skip_nested
def test_setitem_loc_scalar_multiple_homogoneous(self, data):
# AssertionError
super().test_setitem_loc_scalar_multiple_homogoneous(data)
@skip_nested
def test_setitem_iloc_scalar_mixed(self, data):
# AssertionError
super().test_setitem_iloc_scalar_mixed(data)
@skip_nested
def test_setitem_iloc_scalar_multiple_homogoneous(self, data):
# AssertionError
super().test_setitem_iloc_scalar_multiple_homogoneous(data)
@skip_nested
@pytest.mark.parametrize("setter", ["loc", None])
def test_setitem_mask_broadcast(self, data, setter):
# ValueError: cannot set using a list-like indexer with a different
# length than the value
super().test_setitem_mask_broadcast(data, setter)
@skip_nested
def test_setitem_scalar_key_sequence_raise(self, data):
# Failed: DID NOT RAISE <class 'ValueError'>
super().test_setitem_scalar_key_sequence_raise(data)
@skip_nested
class TestParsing(BaseNumPyTests, base.BaseParsingTests):
pass

View File

@@ -0,0 +1,161 @@
import numpy as np
import pytest
from pandas._libs.tslib import iNaT
from pandas.core.dtypes.dtypes import PeriodDtype
import pandas as pd
from pandas.core.arrays import PeriodArray
from pandas.tests.extension import base
@pytest.fixture
def dtype():
return PeriodDtype(freq="D")
@pytest.fixture
def data(dtype):
return PeriodArray(np.arange(1970, 2070), freq=dtype.freq)
@pytest.fixture
def data_for_twos(dtype):
return PeriodArray(np.ones(100) * 2, freq=dtype.freq)
@pytest.fixture
def data_for_sorting(dtype):
return PeriodArray([2018, 2019, 2017], freq=dtype.freq)
@pytest.fixture
def data_missing(dtype):
return PeriodArray([iNaT, 2017], freq=dtype.freq)
@pytest.fixture
def data_missing_for_sorting(dtype):
return PeriodArray([2018, iNaT, 2017], freq=dtype.freq)
@pytest.fixture
def data_for_grouping(dtype):
B = 2018
NA = iNaT
A = 2017
C = 2019
return PeriodArray([B, B, NA, NA, A, A, B, C], freq=dtype.freq)
@pytest.fixture
def na_value():
return pd.NaT
class BasePeriodTests:
pass
class TestPeriodDtype(BasePeriodTests, base.BaseDtypeTests):
pass
class TestConstructors(BasePeriodTests, base.BaseConstructorsTests):
pass
class TestGetitem(BasePeriodTests, base.BaseGetitemTests):
pass
class TestMethods(BasePeriodTests, base.BaseMethodsTests):
def test_combine_add(self, data_repeated):
# Period + Period is not defined.
pass
class TestInterface(BasePeriodTests, base.BaseInterfaceTests):
pass
class TestArithmeticOps(BasePeriodTests, base.BaseArithmeticOpsTests):
implements = {"__sub__", "__rsub__"}
def test_arith_series_with_scalar(self, data, all_arithmetic_operators):
# we implement substitution...
if all_arithmetic_operators in self.implements:
s = pd.Series(data)
self.check_opname(s, all_arithmetic_operators, s.iloc[0], exc=None)
else:
# ... but not the rest.
super().test_arith_series_with_scalar(data, all_arithmetic_operators)
def test_arith_series_with_array(self, data, all_arithmetic_operators):
if all_arithmetic_operators in self.implements:
s = pd.Series(data)
self.check_opname(s, all_arithmetic_operators, s.iloc[0], exc=None)
else:
# ... but not the rest.
super().test_arith_series_with_scalar(data, all_arithmetic_operators)
def _check_divmod_op(self, s, op, other, exc=NotImplementedError):
super()._check_divmod_op(s, op, other, exc=TypeError)
def test_add_series_with_extension_array(self, data):
# we don't implement + for Period
s = pd.Series(data)
msg = (
r"unsupported operand type\(s\) for \+: "
r"\'PeriodArray\' and \'PeriodArray\'"
)
with pytest.raises(TypeError, match=msg):
s + data
def test_error(self):
pass
def test_direct_arith_with_series_returns_not_implemented(self, data):
# Override to use __sub__ instead of __add__
other = pd.Series(data)
result = data.__sub__(other)
assert result is NotImplemented
class TestCasting(BasePeriodTests, base.BaseCastingTests):
pass
class TestComparisonOps(BasePeriodTests, base.BaseComparisonOpsTests):
def _compare_other(self, s, data, op_name, other):
# the base test is not appropriate for us. We raise on comparison
# with (some) integers, depending on the value.
pass
class TestMissing(BasePeriodTests, base.BaseMissingTests):
pass
class TestReshaping(BasePeriodTests, base.BaseReshapingTests):
pass
class TestSetitem(BasePeriodTests, base.BaseSetitemTests):
pass
class TestGroupby(BasePeriodTests, base.BaseGroupbyTests):
pass
class TestPrinting(BasePeriodTests, base.BasePrintingTests):
pass
class TestParsing(BasePeriodTests, base.BaseParsingTests):
@pytest.mark.parametrize("engine", ["c", "python"])
def test_EA_types(self, engine, data):
super().test_EA_types(engine, data)

View File

@@ -0,0 +1,370 @@
import numpy as np
import pytest
from pandas.errors import PerformanceWarning
import pandas as pd
from pandas import SparseArray, SparseDtype
from pandas.tests.extension import base
import pandas.util.testing as tm
def make_data(fill_value):
if np.isnan(fill_value):
data = np.random.uniform(size=100)
else:
data = np.random.randint(1, 100, size=100)
if data[0] == data[1]:
data[0] += 1
data[2::3] = fill_value
return data
@pytest.fixture
def dtype():
return SparseDtype()
@pytest.fixture(params=[0, np.nan])
def data(request):
"""Length-100 PeriodArray for semantics test."""
res = SparseArray(make_data(request.param), fill_value=request.param)
return res
@pytest.fixture
def data_for_twos(request):
return SparseArray(np.ones(100) * 2)
@pytest.fixture(params=[0, np.nan])
def data_missing(request):
"""Length 2 array with [NA, Valid]"""
return SparseArray([np.nan, 1], fill_value=request.param)
@pytest.fixture(params=[0, np.nan])
def data_repeated(request):
"""Return different versions of data for count times"""
def gen(count):
for _ in range(count):
yield SparseArray(make_data(request.param), fill_value=request.param)
yield gen
@pytest.fixture(params=[0, np.nan])
def data_for_sorting(request):
return SparseArray([2, 3, 1], fill_value=request.param)
@pytest.fixture(params=[0, np.nan])
def data_missing_for_sorting(request):
return SparseArray([2, np.nan, 1], fill_value=request.param)
@pytest.fixture
def na_value():
return np.nan
@pytest.fixture
def na_cmp():
return lambda left, right: pd.isna(left) and pd.isna(right)
@pytest.fixture(params=[0, np.nan])
def data_for_grouping(request):
return SparseArray([1, 1, np.nan, np.nan, 2, 2, 1, 3], fill_value=request.param)
class BaseSparseTests:
def _check_unsupported(self, data):
if data.dtype == SparseDtype(int, 0):
pytest.skip("Can't store nan in int array.")
@pytest.mark.xfail(reason="SparseArray does not support setitem")
def test_ravel(self, data):
super().test_ravel(data)
class TestDtype(BaseSparseTests, base.BaseDtypeTests):
def test_array_type_with_arg(self, data, dtype):
assert dtype.construct_array_type() is SparseArray
class TestInterface(BaseSparseTests, base.BaseInterfaceTests):
def test_no_values_attribute(self, data):
pytest.skip("We have values")
def test_copy(self, data):
# __setitem__ does not work, so we only have a smoke-test
data.copy()
class TestConstructors(BaseSparseTests, base.BaseConstructorsTests):
pass
class TestReshaping(BaseSparseTests, base.BaseReshapingTests):
def test_concat_mixed_dtypes(self, data):
# https://github.com/pandas-dev/pandas/issues/20762
# This should be the same, aside from concat([sparse, float])
df1 = pd.DataFrame({"A": data[:3]})
df2 = pd.DataFrame({"A": [1, 2, 3]})
df3 = pd.DataFrame({"A": ["a", "b", "c"]}).astype("category")
dfs = [df1, df2, df3]
# dataframes
result = pd.concat(dfs)
expected = pd.concat(
[x.apply(lambda s: np.asarray(s).astype(object)) for x in dfs]
)
self.assert_frame_equal(result, expected)
def test_concat_columns(self, data, na_value):
self._check_unsupported(data)
super().test_concat_columns(data, na_value)
def test_align(self, data, na_value):
self._check_unsupported(data)
super().test_align(data, na_value)
def test_align_frame(self, data, na_value):
self._check_unsupported(data)
super().test_align_frame(data, na_value)
def test_align_series_frame(self, data, na_value):
self._check_unsupported(data)
super().test_align_series_frame(data, na_value)
def test_merge(self, data, na_value):
self._check_unsupported(data)
super().test_merge(data, na_value)
class TestGetitem(BaseSparseTests, base.BaseGetitemTests):
def test_get(self, data):
s = pd.Series(data, index=[2 * i for i in range(len(data))])
if np.isnan(s.values.fill_value):
assert np.isnan(s.get(4)) and np.isnan(s.iloc[2])
else:
assert s.get(4) == s.iloc[2]
assert s.get(2) == s.iloc[1]
def test_reindex(self, data, na_value):
self._check_unsupported(data)
super().test_reindex(data, na_value)
# Skipping TestSetitem, since we don't implement it.
class TestMissing(BaseSparseTests, base.BaseMissingTests):
def test_isna(self, data_missing):
expected_dtype = SparseDtype(bool, pd.isna(data_missing.dtype.fill_value))
expected = SparseArray([True, False], dtype=expected_dtype)
result = pd.isna(data_missing)
self.assert_equal(result, expected)
result = pd.Series(data_missing).isna()
expected = pd.Series(expected)
self.assert_series_equal(result, expected)
# GH 21189
result = pd.Series(data_missing).drop([0, 1]).isna()
expected = pd.Series([], dtype=expected_dtype)
self.assert_series_equal(result, expected)
def test_fillna_limit_pad(self, data_missing):
with tm.assert_produces_warning(PerformanceWarning):
super().test_fillna_limit_pad(data_missing)
def test_fillna_limit_backfill(self, data_missing):
with tm.assert_produces_warning(PerformanceWarning):
super().test_fillna_limit_backfill(data_missing)
def test_fillna_series_method(self, data_missing):
with tm.assert_produces_warning(PerformanceWarning):
super().test_fillna_limit_backfill(data_missing)
@pytest.mark.skip(reason="Unsupported")
def test_fillna_series(self):
# this one looks doable.
pass
def test_fillna_frame(self, data_missing):
# Have to override to specify that fill_value will change.
fill_value = data_missing[1]
result = pd.DataFrame({"A": data_missing, "B": [1, 2]}).fillna(fill_value)
if pd.isna(data_missing.fill_value):
dtype = SparseDtype(data_missing.dtype, fill_value)
else:
dtype = data_missing.dtype
expected = pd.DataFrame(
{
"A": data_missing._from_sequence([fill_value, fill_value], dtype=dtype),
"B": [1, 2],
}
)
self.assert_frame_equal(result, expected)
class TestMethods(BaseSparseTests, base.BaseMethodsTests):
def test_combine_le(self, data_repeated):
# We return a Series[SparseArray].__le__ returns a
# Series[Sparse[bool]]
# rather than Series[bool]
orig_data1, orig_data2 = data_repeated(2)
s1 = pd.Series(orig_data1)
s2 = pd.Series(orig_data2)
result = s1.combine(s2, lambda x1, x2: x1 <= x2)
expected = pd.Series(
pd.SparseArray(
[a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))],
fill_value=False,
)
)
self.assert_series_equal(result, expected)
val = s1.iloc[0]
result = s1.combine(val, lambda x1, x2: x1 <= x2)
expected = pd.Series(
pd.SparseArray([a <= val for a in list(orig_data1)], fill_value=False)
)
self.assert_series_equal(result, expected)
def test_fillna_copy_frame(self, data_missing):
arr = data_missing.take([1, 1])
df = pd.DataFrame({"A": arr})
filled_val = df.iloc[0, 0]
result = df.fillna(filled_val)
assert df.values.base is not result.values.base
assert df.A._values.to_dense() is arr.to_dense()
def test_fillna_copy_series(self, data_missing):
arr = data_missing.take([1, 1])
ser = pd.Series(arr)
filled_val = ser[0]
result = ser.fillna(filled_val)
assert ser._values is not result._values
assert ser._values.to_dense() is arr.to_dense()
@pytest.mark.skip(reason="Not Applicable")
def test_fillna_length_mismatch(self, data_missing):
pass
def test_where_series(self, data, na_value):
assert data[0] != data[1]
cls = type(data)
a, b = data[:2]
ser = pd.Series(cls._from_sequence([a, a, b, b], dtype=data.dtype))
cond = np.array([True, True, False, False])
result = ser.where(cond)
new_dtype = SparseDtype("float", 0.0)
expected = pd.Series(
cls._from_sequence([a, a, na_value, na_value], dtype=new_dtype)
)
self.assert_series_equal(result, expected)
other = cls._from_sequence([a, b, a, b], dtype=data.dtype)
cond = np.array([True, False, True, True])
result = ser.where(cond, other)
expected = pd.Series(cls._from_sequence([a, b, b, b], dtype=data.dtype))
self.assert_series_equal(result, expected)
def test_combine_first(self, data):
if data.dtype.subtype == "int":
# Right now this is upcasted to float, just like combine_first
# for Series[int]
pytest.skip("TODO(SparseArray.__setitem__ will preserve dtype.")
super().test_combine_first(data)
def test_searchsorted(self, data_for_sorting, as_series):
with tm.assert_produces_warning(PerformanceWarning):
super().test_searchsorted(data_for_sorting, as_series)
class TestCasting(BaseSparseTests, base.BaseCastingTests):
pass
class TestArithmeticOps(BaseSparseTests, base.BaseArithmeticOpsTests):
series_scalar_exc = None
frame_scalar_exc = None
divmod_exc = None
series_array_exc = None
def _skip_if_different_combine(self, data):
if data.fill_value == 0:
# arith ops call on dtype.fill_value so that the sparsity
# is maintained. Combine can't be called on a dtype in
# general, so we can't make the expected. This is tested elsewhere
raise pytest.skip("Incorrected expected from Series.combine")
def test_error(self, data, all_arithmetic_operators):
pass
def test_arith_series_with_scalar(self, data, all_arithmetic_operators):
self._skip_if_different_combine(data)
super().test_arith_series_with_scalar(data, all_arithmetic_operators)
def test_arith_series_with_array(self, data, all_arithmetic_operators):
self._skip_if_different_combine(data)
super().test_arith_series_with_array(data, all_arithmetic_operators)
class TestComparisonOps(BaseSparseTests, base.BaseComparisonOpsTests):
def _compare_other(self, s, data, op_name, other):
op = self.get_op_from_name(op_name)
# array
result = pd.Series(op(data, other))
# hard to test the fill value, since we don't know what expected
# is in general.
# Rely on tests in `tests/sparse` to validate that.
assert isinstance(result.dtype, SparseDtype)
assert result.dtype.subtype == np.dtype("bool")
with np.errstate(all="ignore"):
expected = pd.Series(
pd.SparseArray(
op(np.asarray(data), np.asarray(other)),
fill_value=result.values.fill_value,
)
)
tm.assert_series_equal(result, expected)
# series
s = pd.Series(data)
result = op(s, other)
tm.assert_series_equal(result, expected)
class TestPrinting(BaseSparseTests, base.BasePrintingTests):
@pytest.mark.xfail(reason="Different repr", strict=True)
def test_array_repr(self, data, size):
super().test_array_repr(data, size)
class TestParsing(BaseSparseTests, base.BaseParsingTests):
@pytest.mark.parametrize("engine", ["c", "python"])
def test_EA_types(self, engine, data):
expected_msg = r".*must implement _from_sequence_of_strings.*"
with pytest.raises(NotImplementedError, match=expected_msg):
super().test_EA_types(engine, data)