434 lines
15 KiB
Python
434 lines
15 KiB
Python
import operator
|
|
import re
|
|
|
|
import numpy as np
|
|
from numpy.random import randn
|
|
import pytest
|
|
|
|
from pandas.core.api import DataFrame
|
|
from pandas.core.computation import expressions as expr
|
|
import pandas.util.testing as tm
|
|
from pandas.util.testing import (
|
|
assert_almost_equal,
|
|
assert_frame_equal,
|
|
assert_series_equal,
|
|
)
|
|
|
|
from pandas.io.formats.printing import pprint_thing
|
|
|
|
_frame = DataFrame(randn(10000, 4), columns=list("ABCD"), dtype="float64")
|
|
_frame2 = DataFrame(randn(100, 4), columns=list("ABCD"), dtype="float64")
|
|
_mixed = DataFrame(
|
|
{
|
|
"A": _frame["A"].copy(),
|
|
"B": _frame["B"].astype("float32"),
|
|
"C": _frame["C"].astype("int64"),
|
|
"D": _frame["D"].astype("int32"),
|
|
}
|
|
)
|
|
_mixed2 = DataFrame(
|
|
{
|
|
"A": _frame2["A"].copy(),
|
|
"B": _frame2["B"].astype("float32"),
|
|
"C": _frame2["C"].astype("int64"),
|
|
"D": _frame2["D"].astype("int32"),
|
|
}
|
|
)
|
|
_integer = DataFrame(
|
|
np.random.randint(1, 100, size=(10001, 4)), columns=list("ABCD"), dtype="int64"
|
|
)
|
|
_integer2 = DataFrame(
|
|
np.random.randint(1, 100, size=(101, 4)), columns=list("ABCD"), dtype="int64"
|
|
)
|
|
|
|
|
|
@pytest.mark.skipif(not expr._USE_NUMEXPR, reason="not using numexpr")
|
|
class TestExpressions:
|
|
def setup_method(self, method):
|
|
|
|
self.frame = _frame.copy()
|
|
self.frame2 = _frame2.copy()
|
|
self.mixed = _mixed.copy()
|
|
self.mixed2 = _mixed2.copy()
|
|
self.integer = _integer.copy()
|
|
self._MIN_ELEMENTS = expr._MIN_ELEMENTS
|
|
|
|
def teardown_method(self, method):
|
|
expr._MIN_ELEMENTS = self._MIN_ELEMENTS
|
|
|
|
def run_arithmetic(self, df, other, assert_func, check_dtype=False, test_flex=True):
|
|
expr._MIN_ELEMENTS = 0
|
|
operations = ["add", "sub", "mul", "mod", "truediv", "floordiv"]
|
|
for arith in operations:
|
|
|
|
operator_name = arith
|
|
if arith == "div":
|
|
operator_name = "truediv"
|
|
|
|
if test_flex:
|
|
op = lambda x, y: getattr(df, arith)(y)
|
|
op.__name__ = arith
|
|
else:
|
|
op = getattr(operator, operator_name)
|
|
expr.set_use_numexpr(False)
|
|
expected = op(df, other)
|
|
expr.set_use_numexpr(True)
|
|
|
|
result = op(df, other)
|
|
try:
|
|
if check_dtype:
|
|
if arith == "truediv":
|
|
assert expected.dtype.kind == "f"
|
|
assert_func(expected, result)
|
|
except Exception:
|
|
pprint_thing("Failed test with operator {op.__name__!r}".format(op=op))
|
|
raise
|
|
|
|
def test_integer_arithmetic(self):
|
|
self.run_arithmetic(self.integer, self.integer, assert_frame_equal)
|
|
self.run_arithmetic(
|
|
self.integer.iloc[:, 0],
|
|
self.integer.iloc[:, 0],
|
|
assert_series_equal,
|
|
check_dtype=True,
|
|
)
|
|
|
|
def run_binary(
|
|
self,
|
|
df,
|
|
other,
|
|
assert_func,
|
|
test_flex=False,
|
|
numexpr_ops={"gt", "lt", "ge", "le", "eq", "ne"},
|
|
):
|
|
"""
|
|
tests solely that the result is the same whether or not numexpr is
|
|
enabled. Need to test whether the function does the correct thing
|
|
elsewhere.
|
|
"""
|
|
expr._MIN_ELEMENTS = 0
|
|
expr.set_test_mode(True)
|
|
operations = ["gt", "lt", "ge", "le", "eq", "ne"]
|
|
|
|
for arith in operations:
|
|
if test_flex:
|
|
op = lambda x, y: getattr(df, arith)(y)
|
|
op.__name__ = arith
|
|
else:
|
|
op = getattr(operator, arith)
|
|
expr.set_use_numexpr(False)
|
|
expected = op(df, other)
|
|
expr.set_use_numexpr(True)
|
|
expr.get_test_result()
|
|
result = op(df, other)
|
|
used_numexpr = expr.get_test_result()
|
|
try:
|
|
if arith in numexpr_ops:
|
|
assert used_numexpr, "Did not use numexpr as expected."
|
|
else:
|
|
assert not used_numexpr, "Used numexpr unexpectedly."
|
|
assert_func(expected, result)
|
|
except Exception:
|
|
pprint_thing("Failed test with operation {arith!r}".format(arith=arith))
|
|
pprint_thing("test_flex was {test_flex!r}".format(test_flex=test_flex))
|
|
raise
|
|
|
|
def run_frame(self, df, other, binary_comp=None, run_binary=True, **kwargs):
|
|
self.run_arithmetic(df, other, assert_frame_equal, test_flex=False, **kwargs)
|
|
self.run_arithmetic(df, other, assert_frame_equal, test_flex=True, **kwargs)
|
|
if run_binary:
|
|
if binary_comp is None:
|
|
expr.set_use_numexpr(False)
|
|
binary_comp = other + 1
|
|
expr.set_use_numexpr(True)
|
|
self.run_binary(
|
|
df, binary_comp, assert_frame_equal, test_flex=False, **kwargs
|
|
)
|
|
self.run_binary(
|
|
df, binary_comp, assert_frame_equal, test_flex=True, **kwargs
|
|
)
|
|
|
|
def run_series(self, ser, other, binary_comp=None, **kwargs):
|
|
self.run_arithmetic(ser, other, assert_series_equal, test_flex=False, **kwargs)
|
|
self.run_arithmetic(ser, other, assert_almost_equal, test_flex=True, **kwargs)
|
|
# series doesn't uses vec_compare instead of numexpr...
|
|
# if binary_comp is None:
|
|
# binary_comp = other + 1
|
|
# self.run_binary(ser, binary_comp, assert_frame_equal,
|
|
# test_flex=False, **kwargs)
|
|
# self.run_binary(ser, binary_comp, assert_frame_equal,
|
|
# test_flex=True, **kwargs)
|
|
|
|
def test_integer_arithmetic_frame(self):
|
|
self.run_frame(self.integer, self.integer)
|
|
|
|
def test_integer_arithmetic_series(self):
|
|
self.run_series(self.integer.iloc[:, 0], self.integer.iloc[:, 0])
|
|
|
|
def test_float_arithemtic_frame(self):
|
|
self.run_frame(self.frame2, self.frame2)
|
|
|
|
def test_float_arithmetic_series(self):
|
|
self.run_series(self.frame2.iloc[:, 0], self.frame2.iloc[:, 0])
|
|
|
|
def test_mixed_arithmetic_frame(self):
|
|
# TODO: FIGURE OUT HOW TO GET IT TO WORK...
|
|
# can't do arithmetic because comparison methods try to do *entire*
|
|
# frame instead of by-column
|
|
self.run_frame(self.mixed2, self.mixed2, run_binary=False)
|
|
|
|
def test_mixed_arithmetic_series(self):
|
|
for col in self.mixed2.columns:
|
|
self.run_series(self.mixed2[col], self.mixed2[col], binary_comp=4)
|
|
|
|
def test_float_arithemtic(self):
|
|
self.run_arithmetic(self.frame, self.frame, assert_frame_equal)
|
|
self.run_arithmetic(
|
|
self.frame.iloc[:, 0],
|
|
self.frame.iloc[:, 0],
|
|
assert_series_equal,
|
|
check_dtype=True,
|
|
)
|
|
|
|
def test_mixed_arithmetic(self):
|
|
self.run_arithmetic(self.mixed, self.mixed, assert_frame_equal)
|
|
for col in self.mixed.columns:
|
|
self.run_arithmetic(self.mixed[col], self.mixed[col], assert_series_equal)
|
|
|
|
def test_integer_with_zeros(self):
|
|
self.integer *= np.random.randint(0, 2, size=np.shape(self.integer))
|
|
self.run_arithmetic(self.integer, self.integer, assert_frame_equal)
|
|
self.run_arithmetic(
|
|
self.integer.iloc[:, 0], self.integer.iloc[:, 0], assert_series_equal
|
|
)
|
|
|
|
def test_invalid(self):
|
|
|
|
# no op
|
|
result = expr._can_use_numexpr(
|
|
operator.add, None, self.frame, self.frame, "evaluate"
|
|
)
|
|
assert not result
|
|
|
|
# mixed
|
|
result = expr._can_use_numexpr(
|
|
operator.add, "+", self.mixed, self.frame, "evaluate"
|
|
)
|
|
assert not result
|
|
|
|
# min elements
|
|
result = expr._can_use_numexpr(
|
|
operator.add, "+", self.frame2, self.frame2, "evaluate"
|
|
)
|
|
assert not result
|
|
|
|
# ok, we only check on first part of expression
|
|
result = expr._can_use_numexpr(
|
|
operator.add, "+", self.frame, self.frame2, "evaluate"
|
|
)
|
|
assert result
|
|
|
|
def test_binary_ops(self):
|
|
def testit():
|
|
|
|
for f, f2 in [(self.frame, self.frame2), (self.mixed, self.mixed2)]:
|
|
|
|
for op, op_str in [
|
|
("add", "+"),
|
|
("sub", "-"),
|
|
("mul", "*"),
|
|
("div", "/"),
|
|
("pow", "**"),
|
|
]:
|
|
|
|
if op == "pow":
|
|
continue
|
|
|
|
if op == "div":
|
|
op = getattr(operator, "truediv", None)
|
|
else:
|
|
op = getattr(operator, op, None)
|
|
if op is not None:
|
|
result = expr._can_use_numexpr(op, op_str, f, f, "evaluate")
|
|
assert result != f._is_mixed_type
|
|
|
|
result = expr.evaluate(op, op_str, f, f, use_numexpr=True)
|
|
expected = expr.evaluate(op, op_str, f, f, use_numexpr=False)
|
|
|
|
if isinstance(result, DataFrame):
|
|
tm.assert_frame_equal(result, expected)
|
|
else:
|
|
tm.assert_numpy_array_equal(result, expected.values)
|
|
|
|
result = expr._can_use_numexpr(op, op_str, f2, f2, "evaluate")
|
|
assert not result
|
|
|
|
expr.set_use_numexpr(False)
|
|
testit()
|
|
expr.set_use_numexpr(True)
|
|
expr.set_numexpr_threads(1)
|
|
testit()
|
|
expr.set_numexpr_threads()
|
|
testit()
|
|
|
|
def test_boolean_ops(self):
|
|
def testit():
|
|
for f, f2 in [(self.frame, self.frame2), (self.mixed, self.mixed2)]:
|
|
|
|
f11 = f
|
|
f12 = f + 1
|
|
|
|
f21 = f2
|
|
f22 = f2 + 1
|
|
|
|
for op, op_str in [
|
|
("gt", ">"),
|
|
("lt", "<"),
|
|
("ge", ">="),
|
|
("le", "<="),
|
|
("eq", "=="),
|
|
("ne", "!="),
|
|
]:
|
|
|
|
op = getattr(operator, op)
|
|
|
|
result = expr._can_use_numexpr(op, op_str, f11, f12, "evaluate")
|
|
assert result != f11._is_mixed_type
|
|
|
|
result = expr.evaluate(op, op_str, f11, f12, use_numexpr=True)
|
|
expected = expr.evaluate(op, op_str, f11, f12, use_numexpr=False)
|
|
if isinstance(result, DataFrame):
|
|
tm.assert_frame_equal(result, expected)
|
|
else:
|
|
tm.assert_numpy_array_equal(result, expected.values)
|
|
|
|
result = expr._can_use_numexpr(op, op_str, f21, f22, "evaluate")
|
|
assert not result
|
|
|
|
expr.set_use_numexpr(False)
|
|
testit()
|
|
expr.set_use_numexpr(True)
|
|
expr.set_numexpr_threads(1)
|
|
testit()
|
|
expr.set_numexpr_threads()
|
|
testit()
|
|
|
|
def test_where(self):
|
|
def testit():
|
|
for f in [self.frame, self.frame2, self.mixed, self.mixed2]:
|
|
|
|
for cond in [True, False]:
|
|
|
|
c = np.empty(f.shape, dtype=np.bool_)
|
|
c.fill(cond)
|
|
result = expr.where(c, f.values, f.values + 1)
|
|
expected = np.where(c, f.values, f.values + 1)
|
|
tm.assert_numpy_array_equal(result, expected)
|
|
|
|
expr.set_use_numexpr(False)
|
|
testit()
|
|
expr.set_use_numexpr(True)
|
|
expr.set_numexpr_threads(1)
|
|
testit()
|
|
expr.set_numexpr_threads()
|
|
testit()
|
|
|
|
def test_bool_ops_raise_on_arithmetic(self):
|
|
df = DataFrame({"a": np.random.rand(10) > 0.5, "b": np.random.rand(10) > 0.5})
|
|
names = "truediv", "floordiv", "pow"
|
|
ops = "/", "//", "**"
|
|
msg = "operator %r not implemented for bool dtypes"
|
|
for op, name in zip(ops, names):
|
|
f = getattr(operator, name)
|
|
err_msg = re.escape(msg % op)
|
|
|
|
with pytest.raises(NotImplementedError, match=err_msg):
|
|
f(df, df)
|
|
|
|
with pytest.raises(NotImplementedError, match=err_msg):
|
|
f(df.a, df.b)
|
|
|
|
with pytest.raises(NotImplementedError, match=err_msg):
|
|
f(df.a, True)
|
|
|
|
with pytest.raises(NotImplementedError, match=err_msg):
|
|
f(False, df.a)
|
|
|
|
with pytest.raises(NotImplementedError, match=err_msg):
|
|
f(False, df)
|
|
|
|
with pytest.raises(NotImplementedError, match=err_msg):
|
|
f(df, True)
|
|
|
|
def test_bool_ops_warn_on_arithmetic(self):
|
|
n = 10
|
|
df = DataFrame({"a": np.random.rand(n) > 0.5, "b": np.random.rand(n) > 0.5})
|
|
names = "add", "mul", "sub"
|
|
ops = "+", "*", "-"
|
|
subs = {"+": "|", "*": "&", "-": "^"}
|
|
sub_funcs = {"|": "or_", "&": "and_", "^": "xor"}
|
|
for op, name in zip(ops, names):
|
|
f = getattr(operator, name)
|
|
fe = getattr(operator, sub_funcs[subs[op]])
|
|
|
|
if op == "-":
|
|
# raises TypeError
|
|
continue
|
|
|
|
with tm.use_numexpr(True, min_elements=5):
|
|
with tm.assert_produces_warning(check_stacklevel=False):
|
|
r = f(df, df)
|
|
e = fe(df, df)
|
|
tm.assert_frame_equal(r, e)
|
|
|
|
with tm.assert_produces_warning(check_stacklevel=False):
|
|
r = f(df.a, df.b)
|
|
e = fe(df.a, df.b)
|
|
tm.assert_series_equal(r, e)
|
|
|
|
with tm.assert_produces_warning(check_stacklevel=False):
|
|
r = f(df.a, True)
|
|
e = fe(df.a, True)
|
|
tm.assert_series_equal(r, e)
|
|
|
|
with tm.assert_produces_warning(check_stacklevel=False):
|
|
r = f(False, df.a)
|
|
e = fe(False, df.a)
|
|
tm.assert_series_equal(r, e)
|
|
|
|
with tm.assert_produces_warning(check_stacklevel=False):
|
|
r = f(False, df)
|
|
e = fe(False, df)
|
|
tm.assert_frame_equal(r, e)
|
|
|
|
with tm.assert_produces_warning(check_stacklevel=False):
|
|
r = f(df, True)
|
|
e = fe(df, True)
|
|
tm.assert_frame_equal(r, e)
|
|
|
|
@pytest.mark.parametrize(
|
|
"test_input,expected",
|
|
[
|
|
(
|
|
DataFrame(
|
|
[[0, 1, 2, "aa"], [0, 1, 2, "aa"]], columns=["a", "b", "c", "dtype"]
|
|
),
|
|
DataFrame([[False, False], [False, False]], columns=["a", "dtype"]),
|
|
),
|
|
(
|
|
DataFrame(
|
|
[[0, 3, 2, "aa"], [0, 4, 2, "aa"], [0, 1, 1, "bb"]],
|
|
columns=["a", "b", "c", "dtype"],
|
|
),
|
|
DataFrame(
|
|
[[False, False], [False, False], [False, False]],
|
|
columns=["a", "dtype"],
|
|
),
|
|
),
|
|
],
|
|
)
|
|
def test_bool_ops_column_name_dtype(self, test_input, expected):
|
|
# GH 22383 - .ne fails if columns containing column name 'dtype'
|
|
result = test_input.loc[:, ["a", "dtype"]].ne(test_input.loc[:, ["a", "dtype"]])
|
|
assert_frame_equal(result, expected)
|