8th day of python challenges 111-117
This commit is contained in:
962
venv/lib/python3.6/site-packages/pandas/tests/io/test_packers.py
Normal file
962
venv/lib/python3.6/site-packages/pandas/tests/io/test_packers.py
Normal file
@@ -0,0 +1,962 @@
|
||||
import datetime
|
||||
import glob
|
||||
from io import BytesIO
|
||||
import os
|
||||
from warnings import catch_warnings, filterwarnings
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs.tslib import iNaT
|
||||
from pandas.errors import PerformanceWarning
|
||||
|
||||
import pandas
|
||||
from pandas import (
|
||||
Categorical,
|
||||
DataFrame,
|
||||
Index,
|
||||
Interval,
|
||||
MultiIndex,
|
||||
NaT,
|
||||
Period,
|
||||
Series,
|
||||
Timestamp,
|
||||
bdate_range,
|
||||
date_range,
|
||||
period_range,
|
||||
)
|
||||
import pandas.util.testing as tm
|
||||
from pandas.util.testing import (
|
||||
assert_categorical_equal,
|
||||
assert_frame_equal,
|
||||
assert_index_equal,
|
||||
assert_series_equal,
|
||||
ensure_clean,
|
||||
)
|
||||
|
||||
from pandas.io.packers import read_msgpack, to_msgpack
|
||||
|
||||
nan = np.nan
|
||||
|
||||
try:
|
||||
import blosc # NOQA
|
||||
except ImportError:
|
||||
_BLOSC_INSTALLED = False
|
||||
else:
|
||||
_BLOSC_INSTALLED = True
|
||||
|
||||
try:
|
||||
import zlib # NOQA
|
||||
except ImportError:
|
||||
_ZLIB_INSTALLED = False
|
||||
else:
|
||||
_ZLIB_INSTALLED = True
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def current_packers_data():
|
||||
# our current version packers data
|
||||
from pandas.tests.io.generate_legacy_storage_files import create_msgpack_data
|
||||
|
||||
return create_msgpack_data()
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def all_packers_data():
|
||||
# our all of our current version packers data
|
||||
from pandas.tests.io.generate_legacy_storage_files import create_data
|
||||
|
||||
return create_data()
|
||||
|
||||
|
||||
def check_arbitrary(a, b):
|
||||
|
||||
if isinstance(a, (list, tuple)) and isinstance(b, (list, tuple)):
|
||||
assert len(a) == len(b)
|
||||
for a_, b_ in zip(a, b):
|
||||
check_arbitrary(a_, b_)
|
||||
elif isinstance(a, DataFrame):
|
||||
assert_frame_equal(a, b)
|
||||
elif isinstance(a, Series):
|
||||
assert_series_equal(a, b)
|
||||
elif isinstance(a, Index):
|
||||
assert_index_equal(a, b)
|
||||
elif isinstance(a, Categorical):
|
||||
# Temp,
|
||||
# Categorical.categories is changed from str to bytes in PY3
|
||||
# maybe the same as GH 13591
|
||||
if b.categories.inferred_type == "string":
|
||||
pass
|
||||
else:
|
||||
tm.assert_categorical_equal(a, b)
|
||||
elif a is NaT:
|
||||
assert b is NaT
|
||||
elif isinstance(a, Timestamp):
|
||||
assert a == b
|
||||
assert a.freq == b.freq
|
||||
else:
|
||||
assert a == b
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning")
|
||||
class TestPackers:
|
||||
def setup_method(self, method):
|
||||
self.path = "__{}__.msg".format(tm.rands(10))
|
||||
|
||||
def teardown_method(self, method):
|
||||
pass
|
||||
|
||||
def encode_decode(self, x, compress=None, **kwargs):
|
||||
with ensure_clean(self.path) as p:
|
||||
to_msgpack(p, x, compress=compress, **kwargs)
|
||||
return read_msgpack(p, **kwargs)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning")
|
||||
class TestAPI(TestPackers):
|
||||
def test_string_io(self):
|
||||
|
||||
df = DataFrame(np.random.randn(10, 2))
|
||||
s = df.to_msgpack(None)
|
||||
result = read_msgpack(s)
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
s = df.to_msgpack()
|
||||
result = read_msgpack(s)
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
s = df.to_msgpack()
|
||||
result = read_msgpack(BytesIO(s))
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
s = to_msgpack(None, df)
|
||||
result = read_msgpack(s)
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
with ensure_clean(self.path) as p:
|
||||
|
||||
s = df.to_msgpack()
|
||||
with open(p, "wb") as fh:
|
||||
fh.write(s)
|
||||
result = read_msgpack(p)
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
def test_path_pathlib(self):
|
||||
df = tm.makeDataFrame()
|
||||
result = tm.round_trip_pathlib(df.to_msgpack, read_msgpack)
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
def test_path_localpath(self):
|
||||
df = tm.makeDataFrame()
|
||||
result = tm.round_trip_localpath(df.to_msgpack, read_msgpack)
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
def test_iterator_with_string_io(self):
|
||||
|
||||
dfs = [DataFrame(np.random.randn(10, 2)) for i in range(5)]
|
||||
s = to_msgpack(None, *dfs)
|
||||
for i, result in enumerate(read_msgpack(s, iterator=True)):
|
||||
tm.assert_frame_equal(result, dfs[i])
|
||||
|
||||
def test_invalid_arg(self):
|
||||
# GH10369
|
||||
class A:
|
||||
def __init__(self):
|
||||
self.read = 0
|
||||
|
||||
msg = "Invalid file path or buffer object type: <class '{}'>"
|
||||
invalid_path = os.path.join("nonexistent_dir", "df.msgpack")
|
||||
with pytest.raises(ValueError, match=msg.format("NoneType")):
|
||||
read_msgpack(path_or_buf=None)
|
||||
with pytest.raises(ValueError, match=msg.format("dict")):
|
||||
read_msgpack(path_or_buf={})
|
||||
with pytest.raises(ValueError, match=msg.format(r".*\.A")):
|
||||
read_msgpack(path_or_buf=A())
|
||||
with pytest.raises(FileNotFoundError, match="does not exist"):
|
||||
read_msgpack(path_or_buf=invalid_path)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning")
|
||||
class TestNumpy(TestPackers):
|
||||
def test_numpy_scalar_float(self):
|
||||
x = np.float32(np.random.rand())
|
||||
x_rec = self.encode_decode(x)
|
||||
tm.assert_almost_equal(x, x_rec)
|
||||
|
||||
def test_numpy_scalar_complex(self):
|
||||
x = np.complex64(np.random.rand() + 1j * np.random.rand())
|
||||
x_rec = self.encode_decode(x)
|
||||
assert np.allclose(x, x_rec)
|
||||
|
||||
def test_scalar_float(self):
|
||||
x = np.random.rand()
|
||||
x_rec = self.encode_decode(x)
|
||||
tm.assert_almost_equal(x, x_rec)
|
||||
|
||||
def test_scalar_bool(self):
|
||||
x = np.bool_(1)
|
||||
x_rec = self.encode_decode(x)
|
||||
tm.assert_almost_equal(x, x_rec)
|
||||
|
||||
x = np.bool_(0)
|
||||
x_rec = self.encode_decode(x)
|
||||
tm.assert_almost_equal(x, x_rec)
|
||||
|
||||
def test_scalar_complex(self):
|
||||
x = np.random.rand() + 1j * np.random.rand()
|
||||
x_rec = self.encode_decode(x)
|
||||
assert np.allclose(x, x_rec)
|
||||
|
||||
def test_list_numpy_float(self):
|
||||
x = [np.float32(np.random.rand()) for i in range(5)]
|
||||
x_rec = self.encode_decode(x)
|
||||
# current msgpack cannot distinguish list/tuple
|
||||
tm.assert_almost_equal(tuple(x), x_rec)
|
||||
|
||||
x_rec = self.encode_decode(tuple(x))
|
||||
tm.assert_almost_equal(tuple(x), x_rec)
|
||||
|
||||
def test_list_numpy_float_complex(self):
|
||||
if not hasattr(np, "complex128"):
|
||||
pytest.skip("numpy can not handle complex128")
|
||||
|
||||
x = [np.float32(np.random.rand()) for i in range(5)] + [
|
||||
np.complex128(np.random.rand() + 1j * np.random.rand()) for i in range(5)
|
||||
]
|
||||
x_rec = self.encode_decode(x)
|
||||
assert np.allclose(x, x_rec)
|
||||
|
||||
def test_list_float(self):
|
||||
x = [np.random.rand() for i in range(5)]
|
||||
x_rec = self.encode_decode(x)
|
||||
# current msgpack cannot distinguish list/tuple
|
||||
tm.assert_almost_equal(tuple(x), x_rec)
|
||||
|
||||
x_rec = self.encode_decode(tuple(x))
|
||||
tm.assert_almost_equal(tuple(x), x_rec)
|
||||
|
||||
def test_list_float_complex(self):
|
||||
x = [np.random.rand() for i in range(5)] + [
|
||||
(np.random.rand() + 1j * np.random.rand()) for i in range(5)
|
||||
]
|
||||
x_rec = self.encode_decode(x)
|
||||
assert np.allclose(x, x_rec)
|
||||
|
||||
def test_dict_float(self):
|
||||
x = {"foo": 1.0, "bar": 2.0}
|
||||
x_rec = self.encode_decode(x)
|
||||
tm.assert_almost_equal(x, x_rec)
|
||||
|
||||
def test_dict_complex(self):
|
||||
x = {"foo": 1.0 + 1.0j, "bar": 2.0 + 2.0j}
|
||||
x_rec = self.encode_decode(x)
|
||||
tm.assert_dict_equal(x, x_rec)
|
||||
|
||||
for key in x:
|
||||
tm.assert_class_equal(x[key], x_rec[key], obj="complex value")
|
||||
|
||||
def test_dict_numpy_float(self):
|
||||
x = {"foo": np.float32(1.0), "bar": np.float32(2.0)}
|
||||
x_rec = self.encode_decode(x)
|
||||
tm.assert_almost_equal(x, x_rec)
|
||||
|
||||
def test_dict_numpy_complex(self):
|
||||
x = {"foo": np.complex128(1.0 + 1.0j), "bar": np.complex128(2.0 + 2.0j)}
|
||||
x_rec = self.encode_decode(x)
|
||||
tm.assert_dict_equal(x, x_rec)
|
||||
|
||||
for key in x:
|
||||
tm.assert_class_equal(x[key], x_rec[key], obj="numpy complex128")
|
||||
|
||||
def test_numpy_array_float(self):
|
||||
|
||||
# run multiple times
|
||||
for n in range(10):
|
||||
x = np.random.rand(10)
|
||||
for dtype in ["float32", "float64"]:
|
||||
x = x.astype(dtype)
|
||||
x_rec = self.encode_decode(x)
|
||||
tm.assert_almost_equal(x, x_rec)
|
||||
|
||||
def test_numpy_array_complex(self):
|
||||
x = (np.random.rand(5) + 1j * np.random.rand(5)).astype(np.complex128)
|
||||
x_rec = self.encode_decode(x)
|
||||
assert all(map(lambda x, y: x == y, x, x_rec)) and x.dtype == x_rec.dtype
|
||||
|
||||
def test_list_mixed(self):
|
||||
x = [1.0, np.float32(3.5), np.complex128(4.25), "foo", np.bool_(1)]
|
||||
x_rec = self.encode_decode(x)
|
||||
# current msgpack cannot distinguish list/tuple
|
||||
tm.assert_almost_equal(tuple(x), x_rec)
|
||||
|
||||
x_rec = self.encode_decode(tuple(x))
|
||||
tm.assert_almost_equal(tuple(x), x_rec)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning")
|
||||
class TestBasic(TestPackers):
|
||||
def test_timestamp(self):
|
||||
|
||||
for i in [
|
||||
Timestamp("20130101"),
|
||||
Timestamp("20130101", tz="US/Eastern"),
|
||||
Timestamp("201301010501"),
|
||||
]:
|
||||
i_rec = self.encode_decode(i)
|
||||
assert i == i_rec
|
||||
|
||||
def test_nat(self):
|
||||
nat_rec = self.encode_decode(NaT)
|
||||
assert NaT is nat_rec
|
||||
|
||||
def test_datetimes(self):
|
||||
|
||||
for i in [
|
||||
datetime.datetime(2013, 1, 1),
|
||||
datetime.datetime(2013, 1, 1, 5, 1),
|
||||
datetime.date(2013, 1, 1),
|
||||
np.datetime64(datetime.datetime(2013, 1, 5, 2, 15)),
|
||||
]:
|
||||
i_rec = self.encode_decode(i)
|
||||
assert i == i_rec
|
||||
|
||||
def test_timedeltas(self):
|
||||
|
||||
for i in [
|
||||
datetime.timedelta(days=1),
|
||||
datetime.timedelta(days=1, seconds=10),
|
||||
np.timedelta64(1000000),
|
||||
]:
|
||||
i_rec = self.encode_decode(i)
|
||||
assert i == i_rec
|
||||
|
||||
def test_periods(self):
|
||||
# 13463
|
||||
for i in [Period("2010-09", "M"), Period("2014-Q1", "Q")]:
|
||||
i_rec = self.encode_decode(i)
|
||||
assert i == i_rec
|
||||
|
||||
def test_intervals(self):
|
||||
# 19967
|
||||
for i in [Interval(0, 1), Interval(0, 1, "left"), Interval(10, 25.0, "right")]:
|
||||
i_rec = self.encode_decode(i)
|
||||
assert i == i_rec
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning")
|
||||
class TestIndex(TestPackers):
|
||||
def setup_method(self, method):
|
||||
super().setup_method(method)
|
||||
|
||||
self.d = {
|
||||
"string": tm.makeStringIndex(100),
|
||||
"date": tm.makeDateIndex(100),
|
||||
"int": tm.makeIntIndex(100),
|
||||
"rng": tm.makeRangeIndex(100),
|
||||
"float": tm.makeFloatIndex(100),
|
||||
"empty": Index([]),
|
||||
"tuple": Index(zip(["foo", "bar", "baz"], [1, 2, 3])),
|
||||
"period": Index(period_range("2012-1-1", freq="M", periods=3)),
|
||||
"date2": Index(date_range("2013-01-1", periods=10)),
|
||||
"bdate": Index(bdate_range("2013-01-02", periods=10)),
|
||||
"cat": tm.makeCategoricalIndex(100),
|
||||
"interval": tm.makeIntervalIndex(100),
|
||||
"timedelta": tm.makeTimedeltaIndex(100, "H"),
|
||||
}
|
||||
|
||||
self.mi = {
|
||||
"reg": MultiIndex.from_tuples(
|
||||
[
|
||||
("bar", "one"),
|
||||
("baz", "two"),
|
||||
("foo", "two"),
|
||||
("qux", "one"),
|
||||
("qux", "two"),
|
||||
],
|
||||
names=["first", "second"],
|
||||
)
|
||||
}
|
||||
|
||||
def test_basic_index(self):
|
||||
|
||||
for s, i in self.d.items():
|
||||
i_rec = self.encode_decode(i)
|
||||
tm.assert_index_equal(i, i_rec)
|
||||
|
||||
# datetime with no freq (GH5506)
|
||||
i = Index([Timestamp("20130101"), Timestamp("20130103")])
|
||||
i_rec = self.encode_decode(i)
|
||||
tm.assert_index_equal(i, i_rec)
|
||||
|
||||
# datetime with timezone
|
||||
i = Index(
|
||||
[Timestamp("20130101 9:00:00"), Timestamp("20130103 11:00:00")]
|
||||
).tz_localize("US/Eastern")
|
||||
i_rec = self.encode_decode(i)
|
||||
tm.assert_index_equal(i, i_rec)
|
||||
|
||||
def test_multi_index(self):
|
||||
|
||||
for s, i in self.mi.items():
|
||||
i_rec = self.encode_decode(i)
|
||||
tm.assert_index_equal(i, i_rec)
|
||||
|
||||
def test_unicode(self):
|
||||
i = tm.makeUnicodeIndex(100)
|
||||
|
||||
i_rec = self.encode_decode(i)
|
||||
tm.assert_index_equal(i, i_rec)
|
||||
|
||||
def categorical_index(self):
|
||||
# GH15487
|
||||
df = DataFrame(np.random.randn(10, 2))
|
||||
df = df.astype({0: "category"}).set_index(0)
|
||||
result = self.encode_decode(df)
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning")
|
||||
class TestSeries(TestPackers):
|
||||
def setup_method(self, method):
|
||||
super().setup_method(method)
|
||||
|
||||
self.d = {}
|
||||
|
||||
s = tm.makeStringSeries()
|
||||
s.name = "string"
|
||||
self.d["string"] = s
|
||||
|
||||
s = tm.makeObjectSeries()
|
||||
s.name = "object"
|
||||
self.d["object"] = s
|
||||
|
||||
s = Series(iNaT, dtype="M8[ns]", index=range(5))
|
||||
self.d["date"] = s
|
||||
|
||||
data = {
|
||||
"A": [0.0, 1.0, 2.0, 3.0, np.nan],
|
||||
"B": [0, 1, 0, 1, 0],
|
||||
"C": ["foo1", "foo2", "foo3", "foo4", "foo5"],
|
||||
"D": date_range("1/1/2009", periods=5),
|
||||
"E": [0.0, 1, Timestamp("20100101"), "foo", 2.0],
|
||||
"F": [Timestamp("20130102", tz="US/Eastern")] * 2
|
||||
+ [Timestamp("20130603", tz="CET")] * 3,
|
||||
"G": [Timestamp("20130102", tz="US/Eastern")] * 5,
|
||||
"H": Categorical([1, 2, 3, 4, 5]),
|
||||
"I": Categorical([1, 2, 3, 4, 5], ordered=True),
|
||||
"J": (np.bool_(1), 2, 3, 4, 5),
|
||||
}
|
||||
|
||||
self.d["float"] = Series(data["A"])
|
||||
self.d["int"] = Series(data["B"])
|
||||
self.d["mixed"] = Series(data["E"])
|
||||
self.d["dt_tz_mixed"] = Series(data["F"])
|
||||
self.d["dt_tz"] = Series(data["G"])
|
||||
self.d["cat_ordered"] = Series(data["H"])
|
||||
self.d["cat_unordered"] = Series(data["I"])
|
||||
self.d["numpy_bool_mixed"] = Series(data["J"])
|
||||
|
||||
def test_basic(self):
|
||||
|
||||
# run multiple times here
|
||||
for n in range(10):
|
||||
for s, i in self.d.items():
|
||||
i_rec = self.encode_decode(i)
|
||||
assert_series_equal(i, i_rec)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning")
|
||||
class TestCategorical(TestPackers):
|
||||
def setup_method(self, method):
|
||||
super().setup_method(method)
|
||||
|
||||
self.d = {}
|
||||
|
||||
self.d["plain_str"] = Categorical(["a", "b", "c", "d", "e"])
|
||||
self.d["plain_str_ordered"] = Categorical(
|
||||
["a", "b", "c", "d", "e"], ordered=True
|
||||
)
|
||||
|
||||
self.d["plain_int"] = Categorical([5, 6, 7, 8])
|
||||
self.d["plain_int_ordered"] = Categorical([5, 6, 7, 8], ordered=True)
|
||||
|
||||
def test_basic(self):
|
||||
|
||||
# run multiple times here
|
||||
for n in range(10):
|
||||
for s, i in self.d.items():
|
||||
i_rec = self.encode_decode(i)
|
||||
assert_categorical_equal(i, i_rec)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:msgpack:FutureWarning")
|
||||
class TestNDFrame(TestPackers):
|
||||
def setup_method(self, method):
|
||||
super().setup_method(method)
|
||||
|
||||
data = {
|
||||
"A": [0.0, 1.0, 2.0, 3.0, np.nan],
|
||||
"B": [0, 1, 0, 1, 0],
|
||||
"C": ["foo1", "foo2", "foo3", "foo4", "foo5"],
|
||||
"D": date_range("1/1/2009", periods=5),
|
||||
"E": [0.0, 1, Timestamp("20100101"), "foo", 2.0],
|
||||
"F": [Timestamp("20130102", tz="US/Eastern")] * 5,
|
||||
"G": [Timestamp("20130603", tz="CET")] * 5,
|
||||
"H": Categorical(["a", "b", "c", "d", "e"]),
|
||||
"I": Categorical(["a", "b", "c", "d", "e"], ordered=True),
|
||||
}
|
||||
|
||||
self.frame = {
|
||||
"float": DataFrame(dict(A=data["A"], B=Series(data["A"]) + 1)),
|
||||
"int": DataFrame(dict(A=data["B"], B=Series(data["B"]) + 1)),
|
||||
"mixed": DataFrame(data),
|
||||
}
|
||||
|
||||
def test_basic_frame(self):
|
||||
|
||||
for s, i in self.frame.items():
|
||||
i_rec = self.encode_decode(i)
|
||||
assert_frame_equal(i, i_rec)
|
||||
|
||||
def test_multi(self):
|
||||
|
||||
i_rec = self.encode_decode(self.frame)
|
||||
for k in self.frame.keys():
|
||||
assert_frame_equal(self.frame[k], i_rec[k])
|
||||
|
||||
packed_items = tuple(
|
||||
[self.frame["float"], self.frame["float"].A, self.frame["float"].B, None]
|
||||
)
|
||||
l_rec = self.encode_decode(packed_items)
|
||||
check_arbitrary(packed_items, l_rec)
|
||||
|
||||
# this is an oddity in that packed lists will be returned as tuples
|
||||
packed_items = [
|
||||
self.frame["float"],
|
||||
self.frame["float"].A,
|
||||
self.frame["float"].B,
|
||||
None,
|
||||
]
|
||||
l_rec = self.encode_decode(packed_items)
|
||||
assert isinstance(l_rec, tuple)
|
||||
check_arbitrary(packed_items, l_rec)
|
||||
|
||||
def test_iterator(self):
|
||||
|
||||
packed_items = [
|
||||
self.frame["float"],
|
||||
self.frame["float"].A,
|
||||
self.frame["float"].B,
|
||||
None,
|
||||
]
|
||||
|
||||
with ensure_clean(self.path) as path:
|
||||
to_msgpack(path, *packed_items)
|
||||
for i, packed in enumerate(read_msgpack(path, iterator=True)):
|
||||
check_arbitrary(packed, packed_items[i])
|
||||
|
||||
def tests_datetimeindex_freq_issue(self):
|
||||
|
||||
# GH 5947
|
||||
# inferring freq on the datetimeindex
|
||||
df = DataFrame([1, 2, 3], index=date_range("1/1/2013", "1/3/2013"))
|
||||
result = self.encode_decode(df)
|
||||
assert_frame_equal(result, df)
|
||||
|
||||
df = DataFrame([1, 2], index=date_range("1/1/2013", "1/2/2013"))
|
||||
result = self.encode_decode(df)
|
||||
assert_frame_equal(result, df)
|
||||
|
||||
def test_dataframe_duplicate_column_names(self):
|
||||
|
||||
# GH 9618
|
||||
expected_1 = DataFrame(columns=["a", "a"])
|
||||
expected_2 = DataFrame(columns=[1] * 100)
|
||||
expected_2.loc[0] = np.random.randn(100)
|
||||
expected_3 = DataFrame(columns=[1, 1])
|
||||
expected_3.loc[0] = ["abc", np.nan]
|
||||
|
||||
result_1 = self.encode_decode(expected_1)
|
||||
result_2 = self.encode_decode(expected_2)
|
||||
result_3 = self.encode_decode(expected_3)
|
||||
|
||||
assert_frame_equal(result_1, expected_1)
|
||||
assert_frame_equal(result_2, expected_2)
|
||||
assert_frame_equal(result_3, expected_3)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning")
|
||||
@pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning")
|
||||
@pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning")
|
||||
@pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning")
|
||||
class TestSparse(TestPackers):
|
||||
def _check_roundtrip(self, obj, comparator, **kwargs):
|
||||
|
||||
# currently these are not implemetned
|
||||
# i_rec = self.encode_decode(obj)
|
||||
# comparator(obj, i_rec, **kwargs)
|
||||
msg = r"msgpack sparse (series|frame) is not implemented"
|
||||
with pytest.raises(NotImplementedError, match=msg):
|
||||
self.encode_decode(obj)
|
||||
|
||||
def test_sparse_series(self):
|
||||
|
||||
s = tm.makeStringSeries()
|
||||
s[3:5] = np.nan
|
||||
ss = s.to_sparse()
|
||||
self._check_roundtrip(ss, tm.assert_series_equal, check_series_type=True)
|
||||
|
||||
ss2 = s.to_sparse(kind="integer")
|
||||
self._check_roundtrip(ss2, tm.assert_series_equal, check_series_type=True)
|
||||
|
||||
ss3 = s.to_sparse(fill_value=0)
|
||||
self._check_roundtrip(ss3, tm.assert_series_equal, check_series_type=True)
|
||||
|
||||
def test_sparse_frame(self):
|
||||
|
||||
s = tm.makeDataFrame()
|
||||
s.loc[3:5, 1:3] = np.nan
|
||||
s.loc[8:10, -2] = np.nan
|
||||
ss = s.to_sparse()
|
||||
|
||||
self._check_roundtrip(ss, tm.assert_frame_equal, check_frame_type=True)
|
||||
|
||||
ss2 = s.to_sparse(kind="integer")
|
||||
self._check_roundtrip(ss2, tm.assert_frame_equal, check_frame_type=True)
|
||||
|
||||
ss3 = s.to_sparse(fill_value=0)
|
||||
self._check_roundtrip(ss3, tm.assert_frame_equal, check_frame_type=True)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning")
|
||||
class TestCompression(TestPackers):
|
||||
"""See https://github.com/pandas-dev/pandas/pull/9783
|
||||
"""
|
||||
|
||||
def setup_method(self, method):
|
||||
try:
|
||||
from sqlalchemy import create_engine
|
||||
|
||||
self._create_sql_engine = create_engine
|
||||
except ImportError:
|
||||
self._SQLALCHEMY_INSTALLED = False
|
||||
else:
|
||||
self._SQLALCHEMY_INSTALLED = True
|
||||
|
||||
super().setup_method(method)
|
||||
data = {
|
||||
"A": np.arange(1000, dtype=np.float64),
|
||||
"B": np.arange(1000, dtype=np.int32),
|
||||
"C": list(100 * "abcdefghij"),
|
||||
"D": date_range(datetime.datetime(2015, 4, 1), periods=1000),
|
||||
"E": [datetime.timedelta(days=x) for x in range(1000)],
|
||||
}
|
||||
self.frame = {
|
||||
"float": DataFrame({k: data[k] for k in ["A", "A"]}),
|
||||
"int": DataFrame({k: data[k] for k in ["B", "B"]}),
|
||||
"mixed": DataFrame(data),
|
||||
}
|
||||
|
||||
def test_plain(self):
|
||||
i_rec = self.encode_decode(self.frame)
|
||||
for k in self.frame.keys():
|
||||
assert_frame_equal(self.frame[k], i_rec[k])
|
||||
|
||||
def _test_compression(self, compress):
|
||||
i_rec = self.encode_decode(self.frame, compress=compress)
|
||||
for k in self.frame.keys():
|
||||
value = i_rec[k]
|
||||
expected = self.frame[k]
|
||||
assert_frame_equal(value, expected)
|
||||
# make sure that we can write to the new frames
|
||||
for block in value._data.blocks:
|
||||
assert block.values.flags.writeable
|
||||
|
||||
def test_compression_zlib(self):
|
||||
if not _ZLIB_INSTALLED:
|
||||
pytest.skip("no zlib")
|
||||
self._test_compression("zlib")
|
||||
|
||||
def test_compression_blosc(self):
|
||||
if not _BLOSC_INSTALLED:
|
||||
pytest.skip("no blosc")
|
||||
self._test_compression("blosc")
|
||||
|
||||
def _test_compression_warns_when_decompress_caches(self, monkeypatch, compress):
|
||||
not_garbage = []
|
||||
control = [] # copied data
|
||||
|
||||
compress_module = globals()[compress]
|
||||
real_decompress = compress_module.decompress
|
||||
|
||||
def decompress(ob):
|
||||
"""mock decompress function that delegates to the real
|
||||
decompress but caches the result and a copy of the result.
|
||||
"""
|
||||
res = real_decompress(ob)
|
||||
not_garbage.append(res) # hold a reference to this bytes object
|
||||
control.append(bytearray(res)) # copy the data here to check later
|
||||
return res
|
||||
|
||||
# types mapped to values to add in place.
|
||||
rhs = {
|
||||
np.dtype("float64"): 1.0,
|
||||
np.dtype("int32"): 1,
|
||||
np.dtype("object"): "a",
|
||||
np.dtype("datetime64[ns]"): np.timedelta64(1, "ns"),
|
||||
np.dtype("timedelta64[ns]"): np.timedelta64(1, "ns"),
|
||||
}
|
||||
|
||||
with monkeypatch.context() as m, tm.assert_produces_warning(
|
||||
PerformanceWarning
|
||||
) as ws:
|
||||
m.setattr(compress_module, "decompress", decompress)
|
||||
|
||||
with catch_warnings():
|
||||
filterwarnings("ignore", category=FutureWarning)
|
||||
i_rec = self.encode_decode(self.frame, compress=compress)
|
||||
for k in self.frame.keys():
|
||||
|
||||
value = i_rec[k]
|
||||
expected = self.frame[k]
|
||||
assert_frame_equal(value, expected)
|
||||
# make sure that we can write to the new frames even though
|
||||
# we needed to copy the data
|
||||
for block in value._data.blocks:
|
||||
assert block.values.flags.writeable
|
||||
# mutate the data in some way
|
||||
block.values[0] += rhs[block.dtype]
|
||||
|
||||
for w in ws:
|
||||
# check the messages from our warnings
|
||||
assert str(w.message) == (
|
||||
"copying data after decompressing; "
|
||||
"this may mean that decompress is "
|
||||
"caching its result"
|
||||
)
|
||||
|
||||
for buf, control_buf in zip(not_garbage, control):
|
||||
# make sure none of our mutations above affected the
|
||||
# original buffers
|
||||
assert buf == control_buf
|
||||
|
||||
def test_compression_warns_when_decompress_caches_zlib(self, monkeypatch):
|
||||
if not _ZLIB_INSTALLED:
|
||||
pytest.skip("no zlib")
|
||||
self._test_compression_warns_when_decompress_caches(monkeypatch, "zlib")
|
||||
|
||||
def test_compression_warns_when_decompress_caches_blosc(self, monkeypatch):
|
||||
if not _BLOSC_INSTALLED:
|
||||
pytest.skip("no blosc")
|
||||
self._test_compression_warns_when_decompress_caches(monkeypatch, "blosc")
|
||||
|
||||
def _test_small_strings_no_warn(self, compress):
|
||||
empty = np.array([], dtype="uint8")
|
||||
with tm.assert_produces_warning(None):
|
||||
with catch_warnings():
|
||||
filterwarnings("ignore", category=FutureWarning)
|
||||
empty_unpacked = self.encode_decode(empty, compress=compress)
|
||||
|
||||
tm.assert_numpy_array_equal(empty_unpacked, empty)
|
||||
assert empty_unpacked.flags.writeable
|
||||
|
||||
char = np.array([ord(b"a")], dtype="uint8")
|
||||
with tm.assert_produces_warning(None):
|
||||
with catch_warnings():
|
||||
filterwarnings("ignore", category=FutureWarning)
|
||||
char_unpacked = self.encode_decode(char, compress=compress)
|
||||
|
||||
tm.assert_numpy_array_equal(char_unpacked, char)
|
||||
assert char_unpacked.flags.writeable
|
||||
# if this test fails I am sorry because the interpreter is now in a
|
||||
# bad state where b'a' points to 98 == ord(b'b').
|
||||
char_unpacked[0] = ord(b"b")
|
||||
|
||||
# we compare the ord of bytes b'a' with unicode 'a' because the should
|
||||
# always be the same (unless we were able to mutate the shared
|
||||
# character singleton in which case ord(b'a') == ord(b'b').
|
||||
assert ord(b"a") == ord("a")
|
||||
tm.assert_numpy_array_equal(char_unpacked, np.array([ord(b"b")], dtype="uint8"))
|
||||
|
||||
def test_small_strings_no_warn_zlib(self):
|
||||
if not _ZLIB_INSTALLED:
|
||||
pytest.skip("no zlib")
|
||||
self._test_small_strings_no_warn("zlib")
|
||||
|
||||
def test_small_strings_no_warn_blosc(self):
|
||||
if not _BLOSC_INSTALLED:
|
||||
pytest.skip("no blosc")
|
||||
self._test_small_strings_no_warn("blosc")
|
||||
|
||||
def test_readonly_axis_blosc(self):
|
||||
# GH11880
|
||||
if not _BLOSC_INSTALLED:
|
||||
pytest.skip("no blosc")
|
||||
df1 = DataFrame({"A": list("abcd")})
|
||||
df2 = DataFrame(df1, index=[1.0, 2.0, 3.0, 4.0])
|
||||
assert 1 in self.encode_decode(df1["A"], compress="blosc")
|
||||
assert 1.0 in self.encode_decode(df2["A"], compress="blosc")
|
||||
|
||||
def test_readonly_axis_zlib(self):
|
||||
# GH11880
|
||||
df1 = DataFrame({"A": list("abcd")})
|
||||
df2 = DataFrame(df1, index=[1.0, 2.0, 3.0, 4.0])
|
||||
assert 1 in self.encode_decode(df1["A"], compress="zlib")
|
||||
assert 1.0 in self.encode_decode(df2["A"], compress="zlib")
|
||||
|
||||
def test_readonly_axis_blosc_to_sql(self):
|
||||
# GH11880
|
||||
if not _BLOSC_INSTALLED:
|
||||
pytest.skip("no blosc")
|
||||
if not self._SQLALCHEMY_INSTALLED:
|
||||
pytest.skip("no sqlalchemy")
|
||||
expected = DataFrame({"A": list("abcd")})
|
||||
df = self.encode_decode(expected, compress="blosc")
|
||||
eng = self._create_sql_engine("sqlite:///:memory:")
|
||||
df.to_sql("test", eng, if_exists="append")
|
||||
result = pandas.read_sql_table("test", eng, index_col="index")
|
||||
result.index.names = [None]
|
||||
assert_frame_equal(expected, result)
|
||||
|
||||
def test_readonly_axis_zlib_to_sql(self):
|
||||
# GH11880
|
||||
if not _ZLIB_INSTALLED:
|
||||
pytest.skip("no zlib")
|
||||
if not self._SQLALCHEMY_INSTALLED:
|
||||
pytest.skip("no sqlalchemy")
|
||||
expected = DataFrame({"A": list("abcd")})
|
||||
df = self.encode_decode(expected, compress="zlib")
|
||||
eng = self._create_sql_engine("sqlite:///:memory:")
|
||||
df.to_sql("test", eng, if_exists="append")
|
||||
result = pandas.read_sql_table("test", eng, index_col="index")
|
||||
result.index.names = [None]
|
||||
assert_frame_equal(expected, result)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning")
|
||||
class TestEncoding(TestPackers):
|
||||
def setup_method(self, method):
|
||||
super().setup_method(method)
|
||||
data = {
|
||||
"A": ["\u2019"] * 1000,
|
||||
"B": np.arange(1000, dtype=np.int32),
|
||||
"C": list(100 * "abcdefghij"),
|
||||
"D": date_range(datetime.datetime(2015, 4, 1), periods=1000),
|
||||
"E": [datetime.timedelta(days=x) for x in range(1000)],
|
||||
"G": [400] * 1000,
|
||||
}
|
||||
self.frame = {
|
||||
"float": DataFrame({k: data[k] for k in ["A", "A"]}),
|
||||
"int": DataFrame({k: data[k] for k in ["B", "B"]}),
|
||||
"mixed": DataFrame(data),
|
||||
}
|
||||
self.utf_encodings = ["utf8", "utf16", "utf32"]
|
||||
|
||||
def test_utf(self):
|
||||
# GH10581
|
||||
for encoding in self.utf_encodings:
|
||||
for frame in self.frame.values():
|
||||
result = self.encode_decode(frame, encoding=encoding)
|
||||
assert_frame_equal(result, frame)
|
||||
|
||||
def test_default_encoding(self):
|
||||
for frame in self.frame.values():
|
||||
result = frame.to_msgpack()
|
||||
expected = frame.to_msgpack(encoding="utf8")
|
||||
assert result == expected
|
||||
result = self.encode_decode(frame)
|
||||
assert_frame_equal(result, frame)
|
||||
|
||||
|
||||
files = glob.glob(
|
||||
os.path.join(os.path.dirname(__file__), "data", "legacy_msgpack", "*", "*.msgpack")
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(params=files)
|
||||
def legacy_packer(request, datapath):
|
||||
return datapath(request.param)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning")
|
||||
@pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning")
|
||||
class TestMsgpack:
|
||||
"""
|
||||
How to add msgpack tests:
|
||||
|
||||
1. Install pandas version intended to output the msgpack.
|
||||
2. Execute "generate_legacy_storage_files.py" to create the msgpack.
|
||||
$ python generate_legacy_storage_files.py <output_dir> msgpack
|
||||
|
||||
3. Move the created pickle to "data/legacy_msgpack/<version>" directory.
|
||||
"""
|
||||
|
||||
minimum_structure = {
|
||||
"series": ["float", "int", "mixed", "ts", "mi", "dup"],
|
||||
"frame": ["float", "int", "mixed", "mi"],
|
||||
"index": ["int", "date", "period"],
|
||||
"mi": ["reg2"],
|
||||
}
|
||||
|
||||
def check_min_structure(self, data, version):
|
||||
for typ, v in self.minimum_structure.items():
|
||||
|
||||
assert typ in data, '"{0}" not found in unpacked data'.format(typ)
|
||||
for kind in v:
|
||||
msg = '"{0}" not found in data["{1}"]'.format(kind, typ)
|
||||
assert kind in data[typ], msg
|
||||
|
||||
def compare(self, current_data, all_data, vf, version):
|
||||
data = read_msgpack(vf)
|
||||
|
||||
self.check_min_structure(data, version)
|
||||
for typ, dv in data.items():
|
||||
assert typ in all_data, "unpacked data contains " 'extra key "{0}"'.format(
|
||||
typ
|
||||
)
|
||||
for dt, result in dv.items():
|
||||
assert (
|
||||
dt in current_data[typ]
|
||||
), 'data["{0}"] contains extra ' 'key "{1}"'.format(typ, dt)
|
||||
try:
|
||||
expected = current_data[typ][dt]
|
||||
except KeyError:
|
||||
continue
|
||||
|
||||
# use a specific comparator
|
||||
# if available
|
||||
comp_method = "compare_{typ}_{dt}".format(typ=typ, dt=dt)
|
||||
comparator = getattr(self, comp_method, None)
|
||||
if comparator is not None:
|
||||
comparator(result, expected, typ, version)
|
||||
else:
|
||||
check_arbitrary(result, expected)
|
||||
|
||||
return data
|
||||
|
||||
def compare_series_dt_tz(self, result, expected, typ, version):
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def compare_frame_dt_mixed_tzs(self, result, expected, typ, version):
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_msgpacks_legacy(
|
||||
self, current_packers_data, all_packers_data, legacy_packer, datapath
|
||||
):
|
||||
|
||||
version = os.path.basename(os.path.dirname(legacy_packer))
|
||||
|
||||
try:
|
||||
with catch_warnings(record=True):
|
||||
self.compare(
|
||||
current_packers_data, all_packers_data, legacy_packer, version
|
||||
)
|
||||
except ImportError:
|
||||
# blosc not installed
|
||||
pass
|
||||
|
||||
def test_msgpack_period_freq(self):
|
||||
# https://github.com/pandas-dev/pandas/issues/24135
|
||||
s = Series(np.random.rand(5), index=date_range("20130101", periods=5))
|
||||
r = read_msgpack(s.to_msgpack())
|
||||
repr(r)
|
||||
Reference in New Issue
Block a user