import datetime import glob from io import BytesIO import os from warnings import catch_warnings, filterwarnings import numpy as np import pytest from pandas._libs.tslib import iNaT from pandas.errors import PerformanceWarning import pandas from pandas import ( Categorical, DataFrame, Index, Interval, MultiIndex, NaT, Period, Series, Timestamp, bdate_range, date_range, period_range, ) import pandas.util.testing as tm from pandas.util.testing import ( assert_categorical_equal, assert_frame_equal, assert_index_equal, assert_series_equal, ensure_clean, ) from pandas.io.packers import read_msgpack, to_msgpack nan = np.nan try: import blosc # NOQA except ImportError: _BLOSC_INSTALLED = False else: _BLOSC_INSTALLED = True try: import zlib # NOQA except ImportError: _ZLIB_INSTALLED = False else: _ZLIB_INSTALLED = True @pytest.fixture(scope="module") def current_packers_data(): # our current version packers data from pandas.tests.io.generate_legacy_storage_files import create_msgpack_data return create_msgpack_data() @pytest.fixture(scope="module") def all_packers_data(): # our all of our current version packers data from pandas.tests.io.generate_legacy_storage_files import create_data return create_data() def check_arbitrary(a, b): if isinstance(a, (list, tuple)) and isinstance(b, (list, tuple)): assert len(a) == len(b) for a_, b_ in zip(a, b): check_arbitrary(a_, b_) elif isinstance(a, DataFrame): assert_frame_equal(a, b) elif isinstance(a, Series): assert_series_equal(a, b) elif isinstance(a, Index): assert_index_equal(a, b) elif isinstance(a, Categorical): # Temp, # Categorical.categories is changed from str to bytes in PY3 # maybe the same as GH 13591 if b.categories.inferred_type == "string": pass else: tm.assert_categorical_equal(a, b) elif a is NaT: assert b is NaT elif isinstance(a, Timestamp): assert a == b assert a.freq == b.freq else: assert a == b @pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") class TestPackers: def setup_method(self, method): self.path = "__{}__.msg".format(tm.rands(10)) def teardown_method(self, method): pass def encode_decode(self, x, compress=None, **kwargs): with ensure_clean(self.path) as p: to_msgpack(p, x, compress=compress, **kwargs) return read_msgpack(p, **kwargs) @pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") class TestAPI(TestPackers): def test_string_io(self): df = DataFrame(np.random.randn(10, 2)) s = df.to_msgpack(None) result = read_msgpack(s) tm.assert_frame_equal(result, df) s = df.to_msgpack() result = read_msgpack(s) tm.assert_frame_equal(result, df) s = df.to_msgpack() result = read_msgpack(BytesIO(s)) tm.assert_frame_equal(result, df) s = to_msgpack(None, df) result = read_msgpack(s) tm.assert_frame_equal(result, df) with ensure_clean(self.path) as p: s = df.to_msgpack() with open(p, "wb") as fh: fh.write(s) result = read_msgpack(p) tm.assert_frame_equal(result, df) def test_path_pathlib(self): df = tm.makeDataFrame() result = tm.round_trip_pathlib(df.to_msgpack, read_msgpack) tm.assert_frame_equal(df, result) def test_path_localpath(self): df = tm.makeDataFrame() result = tm.round_trip_localpath(df.to_msgpack, read_msgpack) tm.assert_frame_equal(df, result) def test_iterator_with_string_io(self): dfs = [DataFrame(np.random.randn(10, 2)) for i in range(5)] s = to_msgpack(None, *dfs) for i, result in enumerate(read_msgpack(s, iterator=True)): tm.assert_frame_equal(result, dfs[i]) def test_invalid_arg(self): # GH10369 class A: def __init__(self): self.read = 0 msg = "Invalid file path or buffer object type: " invalid_path = os.path.join("nonexistent_dir", "df.msgpack") with pytest.raises(ValueError, match=msg.format("NoneType")): read_msgpack(path_or_buf=None) with pytest.raises(ValueError, match=msg.format("dict")): read_msgpack(path_or_buf={}) with pytest.raises(ValueError, match=msg.format(r".*\.A")): read_msgpack(path_or_buf=A()) with pytest.raises(FileNotFoundError, match="does not exist"): read_msgpack(path_or_buf=invalid_path) @pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") class TestNumpy(TestPackers): def test_numpy_scalar_float(self): x = np.float32(np.random.rand()) x_rec = self.encode_decode(x) tm.assert_almost_equal(x, x_rec) def test_numpy_scalar_complex(self): x = np.complex64(np.random.rand() + 1j * np.random.rand()) x_rec = self.encode_decode(x) assert np.allclose(x, x_rec) def test_scalar_float(self): x = np.random.rand() x_rec = self.encode_decode(x) tm.assert_almost_equal(x, x_rec) def test_scalar_bool(self): x = np.bool_(1) x_rec = self.encode_decode(x) tm.assert_almost_equal(x, x_rec) x = np.bool_(0) x_rec = self.encode_decode(x) tm.assert_almost_equal(x, x_rec) def test_scalar_complex(self): x = np.random.rand() + 1j * np.random.rand() x_rec = self.encode_decode(x) assert np.allclose(x, x_rec) def test_list_numpy_float(self): x = [np.float32(np.random.rand()) for i in range(5)] x_rec = self.encode_decode(x) # current msgpack cannot distinguish list/tuple tm.assert_almost_equal(tuple(x), x_rec) x_rec = self.encode_decode(tuple(x)) tm.assert_almost_equal(tuple(x), x_rec) def test_list_numpy_float_complex(self): if not hasattr(np, "complex128"): pytest.skip("numpy can not handle complex128") x = [np.float32(np.random.rand()) for i in range(5)] + [ np.complex128(np.random.rand() + 1j * np.random.rand()) for i in range(5) ] x_rec = self.encode_decode(x) assert np.allclose(x, x_rec) def test_list_float(self): x = [np.random.rand() for i in range(5)] x_rec = self.encode_decode(x) # current msgpack cannot distinguish list/tuple tm.assert_almost_equal(tuple(x), x_rec) x_rec = self.encode_decode(tuple(x)) tm.assert_almost_equal(tuple(x), x_rec) def test_list_float_complex(self): x = [np.random.rand() for i in range(5)] + [ (np.random.rand() + 1j * np.random.rand()) for i in range(5) ] x_rec = self.encode_decode(x) assert np.allclose(x, x_rec) def test_dict_float(self): x = {"foo": 1.0, "bar": 2.0} x_rec = self.encode_decode(x) tm.assert_almost_equal(x, x_rec) def test_dict_complex(self): x = {"foo": 1.0 + 1.0j, "bar": 2.0 + 2.0j} x_rec = self.encode_decode(x) tm.assert_dict_equal(x, x_rec) for key in x: tm.assert_class_equal(x[key], x_rec[key], obj="complex value") def test_dict_numpy_float(self): x = {"foo": np.float32(1.0), "bar": np.float32(2.0)} x_rec = self.encode_decode(x) tm.assert_almost_equal(x, x_rec) def test_dict_numpy_complex(self): x = {"foo": np.complex128(1.0 + 1.0j), "bar": np.complex128(2.0 + 2.0j)} x_rec = self.encode_decode(x) tm.assert_dict_equal(x, x_rec) for key in x: tm.assert_class_equal(x[key], x_rec[key], obj="numpy complex128") def test_numpy_array_float(self): # run multiple times for n in range(10): x = np.random.rand(10) for dtype in ["float32", "float64"]: x = x.astype(dtype) x_rec = self.encode_decode(x) tm.assert_almost_equal(x, x_rec) def test_numpy_array_complex(self): x = (np.random.rand(5) + 1j * np.random.rand(5)).astype(np.complex128) x_rec = self.encode_decode(x) assert all(map(lambda x, y: x == y, x, x_rec)) and x.dtype == x_rec.dtype def test_list_mixed(self): x = [1.0, np.float32(3.5), np.complex128(4.25), "foo", np.bool_(1)] x_rec = self.encode_decode(x) # current msgpack cannot distinguish list/tuple tm.assert_almost_equal(tuple(x), x_rec) x_rec = self.encode_decode(tuple(x)) tm.assert_almost_equal(tuple(x), x_rec) @pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") class TestBasic(TestPackers): def test_timestamp(self): for i in [ Timestamp("20130101"), Timestamp("20130101", tz="US/Eastern"), Timestamp("201301010501"), ]: i_rec = self.encode_decode(i) assert i == i_rec def test_nat(self): nat_rec = self.encode_decode(NaT) assert NaT is nat_rec def test_datetimes(self): for i in [ datetime.datetime(2013, 1, 1), datetime.datetime(2013, 1, 1, 5, 1), datetime.date(2013, 1, 1), np.datetime64(datetime.datetime(2013, 1, 5, 2, 15)), ]: i_rec = self.encode_decode(i) assert i == i_rec def test_timedeltas(self): for i in [ datetime.timedelta(days=1), datetime.timedelta(days=1, seconds=10), np.timedelta64(1000000), ]: i_rec = self.encode_decode(i) assert i == i_rec def test_periods(self): # 13463 for i in [Period("2010-09", "M"), Period("2014-Q1", "Q")]: i_rec = self.encode_decode(i) assert i == i_rec def test_intervals(self): # 19967 for i in [Interval(0, 1), Interval(0, 1, "left"), Interval(10, 25.0, "right")]: i_rec = self.encode_decode(i) assert i == i_rec @pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") class TestIndex(TestPackers): def setup_method(self, method): super().setup_method(method) self.d = { "string": tm.makeStringIndex(100), "date": tm.makeDateIndex(100), "int": tm.makeIntIndex(100), "rng": tm.makeRangeIndex(100), "float": tm.makeFloatIndex(100), "empty": Index([]), "tuple": Index(zip(["foo", "bar", "baz"], [1, 2, 3])), "period": Index(period_range("2012-1-1", freq="M", periods=3)), "date2": Index(date_range("2013-01-1", periods=10)), "bdate": Index(bdate_range("2013-01-02", periods=10)), "cat": tm.makeCategoricalIndex(100), "interval": tm.makeIntervalIndex(100), "timedelta": tm.makeTimedeltaIndex(100, "H"), } self.mi = { "reg": MultiIndex.from_tuples( [ ("bar", "one"), ("baz", "two"), ("foo", "two"), ("qux", "one"), ("qux", "two"), ], names=["first", "second"], ) } def test_basic_index(self): for s, i in self.d.items(): i_rec = self.encode_decode(i) tm.assert_index_equal(i, i_rec) # datetime with no freq (GH5506) i = Index([Timestamp("20130101"), Timestamp("20130103")]) i_rec = self.encode_decode(i) tm.assert_index_equal(i, i_rec) # datetime with timezone i = Index( [Timestamp("20130101 9:00:00"), Timestamp("20130103 11:00:00")] ).tz_localize("US/Eastern") i_rec = self.encode_decode(i) tm.assert_index_equal(i, i_rec) def test_multi_index(self): for s, i in self.mi.items(): i_rec = self.encode_decode(i) tm.assert_index_equal(i, i_rec) def test_unicode(self): i = tm.makeUnicodeIndex(100) i_rec = self.encode_decode(i) tm.assert_index_equal(i, i_rec) def categorical_index(self): # GH15487 df = DataFrame(np.random.randn(10, 2)) df = df.astype({0: "category"}).set_index(0) result = self.encode_decode(df) tm.assert_frame_equal(result, df) @pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") class TestSeries(TestPackers): def setup_method(self, method): super().setup_method(method) self.d = {} s = tm.makeStringSeries() s.name = "string" self.d["string"] = s s = tm.makeObjectSeries() s.name = "object" self.d["object"] = s s = Series(iNaT, dtype="M8[ns]", index=range(5)) self.d["date"] = s data = { "A": [0.0, 1.0, 2.0, 3.0, np.nan], "B": [0, 1, 0, 1, 0], "C": ["foo1", "foo2", "foo3", "foo4", "foo5"], "D": date_range("1/1/2009", periods=5), "E": [0.0, 1, Timestamp("20100101"), "foo", 2.0], "F": [Timestamp("20130102", tz="US/Eastern")] * 2 + [Timestamp("20130603", tz="CET")] * 3, "G": [Timestamp("20130102", tz="US/Eastern")] * 5, "H": Categorical([1, 2, 3, 4, 5]), "I": Categorical([1, 2, 3, 4, 5], ordered=True), "J": (np.bool_(1), 2, 3, 4, 5), } self.d["float"] = Series(data["A"]) self.d["int"] = Series(data["B"]) self.d["mixed"] = Series(data["E"]) self.d["dt_tz_mixed"] = Series(data["F"]) self.d["dt_tz"] = Series(data["G"]) self.d["cat_ordered"] = Series(data["H"]) self.d["cat_unordered"] = Series(data["I"]) self.d["numpy_bool_mixed"] = Series(data["J"]) def test_basic(self): # run multiple times here for n in range(10): for s, i in self.d.items(): i_rec = self.encode_decode(i) assert_series_equal(i, i_rec) @pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") class TestCategorical(TestPackers): def setup_method(self, method): super().setup_method(method) self.d = {} self.d["plain_str"] = Categorical(["a", "b", "c", "d", "e"]) self.d["plain_str_ordered"] = Categorical( ["a", "b", "c", "d", "e"], ordered=True ) self.d["plain_int"] = Categorical([5, 6, 7, 8]) self.d["plain_int_ordered"] = Categorical([5, 6, 7, 8], ordered=True) def test_basic(self): # run multiple times here for n in range(10): for s, i in self.d.items(): i_rec = self.encode_decode(i) assert_categorical_equal(i, i_rec) @pytest.mark.filterwarnings("ignore:msgpack:FutureWarning") class TestNDFrame(TestPackers): def setup_method(self, method): super().setup_method(method) data = { "A": [0.0, 1.0, 2.0, 3.0, np.nan], "B": [0, 1, 0, 1, 0], "C": ["foo1", "foo2", "foo3", "foo4", "foo5"], "D": date_range("1/1/2009", periods=5), "E": [0.0, 1, Timestamp("20100101"), "foo", 2.0], "F": [Timestamp("20130102", tz="US/Eastern")] * 5, "G": [Timestamp("20130603", tz="CET")] * 5, "H": Categorical(["a", "b", "c", "d", "e"]), "I": Categorical(["a", "b", "c", "d", "e"], ordered=True), } self.frame = { "float": DataFrame(dict(A=data["A"], B=Series(data["A"]) + 1)), "int": DataFrame(dict(A=data["B"], B=Series(data["B"]) + 1)), "mixed": DataFrame(data), } def test_basic_frame(self): for s, i in self.frame.items(): i_rec = self.encode_decode(i) assert_frame_equal(i, i_rec) def test_multi(self): i_rec = self.encode_decode(self.frame) for k in self.frame.keys(): assert_frame_equal(self.frame[k], i_rec[k]) packed_items = tuple( [self.frame["float"], self.frame["float"].A, self.frame["float"].B, None] ) l_rec = self.encode_decode(packed_items) check_arbitrary(packed_items, l_rec) # this is an oddity in that packed lists will be returned as tuples packed_items = [ self.frame["float"], self.frame["float"].A, self.frame["float"].B, None, ] l_rec = self.encode_decode(packed_items) assert isinstance(l_rec, tuple) check_arbitrary(packed_items, l_rec) def test_iterator(self): packed_items = [ self.frame["float"], self.frame["float"].A, self.frame["float"].B, None, ] with ensure_clean(self.path) as path: to_msgpack(path, *packed_items) for i, packed in enumerate(read_msgpack(path, iterator=True)): check_arbitrary(packed, packed_items[i]) def tests_datetimeindex_freq_issue(self): # GH 5947 # inferring freq on the datetimeindex df = DataFrame([1, 2, 3], index=date_range("1/1/2013", "1/3/2013")) result = self.encode_decode(df) assert_frame_equal(result, df) df = DataFrame([1, 2], index=date_range("1/1/2013", "1/2/2013")) result = self.encode_decode(df) assert_frame_equal(result, df) def test_dataframe_duplicate_column_names(self): # GH 9618 expected_1 = DataFrame(columns=["a", "a"]) expected_2 = DataFrame(columns=[1] * 100) expected_2.loc[0] = np.random.randn(100) expected_3 = DataFrame(columns=[1, 1]) expected_3.loc[0] = ["abc", np.nan] result_1 = self.encode_decode(expected_1) result_2 = self.encode_decode(expected_2) result_3 = self.encode_decode(expected_3) assert_frame_equal(result_1, expected_1) assert_frame_equal(result_2, expected_2) assert_frame_equal(result_3, expected_3) @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") @pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning") @pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") @pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") class TestSparse(TestPackers): def _check_roundtrip(self, obj, comparator, **kwargs): # currently these are not implemetned # i_rec = self.encode_decode(obj) # comparator(obj, i_rec, **kwargs) msg = r"msgpack sparse (series|frame) is not implemented" with pytest.raises(NotImplementedError, match=msg): self.encode_decode(obj) def test_sparse_series(self): s = tm.makeStringSeries() s[3:5] = np.nan ss = s.to_sparse() self._check_roundtrip(ss, tm.assert_series_equal, check_series_type=True) ss2 = s.to_sparse(kind="integer") self._check_roundtrip(ss2, tm.assert_series_equal, check_series_type=True) ss3 = s.to_sparse(fill_value=0) self._check_roundtrip(ss3, tm.assert_series_equal, check_series_type=True) def test_sparse_frame(self): s = tm.makeDataFrame() s.loc[3:5, 1:3] = np.nan s.loc[8:10, -2] = np.nan ss = s.to_sparse() self._check_roundtrip(ss, tm.assert_frame_equal, check_frame_type=True) ss2 = s.to_sparse(kind="integer") self._check_roundtrip(ss2, tm.assert_frame_equal, check_frame_type=True) ss3 = s.to_sparse(fill_value=0) self._check_roundtrip(ss3, tm.assert_frame_equal, check_frame_type=True) @pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") class TestCompression(TestPackers): """See https://github.com/pandas-dev/pandas/pull/9783 """ def setup_method(self, method): try: from sqlalchemy import create_engine self._create_sql_engine = create_engine except ImportError: self._SQLALCHEMY_INSTALLED = False else: self._SQLALCHEMY_INSTALLED = True super().setup_method(method) data = { "A": np.arange(1000, dtype=np.float64), "B": np.arange(1000, dtype=np.int32), "C": list(100 * "abcdefghij"), "D": date_range(datetime.datetime(2015, 4, 1), periods=1000), "E": [datetime.timedelta(days=x) for x in range(1000)], } self.frame = { "float": DataFrame({k: data[k] for k in ["A", "A"]}), "int": DataFrame({k: data[k] for k in ["B", "B"]}), "mixed": DataFrame(data), } def test_plain(self): i_rec = self.encode_decode(self.frame) for k in self.frame.keys(): assert_frame_equal(self.frame[k], i_rec[k]) def _test_compression(self, compress): i_rec = self.encode_decode(self.frame, compress=compress) for k in self.frame.keys(): value = i_rec[k] expected = self.frame[k] assert_frame_equal(value, expected) # make sure that we can write to the new frames for block in value._data.blocks: assert block.values.flags.writeable def test_compression_zlib(self): if not _ZLIB_INSTALLED: pytest.skip("no zlib") self._test_compression("zlib") def test_compression_blosc(self): if not _BLOSC_INSTALLED: pytest.skip("no blosc") self._test_compression("blosc") def _test_compression_warns_when_decompress_caches(self, monkeypatch, compress): not_garbage = [] control = [] # copied data compress_module = globals()[compress] real_decompress = compress_module.decompress def decompress(ob): """mock decompress function that delegates to the real decompress but caches the result and a copy of the result. """ res = real_decompress(ob) not_garbage.append(res) # hold a reference to this bytes object control.append(bytearray(res)) # copy the data here to check later return res # types mapped to values to add in place. rhs = { np.dtype("float64"): 1.0, np.dtype("int32"): 1, np.dtype("object"): "a", np.dtype("datetime64[ns]"): np.timedelta64(1, "ns"), np.dtype("timedelta64[ns]"): np.timedelta64(1, "ns"), } with monkeypatch.context() as m, tm.assert_produces_warning( PerformanceWarning ) as ws: m.setattr(compress_module, "decompress", decompress) with catch_warnings(): filterwarnings("ignore", category=FutureWarning) i_rec = self.encode_decode(self.frame, compress=compress) for k in self.frame.keys(): value = i_rec[k] expected = self.frame[k] assert_frame_equal(value, expected) # make sure that we can write to the new frames even though # we needed to copy the data for block in value._data.blocks: assert block.values.flags.writeable # mutate the data in some way block.values[0] += rhs[block.dtype] for w in ws: # check the messages from our warnings assert str(w.message) == ( "copying data after decompressing; " "this may mean that decompress is " "caching its result" ) for buf, control_buf in zip(not_garbage, control): # make sure none of our mutations above affected the # original buffers assert buf == control_buf def test_compression_warns_when_decompress_caches_zlib(self, monkeypatch): if not _ZLIB_INSTALLED: pytest.skip("no zlib") self._test_compression_warns_when_decompress_caches(monkeypatch, "zlib") def test_compression_warns_when_decompress_caches_blosc(self, monkeypatch): if not _BLOSC_INSTALLED: pytest.skip("no blosc") self._test_compression_warns_when_decompress_caches(monkeypatch, "blosc") def _test_small_strings_no_warn(self, compress): empty = np.array([], dtype="uint8") with tm.assert_produces_warning(None): with catch_warnings(): filterwarnings("ignore", category=FutureWarning) empty_unpacked = self.encode_decode(empty, compress=compress) tm.assert_numpy_array_equal(empty_unpacked, empty) assert empty_unpacked.flags.writeable char = np.array([ord(b"a")], dtype="uint8") with tm.assert_produces_warning(None): with catch_warnings(): filterwarnings("ignore", category=FutureWarning) char_unpacked = self.encode_decode(char, compress=compress) tm.assert_numpy_array_equal(char_unpacked, char) assert char_unpacked.flags.writeable # if this test fails I am sorry because the interpreter is now in a # bad state where b'a' points to 98 == ord(b'b'). char_unpacked[0] = ord(b"b") # we compare the ord of bytes b'a' with unicode 'a' because the should # always be the same (unless we were able to mutate the shared # character singleton in which case ord(b'a') == ord(b'b'). assert ord(b"a") == ord("a") tm.assert_numpy_array_equal(char_unpacked, np.array([ord(b"b")], dtype="uint8")) def test_small_strings_no_warn_zlib(self): if not _ZLIB_INSTALLED: pytest.skip("no zlib") self._test_small_strings_no_warn("zlib") def test_small_strings_no_warn_blosc(self): if not _BLOSC_INSTALLED: pytest.skip("no blosc") self._test_small_strings_no_warn("blosc") def test_readonly_axis_blosc(self): # GH11880 if not _BLOSC_INSTALLED: pytest.skip("no blosc") df1 = DataFrame({"A": list("abcd")}) df2 = DataFrame(df1, index=[1.0, 2.0, 3.0, 4.0]) assert 1 in self.encode_decode(df1["A"], compress="blosc") assert 1.0 in self.encode_decode(df2["A"], compress="blosc") def test_readonly_axis_zlib(self): # GH11880 df1 = DataFrame({"A": list("abcd")}) df2 = DataFrame(df1, index=[1.0, 2.0, 3.0, 4.0]) assert 1 in self.encode_decode(df1["A"], compress="zlib") assert 1.0 in self.encode_decode(df2["A"], compress="zlib") def test_readonly_axis_blosc_to_sql(self): # GH11880 if not _BLOSC_INSTALLED: pytest.skip("no blosc") if not self._SQLALCHEMY_INSTALLED: pytest.skip("no sqlalchemy") expected = DataFrame({"A": list("abcd")}) df = self.encode_decode(expected, compress="blosc") eng = self._create_sql_engine("sqlite:///:memory:") df.to_sql("test", eng, if_exists="append") result = pandas.read_sql_table("test", eng, index_col="index") result.index.names = [None] assert_frame_equal(expected, result) def test_readonly_axis_zlib_to_sql(self): # GH11880 if not _ZLIB_INSTALLED: pytest.skip("no zlib") if not self._SQLALCHEMY_INSTALLED: pytest.skip("no sqlalchemy") expected = DataFrame({"A": list("abcd")}) df = self.encode_decode(expected, compress="zlib") eng = self._create_sql_engine("sqlite:///:memory:") df.to_sql("test", eng, if_exists="append") result = pandas.read_sql_table("test", eng, index_col="index") result.index.names = [None] assert_frame_equal(expected, result) @pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") class TestEncoding(TestPackers): def setup_method(self, method): super().setup_method(method) data = { "A": ["\u2019"] * 1000, "B": np.arange(1000, dtype=np.int32), "C": list(100 * "abcdefghij"), "D": date_range(datetime.datetime(2015, 4, 1), periods=1000), "E": [datetime.timedelta(days=x) for x in range(1000)], "G": [400] * 1000, } self.frame = { "float": DataFrame({k: data[k] for k in ["A", "A"]}), "int": DataFrame({k: data[k] for k in ["B", "B"]}), "mixed": DataFrame(data), } self.utf_encodings = ["utf8", "utf16", "utf32"] def test_utf(self): # GH10581 for encoding in self.utf_encodings: for frame in self.frame.values(): result = self.encode_decode(frame, encoding=encoding) assert_frame_equal(result, frame) def test_default_encoding(self): for frame in self.frame.values(): result = frame.to_msgpack() expected = frame.to_msgpack(encoding="utf8") assert result == expected result = self.encode_decode(frame) assert_frame_equal(result, frame) files = glob.glob( os.path.join(os.path.dirname(__file__), "data", "legacy_msgpack", "*", "*.msgpack") ) @pytest.fixture(params=files) def legacy_packer(request, datapath): return datapath(request.param) @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") @pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") class TestMsgpack: """ How to add msgpack tests: 1. Install pandas version intended to output the msgpack. 2. Execute "generate_legacy_storage_files.py" to create the msgpack. $ python generate_legacy_storage_files.py msgpack 3. Move the created pickle to "data/legacy_msgpack/" directory. """ minimum_structure = { "series": ["float", "int", "mixed", "ts", "mi", "dup"], "frame": ["float", "int", "mixed", "mi"], "index": ["int", "date", "period"], "mi": ["reg2"], } def check_min_structure(self, data, version): for typ, v in self.minimum_structure.items(): assert typ in data, '"{0}" not found in unpacked data'.format(typ) for kind in v: msg = '"{0}" not found in data["{1}"]'.format(kind, typ) assert kind in data[typ], msg def compare(self, current_data, all_data, vf, version): data = read_msgpack(vf) self.check_min_structure(data, version) for typ, dv in data.items(): assert typ in all_data, "unpacked data contains " 'extra key "{0}"'.format( typ ) for dt, result in dv.items(): assert ( dt in current_data[typ] ), 'data["{0}"] contains extra ' 'key "{1}"'.format(typ, dt) try: expected = current_data[typ][dt] except KeyError: continue # use a specific comparator # if available comp_method = "compare_{typ}_{dt}".format(typ=typ, dt=dt) comparator = getattr(self, comp_method, None) if comparator is not None: comparator(result, expected, typ, version) else: check_arbitrary(result, expected) return data def compare_series_dt_tz(self, result, expected, typ, version): tm.assert_series_equal(result, expected) def compare_frame_dt_mixed_tzs(self, result, expected, typ, version): tm.assert_frame_equal(result, expected) def test_msgpacks_legacy( self, current_packers_data, all_packers_data, legacy_packer, datapath ): version = os.path.basename(os.path.dirname(legacy_packer)) try: with catch_warnings(record=True): self.compare( current_packers_data, all_packers_data, legacy_packer, version ) except ImportError: # blosc not installed pass def test_msgpack_period_freq(self): # https://github.com/pandas-dev/pandas/issues/24135 s = Series(np.random.rand(5), index=date_range("20130101", periods=5)) r = read_msgpack(s.to_msgpack()) repr(r)