import numpy as np import pytest import pandas as pd import pandas.util.testing as tm @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") @pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") class TestSparseGroupBy: def setup_method(self, method): self.dense = pd.DataFrame( { "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], "B": ["one", "one", "two", "three", "two", "two", "one", "three"], "C": np.random.randn(8), "D": np.random.randn(8), "E": [np.nan, np.nan, 1, 2, np.nan, 1, np.nan, np.nan], } ) self.sparse = self.dense.to_sparse() def test_first_last_nth(self): # tests for first / last / nth sparse_grouped = self.sparse.groupby("A") dense_grouped = self.dense.groupby("A") sparse_grouped_first = sparse_grouped.first() sparse_grouped_last = sparse_grouped.last() sparse_grouped_nth = sparse_grouped.nth(1) dense_grouped_first = pd.DataFrame(dense_grouped.first().to_sparse()) dense_grouped_last = pd.DataFrame(dense_grouped.last().to_sparse()) dense_grouped_nth = pd.DataFrame(dense_grouped.nth(1).to_sparse()) tm.assert_frame_equal(sparse_grouped_first, dense_grouped_first) tm.assert_frame_equal(sparse_grouped_last, dense_grouped_last) tm.assert_frame_equal(sparse_grouped_nth, dense_grouped_nth) def test_aggfuncs(self): sparse_grouped = self.sparse.groupby("A") dense_grouped = self.dense.groupby("A") result = sparse_grouped.mean().to_sparse() expected = dense_grouped.mean().to_sparse() tm.assert_frame_equal(result, expected) # ToDo: sparse sum includes str column # tm.assert_frame_equal(sparse_grouped.sum(), # dense_grouped.sum()) result = sparse_grouped.count().to_sparse() expected = dense_grouped.count().to_sparse() tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("fill_value", [0, np.nan]) @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") @pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") def test_groupby_includes_fill_value(fill_value): # https://github.com/pandas-dev/pandas/issues/5078 df = pd.DataFrame( { "a": [fill_value, 1, fill_value, fill_value], "b": [fill_value, 1, fill_value, fill_value], } ) sdf = df.to_sparse(fill_value=fill_value) result = sdf.groupby("a").sum() expected = pd.DataFrame(df.groupby("a").sum().to_sparse(fill_value=fill_value)) tm.assert_frame_equal(result, expected, check_index_type=False)