from datetime import timedelta import operator from string import ascii_lowercase import warnings import numpy as np import pytest import pandas.util._test_decorators as td import pandas as pd from pandas import ( Categorical, DataFrame, MultiIndex, Series, Timestamp, date_range, isna, notna, to_datetime, to_timedelta, ) import pandas.core.algorithms as algorithms import pandas.core.nanops as nanops import pandas.util.testing as tm def assert_stat_op_calc( opname, alternative, frame, has_skipna=True, check_dtype=True, check_dates=False, check_less_precise=False, skipna_alternative=None, ): """ Check that operator opname works as advertised on frame Parameters ---------- opname : string Name of the operator to test on frame alternative : function Function that opname is tested against; i.e. "frame.opname()" should equal "alternative(frame)". frame : DataFrame The object that the tests are executed on has_skipna : bool, default True Whether the method "opname" has the kwarg "skip_na" check_dtype : bool, default True Whether the dtypes of the result of "frame.opname()" and "alternative(frame)" should be checked. check_dates : bool, default false Whether opname should be tested on a Datetime Series check_less_precise : bool, default False Whether results should only be compared approximately; passed on to tm.assert_series_equal skipna_alternative : function, default None NaN-safe version of alternative """ f = getattr(frame, opname) if check_dates: df = DataFrame({"b": date_range("1/1/2001", periods=2)}) result = getattr(df, opname)() assert isinstance(result, Series) df["a"] = range(len(df)) result = getattr(df, opname)() assert isinstance(result, Series) assert len(result) if has_skipna: def wrapper(x): return alternative(x.values) skipna_wrapper = tm._make_skipna_wrapper(alternative, skipna_alternative) result0 = f(axis=0, skipna=False) result1 = f(axis=1, skipna=False) tm.assert_series_equal( result0, frame.apply(wrapper), check_dtype=check_dtype, check_less_precise=check_less_precise, ) # HACK: win32 tm.assert_series_equal( result1, frame.apply(wrapper, axis=1), check_dtype=False, check_less_precise=check_less_precise, ) else: skipna_wrapper = alternative result0 = f(axis=0) result1 = f(axis=1) tm.assert_series_equal( result0, frame.apply(skipna_wrapper), check_dtype=check_dtype, check_less_precise=check_less_precise, ) if opname in ["sum", "prod"]: expected = frame.apply(skipna_wrapper, axis=1) tm.assert_series_equal( result1, expected, check_dtype=False, check_less_precise=check_less_precise ) # check dtypes if check_dtype: lcd_dtype = frame.values.dtype assert lcd_dtype == result0.dtype assert lcd_dtype == result1.dtype # bad axis with pytest.raises(ValueError, match="No axis named 2"): f(axis=2) # all NA case if has_skipna: all_na = frame * np.NaN r0 = getattr(all_na, opname)(axis=0) r1 = getattr(all_na, opname)(axis=1) if opname in ["sum", "prod"]: unit = 1 if opname == "prod" else 0 # result for empty sum/prod expected = pd.Series(unit, index=r0.index, dtype=r0.dtype) tm.assert_series_equal(r0, expected) expected = pd.Series(unit, index=r1.index, dtype=r1.dtype) tm.assert_series_equal(r1, expected) def assert_stat_op_api(opname, float_frame, float_string_frame, has_numeric_only=False): """ Check that API for operator opname works as advertised on frame Parameters ---------- opname : string Name of the operator to test on frame float_frame : DataFrame DataFrame with columns of type float float_string_frame : DataFrame DataFrame with both float and string columns has_numeric_only : bool, default False Whether the method "opname" has the kwarg "numeric_only" """ # make sure works on mixed-type frame getattr(float_string_frame, opname)(axis=0) getattr(float_string_frame, opname)(axis=1) if has_numeric_only: getattr(float_string_frame, opname)(axis=0, numeric_only=True) getattr(float_string_frame, opname)(axis=1, numeric_only=True) getattr(float_frame, opname)(axis=0, numeric_only=False) getattr(float_frame, opname)(axis=1, numeric_only=False) def assert_bool_op_calc(opname, alternative, frame, has_skipna=True): """ Check that bool operator opname works as advertised on frame Parameters ---------- opname : string Name of the operator to test on frame alternative : function Function that opname is tested against; i.e. "frame.opname()" should equal "alternative(frame)". frame : DataFrame The object that the tests are executed on has_skipna : bool, default True Whether the method "opname" has the kwarg "skip_na" """ f = getattr(frame, opname) if has_skipna: def skipna_wrapper(x): nona = x.dropna().values return alternative(nona) def wrapper(x): return alternative(x.values) result0 = f(axis=0, skipna=False) result1 = f(axis=1, skipna=False) tm.assert_series_equal(result0, frame.apply(wrapper)) tm.assert_series_equal( result1, frame.apply(wrapper, axis=1), check_dtype=False ) # HACK: win32 else: skipna_wrapper = alternative wrapper = alternative result0 = f(axis=0) result1 = f(axis=1) tm.assert_series_equal(result0, frame.apply(skipna_wrapper)) tm.assert_series_equal( result1, frame.apply(skipna_wrapper, axis=1), check_dtype=False ) # bad axis with pytest.raises(ValueError, match="No axis named 2"): f(axis=2) # all NA case if has_skipna: all_na = frame * np.NaN r0 = getattr(all_na, opname)(axis=0) r1 = getattr(all_na, opname)(axis=1) if opname == "any": assert not r0.any() assert not r1.any() else: assert r0.all() assert r1.all() def assert_bool_op_api( opname, bool_frame_with_na, float_string_frame, has_bool_only=False ): """ Check that API for boolean operator opname works as advertised on frame Parameters ---------- opname : string Name of the operator to test on frame float_frame : DataFrame DataFrame with columns of type float float_string_frame : DataFrame DataFrame with both float and string columns has_bool_only : bool, default False Whether the method "opname" has the kwarg "bool_only" """ # make sure op works on mixed-type frame mixed = float_string_frame mixed["_bool_"] = np.random.randn(len(mixed)) > 0.5 getattr(mixed, opname)(axis=0) getattr(mixed, opname)(axis=1) if has_bool_only: getattr(mixed, opname)(axis=0, bool_only=True) getattr(mixed, opname)(axis=1, bool_only=True) getattr(bool_frame_with_na, opname)(axis=0, bool_only=False) getattr(bool_frame_with_na, opname)(axis=1, bool_only=False) class TestDataFrameAnalytics: # --------------------------------------------------------------------- # Correlation and covariance @td.skip_if_no_scipy def test_corr_pearson(self, float_frame): float_frame["A"][:5] = np.nan float_frame["B"][5:10] = np.nan self._check_method(float_frame, "pearson") @td.skip_if_no_scipy def test_corr_kendall(self, float_frame): float_frame["A"][:5] = np.nan float_frame["B"][5:10] = np.nan self._check_method(float_frame, "kendall") @td.skip_if_no_scipy def test_corr_spearman(self, float_frame): float_frame["A"][:5] = np.nan float_frame["B"][5:10] = np.nan self._check_method(float_frame, "spearman") def _check_method(self, frame, method="pearson"): correls = frame.corr(method=method) expected = frame["A"].corr(frame["C"], method=method) tm.assert_almost_equal(correls["A"]["C"], expected) @td.skip_if_no_scipy def test_corr_non_numeric(self, float_frame, float_string_frame): float_frame["A"][:5] = np.nan float_frame["B"][5:10] = np.nan # exclude non-numeric types result = float_string_frame.corr() expected = float_string_frame.loc[:, ["A", "B", "C", "D"]].corr() tm.assert_frame_equal(result, expected) @td.skip_if_no_scipy @pytest.mark.parametrize("meth", ["pearson", "kendall", "spearman"]) def test_corr_nooverlap(self, meth): # nothing in common df = DataFrame( { "A": [1, 1.5, 1, np.nan, np.nan, np.nan], "B": [np.nan, np.nan, np.nan, 1, 1.5, 1], "C": [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], } ) rs = df.corr(meth) assert isna(rs.loc["A", "B"]) assert isna(rs.loc["B", "A"]) assert rs.loc["A", "A"] == 1 assert rs.loc["B", "B"] == 1 assert isna(rs.loc["C", "C"]) @td.skip_if_no_scipy @pytest.mark.parametrize("meth", ["pearson", "spearman"]) def test_corr_constant(self, meth): # constant --> all NA df = DataFrame( { "A": [1, 1, 1, np.nan, np.nan, np.nan], "B": [np.nan, np.nan, np.nan, 1, 1, 1], } ) rs = df.corr(meth) assert isna(rs.values).all() def test_corr_int(self): # dtypes other than float64 #1761 df3 = DataFrame({"a": [1, 2, 3, 4], "b": [1, 2, 3, 4]}) df3.cov() df3.corr() @td.skip_if_no_scipy def test_corr_int_and_boolean(self): # when dtypes of pandas series are different # then ndarray will have dtype=object, # so it need to be properly handled df = DataFrame({"a": [True, False], "b": [1, 0]}) expected = DataFrame(np.ones((2, 2)), index=["a", "b"], columns=["a", "b"]) for meth in ["pearson", "kendall", "spearman"]: with warnings.catch_warnings(record=True): warnings.simplefilter("ignore", RuntimeWarning) result = df.corr(meth) tm.assert_frame_equal(result, expected) def test_corr_cov_independent_index_column(self): # GH 14617 df = pd.DataFrame(np.random.randn(4 * 10).reshape(10, 4), columns=list("abcd")) for method in ["cov", "corr"]: result = getattr(df, method)() assert result.index is not result.columns assert result.index.equals(result.columns) def test_corr_invalid_method(self): # GH 22298 df = pd.DataFrame(np.random.normal(size=(10, 2))) msg = "method must be either 'pearson', 'spearman', 'kendall', or a callable, " with pytest.raises(ValueError, match=msg): df.corr(method="____") def test_cov(self, float_frame, float_string_frame): # min_periods no NAs (corner case) expected = float_frame.cov() result = float_frame.cov(min_periods=len(float_frame)) tm.assert_frame_equal(expected, result) result = float_frame.cov(min_periods=len(float_frame) + 1) assert isna(result.values).all() # with NAs frame = float_frame.copy() frame["A"][:5] = np.nan frame["B"][5:10] = np.nan result = float_frame.cov(min_periods=len(float_frame) - 8) expected = float_frame.cov() expected.loc["A", "B"] = np.nan expected.loc["B", "A"] = np.nan # regular float_frame["A"][:5] = np.nan float_frame["B"][:10] = np.nan cov = float_frame.cov() tm.assert_almost_equal(cov["A"]["C"], float_frame["A"].cov(float_frame["C"])) # exclude non-numeric types result = float_string_frame.cov() expected = float_string_frame.loc[:, ["A", "B", "C", "D"]].cov() tm.assert_frame_equal(result, expected) # Single column frame df = DataFrame(np.linspace(0.0, 1.0, 10)) result = df.cov() expected = DataFrame( np.cov(df.values.T).reshape((1, 1)), index=df.columns, columns=df.columns ) tm.assert_frame_equal(result, expected) df.loc[0] = np.nan result = df.cov() expected = DataFrame( np.cov(df.values[1:].T).reshape((1, 1)), index=df.columns, columns=df.columns, ) tm.assert_frame_equal(result, expected) def test_corrwith(self, datetime_frame): a = datetime_frame noise = Series(np.random.randn(len(a)), index=a.index) b = datetime_frame.add(noise, axis=0) # make sure order does not matter b = b.reindex(columns=b.columns[::-1], index=b.index[::-1][10:]) del b["B"] colcorr = a.corrwith(b, axis=0) tm.assert_almost_equal(colcorr["A"], a["A"].corr(b["A"])) rowcorr = a.corrwith(b, axis=1) tm.assert_series_equal(rowcorr, a.T.corrwith(b.T, axis=0)) dropped = a.corrwith(b, axis=0, drop=True) tm.assert_almost_equal(dropped["A"], a["A"].corr(b["A"])) assert "B" not in dropped dropped = a.corrwith(b, axis=1, drop=True) assert a.index[-1] not in dropped.index # non time-series data index = ["a", "b", "c", "d", "e"] columns = ["one", "two", "three", "four"] df1 = DataFrame(np.random.randn(5, 4), index=index, columns=columns) df2 = DataFrame(np.random.randn(4, 4), index=index[:4], columns=columns) correls = df1.corrwith(df2, axis=1) for row in index[:4]: tm.assert_almost_equal(correls[row], df1.loc[row].corr(df2.loc[row])) def test_corrwith_with_objects(self): df1 = tm.makeTimeDataFrame() df2 = tm.makeTimeDataFrame() cols = ["A", "B", "C", "D"] df1["obj"] = "foo" df2["obj"] = "bar" result = df1.corrwith(df2) expected = df1.loc[:, cols].corrwith(df2.loc[:, cols]) tm.assert_series_equal(result, expected) result = df1.corrwith(df2, axis=1) expected = df1.loc[:, cols].corrwith(df2.loc[:, cols], axis=1) tm.assert_series_equal(result, expected) def test_corrwith_series(self, datetime_frame): result = datetime_frame.corrwith(datetime_frame["A"]) expected = datetime_frame.apply(datetime_frame["A"].corr) tm.assert_series_equal(result, expected) def test_corrwith_matches_corrcoef(self): df1 = DataFrame(np.arange(10000), columns=["a"]) df2 = DataFrame(np.arange(10000) ** 2, columns=["a"]) c1 = df1.corrwith(df2)["a"] c2 = np.corrcoef(df1["a"], df2["a"])[0][1] tm.assert_almost_equal(c1, c2) assert c1 < 1 def test_corrwith_mixed_dtypes(self): # GH 18570 df = pd.DataFrame( {"a": [1, 4, 3, 2], "b": [4, 6, 7, 3], "c": ["a", "b", "c", "d"]} ) s = pd.Series([0, 6, 7, 3]) result = df.corrwith(s) corrs = [df["a"].corr(s), df["b"].corr(s)] expected = pd.Series(data=corrs, index=["a", "b"]) tm.assert_series_equal(result, expected) def test_corrwith_index_intersection(self): df1 = pd.DataFrame(np.random.random(size=(10, 2)), columns=["a", "b"]) df2 = pd.DataFrame(np.random.random(size=(10, 3)), columns=["a", "b", "c"]) result = df1.corrwith(df2, drop=True).index.sort_values() expected = df1.columns.intersection(df2.columns).sort_values() tm.assert_index_equal(result, expected) def test_corrwith_index_union(self): df1 = pd.DataFrame(np.random.random(size=(10, 2)), columns=["a", "b"]) df2 = pd.DataFrame(np.random.random(size=(10, 3)), columns=["a", "b", "c"]) result = df1.corrwith(df2, drop=False).index.sort_values() expected = df1.columns.union(df2.columns).sort_values() tm.assert_index_equal(result, expected) def test_corrwith_dup_cols(self): # GH 21925 df1 = pd.DataFrame(np.vstack([np.arange(10)] * 3).T) df2 = df1.copy() df2 = pd.concat((df2, df2[0]), axis=1) result = df1.corrwith(df2) expected = pd.Series(np.ones(4), index=[0, 0, 1, 2]) tm.assert_series_equal(result, expected) @td.skip_if_no_scipy def test_corrwith_spearman(self): # GH 21925 df = pd.DataFrame(np.random.random(size=(100, 3))) result = df.corrwith(df ** 2, method="spearman") expected = Series(np.ones(len(result))) tm.assert_series_equal(result, expected) @td.skip_if_no_scipy def test_corrwith_kendall(self): # GH 21925 df = pd.DataFrame(np.random.random(size=(100, 3))) result = df.corrwith(df ** 2, method="kendall") expected = Series(np.ones(len(result))) tm.assert_series_equal(result, expected) # --------------------------------------------------------------------- # Describe def test_bool_describe_in_mixed_frame(self): df = DataFrame( { "string_data": ["a", "b", "c", "d", "e"], "bool_data": [True, True, False, False, False], "int_data": [10, 20, 30, 40, 50], } ) # Integer data are included in .describe() output, # Boolean and string data are not. result = df.describe() expected = DataFrame( {"int_data": [5, 30, df.int_data.std(), 10, 20, 30, 40, 50]}, index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], ) tm.assert_frame_equal(result, expected) # Top value is a boolean value that is False result = df.describe(include=["bool"]) expected = DataFrame( {"bool_data": [5, 2, False, 3]}, index=["count", "unique", "top", "freq"] ) tm.assert_frame_equal(result, expected) def test_describe_empty_object(self): # https://github.com/pandas-dev/pandas/issues/27183 df = pd.DataFrame({"A": [None, None]}, dtype=object) result = df.describe() expected = pd.DataFrame( {"A": [0, 0, np.nan, np.nan]}, dtype=object, index=["count", "unique", "top", "freq"], ) tm.assert_frame_equal(result, expected) result = df.iloc[:0].describe() tm.assert_frame_equal(result, expected) def test_describe_bool_frame(self): # GH 13891 df = pd.DataFrame( { "bool_data_1": [False, False, True, True], "bool_data_2": [False, True, True, True], } ) result = df.describe() expected = DataFrame( {"bool_data_1": [4, 2, True, 2], "bool_data_2": [4, 2, True, 3]}, index=["count", "unique", "top", "freq"], ) tm.assert_frame_equal(result, expected) df = pd.DataFrame( { "bool_data": [False, False, True, True, False], "int_data": [0, 1, 2, 3, 4], } ) result = df.describe() expected = DataFrame( {"int_data": [5, 2, df.int_data.std(), 0, 1, 2, 3, 4]}, index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], ) tm.assert_frame_equal(result, expected) df = pd.DataFrame( {"bool_data": [False, False, True, True], "str_data": ["a", "b", "c", "a"]} ) result = df.describe() expected = DataFrame( {"bool_data": [4, 2, True, 2], "str_data": [4, 3, "a", 2]}, index=["count", "unique", "top", "freq"], ) tm.assert_frame_equal(result, expected) def test_describe_categorical(self): df = DataFrame({"value": np.random.randint(0, 10000, 100)}) labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] cat_labels = Categorical(labels, labels) df = df.sort_values(by=["value"], ascending=True) df["value_group"] = pd.cut( df.value, range(0, 10500, 500), right=False, labels=cat_labels ) cat = df # Categoricals should not show up together with numerical columns result = cat.describe() assert len(result.columns) == 1 # In a frame, describe() for the cat should be the same as for string # arrays (count, unique, top, freq) cat = Categorical( ["a", "b", "b", "b"], categories=["a", "b", "c"], ordered=True ) s = Series(cat) result = s.describe() expected = Series([4, 2, "b", 3], index=["count", "unique", "top", "freq"]) tm.assert_series_equal(result, expected) cat = Series(Categorical(["a", "b", "c", "c"])) df3 = DataFrame({"cat": cat, "s": ["a", "b", "c", "c"]}) result = df3.describe() tm.assert_numpy_array_equal(result["cat"].values, result["s"].values) def test_describe_empty_categorical_column(self): # GH 26397 # Ensure the index of an an empty categorical DataFrame column # also contains (count, unique, top, freq) df = pd.DataFrame({"empty_col": Categorical([])}) result = df.describe() expected = DataFrame( {"empty_col": [0, 0, np.nan, np.nan]}, index=["count", "unique", "top", "freq"], dtype="object", ) tm.assert_frame_equal(result, expected) # ensure NaN, not None assert np.isnan(result.iloc[2, 0]) assert np.isnan(result.iloc[3, 0]) def test_describe_categorical_columns(self): # GH 11558 columns = pd.CategoricalIndex(["int1", "int2", "obj"], ordered=True, name="XXX") df = DataFrame( { "int1": [10, 20, 30, 40, 50], "int2": [10, 20, 30, 40, 50], "obj": ["A", 0, None, "X", 1], }, columns=columns, ) result = df.describe() exp_columns = pd.CategoricalIndex( ["int1", "int2"], categories=["int1", "int2", "obj"], ordered=True, name="XXX", ) expected = DataFrame( { "int1": [5, 30, df.int1.std(), 10, 20, 30, 40, 50], "int2": [5, 30, df.int2.std(), 10, 20, 30, 40, 50], }, index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], columns=exp_columns, ) tm.assert_frame_equal(result, expected) tm.assert_categorical_equal(result.columns.values, expected.columns.values) def test_describe_datetime_columns(self): columns = pd.DatetimeIndex( ["2011-01-01", "2011-02-01", "2011-03-01"], freq="MS", tz="US/Eastern", name="XXX", ) df = DataFrame( { 0: [10, 20, 30, 40, 50], 1: [10, 20, 30, 40, 50], 2: ["A", 0, None, "X", 1], } ) df.columns = columns result = df.describe() exp_columns = pd.DatetimeIndex( ["2011-01-01", "2011-02-01"], freq="MS", tz="US/Eastern", name="XXX" ) expected = DataFrame( { 0: [5, 30, df.iloc[:, 0].std(), 10, 20, 30, 40, 50], 1: [5, 30, df.iloc[:, 1].std(), 10, 20, 30, 40, 50], }, index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], ) expected.columns = exp_columns tm.assert_frame_equal(result, expected) assert result.columns.freq == "MS" assert result.columns.tz == expected.columns.tz def test_describe_timedelta_values(self): # GH 6145 t1 = pd.timedelta_range("1 days", freq="D", periods=5) t2 = pd.timedelta_range("1 hours", freq="H", periods=5) df = pd.DataFrame({"t1": t1, "t2": t2}) expected = DataFrame( { "t1": [ 5, pd.Timedelta("3 days"), df.iloc[:, 0].std(), pd.Timedelta("1 days"), pd.Timedelta("2 days"), pd.Timedelta("3 days"), pd.Timedelta("4 days"), pd.Timedelta("5 days"), ], "t2": [ 5, pd.Timedelta("3 hours"), df.iloc[:, 1].std(), pd.Timedelta("1 hours"), pd.Timedelta("2 hours"), pd.Timedelta("3 hours"), pd.Timedelta("4 hours"), pd.Timedelta("5 hours"), ], }, index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], ) result = df.describe() tm.assert_frame_equal(result, expected) exp_repr = ( " t1 t2\n" "count 5 5\n" "mean 3 days 00:00:00 0 days 03:00:00\n" "std 1 days 13:56:50.394919 0 days 01:34:52.099788\n" "min 1 days 00:00:00 0 days 01:00:00\n" "25% 2 days 00:00:00 0 days 02:00:00\n" "50% 3 days 00:00:00 0 days 03:00:00\n" "75% 4 days 00:00:00 0 days 04:00:00\n" "max 5 days 00:00:00 0 days 05:00:00" ) assert repr(result) == exp_repr def test_describe_tz_values(self, tz_naive_fixture): # GH 21332 tz = tz_naive_fixture s1 = Series(range(5)) start = Timestamp(2018, 1, 1) end = Timestamp(2018, 1, 5) s2 = Series(date_range(start, end, tz=tz)) df = pd.DataFrame({"s1": s1, "s2": s2}) expected = DataFrame( { "s1": [ 5, np.nan, np.nan, np.nan, np.nan, np.nan, 2, 1.581139, 0, 1, 2, 3, 4, ], "s2": [ 5, 5, s2.value_counts().index[0], 1, start.tz_localize(tz), end.tz_localize(tz), np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, ], }, index=[ "count", "unique", "top", "freq", "first", "last", "mean", "std", "min", "25%", "50%", "75%", "max", ], ) result = df.describe(include="all") tm.assert_frame_equal(result, expected) def test_describe_percentiles_integer_idx(self): # Issue 26660 df = pd.DataFrame({"x": [1]}) pct = np.linspace(0, 1, 10 + 1) result = df.describe(percentiles=pct) expected = DataFrame( {"x": [1.0, 1.0, np.NaN, 1.0, *[1.0 for _ in pct], 1.0]}, index=[ "count", "mean", "std", "min", "0%", "10%", "20%", "30%", "40%", "50%", "60%", "70%", "80%", "90%", "100%", "max", ], ) tm.assert_frame_equal(result, expected) # --------------------------------------------------------------------- # Reductions def test_stat_op_api(self, float_frame, float_string_frame): assert_stat_op_api( "count", float_frame, float_string_frame, has_numeric_only=True ) assert_stat_op_api( "sum", float_frame, float_string_frame, has_numeric_only=True ) assert_stat_op_api("nunique", float_frame, float_string_frame) assert_stat_op_api("mean", float_frame, float_string_frame) assert_stat_op_api("product", float_frame, float_string_frame) assert_stat_op_api("median", float_frame, float_string_frame) assert_stat_op_api("min", float_frame, float_string_frame) assert_stat_op_api("max", float_frame, float_string_frame) assert_stat_op_api("mad", float_frame, float_string_frame) assert_stat_op_api("var", float_frame, float_string_frame) assert_stat_op_api("std", float_frame, float_string_frame) assert_stat_op_api("sem", float_frame, float_string_frame) assert_stat_op_api("median", float_frame, float_string_frame) try: from scipy.stats import skew, kurtosis # noqa:F401 assert_stat_op_api("skew", float_frame, float_string_frame) assert_stat_op_api("kurt", float_frame, float_string_frame) except ImportError: pass def test_stat_op_calc(self, float_frame_with_na, mixed_float_frame): def count(s): return notna(s).sum() def nunique(s): return len(algorithms.unique1d(s.dropna())) def mad(x): return np.abs(x - x.mean()).mean() def var(x): return np.var(x, ddof=1) def std(x): return np.std(x, ddof=1) def sem(x): return np.std(x, ddof=1) / np.sqrt(len(x)) def skewness(x): from scipy.stats import skew # noqa:F811 if len(x) < 3: return np.nan return skew(x, bias=False) def kurt(x): from scipy.stats import kurtosis # noqa:F811 if len(x) < 4: return np.nan return kurtosis(x, bias=False) assert_stat_op_calc( "nunique", nunique, float_frame_with_na, has_skipna=False, check_dtype=False, check_dates=True, ) # mixed types (with upcasting happening) assert_stat_op_calc( "sum", np.sum, mixed_float_frame.astype("float32"), check_dtype=False, check_less_precise=True, ) assert_stat_op_calc( "sum", np.sum, float_frame_with_na, skipna_alternative=np.nansum ) assert_stat_op_calc("mean", np.mean, float_frame_with_na, check_dates=True) assert_stat_op_calc("product", np.prod, float_frame_with_na) assert_stat_op_calc("mad", mad, float_frame_with_na) assert_stat_op_calc("var", var, float_frame_with_na) assert_stat_op_calc("std", std, float_frame_with_na) assert_stat_op_calc("sem", sem, float_frame_with_na) assert_stat_op_calc( "count", count, float_frame_with_na, has_skipna=False, check_dtype=False, check_dates=True, ) try: from scipy import skew, kurtosis # noqa:F401 assert_stat_op_calc("skew", skewness, float_frame_with_na) assert_stat_op_calc("kurt", kurt, float_frame_with_na) except ImportError: pass # TODO: Ensure warning isn't emitted in the first place @pytest.mark.filterwarnings("ignore:All-NaN:RuntimeWarning") def test_median(self, float_frame_with_na, int_frame): def wrapper(x): if isna(x).any(): return np.nan return np.median(x) assert_stat_op_calc("median", wrapper, float_frame_with_na, check_dates=True) assert_stat_op_calc( "median", wrapper, int_frame, check_dtype=False, check_dates=True ) @pytest.mark.parametrize( "method", ["sum", "mean", "prod", "var", "std", "skew", "min", "max"] ) def test_stat_operators_attempt_obj_array(self, method): # GH#676 data = { "a": [ -0.00049987540199591344, -0.0016467257772919831, 0.00067695870775883013, ], "b": [-0, -0, 0.0], "c": [ 0.00031111847529610595, 0.0014902627951905339, -0.00094099200035979691, ], } df1 = DataFrame(data, index=["foo", "bar", "baz"], dtype="O") df2 = DataFrame({0: [np.nan, 2], 1: [np.nan, 3], 2: [np.nan, 4]}, dtype=object) for df in [df1, df2]: assert df.values.dtype == np.object_ result = getattr(df, method)(1) expected = getattr(df.astype("f8"), method)(1) if method in ["sum", "prod"]: tm.assert_series_equal(result, expected) @pytest.mark.parametrize("op", ["mean", "std", "var", "skew", "kurt", "sem"]) def test_mixed_ops(self, op): # GH#16116 df = DataFrame( { "int": [1, 2, 3, 4], "float": [1.0, 2.0, 3.0, 4.0], "str": ["a", "b", "c", "d"], } ) result = getattr(df, op)() assert len(result) == 2 with pd.option_context("use_bottleneck", False): result = getattr(df, op)() assert len(result) == 2 def test_reduce_mixed_frame(self): # GH 6806 df = DataFrame( { "bool_data": [True, True, False, False, False], "int_data": [10, 20, 30, 40, 50], "string_data": ["a", "b", "c", "d", "e"], } ) df.reindex(columns=["bool_data", "int_data", "string_data"]) test = df.sum(axis=0) tm.assert_numpy_array_equal( test.values, np.array([2, 150, "abcde"], dtype=object) ) tm.assert_series_equal(test, df.T.sum(axis=1)) def test_nunique(self): df = DataFrame({"A": [1, 1, 1], "B": [1, 2, 3], "C": [1, np.nan, 3]}) tm.assert_series_equal(df.nunique(), Series({"A": 1, "B": 3, "C": 2})) tm.assert_series_equal( df.nunique(dropna=False), Series({"A": 1, "B": 3, "C": 3}) ) tm.assert_series_equal(df.nunique(axis=1), Series({0: 1, 1: 2, 2: 2})) tm.assert_series_equal( df.nunique(axis=1, dropna=False), Series({0: 1, 1: 3, 2: 2}) ) @pytest.mark.parametrize("tz", [None, "UTC"]) def test_mean_mixed_datetime_numeric(self, tz): # https://github.com/pandas-dev/pandas/issues/24752 df = pd.DataFrame({"A": [1, 1], "B": [pd.Timestamp("2000", tz=tz)] * 2}) result = df.mean() expected = pd.Series([1.0], index=["A"]) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("tz", [None, "UTC"]) def test_mean_excludeds_datetimes(self, tz): # https://github.com/pandas-dev/pandas/issues/24752 # Our long-term desired behavior is unclear, but the behavior in # 0.24.0rc1 was buggy. df = pd.DataFrame({"A": [pd.Timestamp("2000", tz=tz)] * 2}) result = df.mean() expected = pd.Series() tm.assert_series_equal(result, expected) def test_var_std(self, datetime_frame): result = datetime_frame.std(ddof=4) expected = datetime_frame.apply(lambda x: x.std(ddof=4)) tm.assert_almost_equal(result, expected) result = datetime_frame.var(ddof=4) expected = datetime_frame.apply(lambda x: x.var(ddof=4)) tm.assert_almost_equal(result, expected) arr = np.repeat(np.random.random((1, 1000)), 1000, 0) result = nanops.nanvar(arr, axis=0) assert not (result < 0).any() with pd.option_context("use_bottleneck", False): result = nanops.nanvar(arr, axis=0) assert not (result < 0).any() @pytest.mark.parametrize("meth", ["sem", "var", "std"]) def test_numeric_only_flag(self, meth): # GH 9201 df1 = DataFrame(np.random.randn(5, 3), columns=["foo", "bar", "baz"]) # set one entry to a number in str format df1.loc[0, "foo"] = "100" df2 = DataFrame(np.random.randn(5, 3), columns=["foo", "bar", "baz"]) # set one entry to a non-number str df2.loc[0, "foo"] = "a" result = getattr(df1, meth)(axis=1, numeric_only=True) expected = getattr(df1[["bar", "baz"]], meth)(axis=1) tm.assert_series_equal(expected, result) result = getattr(df2, meth)(axis=1, numeric_only=True) expected = getattr(df2[["bar", "baz"]], meth)(axis=1) tm.assert_series_equal(expected, result) # df1 has all numbers, df2 has a letter inside msg = r"unsupported operand type\(s\) for -: 'float' and 'str'" with pytest.raises(TypeError, match=msg): getattr(df1, meth)(axis=1, numeric_only=False) msg = "could not convert string to float: 'a'" with pytest.raises(TypeError, match=msg): getattr(df2, meth)(axis=1, numeric_only=False) def test_sem(self, datetime_frame): result = datetime_frame.sem(ddof=4) expected = datetime_frame.apply(lambda x: x.std(ddof=4) / np.sqrt(len(x))) tm.assert_almost_equal(result, expected) arr = np.repeat(np.random.random((1, 1000)), 1000, 0) result = nanops.nansem(arr, axis=0) assert not (result < 0).any() with pd.option_context("use_bottleneck", False): result = nanops.nansem(arr, axis=0) assert not (result < 0).any() @td.skip_if_no_scipy def test_kurt(self): index = MultiIndex( levels=[["bar"], ["one", "two", "three"], [0, 1]], codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]], ) df = DataFrame(np.random.randn(6, 3), index=index) kurt = df.kurt() kurt2 = df.kurt(level=0).xs("bar") tm.assert_series_equal(kurt, kurt2, check_names=False) assert kurt.name is None assert kurt2.name == "bar" @pytest.mark.parametrize( "dropna, expected", [ ( True, { "A": [12], "B": [10.0], "C": [1.0], "D": ["a"], "E": Categorical(["a"], categories=["a"]), "F": to_datetime(["2000-1-2"]), "G": to_timedelta(["1 days"]), }, ), ( False, { "A": [12], "B": [10.0], "C": [np.nan], "D": np.array([np.nan], dtype=object), "E": Categorical([np.nan], categories=["a"]), "F": [pd.NaT], "G": to_timedelta([pd.NaT]), }, ), ( True, { "H": [8, 9, np.nan, np.nan], "I": [8, 9, np.nan, np.nan], "J": [1, np.nan, np.nan, np.nan], "K": Categorical(["a", np.nan, np.nan, np.nan], categories=["a"]), "L": to_datetime(["2000-1-2", "NaT", "NaT", "NaT"]), "M": to_timedelta(["1 days", "nan", "nan", "nan"]), "N": [0, 1, 2, 3], }, ), ( False, { "H": [8, 9, np.nan, np.nan], "I": [8, 9, np.nan, np.nan], "J": [1, np.nan, np.nan, np.nan], "K": Categorical([np.nan, "a", np.nan, np.nan], categories=["a"]), "L": to_datetime(["NaT", "2000-1-2", "NaT", "NaT"]), "M": to_timedelta(["nan", "1 days", "nan", "nan"]), "N": [0, 1, 2, 3], }, ), ], ) def test_mode_dropna(self, dropna, expected): df = DataFrame( { "A": [12, 12, 19, 11], "B": [10, 10, np.nan, 3], "C": [1, np.nan, np.nan, np.nan], "D": [np.nan, np.nan, "a", np.nan], "E": Categorical([np.nan, np.nan, "a", np.nan]), "F": to_datetime(["NaT", "2000-1-2", "NaT", "NaT"]), "G": to_timedelta(["1 days", "nan", "nan", "nan"]), "H": [8, 8, 9, 9], "I": [9, 9, 8, 8], "J": [1, 1, np.nan, np.nan], "K": Categorical(["a", np.nan, "a", np.nan]), "L": to_datetime(["2000-1-2", "2000-1-2", "NaT", "NaT"]), "M": to_timedelta(["1 days", "nan", "1 days", "nan"]), "N": np.arange(4, dtype="int64"), } ) result = df[sorted(list(expected.keys()))].mode(dropna=dropna) expected = DataFrame(expected) tm.assert_frame_equal(result, expected) def test_mode_sortwarning(self): # Check for the warning that is raised when the mode # results cannot be sorted df = DataFrame({"A": [np.nan, np.nan, "a", "a"]}) expected = DataFrame({"A": ["a", np.nan]}) with tm.assert_produces_warning(UserWarning, check_stacklevel=False): result = df.mode(dropna=False) result = result.sort_values(by="A").reset_index(drop=True) tm.assert_frame_equal(result, expected) def test_operators_timedelta64(self): df = DataFrame( dict( A=date_range("2012-1-1", periods=3, freq="D"), B=date_range("2012-1-2", periods=3, freq="D"), C=Timestamp("20120101") - timedelta(minutes=5, seconds=5), ) ) diffs = DataFrame(dict(A=df["A"] - df["C"], B=df["A"] - df["B"])) # min result = diffs.min() assert result[0] == diffs.loc[0, "A"] assert result[1] == diffs.loc[0, "B"] result = diffs.min(axis=1) assert (result == diffs.loc[0, "B"]).all() # max result = diffs.max() assert result[0] == diffs.loc[2, "A"] assert result[1] == diffs.loc[2, "B"] result = diffs.max(axis=1) assert (result == diffs["A"]).all() # abs result = diffs.abs() result2 = abs(diffs) expected = DataFrame(dict(A=df["A"] - df["C"], B=df["B"] - df["A"])) tm.assert_frame_equal(result, expected) tm.assert_frame_equal(result2, expected) # mixed frame mixed = diffs.copy() mixed["C"] = "foo" mixed["D"] = 1 mixed["E"] = 1.0 mixed["F"] = Timestamp("20130101") # results in an object array result = mixed.min() expected = Series( [ pd.Timedelta(timedelta(seconds=5 * 60 + 5)), pd.Timedelta(timedelta(days=-1)), "foo", 1, 1.0, Timestamp("20130101"), ], index=mixed.columns, ) tm.assert_series_equal(result, expected) # excludes numeric result = mixed.min(axis=1) expected = Series([1, 1, 1.0], index=[0, 1, 2]) tm.assert_series_equal(result, expected) # works when only those columns are selected result = mixed[["A", "B"]].min(1) expected = Series([timedelta(days=-1)] * 3) tm.assert_series_equal(result, expected) result = mixed[["A", "B"]].min() expected = Series( [timedelta(seconds=5 * 60 + 5), timedelta(days=-1)], index=["A", "B"] ) tm.assert_series_equal(result, expected) # GH 3106 df = DataFrame( { "time": date_range("20130102", periods=5), "time2": date_range("20130105", periods=5), } ) df["off1"] = df["time2"] - df["time"] assert df["off1"].dtype == "timedelta64[ns]" df["off2"] = df["time"] - df["time2"] df._consolidate_inplace() assert df["off1"].dtype == "timedelta64[ns]" assert df["off2"].dtype == "timedelta64[ns]" def test_sum_corner(self): empty_frame = DataFrame() axis0 = empty_frame.sum(0) axis1 = empty_frame.sum(1) assert isinstance(axis0, Series) assert isinstance(axis1, Series) assert len(axis0) == 0 assert len(axis1) == 0 @pytest.mark.parametrize("method, unit", [("sum", 0), ("prod", 1)]) def test_sum_prod_nanops(self, method, unit): idx = ["a", "b", "c"] df = pd.DataFrame( {"a": [unit, unit], "b": [unit, np.nan], "c": [np.nan, np.nan]} ) # The default result = getattr(df, method) expected = pd.Series([unit, unit, unit], index=idx, dtype="float64") # min_count=1 result = getattr(df, method)(min_count=1) expected = pd.Series([unit, unit, np.nan], index=idx) tm.assert_series_equal(result, expected) # min_count=0 result = getattr(df, method)(min_count=0) expected = pd.Series([unit, unit, unit], index=idx, dtype="float64") tm.assert_series_equal(result, expected) result = getattr(df.iloc[1:], method)(min_count=1) expected = pd.Series([unit, np.nan, np.nan], index=idx) tm.assert_series_equal(result, expected) # min_count > 1 df = pd.DataFrame({"A": [unit] * 10, "B": [unit] * 5 + [np.nan] * 5}) result = getattr(df, method)(min_count=5) expected = pd.Series(result, index=["A", "B"]) tm.assert_series_equal(result, expected) result = getattr(df, method)(min_count=6) expected = pd.Series(result, index=["A", "B"]) tm.assert_series_equal(result, expected) def test_sum_nanops_timedelta(self): # prod isn't defined on timedeltas idx = ["a", "b", "c"] df = pd.DataFrame({"a": [0, 0], "b": [0, np.nan], "c": [np.nan, np.nan]}) df2 = df.apply(pd.to_timedelta) # 0 by default result = df2.sum() expected = pd.Series([0, 0, 0], dtype="m8[ns]", index=idx) tm.assert_series_equal(result, expected) # min_count=0 result = df2.sum(min_count=0) tm.assert_series_equal(result, expected) # min_count=1 result = df2.sum(min_count=1) expected = pd.Series([0, 0, np.nan], dtype="m8[ns]", index=idx) tm.assert_series_equal(result, expected) def test_sum_object(self, float_frame): values = float_frame.values.astype(int) frame = DataFrame(values, index=float_frame.index, columns=float_frame.columns) deltas = frame * timedelta(1) deltas.sum() def test_sum_bool(self, float_frame): # ensure this works, bug report bools = np.isnan(float_frame) bools.sum(1) bools.sum(0) def test_mean_corner(self, float_frame, float_string_frame): # unit test when have object data the_mean = float_string_frame.mean(axis=0) the_sum = float_string_frame.sum(axis=0, numeric_only=True) tm.assert_index_equal(the_sum.index, the_mean.index) assert len(the_mean.index) < len(float_string_frame.columns) # xs sum mixed type, just want to know it works... the_mean = float_string_frame.mean(axis=1) the_sum = float_string_frame.sum(axis=1, numeric_only=True) tm.assert_index_equal(the_sum.index, the_mean.index) # take mean of boolean column float_frame["bool"] = float_frame["A"] > 0 means = float_frame.mean(0) assert means["bool"] == float_frame["bool"].values.mean() def test_mean_datetimelike(self): # GH#24757 check that datetimelike are excluded by default, handled # correctly with numeric_only=True df = pd.DataFrame( { "A": np.arange(3), "B": pd.date_range("2016-01-01", periods=3), "C": pd.timedelta_range("1D", periods=3), "D": pd.period_range("2016", periods=3, freq="A"), } ) result = df.mean(numeric_only=True) expected = pd.Series({"A": 1.0}) tm.assert_series_equal(result, expected) result = df.mean() expected = pd.Series({"A": 1.0, "C": df.loc[1, "C"]}) tm.assert_series_equal(result, expected) @pytest.mark.xfail( reason="casts to object-dtype and then tries to add timestamps", raises=TypeError, strict=True, ) def test_mean_datetimelike_numeric_only_false(self): df = pd.DataFrame( { "A": np.arange(3), "B": pd.date_range("2016-01-01", periods=3), "C": pd.timedelta_range("1D", periods=3), "D": pd.period_range("2016", periods=3, freq="A"), } ) result = df.mean(numeric_only=False) expected = pd.Series( {"A": 1, "B": df.loc[1, "B"], "C": df.loc[1, "C"], "D": df.loc[1, "D"]} ) tm.assert_series_equal(result, expected) def test_stats_mixed_type(self, float_string_frame): # don't blow up float_string_frame.std(1) float_string_frame.var(1) float_string_frame.mean(1) float_string_frame.skew(1) def test_sum_bools(self): df = DataFrame(index=range(1), columns=range(10)) bools = isna(df) assert bools.sum(axis=1)[0] == 10 # --------------------------------------------------------------------- # Cumulative Reductions - cumsum, cummax, ... def test_cumsum_corner(self): dm = DataFrame(np.arange(20).reshape(4, 5), index=range(4), columns=range(5)) # ?(wesm) result = dm.cumsum() # noqa def test_cumsum(self, datetime_frame): datetime_frame.loc[5:10, 0] = np.nan datetime_frame.loc[10:15, 1] = np.nan datetime_frame.loc[15:, 2] = np.nan # axis = 0 cumsum = datetime_frame.cumsum() expected = datetime_frame.apply(Series.cumsum) tm.assert_frame_equal(cumsum, expected) # axis = 1 cumsum = datetime_frame.cumsum(axis=1) expected = datetime_frame.apply(Series.cumsum, axis=1) tm.assert_frame_equal(cumsum, expected) # works df = DataFrame({"A": np.arange(20)}, index=np.arange(20)) result = df.cumsum() # noqa # fix issue cumsum_xs = datetime_frame.cumsum(axis=1) assert np.shape(cumsum_xs) == np.shape(datetime_frame) def test_cumprod(self, datetime_frame): datetime_frame.loc[5:10, 0] = np.nan datetime_frame.loc[10:15, 1] = np.nan datetime_frame.loc[15:, 2] = np.nan # axis = 0 cumprod = datetime_frame.cumprod() expected = datetime_frame.apply(Series.cumprod) tm.assert_frame_equal(cumprod, expected) # axis = 1 cumprod = datetime_frame.cumprod(axis=1) expected = datetime_frame.apply(Series.cumprod, axis=1) tm.assert_frame_equal(cumprod, expected) # fix issue cumprod_xs = datetime_frame.cumprod(axis=1) assert np.shape(cumprod_xs) == np.shape(datetime_frame) # ints df = datetime_frame.fillna(0).astype(int) df.cumprod(0) df.cumprod(1) # ints32 df = datetime_frame.fillna(0).astype(np.int32) df.cumprod(0) df.cumprod(1) def test_cummin(self, datetime_frame): datetime_frame.loc[5:10, 0] = np.nan datetime_frame.loc[10:15, 1] = np.nan datetime_frame.loc[15:, 2] = np.nan # axis = 0 cummin = datetime_frame.cummin() expected = datetime_frame.apply(Series.cummin) tm.assert_frame_equal(cummin, expected) # axis = 1 cummin = datetime_frame.cummin(axis=1) expected = datetime_frame.apply(Series.cummin, axis=1) tm.assert_frame_equal(cummin, expected) # it works df = DataFrame({"A": np.arange(20)}, index=np.arange(20)) result = df.cummin() # noqa # fix issue cummin_xs = datetime_frame.cummin(axis=1) assert np.shape(cummin_xs) == np.shape(datetime_frame) def test_cummax(self, datetime_frame): datetime_frame.loc[5:10, 0] = np.nan datetime_frame.loc[10:15, 1] = np.nan datetime_frame.loc[15:, 2] = np.nan # axis = 0 cummax = datetime_frame.cummax() expected = datetime_frame.apply(Series.cummax) tm.assert_frame_equal(cummax, expected) # axis = 1 cummax = datetime_frame.cummax(axis=1) expected = datetime_frame.apply(Series.cummax, axis=1) tm.assert_frame_equal(cummax, expected) # it works df = DataFrame({"A": np.arange(20)}, index=np.arange(20)) result = df.cummax() # noqa # fix issue cummax_xs = datetime_frame.cummax(axis=1) assert np.shape(cummax_xs) == np.shape(datetime_frame) # --------------------------------------------------------------------- # Miscellanea def test_count(self): # corner case frame = DataFrame() ct1 = frame.count(1) assert isinstance(ct1, Series) ct2 = frame.count(0) assert isinstance(ct2, Series) # GH#423 df = DataFrame(index=range(10)) result = df.count(1) expected = Series(0, index=df.index) tm.assert_series_equal(result, expected) df = DataFrame(columns=range(10)) result = df.count(0) expected = Series(0, index=df.columns) tm.assert_series_equal(result, expected) df = DataFrame() result = df.count() expected = Series(0, index=[]) tm.assert_series_equal(result, expected) def test_count_objects(self, float_string_frame): dm = DataFrame(float_string_frame._series) df = DataFrame(float_string_frame._series) tm.assert_series_equal(dm.count(), df.count()) tm.assert_series_equal(dm.count(1), df.count(1)) def test_pct_change(self): # GH#11150 pnl = DataFrame( [np.arange(0, 40, 10), np.arange(0, 40, 10), np.arange(0, 40, 10)] ).astype(np.float64) pnl.iat[1, 0] = np.nan pnl.iat[1, 1] = np.nan pnl.iat[2, 3] = 60 for axis in range(2): expected = pnl.ffill(axis=axis) / pnl.ffill(axis=axis).shift(axis=axis) - 1 result = pnl.pct_change(axis=axis, fill_method="pad") tm.assert_frame_equal(result, expected) # ---------------------------------------------------------------------- # Index of max / min def test_idxmin(self, float_frame, int_frame): frame = float_frame frame.loc[5:10] = np.nan frame.loc[15:20, -2:] = np.nan for skipna in [True, False]: for axis in [0, 1]: for df in [frame, int_frame]: result = df.idxmin(axis=axis, skipna=skipna) expected = df.apply(Series.idxmin, axis=axis, skipna=skipna) tm.assert_series_equal(result, expected) msg = "No axis named 2 for object type " with pytest.raises(ValueError, match=msg): frame.idxmin(axis=2) def test_idxmax(self, float_frame, int_frame): frame = float_frame frame.loc[5:10] = np.nan frame.loc[15:20, -2:] = np.nan for skipna in [True, False]: for axis in [0, 1]: for df in [frame, int_frame]: result = df.idxmax(axis=axis, skipna=skipna) expected = df.apply(Series.idxmax, axis=axis, skipna=skipna) tm.assert_series_equal(result, expected) msg = "No axis named 2 for object type " with pytest.raises(ValueError, match=msg): frame.idxmax(axis=2) # ---------------------------------------------------------------------- # Logical reductions @pytest.mark.parametrize("opname", ["any", "all"]) def test_any_all(self, opname, bool_frame_with_na, float_string_frame): assert_bool_op_calc( opname, getattr(np, opname), bool_frame_with_na, has_skipna=True ) assert_bool_op_api( opname, bool_frame_with_na, float_string_frame, has_bool_only=True ) def test_any_all_extra(self): df = DataFrame( { "A": [True, False, False], "B": [True, True, False], "C": [True, True, True], }, index=["a", "b", "c"], ) result = df[["A", "B"]].any(1) expected = Series([True, True, False], index=["a", "b", "c"]) tm.assert_series_equal(result, expected) result = df[["A", "B"]].any(1, bool_only=True) tm.assert_series_equal(result, expected) result = df.all(1) expected = Series([True, False, False], index=["a", "b", "c"]) tm.assert_series_equal(result, expected) result = df.all(1, bool_only=True) tm.assert_series_equal(result, expected) # Axis is None result = df.all(axis=None).item() assert result is False result = df.any(axis=None).item() assert result is True result = df[["C"]].all(axis=None).item() assert result is True def test_any_datetime(self): # GH 23070 float_data = [1, np.nan, 3, np.nan] datetime_data = [ pd.Timestamp("1960-02-15"), pd.Timestamp("1960-02-16"), pd.NaT, pd.NaT, ] df = DataFrame({"A": float_data, "B": datetime_data}) result = df.any(1) expected = Series([True, True, True, False]) tm.assert_series_equal(result, expected) def test_any_all_bool_only(self): # GH 25101 df = DataFrame( {"col1": [1, 2, 3], "col2": [4, 5, 6], "col3": [None, None, None]} ) result = df.all(bool_only=True) expected = Series(dtype=np.bool) tm.assert_series_equal(result, expected) df = DataFrame( { "col1": [1, 2, 3], "col2": [4, 5, 6], "col3": [None, None, None], "col4": [False, False, True], } ) result = df.all(bool_only=True) expected = Series({"col4": False}) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "func, data, expected", [ (np.any, {}, False), (np.all, {}, True), (np.any, {"A": []}, False), (np.all, {"A": []}, True), (np.any, {"A": [False, False]}, False), (np.all, {"A": [False, False]}, False), (np.any, {"A": [True, False]}, True), (np.all, {"A": [True, False]}, False), (np.any, {"A": [True, True]}, True), (np.all, {"A": [True, True]}, True), (np.any, {"A": [False], "B": [False]}, False), (np.all, {"A": [False], "B": [False]}, False), (np.any, {"A": [False, False], "B": [False, True]}, True), (np.all, {"A": [False, False], "B": [False, True]}, False), # other types (np.all, {"A": pd.Series([0.0, 1.0], dtype="float")}, False), (np.any, {"A": pd.Series([0.0, 1.0], dtype="float")}, True), (np.all, {"A": pd.Series([0, 1], dtype=int)}, False), (np.any, {"A": pd.Series([0, 1], dtype=int)}, True), pytest.param( np.all, {"A": pd.Series([0, 1], dtype="M8[ns]")}, False, marks=[td.skip_if_np_lt("1.15")], ), pytest.param( np.any, {"A": pd.Series([0, 1], dtype="M8[ns]")}, True, marks=[td.skip_if_np_lt("1.15")], ), pytest.param( np.all, {"A": pd.Series([1, 2], dtype="M8[ns]")}, True, marks=[td.skip_if_np_lt("1.15")], ), pytest.param( np.any, {"A": pd.Series([1, 2], dtype="M8[ns]")}, True, marks=[td.skip_if_np_lt("1.15")], ), pytest.param( np.all, {"A": pd.Series([0, 1], dtype="m8[ns]")}, False, marks=[td.skip_if_np_lt("1.15")], ), pytest.param( np.any, {"A": pd.Series([0, 1], dtype="m8[ns]")}, True, marks=[td.skip_if_np_lt("1.15")], ), pytest.param( np.all, {"A": pd.Series([1, 2], dtype="m8[ns]")}, True, marks=[td.skip_if_np_lt("1.15")], ), pytest.param( np.any, {"A": pd.Series([1, 2], dtype="m8[ns]")}, True, marks=[td.skip_if_np_lt("1.15")], ), (np.all, {"A": pd.Series([0, 1], dtype="category")}, False), (np.any, {"A": pd.Series([0, 1], dtype="category")}, True), (np.all, {"A": pd.Series([1, 2], dtype="category")}, True), (np.any, {"A": pd.Series([1, 2], dtype="category")}, True), # # Mix # GH 21484 # (np.all, {'A': pd.Series([10, 20], dtype='M8[ns]'), # 'B': pd.Series([10, 20], dtype='m8[ns]')}, True), ], ) def test_any_all_np_func(self, func, data, expected): # GH 19976 data = DataFrame(data) result = func(data) assert isinstance(result, np.bool_) assert result.item() is expected # method version result = getattr(DataFrame(data), func.__name__)(axis=None) assert isinstance(result, np.bool_) assert result.item() is expected def test_any_all_object(self): # GH 19976 result = np.all(DataFrame(columns=["a", "b"])).item() assert result is True result = np.any(DataFrame(columns=["a", "b"])).item() assert result is False @pytest.mark.parametrize("method", ["any", "all"]) def test_any_all_level_axis_none_raises(self, method): df = DataFrame( {"A": 1}, index=MultiIndex.from_product( [["A", "B"], ["a", "b"]], names=["out", "in"] ), ) xpr = "Must specify 'axis' when aggregating by level." with pytest.raises(ValueError, match=xpr): getattr(df, method)(axis=None, level="out") # ---------------------------------------------------------------------- # Isin def test_isin(self): # GH 4211 df = DataFrame( { "vals": [1, 2, 3, 4], "ids": ["a", "b", "f", "n"], "ids2": ["a", "n", "c", "n"], }, index=["foo", "bar", "baz", "qux"], ) other = ["a", "b", "c"] result = df.isin(other) expected = DataFrame([df.loc[s].isin(other) for s in df.index]) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("empty", [[], Series(), np.array([])]) def test_isin_empty(self, empty): # GH 16991 df = DataFrame({"A": ["a", "b", "c"], "B": ["a", "e", "f"]}) expected = DataFrame(False, df.index, df.columns) result = df.isin(empty) tm.assert_frame_equal(result, expected) def test_isin_dict(self): df = DataFrame({"A": ["a", "b", "c"], "B": ["a", "e", "f"]}) d = {"A": ["a"]} expected = DataFrame(False, df.index, df.columns) expected.loc[0, "A"] = True result = df.isin(d) tm.assert_frame_equal(result, expected) # non unique columns df = DataFrame({"A": ["a", "b", "c"], "B": ["a", "e", "f"]}) df.columns = ["A", "A"] expected = DataFrame(False, df.index, df.columns) expected.loc[0, "A"] = True result = df.isin(d) tm.assert_frame_equal(result, expected) def test_isin_with_string_scalar(self): # GH 4763 df = DataFrame( { "vals": [1, 2, 3, 4], "ids": ["a", "b", "f", "n"], "ids2": ["a", "n", "c", "n"], }, index=["foo", "bar", "baz", "qux"], ) with pytest.raises(TypeError): df.isin("a") with pytest.raises(TypeError): df.isin("aaa") def test_isin_df(self): df1 = DataFrame({"A": [1, 2, 3, 4], "B": [2, np.nan, 4, 4]}) df2 = DataFrame({"A": [0, 2, 12, 4], "B": [2, np.nan, 4, 5]}) expected = DataFrame(False, df1.index, df1.columns) result = df1.isin(df2) expected["A"].loc[[1, 3]] = True expected["B"].loc[[0, 2]] = True tm.assert_frame_equal(result, expected) # partial overlapping columns df2.columns = ["A", "C"] result = df1.isin(df2) expected["B"] = False tm.assert_frame_equal(result, expected) def test_isin_tuples(self): # GH 16394 df = pd.DataFrame({"A": [1, 2, 3], "B": ["a", "b", "f"]}) df["C"] = list(zip(df["A"], df["B"])) result = df["C"].isin([(1, "a")]) tm.assert_series_equal(result, Series([True, False, False], name="C")) def test_isin_df_dupe_values(self): df1 = DataFrame({"A": [1, 2, 3, 4], "B": [2, np.nan, 4, 4]}) # just cols duped df2 = DataFrame([[0, 2], [12, 4], [2, np.nan], [4, 5]], columns=["B", "B"]) with pytest.raises(ValueError): df1.isin(df2) # just index duped df2 = DataFrame( [[0, 2], [12, 4], [2, np.nan], [4, 5]], columns=["A", "B"], index=[0, 0, 1, 1], ) with pytest.raises(ValueError): df1.isin(df2) # cols and index: df2.columns = ["B", "B"] with pytest.raises(ValueError): df1.isin(df2) def test_isin_dupe_self(self): other = DataFrame({"A": [1, 0, 1, 0], "B": [1, 1, 0, 0]}) df = DataFrame([[1, 1], [1, 0], [0, 0]], columns=["A", "A"]) result = df.isin(other) expected = DataFrame(False, index=df.index, columns=df.columns) expected.loc[0] = True expected.iloc[1, 1] = True tm.assert_frame_equal(result, expected) def test_isin_against_series(self): df = pd.DataFrame( {"A": [1, 2, 3, 4], "B": [2, np.nan, 4, 4]}, index=["a", "b", "c", "d"] ) s = pd.Series([1, 3, 11, 4], index=["a", "b", "c", "d"]) expected = DataFrame(False, index=df.index, columns=df.columns) expected["A"].loc["a"] = True expected.loc["d"] = True result = df.isin(s) tm.assert_frame_equal(result, expected) def test_isin_multiIndex(self): idx = MultiIndex.from_tuples( [ (0, "a", "foo"), (0, "a", "bar"), (0, "b", "bar"), (0, "b", "baz"), (2, "a", "foo"), (2, "a", "bar"), (2, "c", "bar"), (2, "c", "baz"), (1, "b", "foo"), (1, "b", "bar"), (1, "c", "bar"), (1, "c", "baz"), ] ) df1 = DataFrame({"A": np.ones(12), "B": np.zeros(12)}, index=idx) df2 = DataFrame( { "A": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], "B": [1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1], } ) # against regular index expected = DataFrame(False, index=df1.index, columns=df1.columns) result = df1.isin(df2) tm.assert_frame_equal(result, expected) df2.index = idx expected = df2.values.astype(np.bool) expected[:, 1] = ~expected[:, 1] expected = DataFrame(expected, columns=["A", "B"], index=idx) result = df1.isin(df2) tm.assert_frame_equal(result, expected) def test_isin_empty_datetimelike(self): # GH 15473 df1_ts = DataFrame({"date": pd.to_datetime(["2014-01-01", "2014-01-02"])}) df1_td = DataFrame({"date": [pd.Timedelta(1, "s"), pd.Timedelta(2, "s")]}) df2 = DataFrame({"date": []}) df3 = DataFrame() expected = DataFrame({"date": [False, False]}) result = df1_ts.isin(df2) tm.assert_frame_equal(result, expected) result = df1_ts.isin(df3) tm.assert_frame_equal(result, expected) result = df1_td.isin(df2) tm.assert_frame_equal(result, expected) result = df1_td.isin(df3) tm.assert_frame_equal(result, expected) # --------------------------------------------------------------------- # Rounding def test_round(self): # GH 2665 # Test that rounding an empty DataFrame does nothing df = DataFrame() tm.assert_frame_equal(df, df.round()) # Here's the test frame we'll be working with df = DataFrame({"col1": [1.123, 2.123, 3.123], "col2": [1.234, 2.234, 3.234]}) # Default round to integer (i.e. decimals=0) expected_rounded = DataFrame({"col1": [1.0, 2.0, 3.0], "col2": [1.0, 2.0, 3.0]}) tm.assert_frame_equal(df.round(), expected_rounded) # Round with an integer decimals = 2 expected_rounded = DataFrame( {"col1": [1.12, 2.12, 3.12], "col2": [1.23, 2.23, 3.23]} ) tm.assert_frame_equal(df.round(decimals), expected_rounded) # This should also work with np.round (since np.round dispatches to # df.round) tm.assert_frame_equal(np.round(df, decimals), expected_rounded) # Round with a list round_list = [1, 2] with pytest.raises(TypeError): df.round(round_list) # Round with a dictionary expected_rounded = DataFrame( {"col1": [1.1, 2.1, 3.1], "col2": [1.23, 2.23, 3.23]} ) round_dict = {"col1": 1, "col2": 2} tm.assert_frame_equal(df.round(round_dict), expected_rounded) # Incomplete dict expected_partially_rounded = DataFrame( {"col1": [1.123, 2.123, 3.123], "col2": [1.2, 2.2, 3.2]} ) partial_round_dict = {"col2": 1} tm.assert_frame_equal(df.round(partial_round_dict), expected_partially_rounded) # Dict with unknown elements wrong_round_dict = {"col3": 2, "col2": 1} tm.assert_frame_equal(df.round(wrong_round_dict), expected_partially_rounded) # float input to `decimals` non_int_round_dict = {"col1": 1, "col2": 0.5} with pytest.raises(TypeError): df.round(non_int_round_dict) # String input non_int_round_dict = {"col1": 1, "col2": "foo"} with pytest.raises(TypeError): df.round(non_int_round_dict) non_int_round_Series = Series(non_int_round_dict) with pytest.raises(TypeError): df.round(non_int_round_Series) # List input non_int_round_dict = {"col1": 1, "col2": [1, 2]} with pytest.raises(TypeError): df.round(non_int_round_dict) non_int_round_Series = Series(non_int_round_dict) with pytest.raises(TypeError): df.round(non_int_round_Series) # Non integer Series inputs non_int_round_Series = Series(non_int_round_dict) with pytest.raises(TypeError): df.round(non_int_round_Series) non_int_round_Series = Series(non_int_round_dict) with pytest.raises(TypeError): df.round(non_int_round_Series) # Negative numbers negative_round_dict = {"col1": -1, "col2": -2} big_df = df * 100 expected_neg_rounded = DataFrame( {"col1": [110.0, 210, 310], "col2": [100.0, 200, 300]} ) tm.assert_frame_equal(big_df.round(negative_round_dict), expected_neg_rounded) # nan in Series round nan_round_Series = Series({"col1": np.nan, "col2": 1}) # TODO(wesm): unused? expected_nan_round = DataFrame( # noqa {"col1": [1.123, 2.123, 3.123], "col2": [1.2, 2.2, 3.2]} ) with pytest.raises(TypeError): df.round(nan_round_Series) # Make sure this doesn't break existing Series.round tm.assert_series_equal(df["col1"].round(1), expected_rounded["col1"]) # named columns # GH 11986 decimals = 2 expected_rounded = DataFrame( {"col1": [1.12, 2.12, 3.12], "col2": [1.23, 2.23, 3.23]} ) df.columns.name = "cols" expected_rounded.columns.name = "cols" tm.assert_frame_equal(df.round(decimals), expected_rounded) # interaction of named columns & series tm.assert_series_equal(df["col1"].round(decimals), expected_rounded["col1"]) tm.assert_series_equal(df.round(decimals)["col1"], expected_rounded["col1"]) def test_numpy_round(self): # GH 12600 df = DataFrame([[1.53, 1.36], [0.06, 7.01]]) out = np.round(df, decimals=0) expected = DataFrame([[2.0, 1.0], [0.0, 7.0]]) tm.assert_frame_equal(out, expected) msg = "the 'out' parameter is not supported" with pytest.raises(ValueError, match=msg): np.round(df, decimals=0, out=df) def test_numpy_round_nan(self): # See gh-14197 df = Series([1.53, np.nan, 0.06]).to_frame() with tm.assert_produces_warning(None): result = df.round() expected = Series([2.0, np.nan, 0.0]).to_frame() tm.assert_frame_equal(result, expected) def test_round_mixed_type(self): # GH 11885 df = DataFrame( { "col1": [1.1, 2.2, 3.3, 4.4], "col2": ["1", "a", "c", "f"], "col3": date_range("20111111", periods=4), } ) round_0 = DataFrame( { "col1": [1.0, 2.0, 3.0, 4.0], "col2": ["1", "a", "c", "f"], "col3": date_range("20111111", periods=4), } ) tm.assert_frame_equal(df.round(), round_0) tm.assert_frame_equal(df.round(1), df) tm.assert_frame_equal(df.round({"col1": 1}), df) tm.assert_frame_equal(df.round({"col1": 0}), round_0) tm.assert_frame_equal(df.round({"col1": 0, "col2": 1}), round_0) tm.assert_frame_equal(df.round({"col3": 1}), df) def test_round_issue(self): # GH 11611 df = pd.DataFrame( np.random.random([3, 3]), columns=["A", "B", "C"], index=["first", "second", "third"], ) dfs = pd.concat((df, df), axis=1) rounded = dfs.round() tm.assert_index_equal(rounded.index, dfs.index) decimals = pd.Series([1, 0, 2], index=["A", "B", "A"]) msg = "Index of decimals must be unique" with pytest.raises(ValueError, match=msg): df.round(decimals) def test_built_in_round(self): # GH 11763 # Here's the test frame we'll be working with df = DataFrame({"col1": [1.123, 2.123, 3.123], "col2": [1.234, 2.234, 3.234]}) # Default round to integer (i.e. decimals=0) expected_rounded = DataFrame({"col1": [1.0, 2.0, 3.0], "col2": [1.0, 2.0, 3.0]}) tm.assert_frame_equal(round(df), expected_rounded) def test_round_nonunique_categorical(self): # See GH21809 idx = pd.CategoricalIndex(["low"] * 3 + ["hi"] * 3) df = pd.DataFrame(np.random.rand(6, 3), columns=list("abc")) expected = df.round(3) expected.index = idx df_categorical = df.copy().set_index(idx) assert df_categorical.shape == (6, 3) result = df_categorical.round(3) assert result.shape == (6, 3) tm.assert_frame_equal(result, expected) # --------------------------------------------------------------------- # Clip def test_clip(self, float_frame): median = float_frame.median().median() original = float_frame.copy() with tm.assert_produces_warning(FutureWarning): capped = float_frame.clip_upper(median) assert not (capped.values > median).any() with tm.assert_produces_warning(FutureWarning): floored = float_frame.clip_lower(median) assert not (floored.values < median).any() double = float_frame.clip(upper=median, lower=median) assert not (double.values != median).any() # Verify that float_frame was not changed inplace assert (float_frame.values == original.values).all() def test_inplace_clip(self, float_frame): # GH 15388 median = float_frame.median().median() frame_copy = float_frame.copy() with tm.assert_produces_warning(FutureWarning): frame_copy.clip_upper(median, inplace=True) assert not (frame_copy.values > median).any() frame_copy = float_frame.copy() with tm.assert_produces_warning(FutureWarning): frame_copy.clip_lower(median, inplace=True) assert not (frame_copy.values < median).any() frame_copy = float_frame.copy() frame_copy.clip(upper=median, lower=median, inplace=True) assert not (frame_copy.values != median).any() def test_dataframe_clip(self): # GH 2747 df = DataFrame(np.random.randn(1000, 2)) for lb, ub in [(-1, 1), (1, -1)]: clipped_df = df.clip(lb, ub) lb, ub = min(lb, ub), max(ub, lb) lb_mask = df.values <= lb ub_mask = df.values >= ub mask = ~lb_mask & ~ub_mask assert (clipped_df.values[lb_mask] == lb).all() assert (clipped_df.values[ub_mask] == ub).all() assert (clipped_df.values[mask] == df.values[mask]).all() def test_clip_mixed_numeric(self): # TODO(jreback) # clip on mixed integer or floats # with integer clippers coerces to float df = DataFrame({"A": [1, 2, 3], "B": [1.0, np.nan, 3.0]}) result = df.clip(1, 2) expected = DataFrame({"A": [1, 2, 2], "B": [1.0, np.nan, 2.0]}) tm.assert_frame_equal(result, expected, check_like=True) # GH 24162, clipping now preserves numeric types per column df = DataFrame([[1, 2, 3.4], [3, 4, 5.6]], columns=["foo", "bar", "baz"]) expected = df.dtypes result = df.clip(upper=3).dtypes tm.assert_series_equal(result, expected) @pytest.mark.parametrize("inplace", [True, False]) def test_clip_against_series(self, inplace): # GH 6966 df = DataFrame(np.random.randn(1000, 2)) lb = Series(np.random.randn(1000)) ub = lb + 1 original = df.copy() clipped_df = df.clip(lb, ub, axis=0, inplace=inplace) if inplace: clipped_df = df for i in range(2): lb_mask = original.iloc[:, i] <= lb ub_mask = original.iloc[:, i] >= ub mask = ~lb_mask & ~ub_mask result = clipped_df.loc[lb_mask, i] tm.assert_series_equal(result, lb[lb_mask], check_names=False) assert result.name == i result = clipped_df.loc[ub_mask, i] tm.assert_series_equal(result, ub[ub_mask], check_names=False) assert result.name == i tm.assert_series_equal(clipped_df.loc[mask, i], df.loc[mask, i]) @pytest.mark.parametrize("inplace", [True, False]) @pytest.mark.parametrize("lower", [[2, 3, 4], np.asarray([2, 3, 4])]) @pytest.mark.parametrize( "axis,res", [ (0, [[2.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 7.0, 7.0]]), (1, [[2.0, 3.0, 4.0], [4.0, 5.0, 6.0], [5.0, 6.0, 7.0]]), ], ) def test_clip_against_list_like(self, simple_frame, inplace, lower, axis, res): # GH 15390 original = simple_frame.copy(deep=True) result = original.clip(lower=lower, upper=[5, 6, 7], axis=axis, inplace=inplace) expected = pd.DataFrame(res, columns=original.columns, index=original.index) if inplace: result = original tm.assert_frame_equal(result, expected, check_exact=True) @pytest.mark.parametrize("axis", [0, 1, None]) def test_clip_against_frame(self, axis): df = DataFrame(np.random.randn(1000, 2)) lb = DataFrame(np.random.randn(1000, 2)) ub = lb + 1 clipped_df = df.clip(lb, ub, axis=axis) lb_mask = df <= lb ub_mask = df >= ub mask = ~lb_mask & ~ub_mask tm.assert_frame_equal(clipped_df[lb_mask], lb[lb_mask]) tm.assert_frame_equal(clipped_df[ub_mask], ub[ub_mask]) tm.assert_frame_equal(clipped_df[mask], df[mask]) def test_clip_against_unordered_columns(self): # GH 20911 df1 = DataFrame(np.random.randn(1000, 4), columns=["A", "B", "C", "D"]) df2 = DataFrame(np.random.randn(1000, 4), columns=["D", "A", "B", "C"]) df3 = DataFrame(df2.values - 1, columns=["B", "D", "C", "A"]) result_upper = df1.clip(lower=0, upper=df2) expected_upper = df1.clip(lower=0, upper=df2[df1.columns]) result_lower = df1.clip(lower=df3, upper=3) expected_lower = df1.clip(lower=df3[df1.columns], upper=3) result_lower_upper = df1.clip(lower=df3, upper=df2) expected_lower_upper = df1.clip(lower=df3[df1.columns], upper=df2[df1.columns]) tm.assert_frame_equal(result_upper, expected_upper) tm.assert_frame_equal(result_lower, expected_lower) tm.assert_frame_equal(result_lower_upper, expected_lower_upper) def test_clip_with_na_args(self, float_frame): """Should process np.nan argument as None """ # GH 17276 tm.assert_frame_equal(float_frame.clip(np.nan), float_frame) tm.assert_frame_equal(float_frame.clip(upper=np.nan, lower=np.nan), float_frame) # GH 19992 df = DataFrame({"col_0": [1, 2, 3], "col_1": [4, 5, 6], "col_2": [7, 8, 9]}) result = df.clip(lower=[4, 5, np.nan], axis=0) expected = DataFrame( {"col_0": [4, 5, np.nan], "col_1": [4, 5, np.nan], "col_2": [7, 8, np.nan]} ) tm.assert_frame_equal(result, expected) result = df.clip(lower=[4, 5, np.nan], axis=1) expected = DataFrame( {"col_0": [4, 4, 4], "col_1": [5, 5, 6], "col_2": [np.nan, np.nan, np.nan]} ) tm.assert_frame_equal(result, expected) # --------------------------------------------------------------------- # Matrix-like def test_dot(self): a = DataFrame( np.random.randn(3, 4), index=["a", "b", "c"], columns=["p", "q", "r", "s"] ) b = DataFrame( np.random.randn(4, 2), index=["p", "q", "r", "s"], columns=["one", "two"] ) result = a.dot(b) expected = DataFrame( np.dot(a.values, b.values), index=["a", "b", "c"], columns=["one", "two"] ) # Check alignment b1 = b.reindex(index=reversed(b.index)) result = a.dot(b) tm.assert_frame_equal(result, expected) # Check series argument result = a.dot(b["one"]) tm.assert_series_equal(result, expected["one"], check_names=False) assert result.name is None result = a.dot(b1["one"]) tm.assert_series_equal(result, expected["one"], check_names=False) assert result.name is None # can pass correct-length arrays row = a.iloc[0].values result = a.dot(row) expected = a.dot(a.iloc[0]) tm.assert_series_equal(result, expected) with pytest.raises(ValueError, match="Dot product shape mismatch"): a.dot(row[:-1]) a = np.random.rand(1, 5) b = np.random.rand(5, 1) A = DataFrame(a) # TODO(wesm): unused B = DataFrame(b) # noqa # it works result = A.dot(b) # unaligned df = DataFrame(np.random.randn(3, 4), index=[1, 2, 3], columns=range(4)) df2 = DataFrame(np.random.randn(5, 3), index=range(5), columns=[1, 2, 3]) with pytest.raises(ValueError, match="aligned"): df.dot(df2) def test_matmul(self): # matmul test is for GH 10259 a = DataFrame( np.random.randn(3, 4), index=["a", "b", "c"], columns=["p", "q", "r", "s"] ) b = DataFrame( np.random.randn(4, 2), index=["p", "q", "r", "s"], columns=["one", "two"] ) # DataFrame @ DataFrame result = operator.matmul(a, b) expected = DataFrame( np.dot(a.values, b.values), index=["a", "b", "c"], columns=["one", "two"] ) tm.assert_frame_equal(result, expected) # DataFrame @ Series result = operator.matmul(a, b.one) expected = Series(np.dot(a.values, b.one.values), index=["a", "b", "c"]) tm.assert_series_equal(result, expected) # np.array @ DataFrame result = operator.matmul(a.values, b) assert isinstance(result, DataFrame) assert result.columns.equals(b.columns) assert result.index.equals(pd.Index(range(3))) expected = np.dot(a.values, b.values) tm.assert_almost_equal(result.values, expected) # nested list @ DataFrame (__rmatmul__) result = operator.matmul(a.values.tolist(), b) expected = DataFrame( np.dot(a.values, b.values), index=["a", "b", "c"], columns=["one", "two"] ) tm.assert_almost_equal(result.values, expected.values) # mixed dtype DataFrame @ DataFrame a["q"] = a.q.round().astype(int) result = operator.matmul(a, b) expected = DataFrame( np.dot(a.values, b.values), index=["a", "b", "c"], columns=["one", "two"] ) tm.assert_frame_equal(result, expected) # different dtypes DataFrame @ DataFrame a = a.astype(int) result = operator.matmul(a, b) expected = DataFrame( np.dot(a.values, b.values), index=["a", "b", "c"], columns=["one", "two"] ) tm.assert_frame_equal(result, expected) # unaligned df = DataFrame(np.random.randn(3, 4), index=[1, 2, 3], columns=range(4)) df2 = DataFrame(np.random.randn(5, 3), index=range(5), columns=[1, 2, 3]) with pytest.raises(ValueError, match="aligned"): operator.matmul(df, df2) @pytest.fixture def df_duplicates(): return pd.DataFrame( {"a": [1, 2, 3, 4, 4], "b": [1, 1, 1, 1, 1], "c": [0, 1, 2, 5, 4]}, index=[0, 0, 1, 1, 1], ) @pytest.fixture def df_strings(): return pd.DataFrame( { "a": np.random.permutation(10), "b": list(ascii_lowercase[:10]), "c": np.random.permutation(10).astype("float64"), } ) @pytest.fixture def df_main_dtypes(): return pd.DataFrame( { "group": [1, 1, 2], "int": [1, 2, 3], "float": [4.0, 5.0, 6.0], "string": list("abc"), "category_string": pd.Series(list("abc")).astype("category"), "category_int": [7, 8, 9], "datetime": pd.date_range("20130101", periods=3), "datetimetz": pd.date_range("20130101", periods=3, tz="US/Eastern"), "timedelta": pd.timedelta_range("1 s", periods=3, freq="s"), }, columns=[ "group", "int", "float", "string", "category_string", "category_int", "datetime", "datetimetz", "timedelta", ], ) class TestNLargestNSmallest: dtype_error_msg_template = ( "Column {column!r} has dtype {dtype}, cannot " "use method {method!r} with this dtype" ) # ---------------------------------------------------------------------- # Top / bottom @pytest.mark.parametrize( "order", [ ["a"], ["c"], ["a", "b"], ["a", "c"], ["b", "a"], ["b", "c"], ["a", "b", "c"], ["c", "a", "b"], ["c", "b", "a"], ["b", "c", "a"], ["b", "a", "c"], # dups! ["b", "c", "c"], ], ) @pytest.mark.parametrize("n", range(1, 11)) def test_n(self, df_strings, nselect_method, n, order): # GH 10393 df = df_strings if "b" in order: error_msg = self.dtype_error_msg_template.format( column="b", method=nselect_method, dtype="object" ) with pytest.raises(TypeError, match=error_msg): getattr(df, nselect_method)(n, order) else: ascending = nselect_method == "nsmallest" result = getattr(df, nselect_method)(n, order) expected = df.sort_values(order, ascending=ascending).head(n) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "columns", [["group", "category_string"], ["group", "string"]] ) def test_n_error(self, df_main_dtypes, nselect_method, columns): df = df_main_dtypes col = columns[1] error_msg = self.dtype_error_msg_template.format( column=col, method=nselect_method, dtype=df[col].dtype ) # escape some characters that may be in the repr error_msg = ( error_msg.replace("(", "\\(") .replace(")", "\\)") .replace("[", "\\[") .replace("]", "\\]") ) with pytest.raises(TypeError, match=error_msg): getattr(df, nselect_method)(2, columns) def test_n_all_dtypes(self, df_main_dtypes): df = df_main_dtypes df.nsmallest(2, list(set(df) - {"category_string", "string"})) df.nlargest(2, list(set(df) - {"category_string", "string"})) @pytest.mark.parametrize( "method,expected", [ ( "nlargest", pd.DataFrame( {"a": [2, 2, 2, 1], "b": [3, 2, 1, 3]}, index=[2, 1, 0, 3] ), ), ( "nsmallest", pd.DataFrame( {"a": [1, 1, 1, 2], "b": [1, 2, 3, 1]}, index=[5, 4, 3, 0] ), ), ], ) def test_duplicates_on_starter_columns(self, method, expected): # regression test for #22752 df = pd.DataFrame({"a": [2, 2, 2, 1, 1, 1], "b": [1, 2, 3, 3, 2, 1]}) result = getattr(df, method)(4, columns=["a", "b"]) tm.assert_frame_equal(result, expected) def test_n_identical_values(self): # GH 15297 df = pd.DataFrame({"a": [1] * 5, "b": [1, 2, 3, 4, 5]}) result = df.nlargest(3, "a") expected = pd.DataFrame({"a": [1] * 3, "b": [1, 2, 3]}, index=[0, 1, 2]) tm.assert_frame_equal(result, expected) result = df.nsmallest(3, "a") expected = pd.DataFrame({"a": [1] * 3, "b": [1, 2, 3]}) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "order", [["a", "b", "c"], ["c", "b", "a"], ["a"], ["b"], ["a", "b"], ["c", "b"]], ) @pytest.mark.parametrize("n", range(1, 6)) def test_n_duplicate_index(self, df_duplicates, n, order): # GH 13412 df = df_duplicates result = df.nsmallest(n, order) expected = df.sort_values(order).head(n) tm.assert_frame_equal(result, expected) result = df.nlargest(n, order) expected = df.sort_values(order, ascending=False).head(n) tm.assert_frame_equal(result, expected) def test_duplicate_keep_all_ties(self): # GH 16818 df = pd.DataFrame( {"a": [5, 4, 4, 2, 3, 3, 3, 3], "b": [10, 9, 8, 7, 5, 50, 10, 20]} ) result = df.nlargest(4, "a", keep="all") expected = pd.DataFrame( { "a": {0: 5, 1: 4, 2: 4, 4: 3, 5: 3, 6: 3, 7: 3}, "b": {0: 10, 1: 9, 2: 8, 4: 5, 5: 50, 6: 10, 7: 20}, } ) tm.assert_frame_equal(result, expected) result = df.nsmallest(2, "a", keep="all") expected = pd.DataFrame( { "a": {3: 2, 4: 3, 5: 3, 6: 3, 7: 3}, "b": {3: 7, 4: 5, 5: 50, 6: 10, 7: 20}, } ) tm.assert_frame_equal(result, expected) def test_series_broadcasting(self): # smoke test for numpy warnings # GH 16378, GH 16306 df = DataFrame([1.0, 1.0, 1.0]) df_nan = DataFrame({"A": [np.nan, 2.0, np.nan]}) s = Series([1, 1, 1]) s_nan = Series([np.nan, np.nan, 1]) with tm.assert_produces_warning(None): with tm.assert_produces_warning(FutureWarning): df_nan.clip_lower(s, axis=0) for op in ["lt", "le", "gt", "ge", "eq", "ne"]: getattr(df, op)(s_nan, axis=0) def test_series_nat_conversion(self): # GH 18521 # Check rank does not mutate DataFrame df = DataFrame(np.random.randn(10, 3), dtype="float64") expected = df.copy() df.rank() result = df tm.assert_frame_equal(result, expected) def test_multiindex_column_lookup(self): # Check whether tuples are correctly treated as multi-level lookups. # GH 23033 df = pd.DataFrame( columns=pd.MultiIndex.from_product([["x"], ["a", "b"]]), data=[[0.33, 0.13], [0.86, 0.25], [0.25, 0.70], [0.85, 0.91]], ) # nsmallest result = df.nsmallest(3, ("x", "a")) expected = df.iloc[[2, 0, 3]] tm.assert_frame_equal(result, expected) # nlargest result = df.nlargest(3, ("x", "b")) expected = df.iloc[[3, 2, 1]] tm.assert_frame_equal(result, expected)