8th day of python challenges 111-117

This commit is contained in:
abd.shallal
2019-08-04 15:26:35 +03:00
parent b04c1b055f
commit 627802c383
3215 changed files with 760227 additions and 491 deletions

View File

@@ -0,0 +1,4 @@
"""
Tests for reductions where we want to test for matching behavior across
Array, Index, Series, and DataFrame methods.
"""

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,270 @@
"""
Tests for statistical reductions of 2nd moment or higher: var, skew, kurt, ...
"""
import inspect
import numpy as np
import pytest
import pandas.util._test_decorators as td
import pandas as pd
from pandas import DataFrame, Series
from pandas.core.arrays import DatetimeArray, PeriodArray, TimedeltaArray
import pandas.util.testing as tm
class TestDatetimeLikeStatReductions:
@pytest.mark.parametrize("box", [Series, pd.Index, DatetimeArray])
def test_dt64_mean(self, tz_naive_fixture, box):
tz = tz_naive_fixture
dti = pd.date_range("2001-01-01", periods=11, tz=tz)
# shuffle so that we are not just working with monotone-increasing
dti = dti.take([4, 1, 3, 10, 9, 7, 8, 5, 0, 2, 6])
dtarr = dti._data
obj = box(dtarr)
assert obj.mean() == pd.Timestamp("2001-01-06", tz=tz)
assert obj.mean(skipna=False) == pd.Timestamp("2001-01-06", tz=tz)
# dtarr[-2] will be the first date 2001-01-1
dtarr[-2] = pd.NaT
obj = box(dtarr)
assert obj.mean() == pd.Timestamp("2001-01-06 07:12:00", tz=tz)
assert obj.mean(skipna=False) is pd.NaT
@pytest.mark.parametrize("box", [Series, pd.Index, PeriodArray])
def test_period_mean(self, box):
# GH#24757
dti = pd.date_range("2001-01-01", periods=11)
# shuffle so that we are not just working with monotone-increasing
dti = dti.take([4, 1, 3, 10, 9, 7, 8, 5, 0, 2, 6])
# use hourly frequency to avoid rounding errors in expected results
# TODO: flesh this out with different frequencies
parr = dti._data.to_period("H")
obj = box(parr)
with pytest.raises(TypeError, match="ambiguous"):
obj.mean()
with pytest.raises(TypeError, match="ambiguous"):
obj.mean(skipna=True)
# parr[-2] will be the first date 2001-01-1
parr[-2] = pd.NaT
with pytest.raises(TypeError, match="ambiguous"):
obj.mean()
with pytest.raises(TypeError, match="ambiguous"):
obj.mean(skipna=True)
@pytest.mark.parametrize("box", [Series, pd.Index, TimedeltaArray])
def test_td64_mean(self, box):
tdi = pd.TimedeltaIndex([0, 3, -2, -7, 1, 2, -1, 3, 5, -2, 4], unit="D")
tdarr = tdi._data
obj = box(tdarr)
result = obj.mean()
expected = np.array(tdarr).mean()
assert result == expected
tdarr[0] = pd.NaT
assert obj.mean(skipna=False) is pd.NaT
result2 = obj.mean(skipna=True)
assert result2 == tdi[1:].mean()
# exact equality fails by 1 nanosecond
assert result2.round("us") == (result * 11.0 / 10).round("us")
class TestSeriesStatReductions:
# Note: the name TestSeriesStatReductions indicates these tests
# were moved from a series-specific test file, _not_ that these tests are
# intended long-term to be series-specific
def _check_stat_op(
self, name, alternate, string_series_, check_objects=False, check_allna=False
):
with pd.option_context("use_bottleneck", False):
f = getattr(Series, name)
# add some NaNs
string_series_[5:15] = np.NaN
# mean, idxmax, idxmin, min, and max are valid for dates
if name not in ["max", "min", "mean"]:
ds = Series(pd.date_range("1/1/2001", periods=10))
with pytest.raises(TypeError):
f(ds)
# skipna or no
assert pd.notna(f(string_series_))
assert pd.isna(f(string_series_, skipna=False))
# check the result is correct
nona = string_series_.dropna()
tm.assert_almost_equal(f(nona), alternate(nona.values))
tm.assert_almost_equal(f(string_series_), alternate(nona.values))
allna = string_series_ * np.nan
if check_allna:
assert np.isnan(f(allna))
# dtype=object with None, it works!
s = Series([1, 2, 3, None, 5])
f(s)
# GH#2888
items = [0]
items.extend(range(2 ** 40, 2 ** 40 + 1000))
s = Series(items, dtype="int64")
tm.assert_almost_equal(float(f(s)), float(alternate(s.values)))
# check date range
if check_objects:
s = Series(pd.bdate_range("1/1/2000", periods=10))
res = f(s)
exp = alternate(s)
assert res == exp
# check on string data
if name not in ["sum", "min", "max"]:
with pytest.raises(TypeError):
f(Series(list("abc")))
# Invalid axis.
with pytest.raises(ValueError):
f(string_series_, axis=1)
# Unimplemented numeric_only parameter.
if "numeric_only" in inspect.getfullargspec(f).args:
with pytest.raises(NotImplementedError, match=name):
f(string_series_, numeric_only=True)
def test_sum(self):
string_series = tm.makeStringSeries().rename("series")
self._check_stat_op("sum", np.sum, string_series, check_allna=False)
def test_mean(self):
string_series = tm.makeStringSeries().rename("series")
self._check_stat_op("mean", np.mean, string_series)
def test_median(self):
string_series = tm.makeStringSeries().rename("series")
self._check_stat_op("median", np.median, string_series)
# test with integers, test failure
int_ts = Series(np.ones(10, dtype=int), index=range(10))
tm.assert_almost_equal(np.median(int_ts), int_ts.median())
def test_prod(self):
string_series = tm.makeStringSeries().rename("series")
self._check_stat_op("prod", np.prod, string_series)
def test_min(self):
string_series = tm.makeStringSeries().rename("series")
self._check_stat_op("min", np.min, string_series, check_objects=True)
def test_max(self):
string_series = tm.makeStringSeries().rename("series")
self._check_stat_op("max", np.max, string_series, check_objects=True)
def test_var_std(self):
string_series = tm.makeStringSeries().rename("series")
datetime_series = tm.makeTimeSeries().rename("ts")
alt = lambda x: np.std(x, ddof=1)
self._check_stat_op("std", alt, string_series)
alt = lambda x: np.var(x, ddof=1)
self._check_stat_op("var", alt, string_series)
result = datetime_series.std(ddof=4)
expected = np.std(datetime_series.values, ddof=4)
tm.assert_almost_equal(result, expected)
result = datetime_series.var(ddof=4)
expected = np.var(datetime_series.values, ddof=4)
tm.assert_almost_equal(result, expected)
# 1 - element series with ddof=1
s = datetime_series.iloc[[0]]
result = s.var(ddof=1)
assert pd.isna(result)
result = s.std(ddof=1)
assert pd.isna(result)
def test_sem(self):
string_series = tm.makeStringSeries().rename("series")
datetime_series = tm.makeTimeSeries().rename("ts")
alt = lambda x: np.std(x, ddof=1) / np.sqrt(len(x))
self._check_stat_op("sem", alt, string_series)
result = datetime_series.sem(ddof=4)
expected = np.std(datetime_series.values, ddof=4) / np.sqrt(
len(datetime_series.values)
)
tm.assert_almost_equal(result, expected)
# 1 - element series with ddof=1
s = datetime_series.iloc[[0]]
result = s.sem(ddof=1)
assert pd.isna(result)
@td.skip_if_no_scipy
def test_skew(self):
from scipy.stats import skew
string_series = tm.makeStringSeries().rename("series")
alt = lambda x: skew(x, bias=False)
self._check_stat_op("skew", alt, string_series)
# test corner cases, skew() returns NaN unless there's at least 3
# values
min_N = 3
for i in range(1, min_N + 1):
s = Series(np.ones(i))
df = DataFrame(np.ones((i, i)))
if i < min_N:
assert np.isnan(s.skew())
assert np.isnan(df.skew()).all()
else:
assert 0 == s.skew()
assert (df.skew() == 0).all()
@td.skip_if_no_scipy
def test_kurt(self):
from scipy.stats import kurtosis
string_series = tm.makeStringSeries().rename("series")
alt = lambda x: kurtosis(x, bias=False)
self._check_stat_op("kurt", alt, string_series)
index = pd.MultiIndex(
levels=[["bar"], ["one", "two", "three"], [0, 1]],
codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]],
)
s = Series(np.random.randn(6), index=index)
tm.assert_almost_equal(s.kurt(), s.kurt(level=0)["bar"])
# test corner cases, kurt() returns NaN unless there's at least 4
# values
min_N = 4
for i in range(1, min_N + 1):
s = Series(np.ones(i))
df = DataFrame(np.ones((i, i)))
if i < min_N:
assert np.isnan(s.kurt())
assert np.isnan(df.kurt()).all()
else:
assert 0 == s.kurt()
assert (df.kurt() == 0).all()