8th day of python challenges 111-117

2019-08-04 15:26:35 +03:00
parent b04c1b055f
commit 627802c383
3215 changed files with 760227 additions and 491 deletions
--- a/venv/lib/python3.6/site-packages/pandas/tests/io/sas/init.py
+++ b/venv/lib/python3.6/site-packages/pandas/tests/io/sas/init.py
--- a/venv/lib/python3.6/site-packages/pandas/tests/io/sas/test_sas.py
+++ b/venv/lib/python3.6/site-packages/pandas/tests/io/sas/test_sas.py
@@ -0,0 +1,26 @@
+from io import StringIO
+
+import pytest
+
+from pandas import read_sas
+import pandas.util.testing as tm
+
+
+class TestSas:
+    def test_sas_buffer_format(self):
+        # see gh-14947
+        b = StringIO("")
+
+        msg = (
+            "If this is a buffer object rather than a string "
+            "name, you must specify a format string"
+        )
+        with pytest.raises(ValueError, match=msg):
+            read_sas(b)
+
+    def test_sas_read_no_format_or_extension(self):
+        # see gh-24548
+        msg = "unable to infer format of SAS file"
+        with tm.ensure_clean("test_file_no_extension") as path:
+            with pytest.raises(ValueError, match=msg):
+                read_sas(path)
--- a/venv/lib/python3.6/site-packages/pandas/tests/io/sas/test_sas7bdat.py
+++ b/venv/lib/python3.6/site-packages/pandas/tests/io/sas/test_sas7bdat.py
@@ -0,0 +1,219 @@
+import io
+import os
+
+import numpy as np
+import pytest
+
+from pandas.errors import EmptyDataError
+import pandas.util._test_decorators as td
+
+import pandas as pd
+import pandas.util.testing as tm
+
+
+# https://github.com/cython/cython/issues/1720
+@pytest.mark.filterwarnings("ignore:can't resolve package:ImportWarning")
+class TestSAS7BDAT:
+    @pytest.fixture(autouse=True)
+    def setup_method(self, datapath):
+        self.dirpath = datapath("io", "sas", "data")
+        self.data = []
+        self.test_ix = [list(range(1, 16)), [16]]
+        for j in 1, 2:
+            fname = os.path.join(self.dirpath, "test_sas7bdat_{j}.csv".format(j=j))
+            df = pd.read_csv(fname)
+            epoch = pd.datetime(1960, 1, 1)
+            t1 = pd.to_timedelta(df["Column4"], unit="d")
+            df["Column4"] = epoch + t1
+            t2 = pd.to_timedelta(df["Column12"], unit="d")
+            df["Column12"] = epoch + t2
+            for k in range(df.shape[1]):
+                col = df.iloc[:, k]
+                if col.dtype == np.int64:
+                    df.iloc[:, k] = df.iloc[:, k].astype(np.float64)
+            self.data.append(df)
+
+    def test_from_file(self):
+        for j in 0, 1:
+            df0 = self.data[j]
+            for k in self.test_ix[j]:
+                fname = os.path.join(self.dirpath, "test{k}.sas7bdat".format(k=k))
+                df = pd.read_sas(fname, encoding="utf-8")
+                tm.assert_frame_equal(df, df0)
+
+    def test_from_buffer(self):
+        for j in 0, 1:
+            df0 = self.data[j]
+            for k in self.test_ix[j]:
+                fname = os.path.join(self.dirpath, "test{k}.sas7bdat".format(k=k))
+                with open(fname, "rb") as f:
+                    byts = f.read()
+                buf = io.BytesIO(byts)
+                rdr = pd.read_sas(
+                    buf, format="sas7bdat", iterator=True, encoding="utf-8"
+                )
+                df = rdr.read()
+                tm.assert_frame_equal(df, df0, check_exact=False)
+                rdr.close()
+
+    def test_from_iterator(self):
+        for j in 0, 1:
+            df0 = self.data[j]
+            for k in self.test_ix[j]:
+                fname = os.path.join(self.dirpath, "test{k}.sas7bdat".format(k=k))
+                rdr = pd.read_sas(fname, iterator=True, encoding="utf-8")
+                df = rdr.read(2)
+                tm.assert_frame_equal(df, df0.iloc[0:2, :])
+                df = rdr.read(3)
+                tm.assert_frame_equal(df, df0.iloc[2:5, :])
+                rdr.close()
+
+    @td.skip_if_no("pathlib")
+    def test_path_pathlib(self):
+        from pathlib import Path
+
+        for j in 0, 1:
+            df0 = self.data[j]
+            for k in self.test_ix[j]:
+                fname = Path(os.path.join(self.dirpath, "test{k}.sas7bdat".format(k=k)))
+                df = pd.read_sas(fname, encoding="utf-8")
+                tm.assert_frame_equal(df, df0)
+
+    @td.skip_if_no("py.path")
+    def test_path_localpath(self):
+        from py.path import local as LocalPath
+
+        for j in 0, 1:
+            df0 = self.data[j]
+            for k in self.test_ix[j]:
+                fname = LocalPath(
+                    os.path.join(self.dirpath, "test{k}.sas7bdat".format(k=k))
+                )
+                df = pd.read_sas(fname, encoding="utf-8")
+                tm.assert_frame_equal(df, df0)
+
+    def test_iterator_loop(self):
+        # github #13654
+        for j in 0, 1:
+            for k in self.test_ix[j]:
+                for chunksize in 3, 5, 10, 11:
+                    fname = os.path.join(self.dirpath, "test{k}.sas7bdat".format(k=k))
+                    rdr = pd.read_sas(fname, chunksize=10, encoding="utf-8")
+                    y = 0
+                    for x in rdr:
+                        y += x.shape[0]
+                    assert y == rdr.row_count
+                    rdr.close()
+
+    def test_iterator_read_too_much(self):
+        # github #14734
+        k = self.test_ix[0][0]
+        fname = os.path.join(self.dirpath, "test{k}.sas7bdat".format(k=k))
+        rdr = pd.read_sas(fname, format="sas7bdat", iterator=True, encoding="utf-8")
+        d1 = rdr.read(rdr.row_count + 20)
+        rdr.close()
+
+        rdr = pd.read_sas(fname, iterator=True, encoding="utf-8")
+        d2 = rdr.read(rdr.row_count + 20)
+        tm.assert_frame_equal(d1, d2)
+        rdr.close()
+
+
+def test_encoding_options(datapath):
+    fname = datapath("io", "sas", "data", "test1.sas7bdat")
+    df1 = pd.read_sas(fname)
+    df2 = pd.read_sas(fname, encoding="utf-8")
+    for col in df1.columns:
+        try:
+            df1[col] = df1[col].str.decode("utf-8")
+        except AttributeError:
+            pass
+    tm.assert_frame_equal(df1, df2)
+
+    from pandas.io.sas.sas7bdat import SAS7BDATReader
+
+    rdr = SAS7BDATReader(fname, convert_header_text=False)
+    df3 = rdr.read()
+    rdr.close()
+    for x, y in zip(df1.columns, df3.columns):
+        assert x == y.decode()
+
+
+def test_productsales(datapath):
+    fname = datapath("io", "sas", "data", "productsales.sas7bdat")
+    df = pd.read_sas(fname, encoding="utf-8")
+    fname = datapath("io", "sas", "data", "productsales.csv")
+    df0 = pd.read_csv(fname, parse_dates=["MONTH"])
+    vn = ["ACTUAL", "PREDICT", "QUARTER", "YEAR"]
+    df0[vn] = df0[vn].astype(np.float64)
+    tm.assert_frame_equal(df, df0)
+
+
+def test_12659(datapath):
+    fname = datapath("io", "sas", "data", "test_12659.sas7bdat")
+    df = pd.read_sas(fname)
+    fname = datapath("io", "sas", "data", "test_12659.csv")
+    df0 = pd.read_csv(fname)
+    df0 = df0.astype(np.float64)
+    tm.assert_frame_equal(df, df0)
+
+
+def test_airline(datapath):
+    fname = datapath("io", "sas", "data", "airline.sas7bdat")
+    df = pd.read_sas(fname)
+    fname = datapath("io", "sas", "data", "airline.csv")
+    df0 = pd.read_csv(fname)
+    df0 = df0.astype(np.float64)
+    tm.assert_frame_equal(df, df0, check_exact=False)
+
+
+def test_date_time(datapath):
+    # Support of different SAS date/datetime formats (PR #15871)
+    fname = datapath("io", "sas", "data", "datetime.sas7bdat")
+    df = pd.read_sas(fname)
+    fname = datapath("io", "sas", "data", "datetime.csv")
+    df0 = pd.read_csv(
+        fname, parse_dates=["Date1", "Date2", "DateTime", "DateTimeHi", "Taiw"]
+    )
+    # GH 19732: Timestamps imported from sas will incur floating point errors
+    df.iloc[:, 3] = df.iloc[:, 3].dt.round("us")
+    tm.assert_frame_equal(df, df0)
+
+
+def test_compact_numerical_values(datapath):
+    # Regression test for #21616
+    fname = datapath("io", "sas", "data", "cars.sas7bdat")
+    df = pd.read_sas(fname, encoding="latin-1")
+    # The two columns CYL and WGT in cars.sas7bdat have column
+    # width < 8 and only contain integral values.
+    # Test that pandas doesn't corrupt the numbers by adding
+    # decimals.
+    result = df["WGT"]
+    expected = df["WGT"].round()
+    tm.assert_series_equal(result, expected, check_exact=True)
+    result = df["CYL"]
+    expected = df["CYL"].round()
+    tm.assert_series_equal(result, expected, check_exact=True)
+
+
+def test_many_columns(datapath):
+    # Test for looking for column information in more places (PR #22628)
+    fname = datapath("io", "sas", "data", "many_columns.sas7bdat")
+    df = pd.read_sas(fname, encoding="latin-1")
+    fname = datapath("io", "sas", "data", "many_columns.csv")
+    df0 = pd.read_csv(fname, encoding="latin-1")
+    tm.assert_frame_equal(df, df0)
+
+
+def test_inconsistent_number_of_rows(datapath):
+    # Regression test for issue #16615. (PR #22628)
+    fname = datapath("io", "sas", "data", "load_log.sas7bdat")
+    df = pd.read_sas(fname, encoding="latin-1")
+    assert len(df) == 2097
+
+
+def test_zero_variables(datapath):
+    # Check if the SAS file has zero variables (PR #18184)
+    fname = datapath("io", "sas", "data", "zero_variables.sas7bdat")
+    with pytest.raises(EmptyDataError):
+        pd.read_sas(fname)
--- a/venv/lib/python3.6/site-packages/pandas/tests/io/sas/test_xport.py
+++ b/venv/lib/python3.6/site-packages/pandas/tests/io/sas/test_xport.py
@@ -0,0 +1,141 @@
+import os
+
+import numpy as np
+import pytest
+
+import pandas as pd
+import pandas.util.testing as tm
+
+from pandas.io.sas.sasreader import read_sas
+
+# CSV versions of test xpt files were obtained using the R foreign library
+
+# Numbers in a SAS xport file are always float64, so need to convert
+# before making comparisons.
+
+
+def numeric_as_float(data):
+    for v in data.columns:
+        if data[v].dtype is np.dtype("int64"):
+            data[v] = data[v].astype(np.float64)
+
+
+class TestXport:
+    @pytest.fixture(autouse=True)
+    def setup_method(self, datapath):
+        self.dirpath = datapath("io", "sas", "data")
+        self.file01 = os.path.join(self.dirpath, "DEMO_G.xpt")
+        self.file02 = os.path.join(self.dirpath, "SSHSV1_A.xpt")
+        self.file03 = os.path.join(self.dirpath, "DRXFCD_G.xpt")
+        self.file04 = os.path.join(self.dirpath, "paxraw_d_short.xpt")
+
+    def test1_basic(self):
+        # Tests with DEMO_G.xpt (all numeric file)
+
+        # Compare to this
+        data_csv = pd.read_csv(self.file01.replace(".xpt", ".csv"))
+        numeric_as_float(data_csv)
+
+        # Read full file
+        data = read_sas(self.file01, format="xport")
+        tm.assert_frame_equal(data, data_csv)
+        num_rows = data.shape[0]
+
+        # Test reading beyond end of file
+        reader = read_sas(self.file01, format="xport", iterator=True)
+        data = reader.read(num_rows + 100)
+        assert data.shape[0] == num_rows
+        reader.close()
+
+        # Test incremental read with `read` method.
+        reader = read_sas(self.file01, format="xport", iterator=True)
+        data = reader.read(10)
+        reader.close()
+        tm.assert_frame_equal(data, data_csv.iloc[0:10, :])
+
+        # Test incremental read with `get_chunk` method.
+        reader = read_sas(self.file01, format="xport", chunksize=10)
+        data = reader.get_chunk()
+        reader.close()
+        tm.assert_frame_equal(data, data_csv.iloc[0:10, :])
+
+        # Test read in loop
+        m = 0
+        reader = read_sas(self.file01, format="xport", chunksize=100)
+        for x in reader:
+            m += x.shape[0]
+        reader.close()
+        assert m == num_rows
+
+        # Read full file with `read_sas` method
+        data = read_sas(self.file01)
+        tm.assert_frame_equal(data, data_csv)
+
+    def test1_index(self):
+        # Tests with DEMO_G.xpt using index (all numeric file)
+
+        # Compare to this
+        data_csv = pd.read_csv(self.file01.replace(".xpt", ".csv"))
+        data_csv = data_csv.set_index("SEQN")
+        numeric_as_float(data_csv)
+
+        # Read full file
+        data = read_sas(self.file01, index="SEQN", format="xport")
+        tm.assert_frame_equal(data, data_csv, check_index_type=False)
+
+        # Test incremental read with `read` method.
+        reader = read_sas(self.file01, index="SEQN", format="xport", iterator=True)
+        data = reader.read(10)
+        reader.close()
+        tm.assert_frame_equal(data, data_csv.iloc[0:10, :], check_index_type=False)
+
+        # Test incremental read with `get_chunk` method.
+        reader = read_sas(self.file01, index="SEQN", format="xport", chunksize=10)
+        data = reader.get_chunk()
+        reader.close()
+        tm.assert_frame_equal(data, data_csv.iloc[0:10, :], check_index_type=False)
+
+    def test1_incremental(self):
+        # Test with DEMO_G.xpt, reading full file incrementally
+
+        data_csv = pd.read_csv(self.file01.replace(".xpt", ".csv"))
+        data_csv = data_csv.set_index("SEQN")
+        numeric_as_float(data_csv)
+
+        reader = read_sas(self.file01, index="SEQN", chunksize=1000)
+
+        all_data = [x for x in reader]
+        data = pd.concat(all_data, axis=0)
+
+        tm.assert_frame_equal(data, data_csv, check_index_type=False)
+
+    def test2(self):
+        # Test with SSHSV1_A.xpt
+
+        # Compare to this
+        data_csv = pd.read_csv(self.file02.replace(".xpt", ".csv"))
+        numeric_as_float(data_csv)
+
+        data = read_sas(self.file02)
+        tm.assert_frame_equal(data, data_csv)
+
+    def test_multiple_types(self):
+        # Test with DRXFCD_G.xpt (contains text and numeric variables)
+
+        # Compare to this
+        data_csv = pd.read_csv(self.file03.replace(".xpt", ".csv"))
+
+        data = read_sas(self.file03, encoding="utf-8")
+        tm.assert_frame_equal(data, data_csv)
+
+    def test_truncated_float_support(self):
+        # Test with paxraw_d_short.xpt, a shortened version of:
+        # http://wwwn.cdc.gov/Nchs/Nhanes/2005-2006/PAXRAW_D.ZIP
+        # This file has truncated floats (5 bytes in this case).
+
+        # GH 11713
+
+        data_csv = pd.read_csv(self.file04.replace(".xpt", ".csv"))
+
+        data = read_sas(self.file04, format="xport")
+        tm.assert_frame_equal(data.astype("int64"), data_csv)