8th day of python challenges 111-117
This commit is contained in:
@@ -0,0 +1,37 @@
|
||||
import pytest
|
||||
|
||||
import pandas.util.testing as tm
|
||||
|
||||
from pandas.io.parsers import read_csv
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def frame(float_frame):
|
||||
return float_frame[:10]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def tsframe():
|
||||
return tm.makeTimeDataFrame()[:5]
|
||||
|
||||
|
||||
@pytest.fixture(params=[True, False])
|
||||
def merge_cells(request):
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df_ref():
|
||||
"""
|
||||
Obtain the reference data from read_csv with the Python engine.
|
||||
"""
|
||||
df_ref = read_csv("test1.csv", index_col=0, parse_dates=True, engine="python")
|
||||
return df_ref
|
||||
|
||||
|
||||
@pytest.fixture(params=[".xls", ".xlsx", ".xlsm", ".ods"])
|
||||
def read_ext(request):
|
||||
"""
|
||||
Valid extensions for reading Excel files.
|
||||
"""
|
||||
return request.param
|
||||
@@ -0,0 +1,38 @@
|
||||
import functools
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
import pandas.util.testing as tm
|
||||
|
||||
pytest.importorskip("odf")
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def cd_and_set_engine(monkeypatch, datapath):
|
||||
func = functools.partial(pd.read_excel, engine="odf")
|
||||
monkeypatch.setattr(pd, "read_excel", func)
|
||||
monkeypatch.chdir(datapath("io", "data"))
|
||||
|
||||
|
||||
def test_read_invalid_types_raises():
|
||||
# the invalid_value_type.ods required manually editing
|
||||
# of the included content.xml file
|
||||
with pytest.raises(ValueError, match="Unrecognized type awesome_new_type"):
|
||||
pd.read_excel("invalid_value_type.ods")
|
||||
|
||||
|
||||
def test_read_writer_table():
|
||||
# Also test reading tables from an text OpenDocument file
|
||||
# (.odt)
|
||||
index = pd.Index(["Row 1", "Row 2", "Row 3"], name="Header")
|
||||
expected = pd.DataFrame(
|
||||
[[1, np.nan, 7], [2, np.nan, 8], [3, np.nan, 9]],
|
||||
index=index,
|
||||
columns=["Column 1", "Unnamed: 2", "Column 3"],
|
||||
)
|
||||
|
||||
result = pd.read_excel("writertable.odt", "Table1", index_col=0)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,103 @@
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame
|
||||
from pandas.util.testing import ensure_clean
|
||||
|
||||
from pandas.io.excel import ExcelWriter, _OpenpyxlWriter
|
||||
|
||||
openpyxl = pytest.importorskip("openpyxl")
|
||||
|
||||
pytestmark = pytest.mark.parametrize("ext", [".xlsx"])
|
||||
|
||||
|
||||
def test_to_excel_styleconverter(ext):
|
||||
from openpyxl import styles
|
||||
|
||||
hstyle = {
|
||||
"font": {"color": "00FF0000", "bold": True},
|
||||
"borders": {"top": "thin", "right": "thin", "bottom": "thin", "left": "thin"},
|
||||
"alignment": {"horizontal": "center", "vertical": "top"},
|
||||
"fill": {"patternType": "solid", "fgColor": {"rgb": "006666FF", "tint": 0.3}},
|
||||
"number_format": {"format_code": "0.00"},
|
||||
"protection": {"locked": True, "hidden": False},
|
||||
}
|
||||
|
||||
font_color = styles.Color("00FF0000")
|
||||
font = styles.Font(bold=True, color=font_color)
|
||||
side = styles.Side(style=styles.borders.BORDER_THIN)
|
||||
border = styles.Border(top=side, right=side, bottom=side, left=side)
|
||||
alignment = styles.Alignment(horizontal="center", vertical="top")
|
||||
fill_color = styles.Color(rgb="006666FF", tint=0.3)
|
||||
fill = styles.PatternFill(patternType="solid", fgColor=fill_color)
|
||||
|
||||
number_format = "0.00"
|
||||
|
||||
protection = styles.Protection(locked=True, hidden=False)
|
||||
|
||||
kw = _OpenpyxlWriter._convert_to_style_kwargs(hstyle)
|
||||
assert kw["font"] == font
|
||||
assert kw["border"] == border
|
||||
assert kw["alignment"] == alignment
|
||||
assert kw["fill"] == fill
|
||||
assert kw["number_format"] == number_format
|
||||
assert kw["protection"] == protection
|
||||
|
||||
|
||||
def test_write_cells_merge_styled(ext):
|
||||
from pandas.io.formats.excel import ExcelCell
|
||||
|
||||
sheet_name = "merge_styled"
|
||||
|
||||
sty_b1 = {"font": {"color": "00FF0000"}}
|
||||
sty_a2 = {"font": {"color": "0000FF00"}}
|
||||
|
||||
initial_cells = [
|
||||
ExcelCell(col=1, row=0, val=42, style=sty_b1),
|
||||
ExcelCell(col=0, row=1, val=99, style=sty_a2),
|
||||
]
|
||||
|
||||
sty_merged = {"font": {"color": "000000FF", "bold": True}}
|
||||
sty_kwargs = _OpenpyxlWriter._convert_to_style_kwargs(sty_merged)
|
||||
openpyxl_sty_merged = sty_kwargs["font"]
|
||||
merge_cells = [
|
||||
ExcelCell(
|
||||
col=0, row=0, val="pandas", mergestart=1, mergeend=1, style=sty_merged
|
||||
)
|
||||
]
|
||||
|
||||
with ensure_clean(ext) as path:
|
||||
writer = _OpenpyxlWriter(path)
|
||||
writer.write_cells(initial_cells, sheet_name=sheet_name)
|
||||
writer.write_cells(merge_cells, sheet_name=sheet_name)
|
||||
|
||||
wks = writer.sheets[sheet_name]
|
||||
xcell_b1 = wks["B1"]
|
||||
xcell_a2 = wks["A2"]
|
||||
assert xcell_b1.font == openpyxl_sty_merged
|
||||
assert xcell_a2.font == openpyxl_sty_merged
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"mode,expected", [("w", ["baz"]), ("a", ["foo", "bar", "baz"])]
|
||||
)
|
||||
def test_write_append_mode(ext, mode, expected):
|
||||
df = DataFrame([1], columns=["baz"])
|
||||
|
||||
with ensure_clean(ext) as f:
|
||||
wb = openpyxl.Workbook()
|
||||
wb.worksheets[0].title = "foo"
|
||||
wb.worksheets[0]["A1"].value = "foo"
|
||||
wb.create_sheet("bar")
|
||||
wb.worksheets[1]["A1"].value = "bar"
|
||||
wb.save(f)
|
||||
|
||||
writer = ExcelWriter(f, engine="openpyxl", mode=mode)
|
||||
df.to_excel(writer, sheet_name="baz", index=False)
|
||||
writer.save()
|
||||
|
||||
wb2 = openpyxl.load_workbook(f)
|
||||
result = [sheet.title for sheet in wb2.worksheets]
|
||||
assert result == expected
|
||||
|
||||
for index, cell_value in enumerate(expected):
|
||||
assert wb2.worksheets[index]["A1"].value == cell_value
|
||||
@@ -0,0 +1,952 @@
|
||||
from collections import OrderedDict
|
||||
import contextlib
|
||||
from datetime import datetime, time
|
||||
from functools import partial
|
||||
import os
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, Index, MultiIndex, Series
|
||||
import pandas.util.testing as tm
|
||||
|
||||
from pandas.io.common import URLError
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def ignore_xlrd_time_clock_warning():
|
||||
"""
|
||||
Context manager to ignore warnings raised by the xlrd library,
|
||||
regarding the deprecation of `time.clock` in Python 3.7.
|
||||
"""
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings(
|
||||
action="ignore",
|
||||
message="time.clock has been deprecated",
|
||||
category=DeprecationWarning,
|
||||
)
|
||||
yield
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
params=[
|
||||
# Add any engines to test here
|
||||
# When defusedxml is installed it triggers deprecation warnings for
|
||||
# xlrd and openpyxl, so catch those here
|
||||
pytest.param(
|
||||
"xlrd",
|
||||
marks=[
|
||||
td.skip_if_no("xlrd"),
|
||||
pytest.mark.filterwarnings("ignore:.*(tree\\.iter|html argument)"),
|
||||
],
|
||||
),
|
||||
pytest.param(
|
||||
"openpyxl",
|
||||
marks=[
|
||||
td.skip_if_no("openpyxl"),
|
||||
pytest.mark.filterwarnings("ignore:.*html argument"),
|
||||
],
|
||||
),
|
||||
pytest.param(
|
||||
None,
|
||||
marks=[
|
||||
td.skip_if_no("xlrd"),
|
||||
pytest.mark.filterwarnings("ignore:.*(tree\\.iter|html argument)"),
|
||||
],
|
||||
),
|
||||
pytest.param("odf", marks=td.skip_if_no("odf")),
|
||||
]
|
||||
)
|
||||
def engine(request):
|
||||
"""
|
||||
A fixture for Excel reader engines.
|
||||
"""
|
||||
return request.param
|
||||
|
||||
|
||||
class TestReaders:
|
||||
@pytest.fixture(autouse=True)
|
||||
def cd_and_set_engine(self, engine, datapath, monkeypatch, read_ext):
|
||||
"""
|
||||
Change directory and set engine for read_excel calls.
|
||||
"""
|
||||
if engine == "openpyxl" and read_ext == ".xls":
|
||||
pytest.skip()
|
||||
if engine == "odf" and read_ext != ".ods":
|
||||
pytest.skip()
|
||||
if read_ext == ".ods" and engine != "odf":
|
||||
pytest.skip()
|
||||
|
||||
func = partial(pd.read_excel, engine=engine)
|
||||
monkeypatch.chdir(datapath("io", "data"))
|
||||
monkeypatch.setattr(pd, "read_excel", func)
|
||||
|
||||
def test_usecols_int(self, read_ext, df_ref):
|
||||
df_ref = df_ref.reindex(columns=["A", "B", "C"])
|
||||
|
||||
# usecols as int
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, check_stacklevel=False, raise_on_extra_warnings=False
|
||||
):
|
||||
with ignore_xlrd_time_clock_warning():
|
||||
df1 = pd.read_excel(
|
||||
"test1" + read_ext, "Sheet1", index_col=0, usecols=3
|
||||
)
|
||||
|
||||
# usecols as int
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, check_stacklevel=False, raise_on_extra_warnings=False
|
||||
):
|
||||
with ignore_xlrd_time_clock_warning():
|
||||
df2 = pd.read_excel(
|
||||
"test1" + read_ext, "Sheet2", skiprows=[1], index_col=0, usecols=3
|
||||
)
|
||||
|
||||
# TODO add index to xls file)
|
||||
tm.assert_frame_equal(df1, df_ref, check_names=False)
|
||||
tm.assert_frame_equal(df2, df_ref, check_names=False)
|
||||
|
||||
def test_usecols_list(self, read_ext, df_ref):
|
||||
|
||||
df_ref = df_ref.reindex(columns=["B", "C"])
|
||||
df1 = pd.read_excel(
|
||||
"test1" + read_ext, "Sheet1", index_col=0, usecols=[0, 2, 3]
|
||||
)
|
||||
df2 = pd.read_excel(
|
||||
"test1" + read_ext, "Sheet2", skiprows=[1], index_col=0, usecols=[0, 2, 3]
|
||||
)
|
||||
|
||||
# TODO add index to xls file)
|
||||
tm.assert_frame_equal(df1, df_ref, check_names=False)
|
||||
tm.assert_frame_equal(df2, df_ref, check_names=False)
|
||||
|
||||
def test_usecols_str(self, read_ext, df_ref):
|
||||
|
||||
df1 = df_ref.reindex(columns=["A", "B", "C"])
|
||||
df2 = pd.read_excel("test1" + read_ext, "Sheet1", index_col=0, usecols="A:D")
|
||||
df3 = pd.read_excel(
|
||||
"test1" + read_ext, "Sheet2", skiprows=[1], index_col=0, usecols="A:D"
|
||||
)
|
||||
|
||||
# TODO add index to xls, read xls ignores index name ?
|
||||
tm.assert_frame_equal(df2, df1, check_names=False)
|
||||
tm.assert_frame_equal(df3, df1, check_names=False)
|
||||
|
||||
df1 = df_ref.reindex(columns=["B", "C"])
|
||||
df2 = pd.read_excel("test1" + read_ext, "Sheet1", index_col=0, usecols="A,C,D")
|
||||
df3 = pd.read_excel(
|
||||
"test1" + read_ext, "Sheet2", skiprows=[1], index_col=0, usecols="A,C,D"
|
||||
)
|
||||
# TODO add index to xls file
|
||||
tm.assert_frame_equal(df2, df1, check_names=False)
|
||||
tm.assert_frame_equal(df3, df1, check_names=False)
|
||||
|
||||
df1 = df_ref.reindex(columns=["B", "C"])
|
||||
df2 = pd.read_excel("test1" + read_ext, "Sheet1", index_col=0, usecols="A,C:D")
|
||||
df3 = pd.read_excel(
|
||||
"test1" + read_ext, "Sheet2", skiprows=[1], index_col=0, usecols="A,C:D"
|
||||
)
|
||||
tm.assert_frame_equal(df2, df1, check_names=False)
|
||||
tm.assert_frame_equal(df3, df1, check_names=False)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"usecols", [[0, 1, 3], [0, 3, 1], [1, 0, 3], [1, 3, 0], [3, 0, 1], [3, 1, 0]]
|
||||
)
|
||||
def test_usecols_diff_positional_int_columns_order(self, read_ext, usecols, df_ref):
|
||||
expected = df_ref[["A", "C"]]
|
||||
result = pd.read_excel(
|
||||
"test1" + read_ext, "Sheet1", index_col=0, usecols=usecols
|
||||
)
|
||||
tm.assert_frame_equal(result, expected, check_names=False)
|
||||
|
||||
@pytest.mark.parametrize("usecols", [["B", "D"], ["D", "B"]])
|
||||
def test_usecols_diff_positional_str_columns_order(self, read_ext, usecols, df_ref):
|
||||
expected = df_ref[["B", "D"]]
|
||||
expected.index = range(len(expected))
|
||||
|
||||
result = pd.read_excel("test1" + read_ext, "Sheet1", usecols=usecols)
|
||||
tm.assert_frame_equal(result, expected, check_names=False)
|
||||
|
||||
def test_read_excel_without_slicing(self, read_ext, df_ref):
|
||||
expected = df_ref
|
||||
result = pd.read_excel("test1" + read_ext, "Sheet1", index_col=0)
|
||||
tm.assert_frame_equal(result, expected, check_names=False)
|
||||
|
||||
def test_usecols_excel_range_str(self, read_ext, df_ref):
|
||||
expected = df_ref[["C", "D"]]
|
||||
result = pd.read_excel(
|
||||
"test1" + read_ext, "Sheet1", index_col=0, usecols="A,D:E"
|
||||
)
|
||||
tm.assert_frame_equal(result, expected, check_names=False)
|
||||
|
||||
def test_usecols_excel_range_str_invalid(self, read_ext):
|
||||
msg = "Invalid column name: E1"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
pd.read_excel("test1" + read_ext, "Sheet1", usecols="D:E1")
|
||||
|
||||
def test_index_col_label_error(self, read_ext):
|
||||
msg = "list indices must be integers.*, not str"
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
pd.read_excel(
|
||||
"test1" + read_ext, "Sheet1", index_col=["A"], usecols=["A", "C"]
|
||||
)
|
||||
|
||||
def test_index_col_empty(self, read_ext):
|
||||
# see gh-9208
|
||||
result = pd.read_excel("test1" + read_ext, "Sheet3", index_col=["A", "B", "C"])
|
||||
expected = DataFrame(
|
||||
columns=["D", "E", "F"],
|
||||
index=MultiIndex(levels=[[]] * 3, codes=[[]] * 3, names=["A", "B", "C"]),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("index_col", [None, 2])
|
||||
def test_index_col_with_unnamed(self, read_ext, index_col):
|
||||
# see gh-18792
|
||||
result = pd.read_excel("test1" + read_ext, "Sheet4", index_col=index_col)
|
||||
expected = DataFrame(
|
||||
[["i1", "a", "x"], ["i2", "b", "y"]], columns=["Unnamed: 0", "col1", "col2"]
|
||||
)
|
||||
if index_col:
|
||||
expected = expected.set_index(expected.columns[index_col])
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_usecols_pass_non_existent_column(self, read_ext):
|
||||
msg = (
|
||||
"Usecols do not match columns, "
|
||||
"columns expected but not found: " + r"\['E'\]"
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
pd.read_excel("test1" + read_ext, usecols=["E"])
|
||||
|
||||
def test_usecols_wrong_type(self, read_ext):
|
||||
msg = (
|
||||
"'usecols' must either be list-like of "
|
||||
"all strings, all unicode, all integers or a callable."
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
pd.read_excel("test1" + read_ext, usecols=["E1", 0])
|
||||
|
||||
def test_excel_stop_iterator(self, read_ext):
|
||||
|
||||
parsed = pd.read_excel("test2" + read_ext, "Sheet1")
|
||||
expected = DataFrame([["aaaa", "bbbbb"]], columns=["Test", "Test1"])
|
||||
tm.assert_frame_equal(parsed, expected)
|
||||
|
||||
def test_excel_cell_error_na(self, read_ext):
|
||||
|
||||
parsed = pd.read_excel("test3" + read_ext, "Sheet1")
|
||||
expected = DataFrame([[np.nan]], columns=["Test"])
|
||||
tm.assert_frame_equal(parsed, expected)
|
||||
|
||||
def test_excel_table(self, read_ext, df_ref):
|
||||
|
||||
df1 = pd.read_excel("test1" + read_ext, "Sheet1", index_col=0)
|
||||
df2 = pd.read_excel("test1" + read_ext, "Sheet2", skiprows=[1], index_col=0)
|
||||
# TODO add index to file
|
||||
tm.assert_frame_equal(df1, df_ref, check_names=False)
|
||||
tm.assert_frame_equal(df2, df_ref, check_names=False)
|
||||
|
||||
df3 = pd.read_excel("test1" + read_ext, "Sheet1", index_col=0, skipfooter=1)
|
||||
tm.assert_frame_equal(df3, df1.iloc[:-1])
|
||||
|
||||
def test_reader_special_dtypes(self, read_ext):
|
||||
|
||||
expected = DataFrame.from_dict(
|
||||
OrderedDict(
|
||||
[
|
||||
("IntCol", [1, 2, -3, 4, 0]),
|
||||
("FloatCol", [1.25, 2.25, 1.83, 1.92, 0.0000000005]),
|
||||
("BoolCol", [True, False, True, True, False]),
|
||||
("StrCol", [1, 2, 3, 4, 5]),
|
||||
# GH5394 - this is why convert_float isn't vectorized
|
||||
("Str2Col", ["a", 3, "c", "d", "e"]),
|
||||
(
|
||||
"DateCol",
|
||||
[
|
||||
datetime(2013, 10, 30),
|
||||
datetime(2013, 10, 31),
|
||||
datetime(1905, 1, 1),
|
||||
datetime(2013, 12, 14),
|
||||
datetime(2015, 3, 14),
|
||||
],
|
||||
),
|
||||
]
|
||||
)
|
||||
)
|
||||
basename = "test_types"
|
||||
|
||||
# should read in correctly and infer types
|
||||
actual = pd.read_excel(basename + read_ext, "Sheet1")
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
# if not coercing number, then int comes in as float
|
||||
float_expected = expected.copy()
|
||||
float_expected["IntCol"] = float_expected["IntCol"].astype(float)
|
||||
float_expected.loc[float_expected.index[1], "Str2Col"] = 3.0
|
||||
actual = pd.read_excel(basename + read_ext, "Sheet1", convert_float=False)
|
||||
tm.assert_frame_equal(actual, float_expected)
|
||||
|
||||
# check setting Index (assuming xls and xlsx are the same here)
|
||||
for icol, name in enumerate(expected.columns):
|
||||
actual = pd.read_excel(basename + read_ext, "Sheet1", index_col=icol)
|
||||
exp = expected.set_index(name)
|
||||
tm.assert_frame_equal(actual, exp)
|
||||
|
||||
# convert_float and converters should be different but both accepted
|
||||
expected["StrCol"] = expected["StrCol"].apply(str)
|
||||
actual = pd.read_excel(
|
||||
basename + read_ext, "Sheet1", converters={"StrCol": str}
|
||||
)
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
no_convert_float = float_expected.copy()
|
||||
no_convert_float["StrCol"] = no_convert_float["StrCol"].apply(str)
|
||||
actual = pd.read_excel(
|
||||
basename + read_ext,
|
||||
"Sheet1",
|
||||
convert_float=False,
|
||||
converters={"StrCol": str},
|
||||
)
|
||||
tm.assert_frame_equal(actual, no_convert_float)
|
||||
|
||||
# GH8212 - support for converters and missing values
|
||||
def test_reader_converters(self, read_ext):
|
||||
|
||||
basename = "test_converters"
|
||||
|
||||
expected = DataFrame.from_dict(
|
||||
OrderedDict(
|
||||
[
|
||||
("IntCol", [1, 2, -3, -1000, 0]),
|
||||
("FloatCol", [12.5, np.nan, 18.3, 19.2, 0.000000005]),
|
||||
("BoolCol", ["Found", "Found", "Found", "Not found", "Found"]),
|
||||
("StrCol", ["1", np.nan, "3", "4", "5"]),
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
converters = {
|
||||
"IntCol": lambda x: int(x) if x != "" else -1000,
|
||||
"FloatCol": lambda x: 10 * x if x else np.nan,
|
||||
2: lambda x: "Found" if x != "" else "Not found",
|
||||
3: lambda x: str(x) if x else "",
|
||||
}
|
||||
|
||||
# should read in correctly and set types of single cells (not array
|
||||
# dtypes)
|
||||
actual = pd.read_excel(basename + read_ext, "Sheet1", converters=converters)
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
def test_reader_dtype(self, read_ext):
|
||||
# GH 8212
|
||||
basename = "testdtype"
|
||||
actual = pd.read_excel(basename + read_ext)
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": [1, 2, 3, 4],
|
||||
"b": [2.5, 3.5, 4.5, 5.5],
|
||||
"c": [1, 2, 3, 4],
|
||||
"d": [1.0, 2.0, np.nan, 4.0],
|
||||
}
|
||||
).reindex(columns=["a", "b", "c", "d"])
|
||||
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
actual = pd.read_excel(
|
||||
basename + read_ext, dtype={"a": "float64", "b": "float32", "c": str}
|
||||
)
|
||||
|
||||
expected["a"] = expected["a"].astype("float64")
|
||||
expected["b"] = expected["b"].astype("float32")
|
||||
expected["c"] = ["001", "002", "003", "004"]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
pd.read_excel(basename + read_ext, dtype={"d": "int64"})
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype,expected",
|
||||
[
|
||||
(
|
||||
None,
|
||||
DataFrame(
|
||||
{
|
||||
"a": [1, 2, 3, 4],
|
||||
"b": [2.5, 3.5, 4.5, 5.5],
|
||||
"c": [1, 2, 3, 4],
|
||||
"d": [1.0, 2.0, np.nan, 4.0],
|
||||
}
|
||||
),
|
||||
),
|
||||
(
|
||||
{"a": "float64", "b": "float32", "c": str, "d": str},
|
||||
DataFrame(
|
||||
{
|
||||
"a": Series([1, 2, 3, 4], dtype="float64"),
|
||||
"b": Series([2.5, 3.5, 4.5, 5.5], dtype="float32"),
|
||||
"c": ["001", "002", "003", "004"],
|
||||
"d": ["1", "2", np.nan, "4"],
|
||||
}
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_reader_dtype_str(self, read_ext, dtype, expected):
|
||||
# see gh-20377
|
||||
basename = "testdtype"
|
||||
|
||||
actual = pd.read_excel(basename + read_ext, dtype=dtype)
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
def test_reading_all_sheets(self, read_ext):
|
||||
# Test reading all sheetnames by setting sheetname to None,
|
||||
# Ensure a dict is returned.
|
||||
# See PR #9450
|
||||
basename = "test_multisheet"
|
||||
dfs = pd.read_excel(basename + read_ext, sheet_name=None)
|
||||
# ensure this is not alphabetical to test order preservation
|
||||
expected_keys = ["Charlie", "Alpha", "Beta"]
|
||||
tm.assert_contains_all(expected_keys, dfs.keys())
|
||||
# Issue 9930
|
||||
# Ensure sheet order is preserved
|
||||
assert expected_keys == list(dfs.keys())
|
||||
|
||||
def test_reading_multiple_specific_sheets(self, read_ext):
|
||||
# Test reading specific sheetnames by specifying a mixed list
|
||||
# of integers and strings, and confirm that duplicated sheet
|
||||
# references (positions/names) are removed properly.
|
||||
# Ensure a dict is returned
|
||||
# See PR #9450
|
||||
basename = "test_multisheet"
|
||||
# Explicitly request duplicates. Only the set should be returned.
|
||||
expected_keys = [2, "Charlie", "Charlie"]
|
||||
dfs = pd.read_excel(basename + read_ext, sheet_name=expected_keys)
|
||||
expected_keys = list(set(expected_keys))
|
||||
tm.assert_contains_all(expected_keys, dfs.keys())
|
||||
assert len(expected_keys) == len(dfs.keys())
|
||||
|
||||
def test_reading_all_sheets_with_blank(self, read_ext):
|
||||
# Test reading all sheetnames by setting sheetname to None,
|
||||
# In the case where some sheets are blank.
|
||||
# Issue #11711
|
||||
basename = "blank_with_header"
|
||||
dfs = pd.read_excel(basename + read_ext, sheet_name=None)
|
||||
expected_keys = ["Sheet1", "Sheet2", "Sheet3"]
|
||||
tm.assert_contains_all(expected_keys, dfs.keys())
|
||||
|
||||
# GH6403
|
||||
def test_read_excel_blank(self, read_ext):
|
||||
actual = pd.read_excel("blank" + read_ext, "Sheet1")
|
||||
tm.assert_frame_equal(actual, DataFrame())
|
||||
|
||||
def test_read_excel_blank_with_header(self, read_ext):
|
||||
expected = DataFrame(columns=["col_1", "col_2"])
|
||||
actual = pd.read_excel("blank_with_header" + read_ext, "Sheet1")
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
def test_date_conversion_overflow(self, read_ext):
|
||||
# GH 10001 : pandas.ExcelFile ignore parse_dates=False
|
||||
expected = pd.DataFrame(
|
||||
[
|
||||
[pd.Timestamp("2016-03-12"), "Marc Johnson"],
|
||||
[pd.Timestamp("2016-03-16"), "Jack Black"],
|
||||
[1e20, "Timothy Brown"],
|
||||
],
|
||||
columns=["DateColWithBigInt", "StringCol"],
|
||||
)
|
||||
|
||||
if pd.read_excel.keywords["engine"] == "openpyxl":
|
||||
pytest.xfail("Maybe not supported by openpyxl")
|
||||
|
||||
result = pd.read_excel("testdateoverflow" + read_ext)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_sheet_name(self, read_ext, df_ref):
|
||||
filename = "test1"
|
||||
sheet_name = "Sheet1"
|
||||
|
||||
df1 = pd.read_excel(
|
||||
filename + read_ext, sheet_name=sheet_name, index_col=0
|
||||
) # doc
|
||||
with ignore_xlrd_time_clock_warning():
|
||||
df2 = pd.read_excel(filename + read_ext, index_col=0, sheet_name=sheet_name)
|
||||
|
||||
tm.assert_frame_equal(df1, df_ref, check_names=False)
|
||||
tm.assert_frame_equal(df2, df_ref, check_names=False)
|
||||
|
||||
def test_excel_read_buffer(self, read_ext):
|
||||
|
||||
pth = "test1" + read_ext
|
||||
expected = pd.read_excel(pth, "Sheet1", index_col=0)
|
||||
with open(pth, "rb") as f:
|
||||
actual = pd.read_excel(f, "Sheet1", index_col=0)
|
||||
tm.assert_frame_equal(expected, actual)
|
||||
|
||||
def test_bad_engine_raises(self, read_ext):
|
||||
bad_engine = "foo"
|
||||
with pytest.raises(ValueError, match="Unknown engine: foo"):
|
||||
pd.read_excel("", engine=bad_engine)
|
||||
|
||||
@tm.network
|
||||
def test_read_from_http_url(self, read_ext):
|
||||
if read_ext == ".ods": # TODO: remove once on master
|
||||
pytest.skip()
|
||||
|
||||
url = (
|
||||
"https://raw.github.com/pandas-dev/pandas/master/"
|
||||
"pandas/tests/io/data/test1" + read_ext
|
||||
)
|
||||
url_table = pd.read_excel(url)
|
||||
local_table = pd.read_excel("test1" + read_ext)
|
||||
tm.assert_frame_equal(url_table, local_table)
|
||||
|
||||
@td.skip_if_not_us_locale
|
||||
def test_read_from_s3_url(self, read_ext, s3_resource):
|
||||
# Bucket "pandas-test" created in tests/io/conftest.py
|
||||
with open("test1" + read_ext, "rb") as f:
|
||||
s3_resource.Bucket("pandas-test").put_object(Key="test1" + read_ext, Body=f)
|
||||
|
||||
url = "s3://pandas-test/test1" + read_ext
|
||||
url_table = pd.read_excel(url)
|
||||
local_table = pd.read_excel("test1" + read_ext)
|
||||
tm.assert_frame_equal(url_table, local_table)
|
||||
|
||||
@pytest.mark.slow
|
||||
# ignore warning from old xlrd
|
||||
@pytest.mark.filterwarnings("ignore:This metho:PendingDeprecationWarning")
|
||||
def test_read_from_file_url(self, read_ext, datapath):
|
||||
|
||||
# FILE
|
||||
localtable = os.path.join(datapath("io", "data"), "test1" + read_ext)
|
||||
local_table = pd.read_excel(localtable)
|
||||
|
||||
try:
|
||||
url_table = pd.read_excel("file://localhost/" + localtable)
|
||||
except URLError:
|
||||
# fails on some systems
|
||||
import platform
|
||||
|
||||
pytest.skip("failing on {}".format(" ".join(platform.uname()).strip()))
|
||||
|
||||
tm.assert_frame_equal(url_table, local_table)
|
||||
|
||||
def test_read_from_pathlib_path(self, read_ext):
|
||||
|
||||
# GH12655
|
||||
from pathlib import Path
|
||||
|
||||
str_path = "test1" + read_ext
|
||||
expected = pd.read_excel(str_path, "Sheet1", index_col=0)
|
||||
|
||||
path_obj = Path("test1" + read_ext)
|
||||
actual = pd.read_excel(path_obj, "Sheet1", index_col=0)
|
||||
|
||||
tm.assert_frame_equal(expected, actual)
|
||||
|
||||
@td.skip_if_no("py.path")
|
||||
def test_read_from_py_localpath(self, read_ext):
|
||||
|
||||
# GH12655
|
||||
from py.path import local as LocalPath
|
||||
|
||||
str_path = os.path.join("test1" + read_ext)
|
||||
expected = pd.read_excel(str_path, "Sheet1", index_col=0)
|
||||
|
||||
path_obj = LocalPath().join("test1" + read_ext)
|
||||
actual = pd.read_excel(path_obj, "Sheet1", index_col=0)
|
||||
|
||||
tm.assert_frame_equal(expected, actual)
|
||||
|
||||
def test_reader_seconds(self, read_ext):
|
||||
|
||||
# Test reading times with and without milliseconds. GH5945.
|
||||
expected = DataFrame.from_dict(
|
||||
{
|
||||
"Time": [
|
||||
time(1, 2, 3),
|
||||
time(2, 45, 56, 100000),
|
||||
time(4, 29, 49, 200000),
|
||||
time(6, 13, 42, 300000),
|
||||
time(7, 57, 35, 400000),
|
||||
time(9, 41, 28, 500000),
|
||||
time(11, 25, 21, 600000),
|
||||
time(13, 9, 14, 700000),
|
||||
time(14, 53, 7, 800000),
|
||||
time(16, 37, 0, 900000),
|
||||
time(18, 20, 54),
|
||||
]
|
||||
}
|
||||
)
|
||||
|
||||
actual = pd.read_excel("times_1900" + read_ext, "Sheet1")
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
actual = pd.read_excel("times_1904" + read_ext, "Sheet1")
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
def test_read_excel_multiindex(self, read_ext):
|
||||
# see gh-4679
|
||||
mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]])
|
||||
mi_file = "testmultiindex" + read_ext
|
||||
|
||||
# "mi_column" sheet
|
||||
expected = DataFrame(
|
||||
[
|
||||
[1, 2.5, pd.Timestamp("2015-01-01"), True],
|
||||
[2, 3.5, pd.Timestamp("2015-01-02"), False],
|
||||
[3, 4.5, pd.Timestamp("2015-01-03"), False],
|
||||
[4, 5.5, pd.Timestamp("2015-01-04"), True],
|
||||
],
|
||||
columns=mi,
|
||||
)
|
||||
|
||||
actual = pd.read_excel(mi_file, "mi_column", header=[0, 1], index_col=0)
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
# "mi_index" sheet
|
||||
expected.index = mi
|
||||
expected.columns = ["a", "b", "c", "d"]
|
||||
|
||||
actual = pd.read_excel(mi_file, "mi_index", index_col=[0, 1])
|
||||
tm.assert_frame_equal(actual, expected, check_names=False)
|
||||
|
||||
# "both" sheet
|
||||
expected.columns = mi
|
||||
|
||||
actual = pd.read_excel(mi_file, "both", index_col=[0, 1], header=[0, 1])
|
||||
tm.assert_frame_equal(actual, expected, check_names=False)
|
||||
|
||||
# "mi_index_name" sheet
|
||||
expected.columns = ["a", "b", "c", "d"]
|
||||
expected.index = mi.set_names(["ilvl1", "ilvl2"])
|
||||
|
||||
actual = pd.read_excel(mi_file, "mi_index_name", index_col=[0, 1])
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
# "mi_column_name" sheet
|
||||
expected.index = list(range(4))
|
||||
expected.columns = mi.set_names(["c1", "c2"])
|
||||
actual = pd.read_excel(mi_file, "mi_column_name", header=[0, 1], index_col=0)
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
# see gh-11317
|
||||
# "name_with_int" sheet
|
||||
expected.columns = mi.set_levels([1, 2], level=1).set_names(["c1", "c2"])
|
||||
|
||||
actual = pd.read_excel(mi_file, "name_with_int", index_col=0, header=[0, 1])
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
# "both_name" sheet
|
||||
expected.columns = mi.set_names(["c1", "c2"])
|
||||
expected.index = mi.set_names(["ilvl1", "ilvl2"])
|
||||
|
||||
actual = pd.read_excel(mi_file, "both_name", index_col=[0, 1], header=[0, 1])
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
# "both_skiprows" sheet
|
||||
actual = pd.read_excel(
|
||||
mi_file, "both_name_skiprows", index_col=[0, 1], header=[0, 1], skiprows=2
|
||||
)
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
def test_read_excel_multiindex_header_only(self, read_ext):
|
||||
# see gh-11733.
|
||||
#
|
||||
# Don't try to parse a header name if there isn't one.
|
||||
mi_file = "testmultiindex" + read_ext
|
||||
result = pd.read_excel(mi_file, "index_col_none", header=[0, 1])
|
||||
|
||||
exp_columns = MultiIndex.from_product([("A", "B"), ("key", "val")])
|
||||
expected = DataFrame([[1, 2, 3, 4]] * 2, columns=exp_columns)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_excel_old_index_format(self, read_ext):
|
||||
# see gh-4679
|
||||
filename = "test_index_name_pre17" + read_ext
|
||||
|
||||
# We detect headers to determine if index names exist, so
|
||||
# that "index" name in the "names" version of the data will
|
||||
# now be interpreted as rows that include null data.
|
||||
data = np.array(
|
||||
[
|
||||
[None, None, None, None, None],
|
||||
["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"],
|
||||
["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"],
|
||||
["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"],
|
||||
["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"],
|
||||
["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"],
|
||||
]
|
||||
)
|
||||
columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"]
|
||||
mi = MultiIndex(
|
||||
levels=[
|
||||
["R0", "R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"],
|
||||
["R1", "R_l1_g0", "R_l1_g1", "R_l1_g2", "R_l1_g3", "R_l1_g4"],
|
||||
],
|
||||
codes=[[0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5]],
|
||||
names=[None, None],
|
||||
)
|
||||
si = Index(
|
||||
["R0", "R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"], name=None
|
||||
)
|
||||
|
||||
expected = pd.DataFrame(data, index=si, columns=columns)
|
||||
|
||||
actual = pd.read_excel(filename, "single_names", index_col=0)
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
expected.index = mi
|
||||
|
||||
actual = pd.read_excel(filename, "multi_names", index_col=[0, 1])
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
# The analogous versions of the "names" version data
|
||||
# where there are explicitly no names for the indices.
|
||||
data = np.array(
|
||||
[
|
||||
["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"],
|
||||
["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"],
|
||||
["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"],
|
||||
["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"],
|
||||
["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"],
|
||||
]
|
||||
)
|
||||
columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"]
|
||||
mi = MultiIndex(
|
||||
levels=[
|
||||
["R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"],
|
||||
["R_l1_g0", "R_l1_g1", "R_l1_g2", "R_l1_g3", "R_l1_g4"],
|
||||
],
|
||||
codes=[[0, 1, 2, 3, 4], [0, 1, 2, 3, 4]],
|
||||
names=[None, None],
|
||||
)
|
||||
si = Index(["R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"], name=None)
|
||||
|
||||
expected = pd.DataFrame(data, index=si, columns=columns)
|
||||
|
||||
actual = pd.read_excel(filename, "single_no_names", index_col=0)
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
expected.index = mi
|
||||
|
||||
actual = pd.read_excel(filename, "multi_no_names", index_col=[0, 1])
|
||||
tm.assert_frame_equal(actual, expected, check_names=False)
|
||||
|
||||
def test_read_excel_bool_header_arg(self, read_ext):
|
||||
# GH 6114
|
||||
for arg in [True, False]:
|
||||
with pytest.raises(TypeError):
|
||||
pd.read_excel("test1" + read_ext, header=arg)
|
||||
|
||||
def test_read_excel_chunksize(self, read_ext):
|
||||
# GH 8011
|
||||
with pytest.raises(NotImplementedError):
|
||||
pd.read_excel("test1" + read_ext, chunksize=100)
|
||||
|
||||
def test_read_excel_skiprows_list(self, read_ext):
|
||||
# GH 4903
|
||||
actual = pd.read_excel(
|
||||
"testskiprows" + read_ext, "skiprows_list", skiprows=[0, 2]
|
||||
)
|
||||
expected = DataFrame(
|
||||
[
|
||||
[1, 2.5, pd.Timestamp("2015-01-01"), True],
|
||||
[2, 3.5, pd.Timestamp("2015-01-02"), False],
|
||||
[3, 4.5, pd.Timestamp("2015-01-03"), False],
|
||||
[4, 5.5, pd.Timestamp("2015-01-04"), True],
|
||||
],
|
||||
columns=["a", "b", "c", "d"],
|
||||
)
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
actual = pd.read_excel(
|
||||
"testskiprows" + read_ext, "skiprows_list", skiprows=np.array([0, 2])
|
||||
)
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
def test_read_excel_nrows(self, read_ext):
|
||||
# GH 16645
|
||||
num_rows_to_pull = 5
|
||||
actual = pd.read_excel("test1" + read_ext, nrows=num_rows_to_pull)
|
||||
expected = pd.read_excel("test1" + read_ext)
|
||||
expected = expected[:num_rows_to_pull]
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
def test_read_excel_nrows_greater_than_nrows_in_file(self, read_ext):
|
||||
# GH 16645
|
||||
expected = pd.read_excel("test1" + read_ext)
|
||||
num_records_in_file = len(expected)
|
||||
num_rows_to_pull = num_records_in_file + 10
|
||||
actual = pd.read_excel("test1" + read_ext, nrows=num_rows_to_pull)
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
def test_read_excel_nrows_non_integer_parameter(self, read_ext):
|
||||
# GH 16645
|
||||
msg = "'nrows' must be an integer >=0"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
pd.read_excel("test1" + read_ext, nrows="5")
|
||||
|
||||
def test_read_excel_squeeze(self, read_ext):
|
||||
# GH 12157
|
||||
f = "test_squeeze" + read_ext
|
||||
|
||||
actual = pd.read_excel(f, "two_columns", index_col=0, squeeze=True)
|
||||
expected = pd.Series([2, 3, 4], [4, 5, 6], name="b")
|
||||
expected.index.name = "a"
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
actual = pd.read_excel(f, "two_columns", squeeze=True)
|
||||
expected = pd.DataFrame({"a": [4, 5, 6], "b": [2, 3, 4]})
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
actual = pd.read_excel(f, "one_column", squeeze=True)
|
||||
expected = pd.Series([1, 2, 3], name="a")
|
||||
tm.assert_series_equal(actual, expected)
|
||||
|
||||
|
||||
class TestExcelFileRead:
|
||||
@pytest.fixture(autouse=True)
|
||||
def cd_and_set_engine(self, engine, datapath, monkeypatch, read_ext):
|
||||
"""
|
||||
Change directory and set engine for ExcelFile objects.
|
||||
"""
|
||||
if engine == "odf" and read_ext != ".ods":
|
||||
pytest.skip()
|
||||
if read_ext == ".ods" and engine != "odf":
|
||||
pytest.skip()
|
||||
if engine == "openpyxl" and read_ext == ".xls":
|
||||
pytest.skip()
|
||||
|
||||
func = partial(pd.ExcelFile, engine=engine)
|
||||
monkeypatch.chdir(datapath("io", "data"))
|
||||
monkeypatch.setattr(pd, "ExcelFile", func)
|
||||
|
||||
def test_excel_passes_na(self, read_ext):
|
||||
|
||||
with pd.ExcelFile("test4" + read_ext) as excel:
|
||||
parsed = pd.read_excel(
|
||||
excel, "Sheet1", keep_default_na=False, na_values=["apple"]
|
||||
)
|
||||
expected = DataFrame(
|
||||
[["NA"], [1], ["NA"], [np.nan], ["rabbit"]], columns=["Test"]
|
||||
)
|
||||
tm.assert_frame_equal(parsed, expected)
|
||||
|
||||
with pd.ExcelFile("test4" + read_ext) as excel:
|
||||
parsed = pd.read_excel(
|
||||
excel, "Sheet1", keep_default_na=True, na_values=["apple"]
|
||||
)
|
||||
expected = DataFrame(
|
||||
[[np.nan], [1], [np.nan], [np.nan], ["rabbit"]], columns=["Test"]
|
||||
)
|
||||
tm.assert_frame_equal(parsed, expected)
|
||||
|
||||
# 13967
|
||||
with pd.ExcelFile("test5" + read_ext) as excel:
|
||||
parsed = pd.read_excel(
|
||||
excel, "Sheet1", keep_default_na=False, na_values=["apple"]
|
||||
)
|
||||
expected = DataFrame(
|
||||
[["1.#QNAN"], [1], ["nan"], [np.nan], ["rabbit"]], columns=["Test"]
|
||||
)
|
||||
tm.assert_frame_equal(parsed, expected)
|
||||
|
||||
with pd.ExcelFile("test5" + read_ext) as excel:
|
||||
parsed = pd.read_excel(
|
||||
excel, "Sheet1", keep_default_na=True, na_values=["apple"]
|
||||
)
|
||||
expected = DataFrame(
|
||||
[[np.nan], [1], [np.nan], [np.nan], ["rabbit"]], columns=["Test"]
|
||||
)
|
||||
tm.assert_frame_equal(parsed, expected)
|
||||
|
||||
@pytest.mark.parametrize("arg", ["sheet", "sheetname", "parse_cols"])
|
||||
def test_unexpected_kwargs_raises(self, read_ext, arg):
|
||||
# gh-17964
|
||||
kwarg = {arg: "Sheet1"}
|
||||
msg = "unexpected keyword argument `{}`".format(arg)
|
||||
|
||||
with pd.ExcelFile("test1" + read_ext) as excel:
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
pd.read_excel(excel, **kwarg)
|
||||
|
||||
def test_excel_table_sheet_by_index(self, read_ext, df_ref):
|
||||
|
||||
with pd.ExcelFile("test1" + read_ext) as excel:
|
||||
df1 = pd.read_excel(excel, 0, index_col=0)
|
||||
df2 = pd.read_excel(excel, 1, skiprows=[1], index_col=0)
|
||||
tm.assert_frame_equal(df1, df_ref, check_names=False)
|
||||
tm.assert_frame_equal(df2, df_ref, check_names=False)
|
||||
|
||||
with pd.ExcelFile("test1" + read_ext) as excel:
|
||||
df1 = excel.parse(0, index_col=0)
|
||||
df2 = excel.parse(1, skiprows=[1], index_col=0)
|
||||
tm.assert_frame_equal(df1, df_ref, check_names=False)
|
||||
tm.assert_frame_equal(df2, df_ref, check_names=False)
|
||||
|
||||
with pd.ExcelFile("test1" + read_ext) as excel:
|
||||
df3 = pd.read_excel(excel, 0, index_col=0, skipfooter=1)
|
||||
tm.assert_frame_equal(df3, df1.iloc[:-1])
|
||||
|
||||
with tm.assert_produces_warning(
|
||||
FutureWarning, check_stacklevel=False, raise_on_extra_warnings=False
|
||||
):
|
||||
with pd.ExcelFile("test1" + read_ext) as excel:
|
||||
df4 = pd.read_excel(excel, 0, index_col=0, skip_footer=1)
|
||||
|
||||
tm.assert_frame_equal(df3, df4)
|
||||
|
||||
with pd.ExcelFile("test1" + read_ext) as excel:
|
||||
df3 = excel.parse(0, index_col=0, skipfooter=1)
|
||||
|
||||
tm.assert_frame_equal(df3, df1.iloc[:-1])
|
||||
|
||||
def test_sheet_name(self, read_ext, df_ref):
|
||||
filename = "test1"
|
||||
sheet_name = "Sheet1"
|
||||
|
||||
with pd.ExcelFile(filename + read_ext) as excel:
|
||||
df1_parse = excel.parse(sheet_name=sheet_name, index_col=0) # doc
|
||||
|
||||
with pd.ExcelFile(filename + read_ext) as excel:
|
||||
df2_parse = excel.parse(index_col=0, sheet_name=sheet_name)
|
||||
|
||||
tm.assert_frame_equal(df1_parse, df_ref, check_names=False)
|
||||
tm.assert_frame_equal(df2_parse, df_ref, check_names=False)
|
||||
|
||||
def test_excel_read_buffer(self, engine, read_ext):
|
||||
pth = "test1" + read_ext
|
||||
expected = pd.read_excel(pth, "Sheet1", index_col=0, engine=engine)
|
||||
|
||||
with open(pth, "rb") as f:
|
||||
with pd.ExcelFile(f) as xls:
|
||||
actual = pd.read_excel(xls, "Sheet1", index_col=0)
|
||||
|
||||
tm.assert_frame_equal(expected, actual)
|
||||
|
||||
def test_reader_closes_file(self, engine, read_ext):
|
||||
f = open("test1" + read_ext, "rb")
|
||||
with pd.ExcelFile(f) as xlsx:
|
||||
# parses okay
|
||||
pd.read_excel(xlsx, "Sheet1", index_col=0, engine=engine)
|
||||
|
||||
assert f.closed
|
||||
|
||||
def test_conflicting_excel_engines(self, read_ext):
|
||||
# GH 26566
|
||||
msg = "Engine should not be specified when passing an ExcelFile"
|
||||
|
||||
with pd.ExcelFile("test1" + read_ext) as xl:
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
pd.read_excel(xl, engine="foo")
|
||||
@@ -0,0 +1,169 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame
|
||||
from pandas.util.testing import ensure_clean
|
||||
|
||||
from pandas.io.excel import ExcelWriter
|
||||
from pandas.io.formats.excel import ExcelFormatter
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"engine",
|
||||
[
|
||||
pytest.param(
|
||||
"xlwt",
|
||||
marks=pytest.mark.xfail(
|
||||
reason="xlwt does not support openpyxl-compatible style dicts"
|
||||
),
|
||||
),
|
||||
"xlsxwriter",
|
||||
"openpyxl",
|
||||
],
|
||||
)
|
||||
def test_styler_to_excel(engine):
|
||||
def style(df):
|
||||
# XXX: RGB colors not supported in xlwt
|
||||
return DataFrame(
|
||||
[
|
||||
["font-weight: bold", "", ""],
|
||||
["", "color: blue", ""],
|
||||
["", "", "text-decoration: underline"],
|
||||
["border-style: solid", "", ""],
|
||||
["", "font-style: italic", ""],
|
||||
["", "", "text-align: right"],
|
||||
["background-color: red", "", ""],
|
||||
["number-format: 0%", "", ""],
|
||||
["", "", ""],
|
||||
["", "", ""],
|
||||
["", "", ""],
|
||||
],
|
||||
index=df.index,
|
||||
columns=df.columns,
|
||||
)
|
||||
|
||||
def assert_equal_style(cell1, cell2, engine):
|
||||
if engine in ["xlsxwriter", "openpyxl"]:
|
||||
pytest.xfail(
|
||||
reason=(
|
||||
"GH25351: failing on some attribute "
|
||||
"comparisons in {}".format(engine)
|
||||
)
|
||||
)
|
||||
# XXX: should find a better way to check equality
|
||||
assert cell1.alignment.__dict__ == cell2.alignment.__dict__
|
||||
assert cell1.border.__dict__ == cell2.border.__dict__
|
||||
assert cell1.fill.__dict__ == cell2.fill.__dict__
|
||||
assert cell1.font.__dict__ == cell2.font.__dict__
|
||||
assert cell1.number_format == cell2.number_format
|
||||
assert cell1.protection.__dict__ == cell2.protection.__dict__
|
||||
|
||||
def custom_converter(css):
|
||||
# use bold iff there is custom style attached to the cell
|
||||
if css.strip(" \n;"):
|
||||
return {"font": {"bold": True}}
|
||||
return {}
|
||||
|
||||
pytest.importorskip("jinja2")
|
||||
pytest.importorskip(engine)
|
||||
|
||||
# Prepare spreadsheets
|
||||
|
||||
df = DataFrame(np.random.randn(11, 3))
|
||||
with ensure_clean(".xlsx" if engine != "xlwt" else ".xls") as path:
|
||||
writer = ExcelWriter(path, engine=engine)
|
||||
df.to_excel(writer, sheet_name="frame")
|
||||
df.style.to_excel(writer, sheet_name="unstyled")
|
||||
styled = df.style.apply(style, axis=None)
|
||||
styled.to_excel(writer, sheet_name="styled")
|
||||
ExcelFormatter(styled, style_converter=custom_converter).write(
|
||||
writer, sheet_name="custom"
|
||||
)
|
||||
writer.save()
|
||||
|
||||
if engine not in ("openpyxl", "xlsxwriter"):
|
||||
# For other engines, we only smoke test
|
||||
return
|
||||
openpyxl = pytest.importorskip("openpyxl")
|
||||
wb = openpyxl.load_workbook(path)
|
||||
|
||||
# (1) compare DataFrame.to_excel and Styler.to_excel when unstyled
|
||||
n_cells = 0
|
||||
for col1, col2 in zip(wb["frame"].columns, wb["unstyled"].columns):
|
||||
assert len(col1) == len(col2)
|
||||
for cell1, cell2 in zip(col1, col2):
|
||||
assert cell1.value == cell2.value
|
||||
assert_equal_style(cell1, cell2, engine)
|
||||
n_cells += 1
|
||||
|
||||
# ensure iteration actually happened:
|
||||
assert n_cells == (11 + 1) * (3 + 1)
|
||||
|
||||
# (2) check styling with default converter
|
||||
|
||||
# XXX: openpyxl (as at 2.4) prefixes colors with 00, xlsxwriter with FF
|
||||
alpha = "00" if engine == "openpyxl" else "FF"
|
||||
|
||||
n_cells = 0
|
||||
for col1, col2 in zip(wb["frame"].columns, wb["styled"].columns):
|
||||
assert len(col1) == len(col2)
|
||||
for cell1, cell2 in zip(col1, col2):
|
||||
ref = "{cell2.column}{cell2.row:d}".format(cell2=cell2)
|
||||
# XXX: this isn't as strong a test as ideal; we should
|
||||
# confirm that differences are exclusive
|
||||
if ref == "B2":
|
||||
assert not cell1.font.bold
|
||||
assert cell2.font.bold
|
||||
elif ref == "C3":
|
||||
assert cell1.font.color.rgb != cell2.font.color.rgb
|
||||
assert cell2.font.color.rgb == alpha + "0000FF"
|
||||
elif ref == "D4":
|
||||
assert cell1.font.underline != cell2.font.underline
|
||||
assert cell2.font.underline == "single"
|
||||
elif ref == "B5":
|
||||
assert not cell1.border.left.style
|
||||
assert (
|
||||
cell2.border.top.style
|
||||
== cell2.border.right.style
|
||||
== cell2.border.bottom.style
|
||||
== cell2.border.left.style
|
||||
== "medium"
|
||||
)
|
||||
elif ref == "C6":
|
||||
assert not cell1.font.italic
|
||||
assert cell2.font.italic
|
||||
elif ref == "D7":
|
||||
assert cell1.alignment.horizontal != cell2.alignment.horizontal
|
||||
assert cell2.alignment.horizontal == "right"
|
||||
elif ref == "B8":
|
||||
assert cell1.fill.fgColor.rgb != cell2.fill.fgColor.rgb
|
||||
assert cell1.fill.patternType != cell2.fill.patternType
|
||||
assert cell2.fill.fgColor.rgb == alpha + "FF0000"
|
||||
assert cell2.fill.patternType == "solid"
|
||||
elif ref == "B9":
|
||||
assert cell1.number_format == "General"
|
||||
assert cell2.number_format == "0%"
|
||||
else:
|
||||
assert_equal_style(cell1, cell2, engine)
|
||||
|
||||
assert cell1.value == cell2.value
|
||||
n_cells += 1
|
||||
|
||||
assert n_cells == (11 + 1) * (3 + 1)
|
||||
|
||||
# (3) check styling with custom converter
|
||||
n_cells = 0
|
||||
for col1, col2 in zip(wb["frame"].columns, wb["custom"].columns):
|
||||
assert len(col1) == len(col2)
|
||||
for cell1, cell2 in zip(col1, col2):
|
||||
ref = "{cell2.column}{cell2.row:d}".format(cell2=cell2)
|
||||
if ref in ("B2", "C3", "D4", "B5", "C6", "D7", "B8", "B9"):
|
||||
assert not cell1.font.bold
|
||||
assert cell2.font.bold
|
||||
else:
|
||||
assert_equal_style(cell1, cell2, engine)
|
||||
|
||||
assert cell1.value == cell2.value
|
||||
n_cells += 1
|
||||
|
||||
assert n_cells == (11 + 1) * (3 + 1)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,42 @@
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
import pandas.util.testing as tm
|
||||
from pandas.util.testing import ensure_clean
|
||||
|
||||
from pandas.io.excel import ExcelFile
|
||||
|
||||
xlrd = pytest.importorskip("xlrd")
|
||||
xlwt = pytest.importorskip("xlwt")
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def skip_ods_files(read_ext):
|
||||
if read_ext == ".ods":
|
||||
pytest.skip("Not valid for xlrd")
|
||||
|
||||
|
||||
def test_read_xlrd_book(read_ext, frame):
|
||||
df = frame
|
||||
|
||||
engine = "xlrd"
|
||||
sheet_name = "SheetA"
|
||||
|
||||
with ensure_clean(read_ext) as pth:
|
||||
df.to_excel(pth, sheet_name)
|
||||
book = xlrd.open_workbook(pth)
|
||||
|
||||
with ExcelFile(book, engine=engine) as xl:
|
||||
result = pd.read_excel(xl, sheet_name, index_col=0)
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
result = pd.read_excel(book, sheet_name=sheet_name, engine=engine, index_col=0)
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
|
||||
# TODO: test for openpyxl as well
|
||||
def test_excel_table_sheet_by_index(datapath, read_ext):
|
||||
path = datapath("io", "data", "test1{}".format(read_ext))
|
||||
with pd.ExcelFile(path) as excel:
|
||||
with pytest.raises(xlrd.XLRDError):
|
||||
pd.read_excel(excel, "asdf")
|
||||
@@ -0,0 +1,64 @@
|
||||
import warnings
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame
|
||||
from pandas.util.testing import ensure_clean
|
||||
|
||||
from pandas.io.excel import ExcelWriter
|
||||
|
||||
xlsxwriter = pytest.importorskip("xlsxwriter")
|
||||
|
||||
pytestmark = pytest.mark.parametrize("ext", [".xlsx"])
|
||||
|
||||
|
||||
def test_column_format(ext):
|
||||
# Test that column formats are applied to cells. Test for issue #9167.
|
||||
# Applicable to xlsxwriter only.
|
||||
with warnings.catch_warnings():
|
||||
# Ignore the openpyxl lxml warning.
|
||||
warnings.simplefilter("ignore")
|
||||
openpyxl = pytest.importorskip("openpyxl")
|
||||
|
||||
with ensure_clean(ext) as path:
|
||||
frame = DataFrame({"A": [123456, 123456], "B": [123456, 123456]})
|
||||
|
||||
writer = ExcelWriter(path)
|
||||
frame.to_excel(writer)
|
||||
|
||||
# Add a number format to col B and ensure it is applied to cells.
|
||||
num_format = "#,##0"
|
||||
write_workbook = writer.book
|
||||
write_worksheet = write_workbook.worksheets()[0]
|
||||
col_format = write_workbook.add_format({"num_format": num_format})
|
||||
write_worksheet.set_column("B:B", None, col_format)
|
||||
writer.save()
|
||||
|
||||
read_workbook = openpyxl.load_workbook(path)
|
||||
try:
|
||||
read_worksheet = read_workbook["Sheet1"]
|
||||
except TypeError:
|
||||
# compat
|
||||
read_worksheet = read_workbook.get_sheet_by_name(name="Sheet1")
|
||||
|
||||
# Get the number format from the cell.
|
||||
try:
|
||||
cell = read_worksheet["B2"]
|
||||
except TypeError:
|
||||
# compat
|
||||
cell = read_worksheet.cell("B2")
|
||||
|
||||
try:
|
||||
read_num_format = cell.number_format
|
||||
except Exception:
|
||||
read_num_format = cell.style.number_format._format_code
|
||||
|
||||
assert read_num_format == num_format
|
||||
|
||||
|
||||
def test_write_append_mode_raises(ext):
|
||||
msg = "Append mode is not supported with xlsxwriter!"
|
||||
|
||||
with ensure_clean(ext) as f:
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
ExcelWriter(f, engine="xlsxwriter", mode="a")
|
||||
@@ -0,0 +1,67 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, MultiIndex
|
||||
from pandas.util.testing import ensure_clean
|
||||
|
||||
from pandas.io.excel import ExcelWriter, _XlwtWriter
|
||||
|
||||
xlwt = pytest.importorskip("xlwt")
|
||||
|
||||
pytestmark = pytest.mark.parametrize("ext,", [".xls"])
|
||||
|
||||
|
||||
def test_excel_raise_error_on_multiindex_columns_and_no_index(ext):
|
||||
# MultiIndex as columns is not yet implemented 9794
|
||||
cols = MultiIndex.from_tuples(
|
||||
[("site", ""), ("2014", "height"), ("2014", "weight")]
|
||||
)
|
||||
df = DataFrame(np.random.randn(10, 3), columns=cols)
|
||||
with pytest.raises(NotImplementedError):
|
||||
with ensure_clean(ext) as path:
|
||||
df.to_excel(path, index=False)
|
||||
|
||||
|
||||
def test_excel_multiindex_columns_and_index_true(ext):
|
||||
cols = MultiIndex.from_tuples(
|
||||
[("site", ""), ("2014", "height"), ("2014", "weight")]
|
||||
)
|
||||
df = pd.DataFrame(np.random.randn(10, 3), columns=cols)
|
||||
with ensure_clean(ext) as path:
|
||||
df.to_excel(path, index=True)
|
||||
|
||||
|
||||
def test_excel_multiindex_index(ext):
|
||||
# MultiIndex as index works so assert no error #9794
|
||||
cols = MultiIndex.from_tuples(
|
||||
[("site", ""), ("2014", "height"), ("2014", "weight")]
|
||||
)
|
||||
df = DataFrame(np.random.randn(3, 10), index=cols)
|
||||
with ensure_clean(ext) as path:
|
||||
df.to_excel(path, index=False)
|
||||
|
||||
|
||||
def test_to_excel_styleconverter(ext):
|
||||
hstyle = {
|
||||
"font": {"bold": True},
|
||||
"borders": {"top": "thin", "right": "thin", "bottom": "thin", "left": "thin"},
|
||||
"alignment": {"horizontal": "center", "vertical": "top"},
|
||||
}
|
||||
|
||||
xls_style = _XlwtWriter._convert_to_style(hstyle)
|
||||
assert xls_style.font.bold
|
||||
assert xlwt.Borders.THIN == xls_style.borders.top
|
||||
assert xlwt.Borders.THIN == xls_style.borders.right
|
||||
assert xlwt.Borders.THIN == xls_style.borders.bottom
|
||||
assert xlwt.Borders.THIN == xls_style.borders.left
|
||||
assert xlwt.Alignment.HORZ_CENTER == xls_style.alignment.horz
|
||||
assert xlwt.Alignment.VERT_TOP == xls_style.alignment.vert
|
||||
|
||||
|
||||
def test_write_append_mode_raises(ext):
|
||||
msg = "Append mode is not supported with xlwt!"
|
||||
|
||||
with ensure_clean(ext) as f:
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
ExcelWriter(f, engine="xlwt", mode="a")
|
||||
Reference in New Issue
Block a user