8th day of python challenges 111-117

This commit is contained in:
abd.shallal
2019-08-04 15:26:35 +03:00
parent b04c1b055f
commit 627802c383
3215 changed files with 760227 additions and 491 deletions

View File

@@ -0,0 +1,81 @@
import os
import pytest
from pandas import read_csv, read_table
class BaseParser:
engine = None
low_memory = True
float_precision_choices = []
def update_kwargs(self, kwargs):
kwargs = kwargs.copy()
kwargs.update(dict(engine=self.engine, low_memory=self.low_memory))
return kwargs
def read_csv(self, *args, **kwargs):
kwargs = self.update_kwargs(kwargs)
return read_csv(*args, **kwargs)
def read_table(self, *args, **kwargs):
kwargs = self.update_kwargs(kwargs)
return read_table(*args, **kwargs)
class CParser(BaseParser):
engine = "c"
float_precision_choices = [None, "high", "round_trip"]
class CParserHighMemory(CParser):
low_memory = False
class CParserLowMemory(CParser):
low_memory = True
class PythonParser(BaseParser):
engine = "python"
float_precision_choices = [None]
@pytest.fixture
def csv_dir_path(datapath):
return datapath("io", "parser", "data")
@pytest.fixture
def csv1(csv_dir_path):
return os.path.join(csv_dir_path, "test1.csv")
_cParserHighMemory = CParserHighMemory()
_cParserLowMemory = CParserLowMemory()
_pythonParser = PythonParser()
_py_parsers_only = [_pythonParser]
_c_parsers_only = [_cParserHighMemory, _cParserLowMemory]
_all_parsers = _c_parsers_only + _py_parsers_only
_py_parser_ids = ["python"]
_c_parser_ids = ["c_high", "c_low"]
_all_parser_ids = _c_parser_ids + _py_parser_ids
@pytest.fixture(params=_all_parsers, ids=_all_parser_ids)
def all_parsers(request):
return request.param
@pytest.fixture(params=_c_parsers_only, ids=_c_parser_ids)
def c_parser_only(request):
return request.param
@pytest.fixture(params=_py_parsers_only, ids=_py_parser_ids)
def python_parser_only(request):
return request.param

View File

@@ -0,0 +1,599 @@
"""
Tests that apply specifically to the CParser. Unless specifically stated
as a CParser-specific issue, the goal is to eventually move as many of
these tests out of this module as soon as the Python parser can accept
further arguments when parsing.
"""
from io import BytesIO, StringIO, TextIOWrapper
import mmap
import os
import tarfile
import numpy as np
import pytest
from pandas.errors import ParserError
import pandas.util._test_decorators as td
from pandas import DataFrame, concat
import pandas.util.testing as tm
@pytest.mark.parametrize(
"malformed",
["1\r1\r1\r 1\r 1\r", "1\r1\r1\r 1\r 1\r11\r", "1\r1\r1\r 1\r 1\r11\r1\r"],
ids=["words pointer", "stream pointer", "lines pointer"],
)
def test_buffer_overflow(c_parser_only, malformed):
# see gh-9205: test certain malformed input files that cause
# buffer overflows in tokenizer.c
msg = "Buffer overflow caught - possible malformed input file."
parser = c_parser_only
with pytest.raises(ParserError, match=msg):
parser.read_csv(StringIO(malformed))
def test_buffer_rd_bytes(c_parser_only):
# see gh-12098: src->buffer in the C parser can be freed twice leading
# to a segfault if a corrupt gzip file is read with 'read_csv', and the
# buffer is filled more than once before gzip raises an Exception.
data = (
"\x1F\x8B\x08\x00\x00\x00\x00\x00\x00\x03\xED\xC3\x41\x09"
"\x00\x00\x08\x00\xB1\xB7\xB6\xBA\xFE\xA5\xCC\x21\x6C\xB0"
"\xA6\x4D" + "\x55" * 267 + "\x7D\xF7\x00\x91\xE0\x47\x97\x14\x38\x04\x00"
"\x1f\x8b\x08\x00VT\x97V\x00\x03\xed]\xefO"
)
parser = c_parser_only
for _ in range(100):
try:
parser.read_csv(StringIO(data), compression="gzip", delim_whitespace=True)
except Exception:
pass
def test_delim_whitespace_custom_terminator(c_parser_only):
# See gh-12912
data = "a b c~1 2 3~4 5 6~7 8 9"
parser = c_parser_only
df = parser.read_csv(StringIO(data), lineterminator="~", delim_whitespace=True)
expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"])
tm.assert_frame_equal(df, expected)
def test_dtype_and_names_error(c_parser_only):
# see gh-8833: passing both dtype and names
# resulting in an error reporting issue
parser = c_parser_only
data = """
1.0 1
2.0 2
3.0 3
"""
# base cases
result = parser.read_csv(StringIO(data), sep=r"\s+", header=None)
expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]])
tm.assert_frame_equal(result, expected)
result = parser.read_csv(StringIO(data), sep=r"\s+", header=None, names=["a", "b"])
expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]], columns=["a", "b"])
tm.assert_frame_equal(result, expected)
# fallback casting
result = parser.read_csv(
StringIO(data), sep=r"\s+", header=None, names=["a", "b"], dtype={"a": np.int32}
)
expected = DataFrame([[1, 1], [2, 2], [3, 3]], columns=["a", "b"])
expected["a"] = expected["a"].astype(np.int32)
tm.assert_frame_equal(result, expected)
data = """
1.0 1
nan 2
3.0 3
"""
# fallback casting, but not castable
with pytest.raises(ValueError, match="cannot safely convert"):
parser.read_csv(
StringIO(data),
sep=r"\s+",
header=None,
names=["a", "b"],
dtype={"a": np.int32},
)
@pytest.mark.parametrize(
"match,kwargs",
[
# For each of these cases, all of the dtypes are valid, just unsupported.
(
(
"the dtype datetime64 is not supported for parsing, "
"pass this column using parse_dates instead"
),
dict(dtype={"A": "datetime64", "B": "float64"}),
),
(
(
"the dtype datetime64 is not supported for parsing, "
"pass this column using parse_dates instead"
),
dict(dtype={"A": "datetime64", "B": "float64"}, parse_dates=["B"]),
),
(
"the dtype timedelta64 is not supported for parsing",
dict(dtype={"A": "timedelta64", "B": "float64"}),
),
("the dtype <U8 is not supported for parsing", dict(dtype={"A": "U8"})),
],
ids=["dt64-0", "dt64-1", "td64", "<U8"],
)
def test_unsupported_dtype(c_parser_only, match, kwargs):
parser = c_parser_only
df = DataFrame(
np.random.rand(5, 2), columns=list("AB"), index=["1A", "1B", "1C", "1D", "1E"]
)
with tm.ensure_clean("__unsupported_dtype__.csv") as path:
df.to_csv(path)
with pytest.raises(TypeError, match=match):
parser.read_csv(path, index_col=0, **kwargs)
@td.skip_if_32bit
def test_precise_conversion(c_parser_only):
from decimal import Decimal
parser = c_parser_only
normal_errors = []
precise_errors = []
# test numbers between 1 and 2
for num in np.linspace(1.0, 2.0, num=500):
# 25 decimal digits of precision
text = "a\n{0:.25}".format(num)
normal_val = float(parser.read_csv(StringIO(text))["a"][0])
precise_val = float(
parser.read_csv(StringIO(text), float_precision="high")["a"][0]
)
roundtrip_val = float(
parser.read_csv(StringIO(text), float_precision="round_trip")["a"][0]
)
actual_val = Decimal(text[2:])
def error(val):
return abs(Decimal("{0:.100}".format(val)) - actual_val)
normal_errors.append(error(normal_val))
precise_errors.append(error(precise_val))
# round-trip should match float()
assert roundtrip_val == float(text[2:])
assert sum(precise_errors) <= sum(normal_errors)
assert max(precise_errors) <= max(normal_errors)
def test_usecols_dtypes(c_parser_only):
parser = c_parser_only
data = """\
1,2,3
4,5,6
7,8,9
10,11,12"""
result = parser.read_csv(
StringIO(data),
usecols=(0, 1, 2),
names=("a", "b", "c"),
header=None,
converters={"a": str},
dtype={"b": int, "c": float},
)
result2 = parser.read_csv(
StringIO(data),
usecols=(0, 2),
names=("a", "b", "c"),
header=None,
converters={"a": str},
dtype={"b": int, "c": float},
)
assert (result.dtypes == [object, np.int, np.float]).all()
assert (result2.dtypes == [object, np.float]).all()
def test_disable_bool_parsing(c_parser_only):
# see gh-2090
parser = c_parser_only
data = """A,B,C
Yes,No,Yes
No,Yes,Yes
Yes,,Yes
No,No,No"""
result = parser.read_csv(StringIO(data), dtype=object)
assert (result.dtypes == object).all()
result = parser.read_csv(StringIO(data), dtype=object, na_filter=False)
assert result["B"][2] == ""
def test_custom_lineterminator(c_parser_only):
parser = c_parser_only
data = "a,b,c~1,2,3~4,5,6"
result = parser.read_csv(StringIO(data), lineterminator="~")
expected = parser.read_csv(StringIO(data.replace("~", "\n")))
tm.assert_frame_equal(result, expected)
def test_parse_ragged_csv(c_parser_only):
parser = c_parser_only
data = """1,2,3
1,2,3,4
1,2,3,4,5
1,2
1,2,3,4"""
nice_data = """1,2,3,,
1,2,3,4,
1,2,3,4,5
1,2,,,
1,2,3,4,"""
result = parser.read_csv(
StringIO(data), header=None, names=["a", "b", "c", "d", "e"]
)
expected = parser.read_csv(
StringIO(nice_data), header=None, names=["a", "b", "c", "d", "e"]
)
tm.assert_frame_equal(result, expected)
# too many columns, cause segfault if not careful
data = "1,2\n3,4,5"
result = parser.read_csv(StringIO(data), header=None, names=range(50))
expected = parser.read_csv(StringIO(data), header=None, names=range(3)).reindex(
columns=range(50)
)
tm.assert_frame_equal(result, expected)
def test_tokenize_CR_with_quoting(c_parser_only):
# see gh-3453
parser = c_parser_only
data = ' a,b,c\r"a,b","e,d","f,f"'
result = parser.read_csv(StringIO(data), header=None)
expected = parser.read_csv(StringIO(data.replace("\r", "\n")), header=None)
tm.assert_frame_equal(result, expected)
result = parser.read_csv(StringIO(data))
expected = parser.read_csv(StringIO(data.replace("\r", "\n")))
tm.assert_frame_equal(result, expected)
def test_grow_boundary_at_cap(c_parser_only):
# See gh-12494
#
# Cause of error was that the C parser
# was not increasing the buffer size when
# the desired space would fill the buffer
# to capacity, which would later cause a
# buffer overflow error when checking the
# EOF terminator of the CSV stream.
parser = c_parser_only
def test_empty_header_read(count):
s = StringIO("," * count)
expected = DataFrame(
columns=["Unnamed: {i}".format(i=i) for i in range(count + 1)]
)
df = parser.read_csv(s)
tm.assert_frame_equal(df, expected)
for cnt in range(1, 101):
test_empty_header_read(cnt)
def test_parse_trim_buffers(c_parser_only):
# This test is part of a bugfix for gh-13703. It attempts to
# to stress the system memory allocator, to cause it to move the
# stream buffer and either let the OS reclaim the region, or let
# other memory requests of parser otherwise modify the contents
# of memory space, where it was formally located.
# This test is designed to cause a `segfault` with unpatched
# `tokenizer.c`. Sometimes the test fails on `segfault`, other
# times it fails due to memory corruption, which causes the
# loaded DataFrame to differ from the expected one.
parser = c_parser_only
# Generate a large mixed-type CSV file on-the-fly (one record is
# approx 1.5KiB).
record_ = (
"""9999-9,99:99,,,,ZZ,ZZ,,,ZZZ-ZZZZ,.Z-ZZZZ,-9.99,,,9.99,Z"""
"""ZZZZ,,-99,9,ZZZ-ZZZZ,ZZ-ZZZZ,,9.99,ZZZ-ZZZZZ,ZZZ-ZZZZZ,"""
"""ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,9"""
"""99,ZZZ-ZZZZ,,ZZ-ZZZZ,,,,,ZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,,,9,9,"""
"""9,9,99,99,999,999,ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,9,ZZ-ZZZZ,9."""
"""99,ZZ-ZZZZ,ZZ-ZZZZ,,,,ZZZZ,,,ZZ,ZZ,,,,,,,,,,,,,9,,,999."""
"""99,999.99,,,ZZZZZ,,,Z9,,,,,,,ZZZ,ZZZ,,,,,,,,,,,ZZZZZ,ZZ"""
"""ZZZ,ZZZ-ZZZZZZ,ZZZ-ZZZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZ"""
"""ZZ,,,999999,999999,ZZZ,ZZZ,,,ZZZ,ZZZ,999.99,999.99,,,,Z"""
"""ZZ-ZZZ,ZZZ-ZZZ,-9.99,-9.99,9,9,,99,,9.99,9.99,9,9,9.99,"""
"""9.99,,,,9.99,9.99,,99,,99,9.99,9.99,,,ZZZ,ZZZ,,999.99,,"""
"""999.99,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,ZZZZZ,ZZZZZ,ZZZ,ZZZ,9,9,"""
""",,,,,ZZZ-ZZZZ,ZZZ999Z,,,999.99,,999.99,ZZZ-ZZZZ,,,9.999"""
""",9.999,9.999,9.999,-9.999,-9.999,-9.999,-9.999,9.999,9."""
"""999,9.999,9.999,9.999,9.999,9.999,9.999,99999,ZZZ-ZZZZ,"""
""",9.99,ZZZ,,,,,,,,ZZZ,,,,,9,,,,9,,,,,,,,,,ZZZ-ZZZZ,ZZZ-Z"""
"""ZZZ,,ZZZZZ,ZZZZZ,ZZZZZ,ZZZZZ,,,9.99,,ZZ-ZZZZ,ZZ-ZZZZ,ZZ"""
""",999,,,,ZZ-ZZZZ,ZZZ,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,99.99,99.99"""
""",,,9.99,9.99,9.99,9.99,ZZZ-ZZZZ,,,ZZZ-ZZZZZ,,,,,-9.99,-"""
"""9.99,-9.99,-9.99,,,,,,,,,ZZZ-ZZZZ,,9,9.99,9.99,99ZZ,,-9"""
""".99,-9.99,ZZZ-ZZZZ,,,,,,,ZZZ-ZZZZ,9.99,9.99,9999,,,,,,,"""
""",,,-9.9,Z/Z-ZZZZ,999.99,9.99,,999.99,ZZ-ZZZZ,ZZ-ZZZZ,9."""
"""99,9.99,9.99,9.99,9.99,9.99,,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZ"""
"""ZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ,ZZZ,ZZZ,ZZZ,9.99,,,-9.99,ZZ"""
"""-ZZZZ,-999.99,,-9999,,999.99,,,,999.99,99.99,,,ZZ-ZZZZZ"""
"""ZZZ,ZZ-ZZZZ-ZZZZZZZ,,,,ZZ-ZZ-ZZZZZZZZ,ZZZZZZZZ,ZZZ-ZZZZ"""
""",9999,999.99,ZZZ-ZZZZ,-9.99,-9.99,ZZZ-ZZZZ,99:99:99,,99"""
""",99,,9.99,,-99.99,,,,,,9.99,ZZZ-ZZZZ,-9.99,-9.99,9.99,9"""
""".99,,ZZZ,,,,,,,ZZZ,ZZZ,,,,,"""
)
# Set the number of lines so that a call to `parser_trim_buffers`
# is triggered: after a couple of full chunks are consumed a
# relatively small 'residual' chunk would cause reallocation
# within the parser.
chunksize, n_lines = 128, 2 * 128 + 15
csv_data = "\n".join([record_] * n_lines) + "\n"
# We will use StringIO to load the CSV from this text buffer.
# pd.read_csv() will iterate over the file in chunks and will
# finally read a residual chunk of really small size.
# Generate the expected output: manually create the dataframe
# by splitting by comma and repeating the `n_lines` times.
row = tuple(val_ if val_ else np.nan for val_ in record_.split(","))
expected = DataFrame(
[row for _ in range(n_lines)], dtype=object, columns=None, index=None
)
# Iterate over the CSV file in chunks of `chunksize` lines
chunks_ = parser.read_csv(
StringIO(csv_data), header=None, dtype=object, chunksize=chunksize
)
result = concat(chunks_, axis=0, ignore_index=True)
# Check for data corruption if there was no segfault
tm.assert_frame_equal(result, expected)
# This extra test was added to replicate the fault in gh-5291.
# Force 'utf-8' encoding, so that `_string_convert` would take
# a different execution branch.
chunks_ = parser.read_csv(
StringIO(csv_data),
header=None,
dtype=object,
chunksize=chunksize,
encoding="utf_8",
)
result = concat(chunks_, axis=0, ignore_index=True)
tm.assert_frame_equal(result, expected)
def test_internal_null_byte(c_parser_only):
# see gh-14012
#
# The null byte ('\x00') should not be used as a
# true line terminator, escape character, or comment
# character, only as a placeholder to indicate that
# none was specified.
#
# This test should be moved to test_common.py ONLY when
# Python's csv class supports parsing '\x00'.
parser = c_parser_only
names = ["a", "b", "c"]
data = "1,2,3\n4,\x00,6\n7,8,9"
expected = DataFrame([[1, 2.0, 3], [4, np.nan, 6], [7, 8, 9]], columns=names)
result = parser.read_csv(StringIO(data), names=names)
tm.assert_frame_equal(result, expected)
def test_read_nrows_large(c_parser_only):
# gh-7626 - Read only nrows of data in for large inputs (>262144b)
parser = c_parser_only
header_narrow = "\t".join(["COL_HEADER_" + str(i) for i in range(10)]) + "\n"
data_narrow = "\t".join(["somedatasomedatasomedata1" for _ in range(10)]) + "\n"
header_wide = "\t".join(["COL_HEADER_" + str(i) for i in range(15)]) + "\n"
data_wide = "\t".join(["somedatasomedatasomedata2" for _ in range(15)]) + "\n"
test_input = header_narrow + data_narrow * 1050 + header_wide + data_wide * 2
df = parser.read_csv(StringIO(test_input), sep="\t", nrows=1010)
assert df.size == 1010 * 10
def test_float_precision_round_trip_with_text(c_parser_only):
# see gh-15140
parser = c_parser_only
df = parser.read_csv(StringIO("a"), header=None, float_precision="round_trip")
tm.assert_frame_equal(df, DataFrame({0: ["a"]}))
def test_large_difference_in_columns(c_parser_only):
# see gh-14125
parser = c_parser_only
count = 10000
large_row = ("X," * count)[:-1] + "\n"
normal_row = "XXXXXX XXXXXX,111111111111111\n"
test_input = (large_row + normal_row * 6)[:-1]
result = parser.read_csv(StringIO(test_input), header=None, usecols=[0])
rows = test_input.split("\n")
expected = DataFrame([row.split(",")[0] for row in rows])
tm.assert_frame_equal(result, expected)
def test_data_after_quote(c_parser_only):
# see gh-15910
parser = c_parser_only
data = 'a\n1\n"b"a'
result = parser.read_csv(StringIO(data))
expected = DataFrame({"a": ["1", "ba"]})
tm.assert_frame_equal(result, expected)
def test_comment_whitespace_delimited(c_parser_only, capsys):
parser = c_parser_only
test_input = """\
1 2
2 2 3
3 2 3 # 3 fields
4 2 3# 3 fields
5 2 # 2 fields
6 2# 2 fields
7 # 1 field, NaN
8# 1 field, NaN
9 2 3 # skipped line
# comment"""
df = parser.read_csv(
StringIO(test_input),
comment="#",
header=None,
delimiter="\\s+",
skiprows=0,
error_bad_lines=False,
)
captured = capsys.readouterr()
# skipped lines 2, 3, 4, 9
for line_num in (2, 3, 4, 9):
assert "Skipping line {}".format(line_num) in captured.err
expected = DataFrame([[1, 2], [5, 2], [6, 2], [7, np.nan], [8, np.nan]])
tm.assert_frame_equal(df, expected)
def test_file_like_no_next(c_parser_only):
# gh-16530: the file-like need not have a "next" or "__next__"
# attribute despite having an "__iter__" attribute.
#
# NOTE: This is only true for the C engine, not Python engine.
class NoNextBuffer(StringIO):
def __next__(self):
raise AttributeError("No next method")
next = __next__
parser = c_parser_only
data = "a\n1"
expected = DataFrame({"a": [1]})
result = parser.read_csv(NoNextBuffer(data))
tm.assert_frame_equal(result, expected)
def test_buffer_rd_bytes_bad_unicode(c_parser_only):
# see gh-22748
t = BytesIO(b"\xB0")
t = TextIOWrapper(t, encoding="ascii", errors="surrogateescape")
msg = "'utf-8' codec can't encode character"
with pytest.raises(UnicodeError, match=msg):
c_parser_only.read_csv(t, encoding="UTF-8")
@pytest.mark.parametrize("tar_suffix", [".tar", ".tar.gz"])
def test_read_tarfile(c_parser_only, csv_dir_path, tar_suffix):
# see gh-16530
#
# Unfortunately, Python's CSV library can't handle
# tarfile objects (expects string, not bytes when
# iterating through a file-like).
parser = c_parser_only
tar_path = os.path.join(csv_dir_path, "tar_csv" + tar_suffix)
with tarfile.open(tar_path, "r") as tar:
data_file = tar.extractfile("tar_data.csv")
out = parser.read_csv(data_file)
expected = DataFrame({"a": [1]})
tm.assert_frame_equal(out, expected)
@pytest.mark.high_memory
def test_bytes_exceed_2gb(c_parser_only):
# see gh-16798
#
# Read from a "CSV" that has a column larger than 2GB.
parser = c_parser_only
if parser.low_memory:
pytest.skip("not a high_memory test")
csv = StringIO("strings\n" + "\n".join(["x" * (1 << 20) for _ in range(2100)]))
df = parser.read_csv(csv)
assert not df.empty
def test_chunk_whitespace_on_boundary(c_parser_only):
# see gh-9735: this issue is C parser-specific (bug when
# parsing whitespace and characters at chunk boundary)
#
# This test case has a field too large for the Python parser / CSV library.
parser = c_parser_only
chunk1 = "a" * (1024 * 256 - 2) + "\na"
chunk2 = "\n a"
result = parser.read_csv(StringIO(chunk1 + chunk2), header=None)
expected = DataFrame(["a" * (1024 * 256 - 2), "a", " a"])
tm.assert_frame_equal(result, expected)
def test_file_handles_mmap(c_parser_only, csv1):
# gh-14418
#
# Don't close user provided file handles.
parser = c_parser_only
with open(csv1, "r") as f:
m = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
parser.read_csv(m)
assert not m.closed
m.close()
def test_file_binary_mode(c_parser_only):
# see gh-23779
parser = c_parser_only
expected = DataFrame([[1, 2, 3], [4, 5, 6]])
with tm.ensure_clean() as path:
with open(path, "w") as f:
f.write("1,2,3\n4,5,6")
with open(path, "rb") as f:
result = parser.read_csv(f, header=None)
tm.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,136 @@
"""
Tests that comments are properly handled during parsing
for all of the parsers defined in parsers.py
"""
from io import StringIO
import numpy as np
import pytest
from pandas import DataFrame
import pandas.util.testing as tm
@pytest.mark.parametrize("na_values", [None, ["NaN"]])
def test_comment(all_parsers, na_values):
parser = all_parsers
data = """A,B,C
1,2.,4.#hello world
5.,NaN,10.0
"""
expected = DataFrame(
[[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
)
result = parser.read_csv(StringIO(data), comment="#", na_values=na_values)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"read_kwargs", [dict(), dict(lineterminator="*"), dict(delim_whitespace=True)]
)
def test_line_comment(all_parsers, read_kwargs):
parser = all_parsers
data = """# empty
A,B,C
1,2.,4.#hello world
#ignore this line
5.,NaN,10.0
"""
if read_kwargs.get("delim_whitespace"):
data = data.replace(",", " ")
elif read_kwargs.get("lineterminator"):
if parser.engine != "c":
pytest.skip("Custom terminator not supported with Python engine")
data = data.replace("\n", read_kwargs.get("lineterminator"))
read_kwargs["comment"] = "#"
result = parser.read_csv(StringIO(data), **read_kwargs)
expected = DataFrame(
[[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
)
tm.assert_frame_equal(result, expected)
def test_comment_skiprows(all_parsers):
parser = all_parsers
data = """# empty
random line
# second empty line
1,2,3
A,B,C
1,2.,4.
5.,NaN,10.0
"""
# This should ignore the first four lines (including comments).
expected = DataFrame(
[[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
)
result = parser.read_csv(StringIO(data), comment="#", skiprows=4)
tm.assert_frame_equal(result, expected)
def test_comment_header(all_parsers):
parser = all_parsers
data = """# empty
# second empty line
1,2,3
A,B,C
1,2.,4.
5.,NaN,10.0
"""
# Header should begin at the second non-comment line.
expected = DataFrame(
[[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
)
result = parser.read_csv(StringIO(data), comment="#", header=1)
tm.assert_frame_equal(result, expected)
def test_comment_skiprows_header(all_parsers):
parser = all_parsers
data = """# empty
# second empty line
# third empty line
X,Y,Z
1,2,3
A,B,C
1,2.,4.
5.,NaN,10.0
"""
# Skiprows should skip the first 4 lines (including comments),
# while header should start from the second non-commented line,
# starting with line 5.
expected = DataFrame(
[[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
)
result = parser.read_csv(StringIO(data), comment="#", skiprows=4, header=1)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("comment_char", ["#", "~", "&", "^", "*", "@"])
def test_custom_comment_char(all_parsers, comment_char):
parser = all_parsers
data = "a,b,c\n1,2,3#ignore this!\n4,5,6#ignorethistoo"
result = parser.read_csv(
StringIO(data.replace("#", comment_char)), comment=comment_char
)
expected = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "b", "c"])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("header", ["infer", None])
def test_comment_first_line(all_parsers, header):
# see gh-4623
parser = all_parsers
data = "# notes\na,b,c\n# more notes\n1,2,3"
if header is None:
expected = DataFrame({0: ["a", "1"], 1: ["b", "2"], 2: ["c", "3"]})
else:
expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"])
result = parser.read_csv(StringIO(data), comment="#", header=header)
tm.assert_frame_equal(result, expected)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,150 @@
"""
Tests compressed data parsing functionality for all
of the parsers defined in parsers.py
"""
import os
import zipfile
import pytest
import pandas as pd
import pandas.util.testing as tm
@pytest.fixture(params=[True, False])
def buffer(request):
return request.param
@pytest.fixture
def parser_and_data(all_parsers, csv1):
parser = all_parsers
with open(csv1, "rb") as f:
data = f.read()
expected = parser.read_csv(csv1)
return parser, data, expected
@pytest.mark.parametrize("compression", ["zip", "infer", "zip2"])
def test_zip(parser_and_data, compression):
parser, data, expected = parser_and_data
with tm.ensure_clean("test_file.zip") as path:
with zipfile.ZipFile(path, mode="w") as tmp:
tmp.writestr("test_file", data)
if compression == "zip2":
with open(path, "rb") as f:
result = parser.read_csv(f, compression="zip")
else:
result = parser.read_csv(path, compression=compression)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("compression", ["zip", "infer"])
def test_zip_error_multiple_files(parser_and_data, compression):
parser, data, expected = parser_and_data
with tm.ensure_clean("combined_zip.zip") as path:
inner_file_names = ["test_file", "second_file"]
with zipfile.ZipFile(path, mode="w") as tmp:
for file_name in inner_file_names:
tmp.writestr(file_name, data)
with pytest.raises(ValueError, match="Multiple files"):
parser.read_csv(path, compression=compression)
def test_zip_error_no_files(parser_and_data):
parser, _, _ = parser_and_data
with tm.ensure_clean() as path:
with zipfile.ZipFile(path, mode="w"):
pass
with pytest.raises(ValueError, match="Zero files"):
parser.read_csv(path, compression="zip")
def test_zip_error_invalid_zip(parser_and_data):
parser, _, _ = parser_and_data
with tm.ensure_clean() as path:
with open(path, "wb") as f:
with pytest.raises(zipfile.BadZipfile, match="File is not a zip file"):
parser.read_csv(f, compression="zip")
@pytest.mark.parametrize("filename", [None, "test.{ext}"])
def test_compression(parser_and_data, compression_only, buffer, filename):
parser, data, expected = parser_and_data
compress_type = compression_only
ext = "gz" if compress_type == "gzip" else compress_type
filename = filename if filename is None else filename.format(ext=ext)
if filename and buffer:
pytest.skip("Cannot deduce compression from buffer of compressed data.")
with tm.ensure_clean(filename=filename) as path:
tm.write_to_compressed(compress_type, path, data)
compression = "infer" if filename else compress_type
if buffer:
with open(path, "rb") as f:
result = parser.read_csv(f, compression=compression)
else:
result = parser.read_csv(path, compression=compression)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("ext", [None, "gz", "bz2"])
def test_infer_compression(all_parsers, csv1, buffer, ext):
# see gh-9770
parser = all_parsers
kwargs = dict(index_col=0, parse_dates=True)
expected = parser.read_csv(csv1, **kwargs)
kwargs["compression"] = "infer"
if buffer:
with open(csv1) as f:
result = parser.read_csv(f, **kwargs)
else:
ext = "." + ext if ext else ""
result = parser.read_csv(csv1 + ext, **kwargs)
tm.assert_frame_equal(result, expected)
def test_compression_utf16_encoding(all_parsers, csv_dir_path):
# see gh-18071
parser = all_parsers
path = os.path.join(csv_dir_path, "utf16_ex_small.zip")
result = parser.read_csv(path, encoding="utf-16", compression="zip", sep="\t")
expected = pd.DataFrame(
{
"Country": ["Venezuela", "Venezuela"],
"Twitter": ["Hugo Chávez Frías", "Henrique Capriles R."],
}
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("invalid_compression", ["sfark", "bz3", "zipper"])
def test_invalid_compression(all_parsers, invalid_compression):
parser = all_parsers
compress_kwargs = dict(compression=invalid_compression)
msg = "Unrecognized compression type: {compression}".format(**compress_kwargs)
with pytest.raises(ValueError, match=msg):
parser.read_csv("test_file.zip", **compress_kwargs)

View File

@@ -0,0 +1,160 @@
"""
Tests column conversion functionality during parsing
for all of the parsers defined in parsers.py
"""
from io import StringIO
from dateutil.parser import parse
import numpy as np
import pytest
import pandas as pd
from pandas import DataFrame, Index
import pandas.util.testing as tm
def test_converters_type_must_be_dict(all_parsers):
parser = all_parsers
data = """index,A,B,C,D
foo,2,3,4,5
"""
with pytest.raises(TypeError, match="Type converters.+"):
parser.read_csv(StringIO(data), converters=0)
@pytest.mark.parametrize("column", [3, "D"])
@pytest.mark.parametrize(
"converter", [parse, lambda x: int(x.split("/")[2])] # Produce integer.
)
def test_converters(all_parsers, column, converter):
parser = all_parsers
data = """A,B,C,D
a,1,2,01/01/2009
b,3,4,01/02/2009
c,4,5,01/03/2009
"""
result = parser.read_csv(StringIO(data), converters={column: converter})
expected = parser.read_csv(StringIO(data))
expected["D"] = expected["D"].map(converter)
tm.assert_frame_equal(result, expected)
def test_converters_no_implicit_conv(all_parsers):
# see gh-2184
parser = all_parsers
data = """000102,1.2,A\n001245,2,B"""
converters = {0: lambda x: x.strip()}
result = parser.read_csv(StringIO(data), header=None, converters=converters)
# Column 0 should not be casted to numeric and should remain as object.
expected = DataFrame([["000102", 1.2, "A"], ["001245", 2, "B"]])
tm.assert_frame_equal(result, expected)
def test_converters_euro_decimal_format(all_parsers):
# see gh-583
converters = dict()
parser = all_parsers
data = """Id;Number1;Number2;Text1;Text2;Number3
1;1521,1541;187101,9543;ABC;poi;4,7387
2;121,12;14897,76;DEF;uyt;0,3773
3;878,158;108013,434;GHI;rez;2,7356"""
converters["Number1"] = converters["Number2"] = converters[
"Number3"
] = lambda x: float(x.replace(",", "."))
result = parser.read_csv(StringIO(data), sep=";", converters=converters)
expected = DataFrame(
[
[1, 1521.1541, 187101.9543, "ABC", "poi", 4.7387],
[2, 121.12, 14897.76, "DEF", "uyt", 0.3773],
[3, 878.158, 108013.434, "GHI", "rez", 2.7356],
],
columns=["Id", "Number1", "Number2", "Text1", "Text2", "Number3"],
)
tm.assert_frame_equal(result, expected)
def test_converters_corner_with_nans(all_parsers):
parser = all_parsers
data = """id,score,days
1,2,12
2,2-5,
3,,14+
4,6-12,2"""
# Example converters.
def convert_days(x):
x = x.strip()
if not x:
return np.nan
is_plus = x.endswith("+")
if is_plus:
x = int(x[:-1]) + 1
else:
x = int(x)
return x
def convert_days_sentinel(x):
x = x.strip()
if not x:
return np.nan
is_plus = x.endswith("+")
if is_plus:
x = int(x[:-1]) + 1
else:
x = int(x)
return x
def convert_score(x):
x = x.strip()
if not x:
return np.nan
if x.find("-") > 0:
val_min, val_max = map(int, x.split("-"))
val = 0.5 * (val_min + val_max)
else:
val = float(x)
return val
results = []
for day_converter in [convert_days, convert_days_sentinel]:
result = parser.read_csv(
StringIO(data),
converters={"score": convert_score, "days": day_converter},
na_values=["", None],
)
assert pd.isna(result["days"][1])
results.append(result)
tm.assert_frame_equal(results[0], results[1])
def test_converter_index_col_bug(all_parsers):
# see gh-1835
parser = all_parsers
data = "A;B\n1;2\n3;4"
rs = parser.read_csv(
StringIO(data), sep=";", index_col="A", converters={"A": lambda x: x}
)
xp = DataFrame({"B": [2, 4]}, index=Index([1, 3], name="A"))
tm.assert_frame_equal(rs, xp)

View File

@@ -0,0 +1,144 @@
"""
Tests that dialects are properly handled during parsing
for all of the parsers defined in parsers.py
"""
import csv
from io import StringIO
import pytest
from pandas.errors import ParserWarning
from pandas import DataFrame
import pandas.util.testing as tm
@pytest.fixture
def custom_dialect():
dialect_name = "weird"
dialect_kwargs = dict(
doublequote=False,
escapechar="~",
delimiter=":",
skipinitialspace=False,
quotechar="~",
quoting=3,
)
return dialect_name, dialect_kwargs
def test_dialect(all_parsers):
parser = all_parsers
data = """\
label1,label2,label3
index1,"a,c,e
index2,b,d,f
"""
dia = csv.excel()
dia.quoting = csv.QUOTE_NONE
df = parser.read_csv(StringIO(data), dialect=dia)
data = """\
label1,label2,label3
index1,a,c,e
index2,b,d,f
"""
exp = parser.read_csv(StringIO(data))
exp.replace("a", '"a', inplace=True)
tm.assert_frame_equal(df, exp)
def test_dialect_str(all_parsers):
dialect_name = "mydialect"
parser = all_parsers
data = """\
fruit:vegetable
apple:broccoli
pear:tomato
"""
exp = DataFrame({"fruit": ["apple", "pear"], "vegetable": ["broccoli", "tomato"]})
with tm.with_csv_dialect(dialect_name, delimiter=":"):
df = parser.read_csv(StringIO(data), dialect=dialect_name)
tm.assert_frame_equal(df, exp)
def test_invalid_dialect(all_parsers):
class InvalidDialect:
pass
data = "a\n1"
parser = all_parsers
msg = "Invalid dialect"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), dialect=InvalidDialect)
@pytest.mark.parametrize(
"arg",
[None, "doublequote", "escapechar", "skipinitialspace", "quotechar", "quoting"],
)
@pytest.mark.parametrize("value", ["dialect", "default", "other"])
def test_dialect_conflict_except_delimiter(all_parsers, custom_dialect, arg, value):
# see gh-23761.
dialect_name, dialect_kwargs = custom_dialect
parser = all_parsers
expected = DataFrame({"a": [1], "b": [2]})
data = "a:b\n1:2"
warning_klass = None
kwds = dict()
# arg=None tests when we pass in the dialect without any other arguments.
if arg is not None:
if "value" == "dialect": # No conflict --> no warning.
kwds[arg] = dialect_kwargs[arg]
elif "value" == "default": # Default --> no warning.
from pandas.io.parsers import _parser_defaults
kwds[arg] = _parser_defaults[arg]
else: # Non-default + conflict with dialect --> warning.
warning_klass = ParserWarning
kwds[arg] = "blah"
with tm.with_csv_dialect(dialect_name, **dialect_kwargs):
with tm.assert_produces_warning(warning_klass):
result = parser.read_csv(StringIO(data), dialect=dialect_name, **kwds)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"kwargs,warning_klass",
[
(dict(sep=","), None), # sep is default --> sep_override=True
(dict(sep="."), ParserWarning), # sep isn't default --> sep_override=False
(dict(delimiter=":"), None), # No conflict
(dict(delimiter=None), None), # Default arguments --> sep_override=True
(dict(delimiter=","), ParserWarning), # Conflict
(dict(delimiter="."), ParserWarning), # Conflict
],
ids=[
"sep-override-true",
"sep-override-false",
"delimiter-no-conflict",
"delimiter-default-arg",
"delimiter-conflict",
"delimiter-conflict2",
],
)
def test_dialect_conflict_delimiter(all_parsers, custom_dialect, kwargs, warning_klass):
# see gh-23761.
dialect_name, dialect_kwargs = custom_dialect
parser = all_parsers
expected = DataFrame({"a": [1], "b": [2]})
data = "a:b\n1:2"
with tm.with_csv_dialect(dialect_name, **dialect_kwargs):
with tm.assert_produces_warning(warning_klass):
result = parser.read_csv(StringIO(data), dialect=dialect_name, **kwargs)
tm.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,552 @@
"""
Tests dtype specification during parsing
for all of the parsers defined in parsers.py
"""
from io import StringIO
import os
import numpy as np
import pytest
from pandas.errors import ParserWarning
from pandas.core.dtypes.dtypes import CategoricalDtype
import pandas as pd
from pandas import Categorical, DataFrame, Index, MultiIndex, Series, Timestamp, concat
import pandas.util.testing as tm
@pytest.mark.parametrize("dtype", [str, object])
@pytest.mark.parametrize("check_orig", [True, False])
def test_dtype_all_columns(all_parsers, dtype, check_orig):
# see gh-3795, gh-6607
parser = all_parsers
df = DataFrame(
np.random.rand(5, 2).round(4),
columns=list("AB"),
index=["1A", "1B", "1C", "1D", "1E"],
)
with tm.ensure_clean("__passing_str_as_dtype__.csv") as path:
df.to_csv(path)
result = parser.read_csv(path, dtype=dtype, index_col=0)
if check_orig:
expected = df.copy()
result = result.astype(float)
else:
expected = df.astype(str)
tm.assert_frame_equal(result, expected)
def test_dtype_all_columns_empty(all_parsers):
# see gh-12048
parser = all_parsers
result = parser.read_csv(StringIO("A,B"), dtype=str)
expected = DataFrame({"A": [], "B": []}, index=[], dtype=str)
tm.assert_frame_equal(result, expected)
def test_dtype_per_column(all_parsers):
parser = all_parsers
data = """\
one,two
1,2.5
2,3.5
3,4.5
4,5.5"""
expected = DataFrame(
[[1, "2.5"], [2, "3.5"], [3, "4.5"], [4, "5.5"]], columns=["one", "two"]
)
expected["one"] = expected["one"].astype(np.float64)
expected["two"] = expected["two"].astype(object)
result = parser.read_csv(StringIO(data), dtype={"one": np.float64, 1: str})
tm.assert_frame_equal(result, expected)
def test_invalid_dtype_per_column(all_parsers):
parser = all_parsers
data = """\
one,two
1,2.5
2,3.5
3,4.5
4,5.5"""
with pytest.raises(TypeError, match="data type 'foo' not understood"):
parser.read_csv(StringIO(data), dtype={"one": "foo", 1: "int"})
@pytest.mark.parametrize(
"dtype",
[
"category",
CategoricalDtype(),
{"a": "category", "b": "category", "c": CategoricalDtype()},
],
)
def test_categorical_dtype(all_parsers, dtype):
# see gh-10153
parser = all_parsers
data = """a,b,c
1,a,3.4
1,a,3.4
2,b,4.5"""
expected = DataFrame(
{
"a": Categorical(["1", "1", "2"]),
"b": Categorical(["a", "a", "b"]),
"c": Categorical(["3.4", "3.4", "4.5"]),
}
)
actual = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(actual, expected)
@pytest.mark.parametrize("dtype", [{"b": "category"}, {1: "category"}])
def test_categorical_dtype_single(all_parsers, dtype):
# see gh-10153
parser = all_parsers
data = """a,b,c
1,a,3.4
1,a,3.4
2,b,4.5"""
expected = DataFrame(
{"a": [1, 1, 2], "b": Categorical(["a", "a", "b"]), "c": [3.4, 3.4, 4.5]}
)
actual = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(actual, expected)
def test_categorical_dtype_unsorted(all_parsers):
# see gh-10153
parser = all_parsers
data = """a,b,c
1,b,3.4
1,b,3.4
2,a,4.5"""
expected = DataFrame(
{
"a": Categorical(["1", "1", "2"]),
"b": Categorical(["b", "b", "a"]),
"c": Categorical(["3.4", "3.4", "4.5"]),
}
)
actual = parser.read_csv(StringIO(data), dtype="category")
tm.assert_frame_equal(actual, expected)
def test_categorical_dtype_missing(all_parsers):
# see gh-10153
parser = all_parsers
data = """a,b,c
1,b,3.4
1,nan,3.4
2,a,4.5"""
expected = DataFrame(
{
"a": Categorical(["1", "1", "2"]),
"b": Categorical(["b", np.nan, "a"]),
"c": Categorical(["3.4", "3.4", "4.5"]),
}
)
actual = parser.read_csv(StringIO(data), dtype="category")
tm.assert_frame_equal(actual, expected)
@pytest.mark.slow
def test_categorical_dtype_high_cardinality_numeric(all_parsers):
# see gh-18186
parser = all_parsers
data = np.sort([str(i) for i in range(524289)])
expected = DataFrame({"a": Categorical(data, ordered=True)})
actual = parser.read_csv(StringIO("a\n" + "\n".join(data)), dtype="category")
actual["a"] = actual["a"].cat.reorder_categories(
np.sort(actual.a.cat.categories), ordered=True
)
tm.assert_frame_equal(actual, expected)
def test_categorical_dtype_latin1(all_parsers, csv_dir_path):
# see gh-10153
pth = os.path.join(csv_dir_path, "unicode_series.csv")
parser = all_parsers
encoding = "latin-1"
expected = parser.read_csv(pth, header=None, encoding=encoding)
expected[1] = Categorical(expected[1])
actual = parser.read_csv(pth, header=None, encoding=encoding, dtype={1: "category"})
tm.assert_frame_equal(actual, expected)
def test_categorical_dtype_utf16(all_parsers, csv_dir_path):
# see gh-10153
pth = os.path.join(csv_dir_path, "utf16_ex.txt")
parser = all_parsers
encoding = "utf-16"
sep = ","
expected = parser.read_csv(pth, sep=sep, encoding=encoding)
expected = expected.apply(Categorical)
actual = parser.read_csv(pth, sep=sep, encoding=encoding, dtype="category")
tm.assert_frame_equal(actual, expected)
def test_categorical_dtype_chunksize_infer_categories(all_parsers):
# see gh-10153
parser = all_parsers
data = """a,b
1,a
1,b
1,b
2,c"""
expecteds = [
DataFrame({"a": [1, 1], "b": Categorical(["a", "b"])}),
DataFrame({"a": [1, 2], "b": Categorical(["b", "c"])}, index=[2, 3]),
]
actuals = parser.read_csv(StringIO(data), dtype={"b": "category"}, chunksize=2)
for actual, expected in zip(actuals, expecteds):
tm.assert_frame_equal(actual, expected)
def test_categorical_dtype_chunksize_explicit_categories(all_parsers):
# see gh-10153
parser = all_parsers
data = """a,b
1,a
1,b
1,b
2,c"""
cats = ["a", "b", "c"]
expecteds = [
DataFrame({"a": [1, 1], "b": Categorical(["a", "b"], categories=cats)}),
DataFrame(
{"a": [1, 2], "b": Categorical(["b", "c"], categories=cats)}, index=[2, 3]
),
]
dtype = CategoricalDtype(cats)
actuals = parser.read_csv(StringIO(data), dtype={"b": dtype}, chunksize=2)
for actual, expected in zip(actuals, expecteds):
tm.assert_frame_equal(actual, expected)
@pytest.mark.parametrize("ordered", [False, True])
@pytest.mark.parametrize(
"categories",
[["a", "b", "c"], ["a", "c", "b"], ["a", "b", "c", "d"], ["c", "b", "a"]],
)
def test_categorical_category_dtype(all_parsers, categories, ordered):
parser = all_parsers
data = """a,b
1,a
1,b
1,b
2,c"""
expected = DataFrame(
{
"a": [1, 1, 1, 2],
"b": Categorical(
["a", "b", "b", "c"], categories=categories, ordered=ordered
),
}
)
dtype = {"b": CategoricalDtype(categories=categories, ordered=ordered)}
result = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(result, expected)
def test_categorical_category_dtype_unsorted(all_parsers):
parser = all_parsers
data = """a,b
1,a
1,b
1,b
2,c"""
dtype = CategoricalDtype(["c", "b", "a"])
expected = DataFrame(
{
"a": [1, 1, 1, 2],
"b": Categorical(["a", "b", "b", "c"], categories=["c", "b", "a"]),
}
)
result = parser.read_csv(StringIO(data), dtype={"b": dtype})
tm.assert_frame_equal(result, expected)
def test_categorical_coerces_numeric(all_parsers):
parser = all_parsers
dtype = {"b": CategoricalDtype([1, 2, 3])}
data = "b\n1\n1\n2\n3"
expected = DataFrame({"b": Categorical([1, 1, 2, 3])})
result = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(result, expected)
def test_categorical_coerces_datetime(all_parsers):
parser = all_parsers
dtype = {"b": CategoricalDtype(pd.date_range("2017", "2019", freq="AS"))}
data = "b\n2017-01-01\n2018-01-01\n2019-01-01"
expected = DataFrame({"b": Categorical(dtype["b"].categories)})
result = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(result, expected)
def test_categorical_coerces_timestamp(all_parsers):
parser = all_parsers
dtype = {"b": CategoricalDtype([Timestamp("2014")])}
data = "b\n2014-01-01\n2014-01-01T00:00:00"
expected = DataFrame({"b": Categorical([Timestamp("2014")] * 2)})
result = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(result, expected)
def test_categorical_coerces_timedelta(all_parsers):
parser = all_parsers
dtype = {"b": CategoricalDtype(pd.to_timedelta(["1H", "2H", "3H"]))}
data = "b\n1H\n2H\n3H"
expected = DataFrame({"b": Categorical(dtype["b"].categories)})
result = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"data",
[
"b\nTrue\nFalse\nNA\nFalse",
"b\ntrue\nfalse\nNA\nfalse",
"b\nTRUE\nFALSE\nNA\nFALSE",
"b\nTrue\nFalse\nNA\nFALSE",
],
)
def test_categorical_dtype_coerces_boolean(all_parsers, data):
# see gh-20498
parser = all_parsers
dtype = {"b": CategoricalDtype([False, True])}
expected = DataFrame({"b": Categorical([True, False, None, False])})
result = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(result, expected)
def test_categorical_unexpected_categories(all_parsers):
parser = all_parsers
dtype = {"b": CategoricalDtype(["a", "b", "d", "e"])}
data = "b\nd\na\nc\nd" # Unexpected c
expected = DataFrame({"b": Categorical(list("dacd"), dtype=dtype["b"])})
result = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(result, expected)
def test_empty_pass_dtype(all_parsers):
parser = all_parsers
data = "one,two"
result = parser.read_csv(StringIO(data), dtype={"one": "u1"})
expected = DataFrame(
{"one": np.empty(0, dtype="u1"), "two": np.empty(0, dtype=np.object)},
index=Index([], dtype=object),
)
tm.assert_frame_equal(result, expected)
def test_empty_with_index_pass_dtype(all_parsers):
parser = all_parsers
data = "one,two"
result = parser.read_csv(
StringIO(data), index_col=["one"], dtype={"one": "u1", 1: "f"}
)
expected = DataFrame(
{"two": np.empty(0, dtype="f")}, index=Index([], dtype="u1", name="one")
)
tm.assert_frame_equal(result, expected)
def test_empty_with_multi_index_pass_dtype(all_parsers):
parser = all_parsers
data = "one,two,three"
result = parser.read_csv(
StringIO(data), index_col=["one", "two"], dtype={"one": "u1", 1: "f8"}
)
exp_idx = MultiIndex.from_arrays(
[np.empty(0, dtype="u1"), np.empty(0, dtype=np.float64)], names=["one", "two"]
)
expected = DataFrame({"three": np.empty(0, dtype=np.object)}, index=exp_idx)
tm.assert_frame_equal(result, expected)
def test_empty_with_mangled_column_pass_dtype_by_names(all_parsers):
parser = all_parsers
data = "one,one"
result = parser.read_csv(StringIO(data), dtype={"one": "u1", "one.1": "f"})
expected = DataFrame(
{"one": np.empty(0, dtype="u1"), "one.1": np.empty(0, dtype="f")},
index=Index([], dtype=object),
)
tm.assert_frame_equal(result, expected)
def test_empty_with_mangled_column_pass_dtype_by_indexes(all_parsers):
parser = all_parsers
data = "one,one"
result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"})
expected = DataFrame(
{"one": np.empty(0, dtype="u1"), "one.1": np.empty(0, dtype="f")},
index=Index([], dtype=object),
)
tm.assert_frame_equal(result, expected)
def test_empty_with_dup_column_pass_dtype_by_indexes(all_parsers):
# see gh-9424
parser = all_parsers
expected = concat(
[Series([], name="one", dtype="u1"), Series([], name="one.1", dtype="f")],
axis=1,
)
expected.index = expected.index.astype(object)
data = "one,one"
result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"})
tm.assert_frame_equal(result, expected)
def test_empty_with_dup_column_pass_dtype_by_indexes_raises(all_parsers):
# see gh-9424
parser = all_parsers
expected = concat(
[Series([], name="one", dtype="u1"), Series([], name="one.1", dtype="f")],
axis=1,
)
expected.index = expected.index.astype(object)
with pytest.raises(ValueError, match="Duplicate names"):
data = ""
parser.read_csv(StringIO(data), names=["one", "one"], dtype={0: "u1", 1: "f"})
def test_raise_on_passed_int_dtype_with_nas(all_parsers):
# see gh-2631
parser = all_parsers
data = """YEAR, DOY, a
2001,106380451,10
2001,,11
2001,106380451,67"""
msg = (
"Integer column has NA values"
if parser.engine == "c"
else "Unable to convert column DOY"
)
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), dtype={"DOY": np.int64}, skipinitialspace=True)
def test_dtype_with_converters(all_parsers):
parser = all_parsers
data = """a,b
1.1,2.2
1.2,2.3"""
# Dtype spec ignored if converted specified.
with tm.assert_produces_warning(ParserWarning):
result = parser.read_csv(
StringIO(data), dtype={"a": "i8"}, converters={"a": lambda x: str(x)}
)
expected = DataFrame({"a": ["1.1", "1.2"], "b": [2.2, 2.3]})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"dtype,expected",
[
(np.float64, DataFrame(columns=["a", "b"], dtype=np.float64)),
("category", DataFrame({"a": Categorical([]), "b": Categorical([])}, index=[])),
(
dict(a="category", b="category"),
DataFrame({"a": Categorical([]), "b": Categorical([])}, index=[]),
),
("datetime64[ns]", DataFrame(columns=["a", "b"], dtype="datetime64[ns]")),
(
"timedelta64[ns]",
DataFrame(
{
"a": Series([], dtype="timedelta64[ns]"),
"b": Series([], dtype="timedelta64[ns]"),
},
index=[],
),
),
(
dict(a=np.int64, b=np.int32),
DataFrame(
{"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)},
index=[],
),
),
(
{0: np.int64, 1: np.int32},
DataFrame(
{"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)},
index=[],
),
),
(
{"a": np.int64, 1: np.int32},
DataFrame(
{"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)},
index=[],
),
),
],
)
def test_empty_dtype(all_parsers, dtype, expected):
# see gh-14712
parser = all_parsers
data = "a,b"
result = parser.read_csv(StringIO(data), header=0, dtype=dtype)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"dtype", list(np.typecodes["AllInteger"] + np.typecodes["Float"])
)
def test_numeric_dtype(all_parsers, dtype):
data = "0\n1"
parser = all_parsers
expected = DataFrame([0, 1], dtype=dtype)
result = parser.read_csv(StringIO(data), header=None, dtype=dtype)
tm.assert_frame_equal(expected, result)

View File

@@ -0,0 +1,513 @@
"""
Tests that the file header is properly handled or inferred
during parsing for all of the parsers defined in parsers.py
"""
from collections import namedtuple
from io import StringIO
import numpy as np
import pytest
from pandas.errors import ParserError
from pandas import DataFrame, Index, MultiIndex
import pandas.util.testing as tm
def test_read_with_bad_header(all_parsers):
parser = all_parsers
msg = r"but only \d+ lines in file"
with pytest.raises(ValueError, match=msg):
s = StringIO(",,")
parser.read_csv(s, header=[10])
@pytest.mark.parametrize("header", [True, False])
def test_bool_header_arg(all_parsers, header):
# see gh-6114
parser = all_parsers
data = """\
MyColumn
a
b
a
b"""
msg = "Passing a bool to header is invalid"
with pytest.raises(TypeError, match=msg):
parser.read_csv(StringIO(data), header=header)
def test_no_header_prefix(all_parsers):
parser = all_parsers
data = """1,2,3,4,5
6,7,8,9,10
11,12,13,14,15
"""
result = parser.read_csv(StringIO(data), prefix="Field", header=None)
expected = DataFrame(
[[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]],
columns=["Field0", "Field1", "Field2", "Field3", "Field4"],
)
tm.assert_frame_equal(result, expected)
def test_header_with_index_col(all_parsers):
parser = all_parsers
data = """foo,1,2,3
bar,4,5,6
baz,7,8,9
"""
names = ["A", "B", "C"]
result = parser.read_csv(StringIO(data), names=names)
expected = DataFrame(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
index=["foo", "bar", "baz"],
columns=["A", "B", "C"],
)
tm.assert_frame_equal(result, expected)
def test_header_not_first_line(all_parsers):
parser = all_parsers
data = """got,to,ignore,this,line
got,to,ignore,this,line
index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
"""
data2 = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
"""
result = parser.read_csv(StringIO(data), header=2, index_col=0)
expected = parser.read_csv(StringIO(data2), header=0, index_col=0)
tm.assert_frame_equal(result, expected)
def test_header_multi_index(all_parsers):
parser = all_parsers
expected = tm.makeCustomDataframe(5, 3, r_idx_nlevels=2, c_idx_nlevels=4)
data = """\
C0,,C_l0_g0,C_l0_g1,C_l0_g2
C1,,C_l1_g0,C_l1_g1,C_l1_g2
C2,,C_l2_g0,C_l2_g1,C_l2_g2
C3,,C_l3_g0,C_l3_g1,C_l3_g2
R0,R1,,,
R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2
R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2
R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2
R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2
R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2
"""
result = parser.read_csv(StringIO(data), header=[0, 1, 2, 3], index_col=[0, 1])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"kwargs,msg",
[
(
dict(index_col=["foo", "bar"]),
(
"index_col must only contain "
"row numbers when specifying "
"a multi-index header"
),
),
(
dict(index_col=[0, 1], names=["foo", "bar"]),
("cannot specify names when specifying a multi-index header"),
),
(
dict(index_col=[0, 1], usecols=["foo", "bar"]),
("cannot specify usecols when specifying a multi-index header"),
),
],
)
def test_header_multi_index_invalid(all_parsers, kwargs, msg):
data = """\
C0,,C_l0_g0,C_l0_g1,C_l0_g2
C1,,C_l1_g0,C_l1_g1,C_l1_g2
C2,,C_l2_g0,C_l2_g1,C_l2_g2
C3,,C_l3_g0,C_l3_g1,C_l3_g2
R0,R1,,,
R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2
R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2
R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2
R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2
R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2
"""
parser = all_parsers
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), header=[0, 1, 2, 3], **kwargs)
_TestTuple = namedtuple("names", ["first", "second"])
@pytest.mark.parametrize(
"kwargs",
[
dict(header=[0, 1]),
dict(
skiprows=3,
names=[
("a", "q"),
("a", "r"),
("a", "s"),
("b", "t"),
("c", "u"),
("c", "v"),
],
),
dict(
skiprows=3,
names=[
_TestTuple("a", "q"),
_TestTuple("a", "r"),
_TestTuple("a", "s"),
_TestTuple("b", "t"),
_TestTuple("c", "u"),
_TestTuple("c", "v"),
],
),
],
)
def test_header_multi_index_common_format1(all_parsers, kwargs):
parser = all_parsers
expected = DataFrame(
[[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
index=["one", "two"],
columns=MultiIndex.from_tuples(
[("a", "q"), ("a", "r"), ("a", "s"), ("b", "t"), ("c", "u"), ("c", "v")]
),
)
data = """,a,a,a,b,c,c
,q,r,s,t,u,v
,,,,,,
one,1,2,3,4,5,6
two,7,8,9,10,11,12"""
result = parser.read_csv(StringIO(data), index_col=0, **kwargs)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"kwargs",
[
dict(header=[0, 1]),
dict(
skiprows=2,
names=[
("a", "q"),
("a", "r"),
("a", "s"),
("b", "t"),
("c", "u"),
("c", "v"),
],
),
dict(
skiprows=2,
names=[
_TestTuple("a", "q"),
_TestTuple("a", "r"),
_TestTuple("a", "s"),
_TestTuple("b", "t"),
_TestTuple("c", "u"),
_TestTuple("c", "v"),
],
),
],
)
def test_header_multi_index_common_format2(all_parsers, kwargs):
parser = all_parsers
expected = DataFrame(
[[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
index=["one", "two"],
columns=MultiIndex.from_tuples(
[("a", "q"), ("a", "r"), ("a", "s"), ("b", "t"), ("c", "u"), ("c", "v")]
),
)
data = """,a,a,a,b,c,c
,q,r,s,t,u,v
one,1,2,3,4,5,6
two,7,8,9,10,11,12"""
result = parser.read_csv(StringIO(data), index_col=0, **kwargs)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"kwargs",
[
dict(header=[0, 1]),
dict(
skiprows=2,
names=[
("a", "q"),
("a", "r"),
("a", "s"),
("b", "t"),
("c", "u"),
("c", "v"),
],
),
dict(
skiprows=2,
names=[
_TestTuple("a", "q"),
_TestTuple("a", "r"),
_TestTuple("a", "s"),
_TestTuple("b", "t"),
_TestTuple("c", "u"),
_TestTuple("c", "v"),
],
),
],
)
def test_header_multi_index_common_format3(all_parsers, kwargs):
parser = all_parsers
expected = DataFrame(
[[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
index=["one", "two"],
columns=MultiIndex.from_tuples(
[("a", "q"), ("a", "r"), ("a", "s"), ("b", "t"), ("c", "u"), ("c", "v")]
),
)
expected = expected.reset_index(drop=True)
data = """a,a,a,b,c,c
q,r,s,t,u,v
1,2,3,4,5,6
7,8,9,10,11,12"""
result = parser.read_csv(StringIO(data), index_col=None, **kwargs)
tm.assert_frame_equal(result, expected)
def test_header_multi_index_common_format_malformed1(all_parsers):
parser = all_parsers
expected = DataFrame(
np.array([[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype="int64"),
index=Index([1, 7]),
columns=MultiIndex(
levels=[["a", "b", "c"], ["r", "s", "t", "u", "v"]],
codes=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]],
names=["a", "q"],
),
)
data = """a,a,a,b,c,c
q,r,s,t,u,v
1,2,3,4,5,6
7,8,9,10,11,12"""
result = parser.read_csv(StringIO(data), header=[0, 1], index_col=0)
tm.assert_frame_equal(expected, result)
def test_header_multi_index_common_format_malformed2(all_parsers):
parser = all_parsers
expected = DataFrame(
np.array([[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype="int64"),
index=Index([1, 7]),
columns=MultiIndex(
levels=[["a", "b", "c"], ["r", "s", "t", "u", "v"]],
codes=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]],
names=[None, "q"],
),
)
data = """,a,a,b,c,c
q,r,s,t,u,v
1,2,3,4,5,6
7,8,9,10,11,12"""
result = parser.read_csv(StringIO(data), header=[0, 1], index_col=0)
tm.assert_frame_equal(expected, result)
def test_header_multi_index_common_format_malformed3(all_parsers):
parser = all_parsers
expected = DataFrame(
np.array([[3, 4, 5, 6], [9, 10, 11, 12]], dtype="int64"),
index=MultiIndex(levels=[[1, 7], [2, 8]], codes=[[0, 1], [0, 1]]),
columns=MultiIndex(
levels=[["a", "b", "c"], ["s", "t", "u", "v"]],
codes=[[0, 1, 2, 2], [0, 1, 2, 3]],
names=[None, "q"],
),
)
data = """,a,a,b,c,c
q,r,s,t,u,v
1,2,3,4,5,6
7,8,9,10,11,12"""
result = parser.read_csv(StringIO(data), header=[0, 1], index_col=[0, 1])
tm.assert_frame_equal(expected, result)
@pytest.mark.parametrize(
"data,header", [("1,2,3\n4,5,6", None), ("foo,bar,baz\n1,2,3\n4,5,6", 0)]
)
def test_header_names_backward_compat(all_parsers, data, header):
# see gh-2539
parser = all_parsers
expected = parser.read_csv(StringIO("1,2,3\n4,5,6"), names=["a", "b", "c"])
result = parser.read_csv(StringIO(data), names=["a", "b", "c"], header=header)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("kwargs", [dict(), dict(index_col=False)])
def test_read_only_header_no_rows(all_parsers, kwargs):
# See gh-7773
parser = all_parsers
expected = DataFrame(columns=["a", "b", "c"])
result = parser.read_csv(StringIO("a,b,c"), **kwargs)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"kwargs,names",
[
(dict(), [0, 1, 2, 3, 4]),
(dict(prefix="X"), ["X0", "X1", "X2", "X3", "X4"]),
(
dict(names=["foo", "bar", "baz", "quux", "panda"]),
["foo", "bar", "baz", "quux", "panda"],
),
],
)
def test_no_header(all_parsers, kwargs, names):
parser = all_parsers
data = """1,2,3,4,5
6,7,8,9,10
11,12,13,14,15
"""
expected = DataFrame(
[[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]], columns=names
)
result = parser.read_csv(StringIO(data), header=None, **kwargs)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("header", [["a", "b"], "string_header"])
def test_non_int_header(all_parsers, header):
# see gh-16338
msg = "header must be integer or list of integers"
data = """1,2\n3,4"""
parser = all_parsers
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), header=header)
def test_singleton_header(all_parsers):
# see gh-7757
data = """a,b,c\n0,1,2\n1,2,3"""
parser = all_parsers
expected = DataFrame({"a": [0, 1], "b": [1, 2], "c": [2, 3]})
result = parser.read_csv(StringIO(data), header=[0])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"data,expected",
[
(
"A,A,A,B\none,one,one,two\n0,40,34,0.1",
DataFrame(
[[0, 40, 34, 0.1]],
columns=MultiIndex.from_tuples(
[("A", "one"), ("A", "one.1"), ("A", "one.2"), ("B", "two")]
),
),
),
(
"A,A,A,B\none,one,one.1,two\n0,40,34,0.1",
DataFrame(
[[0, 40, 34, 0.1]],
columns=MultiIndex.from_tuples(
[("A", "one"), ("A", "one.1"), ("A", "one.1.1"), ("B", "two")]
),
),
),
(
"A,A,A,B,B\none,one,one.1,two,two\n0,40,34,0.1,0.1",
DataFrame(
[[0, 40, 34, 0.1, 0.1]],
columns=MultiIndex.from_tuples(
[
("A", "one"),
("A", "one.1"),
("A", "one.1.1"),
("B", "two"),
("B", "two.1"),
]
),
),
),
],
)
def test_mangles_multi_index(all_parsers, data, expected):
# see gh-18062
parser = all_parsers
result = parser.read_csv(StringIO(data), header=[0, 1])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("index_col", [None, [0]])
@pytest.mark.parametrize(
"columns", [None, (["", "Unnamed"]), (["Unnamed", ""]), (["Unnamed", "NotUnnamed"])]
)
def test_multi_index_unnamed(all_parsers, index_col, columns):
# see gh-23687
#
# When specifying a multi-index header, make sure that
# we don't error just because one of the rows in our header
# has ALL column names containing the string "Unnamed". The
# correct condition to check is whether the row contains
# ALL columns that did not have names (and instead were given
# placeholder ones).
parser = all_parsers
header = [0, 1]
if index_col is None:
data = ",".join(columns or ["", ""]) + "\n0,1\n2,3\n4,5\n"
else:
data = ",".join([""] + (columns or ["", ""])) + "\n,0,1\n0,2,3\n1,4,5\n"
if columns is None:
msg = (
r"Passed header=\[0,1\] are too "
r"many rows for this multi_index of columns"
)
with pytest.raises(ParserError, match=msg):
parser.read_csv(StringIO(data), header=header, index_col=index_col)
else:
result = parser.read_csv(StringIO(data), header=header, index_col=index_col)
template = "Unnamed: {i}_level_0"
exp_columns = []
for i, col in enumerate(columns):
if not col: # Unnamed.
col = template.format(i=i if index_col is None else i + 1)
exp_columns.append(col)
columns = MultiIndex.from_tuples(zip(exp_columns, ["0", "1"]))
expected = DataFrame([[2, 3], [4, 5]], columns=columns)
tm.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,176 @@
"""
Tests that the specified index column (a.k.a "index_col")
is properly handled or inferred during parsing for all of
the parsers defined in parsers.py
"""
from io import StringIO
import pytest
from pandas import DataFrame, Index, MultiIndex
import pandas.util.testing as tm
@pytest.mark.parametrize("with_header", [True, False])
def test_index_col_named(all_parsers, with_header):
parser = all_parsers
no_header = """\
KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" # noqa
header = (
"ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir\n"
) # noqa
if with_header:
data = header + no_header
result = parser.read_csv(StringIO(data), index_col="ID")
expected = parser.read_csv(StringIO(data), header=0).set_index("ID")
tm.assert_frame_equal(result, expected)
else:
data = no_header
msg = "Index ID invalid"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), index_col="ID")
def test_index_col_named2(all_parsers):
parser = all_parsers
data = """\
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo
"""
expected = DataFrame(
{"a": [1, 5, 9], "b": [2, 6, 10], "c": [3, 7, 11], "d": [4, 8, 12]},
index=Index(["hello", "world", "foo"], name="message"),
)
names = ["a", "b", "c", "d", "message"]
result = parser.read_csv(StringIO(data), names=names, index_col=["message"])
tm.assert_frame_equal(result, expected)
def test_index_col_is_true(all_parsers):
# see gh-9798
data = "a,b\n1,2"
parser = all_parsers
msg = "The value of index_col couldn't be 'True'"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), index_col=True)
def test_infer_index_col(all_parsers):
data = """A,B,C
foo,1,2,3
bar,4,5,6
baz,7,8,9
"""
parser = all_parsers
result = parser.read_csv(StringIO(data))
expected = DataFrame(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
index=["foo", "bar", "baz"],
columns=["A", "B", "C"],
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"index_col,kwargs",
[
(None, dict(columns=["x", "y", "z"])),
(False, dict(columns=["x", "y", "z"])),
(0, dict(columns=["y", "z"], index=Index([], name="x"))),
(1, dict(columns=["x", "z"], index=Index([], name="y"))),
("x", dict(columns=["y", "z"], index=Index([], name="x"))),
("y", dict(columns=["x", "z"], index=Index([], name="y"))),
(
[0, 1],
dict(
columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["x", "y"])
),
),
(
["x", "y"],
dict(
columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["x", "y"])
),
),
(
[1, 0],
dict(
columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["y", "x"])
),
),
(
["y", "x"],
dict(
columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["y", "x"])
),
),
],
)
def test_index_col_empty_data(all_parsers, index_col, kwargs):
data = "x,y,z"
parser = all_parsers
result = parser.read_csv(StringIO(data), index_col=index_col)
expected = DataFrame(**kwargs)
tm.assert_frame_equal(result, expected)
def test_empty_with_index_col_false(all_parsers):
# see gh-10413
data = "x,y"
parser = all_parsers
result = parser.read_csv(StringIO(data), index_col=False)
expected = DataFrame(columns=["x", "y"])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"index_names",
[
["", ""],
["foo", ""],
["", "bar"],
["foo", "bar"],
["NotReallyUnnamed", "Unnamed: 0"],
],
)
def test_multi_index_naming(all_parsers, index_names):
parser = all_parsers
# We don't want empty index names being replaced with "Unnamed: 0"
data = ",".join(index_names + ["col\na,c,1\na,d,2\nb,c,3\nb,d,4"])
result = parser.read_csv(StringIO(data), index_col=[0, 1])
expected = DataFrame(
{"col": [1, 2, 3, 4]}, index=MultiIndex.from_product([["a", "b"], ["c", "d"]])
)
expected.index.names = [name if name else None for name in index_names]
tm.assert_frame_equal(result, expected)
def test_multi_index_naming_not_all_at_beginning(all_parsers):
parser = all_parsers
data = ",Unnamed: 2,\na,c,1\na,d,2\nb,c,3\nb,d,4"
result = parser.read_csv(StringIO(data), index_col=[0, 2])
expected = DataFrame(
{"Unnamed: 2": ["c", "d", "c", "d"]},
index=MultiIndex(
levels=[["a", "b"], [1, 2, 3, 4]], codes=[[0, 0, 1, 1], [0, 1, 2, 3]]
),
)
tm.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,132 @@
"""
Tests that duplicate columns are handled appropriately when parsed by the
CSV engine. In general, the expected result is that they are either thoroughly
de-duplicated (if mangling requested) or ignored otherwise.
"""
from io import StringIO
import pytest
from pandas import DataFrame
import pandas.util.testing as tm
@pytest.mark.parametrize("kwargs", [dict(), dict(mangle_dupe_cols=True)])
def test_basic(all_parsers, kwargs):
# TODO: add test for condition "mangle_dupe_cols=False"
# once it is actually supported (gh-12935)
parser = all_parsers
data = "a,a,b,b,b\n1,2,3,4,5"
result = parser.read_csv(StringIO(data), sep=",", **kwargs)
expected = DataFrame([[1, 2, 3, 4, 5]], columns=["a", "a.1", "b", "b.1", "b.2"])
tm.assert_frame_equal(result, expected)
def test_basic_names(all_parsers):
# See gh-7160
parser = all_parsers
data = "a,b,a\n0,1,2\n3,4,5"
expected = DataFrame([[0, 1, 2], [3, 4, 5]], columns=["a", "b", "a.1"])
result = parser.read_csv(StringIO(data))
tm.assert_frame_equal(result, expected)
def test_basic_names_raise(all_parsers):
# See gh-7160
parser = all_parsers
data = "0,1,2\n3,4,5"
with pytest.raises(ValueError, match="Duplicate names"):
parser.read_csv(StringIO(data), names=["a", "b", "a"])
@pytest.mark.parametrize(
"data,expected",
[
("a,a,a.1\n1,2,3", DataFrame([[1, 2, 3]], columns=["a", "a.1", "a.1.1"])),
(
"a,a,a.1,a.1.1,a.1.1.1,a.1.1.1.1\n1,2,3,4,5,6",
DataFrame(
[[1, 2, 3, 4, 5, 6]],
columns=["a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1", "a.1.1.1.1.1"],
),
),
(
"a,a,a.3,a.1,a.2,a,a\n1,2,3,4,5,6,7",
DataFrame(
[[1, 2, 3, 4, 5, 6, 7]],
columns=["a", "a.1", "a.3", "a.1.1", "a.2", "a.2.1", "a.3.1"],
),
),
],
)
def test_thorough_mangle_columns(all_parsers, data, expected):
# see gh-17060
parser = all_parsers
result = parser.read_csv(StringIO(data))
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"data,names,expected",
[
(
"a,b,b\n1,2,3",
["a.1", "a.1", "a.1.1"],
DataFrame(
[["a", "b", "b"], ["1", "2", "3"]], columns=["a.1", "a.1.1", "a.1.1.1"]
),
),
(
"a,b,c,d,e,f\n1,2,3,4,5,6",
["a", "a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"],
DataFrame(
[["a", "b", "c", "d", "e", "f"], ["1", "2", "3", "4", "5", "6"]],
columns=["a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1", "a.1.1.1.1.1"],
),
),
(
"a,b,c,d,e,f,g\n1,2,3,4,5,6,7",
["a", "a", "a.3", "a.1", "a.2", "a", "a"],
DataFrame(
[
["a", "b", "c", "d", "e", "f", "g"],
["1", "2", "3", "4", "5", "6", "7"],
],
columns=["a", "a.1", "a.3", "a.1.1", "a.2", "a.2.1", "a.3.1"],
),
),
],
)
def test_thorough_mangle_names(all_parsers, data, names, expected):
# see gh-17095
parser = all_parsers
with pytest.raises(ValueError, match="Duplicate names"):
parser.read_csv(StringIO(data), names=names)
def test_mangled_unnamed_placeholders(all_parsers):
# xref gh-13017
orig_key = "0"
parser = all_parsers
orig_value = [1, 2, 3]
df = DataFrame({orig_key: orig_value})
# This test recursively updates `df`.
for i in range(3):
expected = DataFrame()
for j in range(i + 1):
expected["Unnamed: 0" + ".1" * j] = [0, 1, 2]
expected[orig_key] = orig_value
df = parser.read_csv(StringIO(df.to_csv()))
tm.assert_frame_equal(df, expected)

View File

@@ -0,0 +1,146 @@
"""
Tests multithreading behaviour for reading and
parsing files for each parser defined in parsers.py
"""
from io import BytesIO
from multiprocessing.pool import ThreadPool
import numpy as np
import pandas as pd
from pandas import DataFrame
import pandas.util.testing as tm
def _construct_dataframe(num_rows):
"""
Construct a DataFrame for testing.
Parameters
----------
num_rows : int
The number of rows for our DataFrame.
Returns
-------
df : DataFrame
"""
df = DataFrame(np.random.rand(num_rows, 5), columns=list("abcde"))
df["foo"] = "foo"
df["bar"] = "bar"
df["baz"] = "baz"
df["date"] = pd.date_range("20000101 09:00:00", periods=num_rows, freq="s")
df["int"] = np.arange(num_rows, dtype="int64")
return df
def test_multi_thread_string_io_read_csv(all_parsers):
# see gh-11786
parser = all_parsers
max_row_range = 10000
num_files = 100
bytes_to_df = [
"\n".join(
["{i:d},{i:d},{i:d}".format(i=i) for i in range(max_row_range)]
).encode()
for _ in range(num_files)
]
files = [BytesIO(b) for b in bytes_to_df]
# Read all files in many threads.
pool = ThreadPool(8)
results = pool.map(parser.read_csv, files)
first_result = results[0]
for result in results:
tm.assert_frame_equal(first_result, result)
def _generate_multi_thread_dataframe(parser, path, num_rows, num_tasks):
"""
Generate a DataFrame via multi-thread.
Parameters
----------
parser : BaseParser
The parser object to use for reading the data.
path : str
The location of the CSV file to read.
num_rows : int
The number of rows to read per task.
num_tasks : int
The number of tasks to use for reading this DataFrame.
Returns
-------
df : DataFrame
"""
def reader(arg):
"""
Create a reader for part of the CSV.
Parameters
----------
arg : tuple
A tuple of the following:
* start : int
The starting row to start for parsing CSV
* nrows : int
The number of rows to read.
Returns
-------
df : DataFrame
"""
start, nrows = arg
if not start:
return parser.read_csv(
path, index_col=0, header=0, nrows=nrows, parse_dates=["date"]
)
return parser.read_csv(
path,
index_col=0,
header=None,
skiprows=int(start) + 1,
nrows=nrows,
parse_dates=[9],
)
tasks = [
(num_rows * i // num_tasks, num_rows // num_tasks) for i in range(num_tasks)
]
pool = ThreadPool(processes=num_tasks)
results = pool.map(reader, tasks)
header = results[0].columns
for r in results[1:]:
r.columns = header
final_dataframe = pd.concat(results)
return final_dataframe
def test_multi_thread_path_multipart_read_csv(all_parsers):
# see gh-11786
num_tasks = 4
num_rows = 100000
parser = all_parsers
file_name = "__thread_pool_reader__.csv"
df = _construct_dataframe(num_rows)
with tm.ensure_clean(file_name) as path:
df.to_csv(path)
final_dataframe = _generate_multi_thread_dataframe(
parser, path, num_rows, num_tasks
)
tm.assert_frame_equal(df, final_dataframe)

View File

@@ -0,0 +1,538 @@
"""
Tests that NA values are properly handled during
parsing for all of the parsers defined in parsers.py
"""
from io import StringIO
import numpy as np
import pytest
from pandas import DataFrame, Index, MultiIndex
import pandas.util.testing as tm
import pandas.io.common as com
def test_string_nas(all_parsers):
parser = all_parsers
data = """A,B,C
a,b,c
d,,f
,g,h
"""
result = parser.read_csv(StringIO(data))
expected = DataFrame(
[["a", "b", "c"], ["d", np.nan, "f"], [np.nan, "g", "h"]],
columns=["A", "B", "C"],
)
tm.assert_frame_equal(result, expected)
def test_detect_string_na(all_parsers):
parser = all_parsers
data = """A,B
foo,bar
NA,baz
NaN,nan
"""
expected = DataFrame(
[["foo", "bar"], [np.nan, "baz"], [np.nan, np.nan]], columns=["A", "B"]
)
result = parser.read_csv(StringIO(data))
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"na_values",
[
["-999.0", "-999"],
[-999, -999.0],
[-999.0, -999],
["-999.0"],
["-999"],
[-999.0],
[-999],
],
)
@pytest.mark.parametrize(
"data",
[
"""A,B
-999,1.2
2,-999
3,4.5
""",
"""A,B
-999,1.200
2,-999.000
3,4.500
""",
],
)
def test_non_string_na_values(all_parsers, data, na_values):
# see gh-3611: with an odd float format, we can't match
# the string "999.0" exactly but still need float matching
parser = all_parsers
expected = DataFrame([[np.nan, 1.2], [2.0, np.nan], [3.0, 4.5]], columns=["A", "B"])
result = parser.read_csv(StringIO(data), na_values=na_values)
tm.assert_frame_equal(result, expected)
def test_default_na_values(all_parsers):
_NA_VALUES = {
"-1.#IND",
"1.#QNAN",
"1.#IND",
"-1.#QNAN",
"#N/A",
"N/A",
"n/a",
"NA",
"#NA",
"NULL",
"null",
"NaN",
"nan",
"-NaN",
"-nan",
"#N/A N/A",
"",
}
assert _NA_VALUES == com._NA_VALUES
parser = all_parsers
nv = len(_NA_VALUES)
def f(i, v):
if i == 0:
buf = ""
elif i > 0:
buf = "".join([","] * i)
buf = "{0}{1}".format(buf, v)
if i < nv - 1:
buf = "{0}{1}".format(buf, "".join([","] * (nv - i - 1)))
return buf
data = StringIO("\n".join(f(i, v) for i, v in enumerate(_NA_VALUES)))
expected = DataFrame(np.nan, columns=range(nv), index=range(nv))
result = parser.read_csv(data, header=None)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("na_values", ["baz", ["baz"]])
def test_custom_na_values(all_parsers, na_values):
parser = all_parsers
data = """A,B,C
ignore,this,row
1,NA,3
-1.#IND,5,baz
7,8,NaN
"""
expected = DataFrame(
[[1.0, np.nan, 3], [np.nan, 5, np.nan], [7, 8, np.nan]], columns=["A", "B", "C"]
)
result = parser.read_csv(StringIO(data), na_values=na_values, skiprows=[1])
tm.assert_frame_equal(result, expected)
def test_bool_na_values(all_parsers):
data = """A,B,C
True,False,True
NA,True,False
False,NA,True"""
parser = all_parsers
result = parser.read_csv(StringIO(data))
expected = DataFrame(
{
"A": np.array([True, np.nan, False], dtype=object),
"B": np.array([False, True, np.nan], dtype=object),
"C": [True, False, True],
}
)
tm.assert_frame_equal(result, expected)
def test_na_value_dict(all_parsers):
data = """A,B,C
foo,bar,NA
bar,foo,foo
foo,bar,NA
bar,foo,foo"""
parser = all_parsers
df = parser.read_csv(StringIO(data), na_values={"A": ["foo"], "B": ["bar"]})
expected = DataFrame(
{
"A": [np.nan, "bar", np.nan, "bar"],
"B": [np.nan, "foo", np.nan, "foo"],
"C": [np.nan, "foo", np.nan, "foo"],
}
)
tm.assert_frame_equal(df, expected)
@pytest.mark.parametrize(
"index_col,expected",
[
(
[0],
DataFrame({"b": [np.nan], "c": [1], "d": [5]}, index=Index([0], name="a")),
),
(
[0, 2],
DataFrame(
{"b": [np.nan], "d": [5]},
index=MultiIndex.from_tuples([(0, 1)], names=["a", "c"]),
),
),
(
["a", "c"],
DataFrame(
{"b": [np.nan], "d": [5]},
index=MultiIndex.from_tuples([(0, 1)], names=["a", "c"]),
),
),
],
)
def test_na_value_dict_multi_index(all_parsers, index_col, expected):
data = """\
a,b,c,d
0,NA,1,5
"""
parser = all_parsers
result = parser.read_csv(StringIO(data), na_values=set(), index_col=index_col)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"kwargs,expected",
[
(
dict(),
DataFrame(
{
"A": ["a", "b", np.nan, "d", "e", np.nan, "g"],
"B": [1, 2, 3, 4, 5, 6, 7],
"C": ["one", "two", "three", np.nan, "five", np.nan, "seven"],
}
),
),
(
dict(na_values={"A": [], "C": []}, keep_default_na=False),
DataFrame(
{
"A": ["a", "b", "", "d", "e", "nan", "g"],
"B": [1, 2, 3, 4, 5, 6, 7],
"C": ["one", "two", "three", "nan", "five", "", "seven"],
}
),
),
(
dict(na_values=["a"], keep_default_na=False),
DataFrame(
{
"A": [np.nan, "b", "", "d", "e", "nan", "g"],
"B": [1, 2, 3, 4, 5, 6, 7],
"C": ["one", "two", "three", "nan", "five", "", "seven"],
}
),
),
(
dict(na_values={"A": [], "C": []}),
DataFrame(
{
"A": ["a", "b", np.nan, "d", "e", np.nan, "g"],
"B": [1, 2, 3, 4, 5, 6, 7],
"C": ["one", "two", "three", np.nan, "five", np.nan, "seven"],
}
),
),
],
)
def test_na_values_keep_default(all_parsers, kwargs, expected):
data = """\
A,B,C
a,1,one
b,2,two
,3,three
d,4,nan
e,5,five
nan,6,
g,7,seven
"""
parser = all_parsers
result = parser.read_csv(StringIO(data), **kwargs)
tm.assert_frame_equal(result, expected)
def test_no_na_values_no_keep_default(all_parsers):
# see gh-4318: passing na_values=None and
# keep_default_na=False yields 'None" as a na_value
data = """\
A,B,C
a,1,None
b,2,two
,3,None
d,4,nan
e,5,five
nan,6,
g,7,seven
"""
parser = all_parsers
result = parser.read_csv(StringIO(data), keep_default_na=False)
expected = DataFrame(
{
"A": ["a", "b", "", "d", "e", "nan", "g"],
"B": [1, 2, 3, 4, 5, 6, 7],
"C": ["None", "two", "None", "nan", "five", "", "seven"],
}
)
tm.assert_frame_equal(result, expected)
def test_no_keep_default_na_dict_na_values(all_parsers):
# see gh-19227
data = "a,b\n,2"
parser = all_parsers
result = parser.read_csv(
StringIO(data), na_values={"b": ["2"]}, keep_default_na=False
)
expected = DataFrame({"a": [""], "b": [np.nan]})
tm.assert_frame_equal(result, expected)
def test_no_keep_default_na_dict_na_scalar_values(all_parsers):
# see gh-19227
#
# Scalar values shouldn't cause the parsing to crash or fail.
data = "a,b\n1,2"
parser = all_parsers
df = parser.read_csv(StringIO(data), na_values={"b": 2}, keep_default_na=False)
expected = DataFrame({"a": [1], "b": [np.nan]})
tm.assert_frame_equal(df, expected)
@pytest.mark.parametrize("col_zero_na_values", [113125, "113125"])
def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_values):
# see gh-19227
data = """\
113125,"blah","/blaha",kjsdkj,412.166,225.874,214.008
729639,"qwer","",asdfkj,466.681,,252.373
"""
parser = all_parsers
expected = DataFrame(
{
0: [np.nan, 729639.0],
1: [np.nan, "qwer"],
2: ["/blaha", np.nan],
3: ["kjsdkj", "asdfkj"],
4: [412.166, 466.681],
5: ["225.874", ""],
6: [np.nan, 252.373],
}
)
result = parser.read_csv(
StringIO(data),
header=None,
keep_default_na=False,
na_values={2: "", 6: "214.008", 1: "blah", 0: col_zero_na_values},
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"na_filter,row_data",
[
(True, [[1, "A"], [np.nan, np.nan], [3, "C"]]),
(False, [["1", "A"], ["nan", "B"], ["3", "C"]]),
],
)
def test_na_values_na_filter_override(all_parsers, na_filter, row_data):
data = """\
A,B
1,A
nan,B
3,C
"""
parser = all_parsers
result = parser.read_csv(StringIO(data), na_values=["B"], na_filter=na_filter)
expected = DataFrame(row_data, columns=["A", "B"])
tm.assert_frame_equal(result, expected)
def test_na_trailing_columns(all_parsers):
parser = all_parsers
data = """Date,Currency,Symbol,Type,Units,UnitPrice,Cost,Tax
2012-03-14,USD,AAPL,BUY,1000
2012-05-12,USD,SBUX,SELL,500"""
# Trailing columns should be all NaN.
result = parser.read_csv(StringIO(data))
expected = DataFrame(
[
["2012-03-14", "USD", "AAPL", "BUY", 1000, np.nan, np.nan, np.nan],
["2012-05-12", "USD", "SBUX", "SELL", 500, np.nan, np.nan, np.nan],
],
columns=[
"Date",
"Currency",
"Symbol",
"Type",
"Units",
"UnitPrice",
"Cost",
"Tax",
],
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"na_values,row_data",
[
(1, [[np.nan, 2.0], [2.0, np.nan]]),
({"a": 2, "b": 1}, [[1.0, 2.0], [np.nan, np.nan]]),
],
)
def test_na_values_scalar(all_parsers, na_values, row_data):
# see gh-12224
parser = all_parsers
names = ["a", "b"]
data = "1,2\n2,1"
result = parser.read_csv(StringIO(data), names=names, na_values=na_values)
expected = DataFrame(row_data, columns=names)
tm.assert_frame_equal(result, expected)
def test_na_values_dict_aliasing(all_parsers):
parser = all_parsers
na_values = {"a": 2, "b": 1}
na_values_copy = na_values.copy()
names = ["a", "b"]
data = "1,2\n2,1"
expected = DataFrame([[1.0, 2.0], [np.nan, np.nan]], columns=names)
result = parser.read_csv(StringIO(data), names=names, na_values=na_values)
tm.assert_frame_equal(result, expected)
tm.assert_dict_equal(na_values, na_values_copy)
def test_na_values_dict_col_index(all_parsers):
# see gh-14203
data = "a\nfoo\n1"
parser = all_parsers
na_values = {0: "foo"}
result = parser.read_csv(StringIO(data), na_values=na_values)
expected = DataFrame({"a": [np.nan, 1]})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"data,kwargs,expected",
[
(
str(2 ** 63) + "\n" + str(2 ** 63 + 1),
dict(na_values=[2 ** 63]),
DataFrame([str(2 ** 63), str(2 ** 63 + 1)]),
),
(str(2 ** 63) + ",1" + "\n,2", dict(), DataFrame([[str(2 ** 63), 1], ["", 2]])),
(str(2 ** 63) + "\n1", dict(na_values=[2 ** 63]), DataFrame([np.nan, 1])),
],
)
def test_na_values_uint64(all_parsers, data, kwargs, expected):
# see gh-14983
parser = all_parsers
result = parser.read_csv(StringIO(data), header=None, **kwargs)
tm.assert_frame_equal(result, expected)
def test_empty_na_values_no_default_with_index(all_parsers):
# see gh-15835
data = "a,1\nb,2"
parser = all_parsers
expected = DataFrame({"1": [2]}, index=Index(["b"], name="a"))
result = parser.read_csv(StringIO(data), index_col=0, keep_default_na=False)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"na_filter,index_data", [(False, ["", "5"]), (True, [np.nan, 5.0])]
)
def test_no_na_filter_on_index(all_parsers, na_filter, index_data):
# see gh-5239
#
# Don't parse NA-values in index unless na_filter=True
parser = all_parsers
data = "a,b,c\n1,,3\n4,5,6"
expected = DataFrame({"a": [1, 4], "c": [3, 6]}, index=Index(index_data, name="b"))
result = parser.read_csv(StringIO(data), index_col=[1], na_filter=na_filter)
tm.assert_frame_equal(result, expected)
def test_inf_na_values_with_int_index(all_parsers):
# see gh-17128
parser = all_parsers
data = "idx,col1,col2\n1,3,4\n2,inf,-inf"
# Don't fail with OverflowError with inf's and integer index column.
out = parser.read_csv(StringIO(data), index_col=[0], na_values=["inf", "-inf"])
expected = DataFrame(
{"col1": [3, np.nan], "col2": [4, np.nan]}, index=Index([1, 2], name="idx")
)
tm.assert_frame_equal(out, expected)
@pytest.mark.parametrize("na_filter", [True, False])
def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter):
# see gh-20377
parser = all_parsers
data = "a,b,c\n1,,3\n4,5,6"
# na_filter=True --> missing value becomes NaN.
# na_filter=False --> missing value remains empty string.
empty = np.nan if na_filter else ""
expected = DataFrame({"a": ["1", "4"], "b": [empty, "5"], "c": ["3", "6"]})
result = parser.read_csv(StringIO(data), na_filter=na_filter, dtype=str)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"data, na_values",
[
("false,1\n,1\ntrue", None),
("false,1\nnull,1\ntrue", None),
("false,1\nnan,1\ntrue", None),
("false,1\nfoo,1\ntrue", "foo"),
("false,1\nfoo,1\ntrue", ["foo"]),
("false,1\nfoo,1\ntrue", {"a": "foo"}),
],
)
def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values):
parser = all_parsers
msg = (
"(Bool column has NA values in column [0a])|"
"(cannot safely convert passed user dtype of "
"bool for object dtyped data in column 0)"
)
with pytest.raises(ValueError, match=msg):
parser.read_csv(
StringIO(data),
header=None,
names=["a", "b"],
dtype={"a": "bool"},
na_values=na_values,
)

View File

@@ -0,0 +1,205 @@
"""
Tests parsers ability to read and parse non-local files
and hence require a network connection to be read.
"""
from io import BytesIO, StringIO
import logging
import numpy as np
import pytest
import pandas.util._test_decorators as td
from pandas import DataFrame
import pandas.util.testing as tm
from pandas.io.parsers import read_csv
@pytest.mark.network
@pytest.mark.parametrize(
"compress_type, extension",
[("gzip", ".gz"), ("bz2", ".bz2"), ("zip", ".zip"), ("xz", ".xz")],
)
@pytest.mark.parametrize("mode", ["explicit", "infer"])
@pytest.mark.parametrize("engine", ["python", "c"])
def test_compressed_urls(salaries_table, compress_type, extension, mode, engine):
check_compressed_urls(salaries_table, compress_type, extension, mode, engine)
@tm.network
def check_compressed_urls(salaries_table, compression, extension, mode, engine):
# test reading compressed urls with various engines and
# extension inference
base_url = (
"https://github.com/pandas-dev/pandas/raw/master/"
"pandas/tests/io/parser/data/salaries.csv"
)
url = base_url + extension
if mode != "explicit":
compression = mode
url_table = read_csv(url, sep="\t", compression=compression, engine=engine)
tm.assert_frame_equal(url_table, salaries_table)
@pytest.fixture
def tips_df(datapath):
"""DataFrame with the tips dataset."""
return read_csv(datapath("io", "parser", "data", "tips.csv"))
@pytest.mark.usefixtures("s3_resource")
@td.skip_if_not_us_locale()
class TestS3:
def test_parse_public_s3_bucket(self, tips_df):
pytest.importorskip("s3fs")
# more of an integration test due to the not-public contents portion
# can probably mock this though.
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
df = read_csv("s3://pandas-test/tips.csv" + ext, compression=comp)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(df, tips_df)
# Read public file from bucket with not-public contents
df = read_csv("s3://cant_get_it/tips.csv")
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(df, tips_df)
def test_parse_public_s3n_bucket(self, tips_df):
# Read from AWS s3 as "s3n" URL
df = read_csv("s3n://pandas-test/tips.csv", nrows=10)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(tips_df.iloc[:10], df)
def test_parse_public_s3a_bucket(self, tips_df):
# Read from AWS s3 as "s3a" URL
df = read_csv("s3a://pandas-test/tips.csv", nrows=10)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(tips_df.iloc[:10], df)
def test_parse_public_s3_bucket_nrows(self, tips_df):
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
df = read_csv("s3://pandas-test/tips.csv" + ext, nrows=10, compression=comp)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(tips_df.iloc[:10], df)
def test_parse_public_s3_bucket_chunked(self, tips_df):
# Read with a chunksize
chunksize = 5
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
df_reader = read_csv(
"s3://pandas-test/tips.csv" + ext, chunksize=chunksize, compression=comp
)
assert df_reader.chunksize == chunksize
for i_chunk in [0, 1, 2]:
# Read a couple of chunks and make sure we see them
# properly.
df = df_reader.get_chunk()
assert isinstance(df, DataFrame)
assert not df.empty
true_df = tips_df.iloc[chunksize * i_chunk : chunksize * (i_chunk + 1)]
tm.assert_frame_equal(true_df, df)
def test_parse_public_s3_bucket_chunked_python(self, tips_df):
# Read with a chunksize using the Python parser
chunksize = 5
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
df_reader = read_csv(
"s3://pandas-test/tips.csv" + ext,
chunksize=chunksize,
compression=comp,
engine="python",
)
assert df_reader.chunksize == chunksize
for i_chunk in [0, 1, 2]:
# Read a couple of chunks and make sure we see them properly.
df = df_reader.get_chunk()
assert isinstance(df, DataFrame)
assert not df.empty
true_df = tips_df.iloc[chunksize * i_chunk : chunksize * (i_chunk + 1)]
tm.assert_frame_equal(true_df, df)
def test_parse_public_s3_bucket_python(self, tips_df):
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
df = read_csv(
"s3://pandas-test/tips.csv" + ext, engine="python", compression=comp
)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(df, tips_df)
def test_infer_s3_compression(self, tips_df):
for ext in ["", ".gz", ".bz2"]:
df = read_csv(
"s3://pandas-test/tips.csv" + ext, engine="python", compression="infer"
)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(df, tips_df)
def test_parse_public_s3_bucket_nrows_python(self, tips_df):
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
df = read_csv(
"s3://pandas-test/tips.csv" + ext,
engine="python",
nrows=10,
compression=comp,
)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(tips_df.iloc[:10], df)
def test_s3_fails(self):
with pytest.raises(IOError):
read_csv("s3://nyqpug/asdf.csv")
# Receive a permission error when trying to read a private bucket.
# It's irrelevant here that this isn't actually a table.
with pytest.raises(IOError):
read_csv("s3://cant_get_it/")
def test_read_csv_handles_boto_s3_object(self, s3_resource, tips_file):
# see gh-16135
s3_object = s3_resource.meta.client.get_object(
Bucket="pandas-test", Key="tips.csv"
)
result = read_csv(BytesIO(s3_object["Body"].read()), encoding="utf8")
assert isinstance(result, DataFrame)
assert not result.empty
expected = read_csv(tips_file)
tm.assert_frame_equal(result, expected)
def test_read_csv_chunked_download(self, s3_resource, caplog):
# 8 MB, S3FS usees 5MB chunks
df = DataFrame(np.random.randn(100000, 4), columns=list("abcd"))
buf = BytesIO()
str_buf = StringIO()
df.to_csv(str_buf)
buf = BytesIO(str_buf.getvalue().encode("utf-8"))
s3_resource.Bucket("pandas-test").put_object(Key="large-file.csv", Body=buf)
with caplog.at_level(logging.DEBUG, logger="s3fs.core"):
read_csv("s3://pandas-test/large-file.csv", nrows=5)
# log of fetch_range (start, stop)
assert (0, 5505024) in {x.args[-2:] for x in caplog.records}
def test_read_s3_with_hash_in_key(self, tips_df):
# GH 25945
result = read_csv("s3://pandas-test/tips#1.csv")
tm.assert_frame_equal(tips_df, result)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,298 @@
"""
Tests that apply specifically to the Python parser. Unless specifically
stated as a Python-specific issue, the goal is to eventually move as many of
these tests out of this module as soon as the C parser can accept further
arguments when parsing.
"""
import csv
from io import BytesIO, StringIO
import pytest
from pandas.errors import ParserError
from pandas import DataFrame, Index, MultiIndex
import pandas.util.testing as tm
def test_default_separator(python_parser_only):
# see gh-17333
#
# csv.Sniffer in Python treats "o" as separator.
data = "aob\n1o2\n3o4"
parser = python_parser_only
expected = DataFrame({"a": [1, 3], "b": [2, 4]})
result = parser.read_csv(StringIO(data), sep=None)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("skipfooter", ["foo", 1.5, True])
def test_invalid_skipfooter_non_int(python_parser_only, skipfooter):
# see gh-15925 (comment)
data = "a\n1\n2"
parser = python_parser_only
msg = "skipfooter must be an integer"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), skipfooter=skipfooter)
def test_invalid_skipfooter_negative(python_parser_only):
# see gh-15925 (comment)
data = "a\n1\n2"
parser = python_parser_only
msg = "skipfooter cannot be negative"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), skipfooter=-1)
@pytest.mark.parametrize("kwargs", [dict(sep=None), dict(delimiter="|")])
def test_sniff_delimiter(python_parser_only, kwargs):
data = """index|A|B|C
foo|1|2|3
bar|4|5|6
baz|7|8|9
"""
parser = python_parser_only
result = parser.read_csv(StringIO(data), index_col=0, **kwargs)
expected = DataFrame(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
columns=["A", "B", "C"],
index=Index(["foo", "bar", "baz"], name="index"),
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("encoding", [None, "utf-8"])
def test_sniff_delimiter_encoding(python_parser_only, encoding):
parser = python_parser_only
data = """ignore this
ignore this too
index|A|B|C
foo|1|2|3
bar|4|5|6
baz|7|8|9
"""
if encoding is not None:
from io import TextIOWrapper
data = data.encode(encoding)
data = BytesIO(data)
data = TextIOWrapper(data, encoding=encoding)
else:
data = StringIO(data)
result = parser.read_csv(data, index_col=0, sep=None, skiprows=2, encoding=encoding)
expected = DataFrame(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
columns=["A", "B", "C"],
index=Index(["foo", "bar", "baz"], name="index"),
)
tm.assert_frame_equal(result, expected)
def test_single_line(python_parser_only):
# see gh-6607: sniff separator
parser = python_parser_only
result = parser.read_csv(StringIO("1,2"), names=["a", "b"], header=None, sep=None)
expected = DataFrame({"a": [1], "b": [2]})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("kwargs", [dict(skipfooter=2), dict(nrows=3)])
def test_skipfooter(python_parser_only, kwargs):
# see gh-6607
data = """A,B,C
1,2,3
4,5,6
7,8,9
want to skip this
also also skip this
"""
parser = python_parser_only
result = parser.read_csv(StringIO(data), **kwargs)
expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["A", "B", "C"])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"compression,klass", [("gzip", "GzipFile"), ("bz2", "BZ2File")]
)
def test_decompression_regex_sep(python_parser_only, csv1, compression, klass):
# see gh-6607
parser = python_parser_only
with open(csv1, "rb") as f:
data = f.read()
data = data.replace(b",", b"::")
expected = parser.read_csv(csv1)
module = pytest.importorskip(compression)
klass = getattr(module, klass)
with tm.ensure_clean() as path:
tmp = klass(path, mode="wb")
tmp.write(data)
tmp.close()
result = parser.read_csv(path, sep="::", compression=compression)
tm.assert_frame_equal(result, expected)
def test_read_csv_buglet_4x_multi_index(python_parser_only):
# see gh-6607
data = """ A B C D E
one two three four
a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640
a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744
x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838"""
parser = python_parser_only
expected = DataFrame(
[
[-0.5109, -2.3358, -0.4645, 0.05076, 0.3640],
[0.4473, 1.4152, 0.2834, 1.00661, 0.1744],
[-0.6662, -0.5243, -0.3580, 0.89145, 2.5838],
],
columns=["A", "B", "C", "D", "E"],
index=MultiIndex.from_tuples(
[("a", "b", 10.0032, 5), ("a", "q", 20, 4), ("x", "q", 30, 3)],
names=["one", "two", "three", "four"],
),
)
result = parser.read_csv(StringIO(data), sep=r"\s+")
tm.assert_frame_equal(result, expected)
def test_read_csv_buglet_4x_multi_index2(python_parser_only):
# see gh-6893
data = " A B C\na b c\n1 3 7 0 3 6\n3 1 4 1 5 9"
parser = python_parser_only
expected = DataFrame.from_records(
[(1, 3, 7, 0, 3, 6), (3, 1, 4, 1, 5, 9)],
columns=list("abcABC"),
index=list("abc"),
)
result = parser.read_csv(StringIO(data), sep=r"\s+")
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("add_footer", [True, False])
def test_skipfooter_with_decimal(python_parser_only, add_footer):
# see gh-6971
data = "1#2\n3#4"
parser = python_parser_only
expected = DataFrame({"a": [1.2, 3.4]})
if add_footer:
# The stray footer line should not mess with the
# casting of the first two lines if we skip it.
kwargs = dict(skipfooter=1)
data += "\nFooter"
else:
kwargs = dict()
result = parser.read_csv(StringIO(data), names=["a"], decimal="#", **kwargs)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"sep", ["::", "#####", "!!!", "123", "#1!c5", "%!c!d", "@@#4:2", "_!pd#_"]
)
@pytest.mark.parametrize(
"encoding", ["utf-16", "utf-16-be", "utf-16-le", "utf-32", "cp037"]
)
def test_encoding_non_utf8_multichar_sep(python_parser_only, sep, encoding):
# see gh-3404
expected = DataFrame({"a": [1], "b": [2]})
parser = python_parser_only
data = "1" + sep + "2"
encoded_data = data.encode(encoding)
result = parser.read_csv(
BytesIO(encoded_data), sep=sep, names=["a", "b"], encoding=encoding
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE])
def test_multi_char_sep_quotes(python_parser_only, quoting):
# see gh-13374
kwargs = dict(sep=",,")
parser = python_parser_only
data = 'a,,b\n1,,a\n2,,"2,,b"'
msg = "ignored when a multi-char delimiter is used"
def fail_read():
with pytest.raises(ParserError, match=msg):
parser.read_csv(StringIO(data), quoting=quoting, **kwargs)
if quoting == csv.QUOTE_NONE:
# We expect no match, so there should be an assertion
# error out of the inner context manager.
with pytest.raises(AssertionError):
fail_read()
else:
fail_read()
def test_none_delimiter(python_parser_only, capsys):
# see gh-13374 and gh-17465
parser = python_parser_only
data = "a,b,c\n0,1,2\n3,4,5,6\n7,8,9"
expected = DataFrame({"a": [0, 7], "b": [1, 8], "c": [2, 9]})
# We expect the third line in the data to be
# skipped because it is malformed, but we do
# not expect any errors to occur.
result = parser.read_csv(
StringIO(data), header=0, sep=None, warn_bad_lines=True, error_bad_lines=False
)
tm.assert_frame_equal(result, expected)
captured = capsys.readouterr()
assert "Skipping line 3" in captured.err
@pytest.mark.parametrize("data", ['a\n1\n"b"a', 'a,b,c\ncat,foo,bar\ndog,foo,"baz'])
@pytest.mark.parametrize("skipfooter", [0, 1])
def test_skipfooter_bad_row(python_parser_only, data, skipfooter):
# see gh-13879 and gh-15910
msg = "parsing errors in the skipped footer rows"
parser = python_parser_only
def fail_read():
with pytest.raises(ParserError, match=msg):
parser.read_csv(StringIO(data), skipfooter=skipfooter)
if skipfooter:
fail_read()
else:
# We expect no match, so there should be an assertion
# error out of the inner context manager.
with pytest.raises(AssertionError):
fail_read()
def test_malformed_skipfooter(python_parser_only):
parser = python_parser_only
data = """ignore
A,B,C
1,2,3 # comment
1,2,3,4,5
2,3,4
footer
"""
msg = "Expected 3 fields in line 4, saw 5"
with pytest.raises(ParserError, match=msg):
parser.read_csv(StringIO(data), header=1, comment="#", skipfooter=1)

View File

@@ -0,0 +1,159 @@
"""
Tests that quoting specifications are properly handled
during parsing for all of the parsers defined in parsers.py
"""
import csv
from io import StringIO
import pytest
from pandas.errors import ParserError
from pandas import DataFrame
import pandas.util.testing as tm
@pytest.mark.parametrize(
"kwargs,msg",
[
(dict(quotechar="foo"), '"quotechar" must be a(n)? 1-character string'),
(
dict(quotechar=None, quoting=csv.QUOTE_MINIMAL),
"quotechar must be set if quoting enabled",
),
(dict(quotechar=2), '"quotechar" must be string, not int'),
],
)
def test_bad_quote_char(all_parsers, kwargs, msg):
data = "1,2,3"
parser = all_parsers
with pytest.raises(TypeError, match=msg):
parser.read_csv(StringIO(data), **kwargs)
@pytest.mark.parametrize(
"quoting,msg",
[
("foo", '"quoting" must be an integer'),
(5, 'bad "quoting" value'), # quoting must be in the range [0, 3]
],
)
def test_bad_quoting(all_parsers, quoting, msg):
data = "1,2,3"
parser = all_parsers
with pytest.raises(TypeError, match=msg):
parser.read_csv(StringIO(data), quoting=quoting)
def test_quote_char_basic(all_parsers):
parser = all_parsers
data = 'a,b,c\n1,2,"cat"'
expected = DataFrame([[1, 2, "cat"]], columns=["a", "b", "c"])
result = parser.read_csv(StringIO(data), quotechar='"')
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("quote_char", ["~", "*", "%", "$", "@", "P"])
def test_quote_char_various(all_parsers, quote_char):
parser = all_parsers
expected = DataFrame([[1, 2, "cat"]], columns=["a", "b", "c"])
data = 'a,b,c\n1,2,"cat"'
new_data = data.replace('"', quote_char)
result = parser.read_csv(StringIO(new_data), quotechar=quote_char)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE])
@pytest.mark.parametrize("quote_char", ["", None])
def test_null_quote_char(all_parsers, quoting, quote_char):
kwargs = dict(quotechar=quote_char, quoting=quoting)
data = "a,b,c\n1,2,3"
parser = all_parsers
if quoting != csv.QUOTE_NONE:
# Sanity checking.
msg = "quotechar must be set if quoting enabled"
with pytest.raises(TypeError, match=msg):
parser.read_csv(StringIO(data), **kwargs)
else:
expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"])
result = parser.read_csv(StringIO(data), **kwargs)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"kwargs,exp_data",
[
(dict(), [[1, 2, "foo"]]), # Test default.
# QUOTE_MINIMAL only applies to CSV writing, so no effect on reading.
(dict(quotechar='"', quoting=csv.QUOTE_MINIMAL), [[1, 2, "foo"]]),
# QUOTE_MINIMAL only applies to CSV writing, so no effect on reading.
(dict(quotechar='"', quoting=csv.QUOTE_ALL), [[1, 2, "foo"]]),
# QUOTE_NONE tells the reader to do no special handling
# of quote characters and leave them alone.
(dict(quotechar='"', quoting=csv.QUOTE_NONE), [[1, 2, '"foo"']]),
# QUOTE_NONNUMERIC tells the reader to cast
# all non-quoted fields to float
(dict(quotechar='"', quoting=csv.QUOTE_NONNUMERIC), [[1.0, 2.0, "foo"]]),
],
)
def test_quoting_various(all_parsers, kwargs, exp_data):
data = '1,2,"foo"'
parser = all_parsers
columns = ["a", "b", "c"]
result = parser.read_csv(StringIO(data), names=columns, **kwargs)
expected = DataFrame(exp_data, columns=columns)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"doublequote,exp_data", [(True, [[3, '4 " 5']]), (False, [[3, '4 " 5"']])]
)
def test_double_quote(all_parsers, doublequote, exp_data):
parser = all_parsers
data = 'a,b\n3,"4 "" 5"'
result = parser.read_csv(StringIO(data), quotechar='"', doublequote=doublequote)
expected = DataFrame(exp_data, columns=["a", "b"])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("quotechar", ['"', "\u0001"])
def test_quotechar_unicode(all_parsers, quotechar):
# see gh-14477
data = "a\n1"
parser = all_parsers
expected = DataFrame({"a": [1]})
result = parser.read_csv(StringIO(data), quotechar=quotechar)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("balanced", [True, False])
def test_unbalanced_quoting(all_parsers, balanced):
# see gh-22789.
parser = all_parsers
data = 'a,b,c\n1,2,"3'
if balanced:
# Re-balance the quoting and read in without errors.
expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"])
result = parser.read_csv(StringIO(data + '"'))
tm.assert_frame_equal(result, expected)
else:
msg = (
"EOF inside string starting at row 1"
if parser.engine == "c"
else "unexpected end of data"
)
with pytest.raises(ParserError, match=msg):
parser.read_csv(StringIO(data))

View File

@@ -0,0 +1,618 @@
"""
Tests the 'read_fwf' function in parsers.py. This
test suite is independent of the others because the
engine is set to 'python-fwf' internally.
"""
from datetime import datetime
from io import BytesIO, StringIO
import numpy as np
import pytest
import pandas as pd
from pandas import DataFrame, DatetimeIndex
import pandas.util.testing as tm
from pandas.io.parsers import EmptyDataError, read_csv, read_fwf
def test_basic():
data = """\
A B C D
201158 360.242940 149.910199 11950.7
201159 444.953632 166.985655 11788.4
201160 364.136849 183.628767 11806.2
201161 413.836124 184.375703 11916.8
201162 502.953953 173.237159 12468.3
"""
result = read_fwf(StringIO(data))
expected = DataFrame(
[
[201158, 360.242940, 149.910199, 11950.7],
[201159, 444.953632, 166.985655, 11788.4],
[201160, 364.136849, 183.628767, 11806.2],
[201161, 413.836124, 184.375703, 11916.8],
[201162, 502.953953, 173.237159, 12468.3],
],
columns=["A", "B", "C", "D"],
)
tm.assert_frame_equal(result, expected)
def test_colspecs():
data = """\
A B C D E
201158 360.242940 149.910199 11950.7
201159 444.953632 166.985655 11788.4
201160 364.136849 183.628767 11806.2
201161 413.836124 184.375703 11916.8
201162 502.953953 173.237159 12468.3
"""
colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)]
result = read_fwf(StringIO(data), colspecs=colspecs)
expected = DataFrame(
[
[2011, 58, 360.242940, 149.910199, 11950.7],
[2011, 59, 444.953632, 166.985655, 11788.4],
[2011, 60, 364.136849, 183.628767, 11806.2],
[2011, 61, 413.836124, 184.375703, 11916.8],
[2011, 62, 502.953953, 173.237159, 12468.3],
],
columns=["A", "B", "C", "D", "E"],
)
tm.assert_frame_equal(result, expected)
def test_widths():
data = """\
A B C D E
2011 58 360.242940 149.910199 11950.7
2011 59 444.953632 166.985655 11788.4
2011 60 364.136849 183.628767 11806.2
2011 61 413.836124 184.375703 11916.8
2011 62 502.953953 173.237159 12468.3
"""
result = read_fwf(StringIO(data), widths=[5, 5, 13, 13, 7])
expected = DataFrame(
[
[2011, 58, 360.242940, 149.910199, 11950.7],
[2011, 59, 444.953632, 166.985655, 11788.4],
[2011, 60, 364.136849, 183.628767, 11806.2],
[2011, 61, 413.836124, 184.375703, 11916.8],
[2011, 62, 502.953953, 173.237159, 12468.3],
],
columns=["A", "B", "C", "D", "E"],
)
tm.assert_frame_equal(result, expected)
def test_non_space_filler():
# From Thomas Kluyver:
#
# Apparently, some non-space filler characters can be seen, this is
# supported by specifying the 'delimiter' character:
#
# http://publib.boulder.ibm.com/infocenter/dmndhelp/v6r1mx/index.jsp?topic=/com.ibm.wbit.612.help.config.doc/topics/rfixwidth.html
data = """\
A~~~~B~~~~C~~~~~~~~~~~~D~~~~~~~~~~~~E
201158~~~~360.242940~~~149.910199~~~11950.7
201159~~~~444.953632~~~166.985655~~~11788.4
201160~~~~364.136849~~~183.628767~~~11806.2
201161~~~~413.836124~~~184.375703~~~11916.8
201162~~~~502.953953~~~173.237159~~~12468.3
"""
colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)]
result = read_fwf(StringIO(data), colspecs=colspecs, delimiter="~")
expected = DataFrame(
[
[2011, 58, 360.242940, 149.910199, 11950.7],
[2011, 59, 444.953632, 166.985655, 11788.4],
[2011, 60, 364.136849, 183.628767, 11806.2],
[2011, 61, 413.836124, 184.375703, 11916.8],
[2011, 62, 502.953953, 173.237159, 12468.3],
],
columns=["A", "B", "C", "D", "E"],
)
tm.assert_frame_equal(result, expected)
def test_over_specified():
data = """\
A B C D E
201158 360.242940 149.910199 11950.7
201159 444.953632 166.985655 11788.4
201160 364.136849 183.628767 11806.2
201161 413.836124 184.375703 11916.8
201162 502.953953 173.237159 12468.3
"""
colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)]
with pytest.raises(ValueError, match="must specify only one of"):
read_fwf(StringIO(data), colspecs=colspecs, widths=[6, 10, 10, 7])
def test_under_specified():
data = """\
A B C D E
201158 360.242940 149.910199 11950.7
201159 444.953632 166.985655 11788.4
201160 364.136849 183.628767 11806.2
201161 413.836124 184.375703 11916.8
201162 502.953953 173.237159 12468.3
"""
with pytest.raises(ValueError, match="Must specify either"):
read_fwf(StringIO(data), colspecs=None, widths=None)
def test_read_csv_compat():
csv_data = """\
A,B,C,D,E
2011,58,360.242940,149.910199,11950.7
2011,59,444.953632,166.985655,11788.4
2011,60,364.136849,183.628767,11806.2
2011,61,413.836124,184.375703,11916.8
2011,62,502.953953,173.237159,12468.3
"""
expected = read_csv(StringIO(csv_data), engine="python")
fwf_data = """\
A B C D E
201158 360.242940 149.910199 11950.7
201159 444.953632 166.985655 11788.4
201160 364.136849 183.628767 11806.2
201161 413.836124 184.375703 11916.8
201162 502.953953 173.237159 12468.3
"""
colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)]
result = read_fwf(StringIO(fwf_data), colspecs=colspecs)
tm.assert_frame_equal(result, expected)
def test_bytes_io_input():
result = read_fwf(
BytesIO("שלום\nשלום".encode("utf8")), widths=[2, 2], encoding="utf8"
)
expected = DataFrame([["של", "ום"]], columns=["של", "ום"])
tm.assert_frame_equal(result, expected)
def test_fwf_colspecs_is_list_or_tuple():
data = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo2,12,13,14,15
bar2,12,13,14,15
"""
msg = "column specifications must be a list or tuple.+"
with pytest.raises(TypeError, match=msg):
read_fwf(StringIO(data), colspecs={"a": 1}, delimiter=",")
def test_fwf_colspecs_is_list_or_tuple_of_two_element_tuples():
data = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo2,12,13,14,15
bar2,12,13,14,15
"""
msg = "Each column specification must be.+"
with pytest.raises(TypeError, match=msg):
read_fwf(StringIO(data), [("a", 1)])
@pytest.mark.parametrize(
"colspecs,exp_data",
[
([(0, 3), (3, None)], [[123, 456], [456, 789]]),
([(None, 3), (3, 6)], [[123, 456], [456, 789]]),
([(0, None), (3, None)], [[123456, 456], [456789, 789]]),
([(None, None), (3, 6)], [[123456, 456], [456789, 789]]),
],
)
def test_fwf_colspecs_none(colspecs, exp_data):
# see gh-7079
data = """\
123456
456789
"""
expected = DataFrame(exp_data)
result = read_fwf(StringIO(data), colspecs=colspecs, header=None)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"infer_nrows,exp_data",
[
# infer_nrows --> colspec == [(2, 3), (5, 6)]
(1, [[1, 2], [3, 8]]),
# infer_nrows > number of rows
(10, [[1, 2], [123, 98]]),
],
)
def test_fwf_colspecs_infer_nrows(infer_nrows, exp_data):
# see gh-15138
data = """\
1 2
123 98
"""
expected = DataFrame(exp_data)
result = read_fwf(StringIO(data), infer_nrows=infer_nrows, header=None)
tm.assert_frame_equal(result, expected)
def test_fwf_regression():
# see gh-3594
#
# Turns out "T060" is parsable as a datetime slice!
tz_list = [1, 10, 20, 30, 60, 80, 100]
widths = [16] + [8] * len(tz_list)
names = ["SST"] + ["T{z:03d}".format(z=z) for z in tz_list[1:]]
data = """ 2009164202000 9.5403 9.4105 8.6571 7.8372 6.0612 5.8843 5.5192
2009164203000 9.5435 9.2010 8.6167 7.8176 6.0804 5.8728 5.4869
2009164204000 9.5873 9.1326 8.4694 7.5889 6.0422 5.8526 5.4657
2009164205000 9.5810 9.0896 8.4009 7.4652 6.0322 5.8189 5.4379
2009164210000 9.6034 9.0897 8.3822 7.4905 6.0908 5.7904 5.4039
"""
result = read_fwf(
StringIO(data),
index_col=0,
header=None,
names=names,
widths=widths,
parse_dates=True,
date_parser=lambda s: datetime.strptime(s, "%Y%j%H%M%S"),
)
expected = DataFrame(
[
[9.5403, 9.4105, 8.6571, 7.8372, 6.0612, 5.8843, 5.5192],
[9.5435, 9.2010, 8.6167, 7.8176, 6.0804, 5.8728, 5.4869],
[9.5873, 9.1326, 8.4694, 7.5889, 6.0422, 5.8526, 5.4657],
[9.5810, 9.0896, 8.4009, 7.4652, 6.0322, 5.8189, 5.4379],
[9.6034, 9.0897, 8.3822, 7.4905, 6.0908, 5.7904, 5.4039],
],
index=DatetimeIndex(
[
"2009-06-13 20:20:00",
"2009-06-13 20:30:00",
"2009-06-13 20:40:00",
"2009-06-13 20:50:00",
"2009-06-13 21:00:00",
]
),
columns=["SST", "T010", "T020", "T030", "T060", "T080", "T100"],
)
tm.assert_frame_equal(result, expected)
def test_fwf_for_uint8():
data = """1421302965.213420 PRI=3 PGN=0xef00 DST=0x17 SRC=0x28 04 154 00 00 00 00 00 127
1421302964.226776 PRI=6 PGN=0xf002 SRC=0x47 243 00 00 255 247 00 00 71""" # noqa
df = read_fwf(
StringIO(data),
colspecs=[(0, 17), (25, 26), (33, 37), (49, 51), (58, 62), (63, 1000)],
names=["time", "pri", "pgn", "dst", "src", "data"],
converters={
"pgn": lambda x: int(x, 16),
"src": lambda x: int(x, 16),
"dst": lambda x: int(x, 16),
"data": lambda x: len(x.split(" ")),
},
)
expected = DataFrame(
[
[1421302965.213420, 3, 61184, 23, 40, 8],
[1421302964.226776, 6, 61442, None, 71, 8],
],
columns=["time", "pri", "pgn", "dst", "src", "data"],
)
expected["dst"] = expected["dst"].astype(object)
tm.assert_frame_equal(df, expected)
@pytest.mark.parametrize("comment", ["#", "~", "!"])
def test_fwf_comment(comment):
data = """\
1 2. 4 #hello world
5 NaN 10.0
"""
data = data.replace("#", comment)
colspecs = [(0, 3), (4, 9), (9, 25)]
expected = DataFrame([[1, 2.0, 4], [5, np.nan, 10.0]])
result = read_fwf(StringIO(data), colspecs=colspecs, header=None, comment=comment)
tm.assert_almost_equal(result, expected)
@pytest.mark.parametrize("thousands", [",", "#", "~"])
def test_fwf_thousands(thousands):
data = """\
1 2,334.0 5
10 13 10.
"""
data = data.replace(",", thousands)
colspecs = [(0, 3), (3, 11), (12, 16)]
expected = DataFrame([[1, 2334.0, 5], [10, 13, 10.0]])
result = read_fwf(
StringIO(data), header=None, colspecs=colspecs, thousands=thousands
)
tm.assert_almost_equal(result, expected)
@pytest.mark.parametrize("header", [True, False])
def test_bool_header_arg(header):
# see gh-6114
data = """\
MyColumn
a
b
a
b"""
msg = "Passing a bool to header is invalid"
with pytest.raises(TypeError, match=msg):
read_fwf(StringIO(data), header=header)
def test_full_file():
# File with all values.
test = """index A B C
2000-01-03T00:00:00 0.980268513777 3 foo
2000-01-04T00:00:00 1.04791624281 -4 bar
2000-01-05T00:00:00 0.498580885705 73 baz
2000-01-06T00:00:00 1.12020151869 1 foo
2000-01-07T00:00:00 0.487094399463 0 bar
2000-01-10T00:00:00 0.836648671666 2 baz
2000-01-11T00:00:00 0.157160753327 34 foo"""
colspecs = ((0, 19), (21, 35), (38, 40), (42, 45))
expected = read_fwf(StringIO(test), colspecs=colspecs)
result = read_fwf(StringIO(test))
tm.assert_frame_equal(result, expected)
def test_full_file_with_missing():
# File with missing values.
test = """index A B C
2000-01-03T00:00:00 0.980268513777 3 foo
2000-01-04T00:00:00 1.04791624281 -4 bar
0.498580885705 73 baz
2000-01-06T00:00:00 1.12020151869 1 foo
2000-01-07T00:00:00 0 bar
2000-01-10T00:00:00 0.836648671666 2 baz
34"""
colspecs = ((0, 19), (21, 35), (38, 40), (42, 45))
expected = read_fwf(StringIO(test), colspecs=colspecs)
result = read_fwf(StringIO(test))
tm.assert_frame_equal(result, expected)
def test_full_file_with_spaces():
# File with spaces in columns.
test = """
Account Name Balance CreditLimit AccountCreated
101 Keanu Reeves 9315.45 10000.00 1/17/1998
312 Gerard Butler 90.00 1000.00 8/6/2003
868 Jennifer Love Hewitt 0 17000.00 5/25/1985
761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006
317 Bill Murray 789.65 5000.00 2/5/2007
""".strip(
"\r\n"
)
colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70))
expected = read_fwf(StringIO(test), colspecs=colspecs)
result = read_fwf(StringIO(test))
tm.assert_frame_equal(result, expected)
def test_full_file_with_spaces_and_missing():
# File with spaces and missing values in columns.
test = """
Account Name Balance CreditLimit AccountCreated
101 10000.00 1/17/1998
312 Gerard Butler 90.00 1000.00 8/6/2003
868 5/25/1985
761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006
317 Bill Murray 789.65
""".strip(
"\r\n"
)
colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70))
expected = read_fwf(StringIO(test), colspecs=colspecs)
result = read_fwf(StringIO(test))
tm.assert_frame_equal(result, expected)
def test_messed_up_data():
# Completely messed up file.
test = """
Account Name Balance Credit Limit Account Created
101 10000.00 1/17/1998
312 Gerard Butler 90.00 1000.00
761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006
317 Bill Murray 789.65
""".strip(
"\r\n"
)
colspecs = ((2, 10), (15, 33), (37, 45), (49, 61), (64, 79))
expected = read_fwf(StringIO(test), colspecs=colspecs)
result = read_fwf(StringIO(test))
tm.assert_frame_equal(result, expected)
def test_multiple_delimiters():
test = r"""
col1~~~~~col2 col3++++++++++++++++++col4
~~22.....11.0+++foo~~~~~~~~~~Keanu Reeves
33+++122.33\\\bar.........Gerard Butler
++44~~~~12.01 baz~~Jennifer Love Hewitt
~~55 11+++foo++++Jada Pinkett-Smith
..66++++++.03~~~bar Bill Murray
""".strip(
"\r\n"
)
delimiter = " +~.\\"
colspecs = ((0, 4), (7, 13), (15, 19), (21, 41))
expected = read_fwf(StringIO(test), colspecs=colspecs, delimiter=delimiter)
result = read_fwf(StringIO(test), delimiter=delimiter)
tm.assert_frame_equal(result, expected)
def test_variable_width_unicode():
data = """
שלום שלום
ום שלל
של ום
""".strip(
"\r\n"
)
encoding = "utf8"
kwargs = dict(header=None, encoding=encoding)
expected = read_fwf(
BytesIO(data.encode(encoding)), colspecs=[(0, 4), (5, 9)], **kwargs
)
result = read_fwf(BytesIO(data.encode(encoding)), **kwargs)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("dtype", [dict(), {"a": "float64", "b": str, "c": "int32"}])
def test_dtype(dtype):
data = """ a b c
1 2 3.2
3 4 5.2
"""
colspecs = [(0, 5), (5, 10), (10, None)]
result = read_fwf(StringIO(data), colspecs=colspecs, dtype=dtype)
expected = pd.DataFrame(
{"a": [1, 3], "b": [2, 4], "c": [3.2, 5.2]}, columns=["a", "b", "c"]
)
for col, dt in dtype.items():
expected[col] = expected[col].astype(dt)
tm.assert_frame_equal(result, expected)
def test_skiprows_inference():
# see gh-11256
data = """
Text contained in the file header
DataCol1 DataCol2
0.0 1.0
101.6 956.1
""".strip()
skiprows = 2
expected = read_csv(StringIO(data), skiprows=skiprows, delim_whitespace=True)
result = read_fwf(StringIO(data), skiprows=skiprows)
tm.assert_frame_equal(result, expected)
def test_skiprows_by_index_inference():
data = """
To be skipped
Not To Be Skipped
Once more to be skipped
123 34 8 123
456 78 9 456
""".strip()
skiprows = [0, 2]
expected = read_csv(StringIO(data), skiprows=skiprows, delim_whitespace=True)
result = read_fwf(StringIO(data), skiprows=skiprows)
tm.assert_frame_equal(result, expected)
def test_skiprows_inference_empty():
data = """
AA BBB C
12 345 6
78 901 2
""".strip()
msg = "No rows from which to infer column width"
with pytest.raises(EmptyDataError, match=msg):
read_fwf(StringIO(data), skiprows=3)
def test_whitespace_preservation():
# see gh-16772
header = None
csv_data = """
a ,bbb
cc,dd """
fwf_data = """
a bbb
ccdd """
result = read_fwf(
StringIO(fwf_data), widths=[3, 3], header=header, skiprows=[0], delimiter="\n\t"
)
expected = read_csv(StringIO(csv_data), header=header)
tm.assert_frame_equal(result, expected)
def test_default_delimiter():
header = None
csv_data = """
a,bbb
cc,dd"""
fwf_data = """
a \tbbb
cc\tdd """
result = read_fwf(StringIO(fwf_data), widths=[3, 3], header=header, skiprows=[0])
expected = read_csv(StringIO(csv_data), header=header)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("infer", [True, False, None])
def test_fwf_compression(compression_only, infer):
data = """1111111111
2222222222
3333333333""".strip()
compression = compression_only
extension = "gz" if compression == "gzip" else compression
kwargs = dict(widths=[5, 5], names=["one", "two"])
expected = read_fwf(StringIO(data), **kwargs)
data = bytes(data, encoding="utf-8")
with tm.ensure_clean(filename="tmp." + extension) as path:
tm.write_to_compressed(compression, path, data)
if infer is not None:
kwargs["compression"] = "infer" if infer else compression
result = read_fwf(path, **kwargs)
tm.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,252 @@
"""
Tests that skipped rows are properly handled during
parsing for all of the parsers defined in parsers.py
"""
from datetime import datetime
from io import StringIO
import numpy as np
import pytest
from pandas.errors import EmptyDataError
from pandas import DataFrame, Index
import pandas.util.testing as tm
@pytest.mark.parametrize("skiprows", [list(range(6)), 6])
def test_skip_rows_bug(all_parsers, skiprows):
# see gh-505
parser = all_parsers
text = """#foo,a,b,c
#foo,a,b,c
#foo,a,b,c
#foo,a,b,c
#foo,a,b,c
#foo,a,b,c
1/1/2000,1.,2.,3.
1/2/2000,4,5,6
1/3/2000,7,8,9
"""
result = parser.read_csv(
StringIO(text), skiprows=skiprows, header=None, index_col=0, parse_dates=True
)
index = Index(
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0
)
expected = DataFrame(
np.arange(1.0, 10.0).reshape((3, 3)), columns=[1, 2, 3], index=index
)
tm.assert_frame_equal(result, expected)
def test_deep_skip_rows(all_parsers):
# see gh-4382
parser = all_parsers
data = "a,b,c\n" + "\n".join(
[",".join([str(i), str(i + 1), str(i + 2)]) for i in range(10)]
)
condensed_data = "a,b,c\n" + "\n".join(
[",".join([str(i), str(i + 1), str(i + 2)]) for i in [0, 1, 2, 3, 4, 6, 8, 9]]
)
result = parser.read_csv(StringIO(data), skiprows=[6, 8])
condensed_result = parser.read_csv(StringIO(condensed_data))
tm.assert_frame_equal(result, condensed_result)
def test_skip_rows_blank(all_parsers):
# see gh-9832
parser = all_parsers
text = """#foo,a,b,c
#foo,a,b,c
#foo,a,b,c
#foo,a,b,c
1/1/2000,1.,2.,3.
1/2/2000,4,5,6
1/3/2000,7,8,9
"""
data = parser.read_csv(
StringIO(text), skiprows=6, header=None, index_col=0, parse_dates=True
)
index = Index(
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0
)
expected = DataFrame(
np.arange(1.0, 10.0).reshape((3, 3)), columns=[1, 2, 3], index=index
)
tm.assert_frame_equal(data, expected)
@pytest.mark.parametrize(
"data,kwargs,expected",
[
(
"""id,text,num_lines
1,"line 11
line 12",2
2,"line 21
line 22",2
3,"line 31",1""",
dict(skiprows=[1]),
DataFrame(
[[2, "line 21\nline 22", 2], [3, "line 31", 1]],
columns=["id", "text", "num_lines"],
),
),
(
"a,b,c\n~a\n b~,~e\n d~,~f\n f~\n1,2,~12\n 13\n 14~",
dict(quotechar="~", skiprows=[2]),
DataFrame([["a\n b", "e\n d", "f\n f"]], columns=["a", "b", "c"]),
),
(
(
"Text,url\n~example\n "
"sentence\n one~,url1\n~"
"example\n sentence\n two~,url2\n~"
"example\n sentence\n three~,url3"
),
dict(quotechar="~", skiprows=[1, 3]),
DataFrame([["example\n sentence\n two", "url2"]], columns=["Text", "url"]),
),
],
)
def test_skip_row_with_newline(all_parsers, data, kwargs, expected):
# see gh-12775 and gh-10911
parser = all_parsers
result = parser.read_csv(StringIO(data), **kwargs)
tm.assert_frame_equal(result, expected)
def test_skip_row_with_quote(all_parsers):
# see gh-12775 and gh-10911
parser = all_parsers
data = """id,text,num_lines
1,"line '11' line 12",2
2,"line '21' line 22",2
3,"line '31' line 32",1"""
exp_data = [[2, "line '21' line 22", 2], [3, "line '31' line 32", 1]]
expected = DataFrame(exp_data, columns=["id", "text", "num_lines"])
result = parser.read_csv(StringIO(data), skiprows=[1])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"data,exp_data",
[
(
"""id,text,num_lines
1,"line \n'11' line 12",2
2,"line \n'21' line 22",2
3,"line \n'31' line 32",1""",
[[2, "line \n'21' line 22", 2], [3, "line \n'31' line 32", 1]],
),
(
"""id,text,num_lines
1,"line '11\n' line 12",2
2,"line '21\n' line 22",2
3,"line '31\n' line 32",1""",
[[2, "line '21\n' line 22", 2], [3, "line '31\n' line 32", 1]],
),
(
"""id,text,num_lines
1,"line '11\n' \r\tline 12",2
2,"line '21\n' \r\tline 22",2
3,"line '31\n' \r\tline 32",1""",
[[2, "line '21\n' \r\tline 22", 2], [3, "line '31\n' \r\tline 32", 1]],
),
],
)
def test_skip_row_with_newline_and_quote(all_parsers, data, exp_data):
# see gh-12775 and gh-10911
parser = all_parsers
result = parser.read_csv(StringIO(data), skiprows=[1])
expected = DataFrame(exp_data, columns=["id", "text", "num_lines"])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"line_terminator", ["\n", "\r\n", "\r"] # "LF" # "CRLF" # "CR"
)
def test_skiprows_lineterminator(all_parsers, line_terminator):
# see gh-9079
parser = all_parsers
data = "\n".join(
[
"SMOSMANIA ThetaProbe-ML2X ",
"2007/01/01 01:00 0.2140 U M ",
"2007/01/01 02:00 0.2141 M O ",
"2007/01/01 04:00 0.2142 D M ",
]
)
expected = DataFrame(
[
["2007/01/01", "01:00", 0.2140, "U", "M"],
["2007/01/01", "02:00", 0.2141, "M", "O"],
["2007/01/01", "04:00", 0.2142, "D", "M"],
],
columns=["date", "time", "var", "flag", "oflag"],
)
if parser.engine == "python" and line_terminator == "\r":
pytest.skip("'CR' not respect with the Python parser yet")
data = data.replace("\n", line_terminator)
result = parser.read_csv(
StringIO(data),
skiprows=1,
delim_whitespace=True,
names=["date", "time", "var", "flag", "oflag"],
)
tm.assert_frame_equal(result, expected)
def test_skiprows_infield_quote(all_parsers):
# see gh-14459
parser = all_parsers
data = 'a"\nb"\na\n1'
expected = DataFrame({"a": [1]})
result = parser.read_csv(StringIO(data), skiprows=2)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"kwargs,expected",
[
(dict(), DataFrame({"1": [3, 5]})),
(dict(header=0, names=["foo"]), DataFrame({"foo": [3, 5]})),
],
)
def test_skip_rows_callable(all_parsers, kwargs, expected):
parser = all_parsers
data = "a\n1\n2\n3\n4\n5"
result = parser.read_csv(StringIO(data), skiprows=lambda x: x % 2 == 0, **kwargs)
tm.assert_frame_equal(result, expected)
def test_skip_rows_skip_all(all_parsers):
parser = all_parsers
data = "a\n1\n2\n3\n4\n5"
msg = "No columns to parse from file"
with pytest.raises(EmptyDataError, match=msg):
parser.read_csv(StringIO(data), skiprows=lambda x: True)
def test_skip_rows_bad_callable(all_parsers):
msg = "by zero"
parser = all_parsers
data = "a\n1\n2\n3\n4\n5"
with pytest.raises(ZeroDivisionError, match=msg):
parser.read_csv(StringIO(data), skiprows=lambda x: 1 / 0)

View File

@@ -0,0 +1,344 @@
"""
Tests the TextReader class in parsers.pyx, which
is integral to the C engine in parsers.py
"""
from io import BytesIO, StringIO
import os
import numpy as np
from numpy import nan
import pytest
import pandas._libs.parsers as parser
from pandas._libs.parsers import TextReader
from pandas import DataFrame
import pandas.util.testing as tm
from pandas.util.testing import assert_frame_equal
from pandas.io.parsers import TextFileReader, read_csv
class TestTextReader:
@pytest.fixture(autouse=True)
def setup_method(self, datapath):
self.dirpath = datapath("io", "parser", "data")
self.csv1 = os.path.join(self.dirpath, "test1.csv")
self.csv2 = os.path.join(self.dirpath, "test2.csv")
self.xls1 = os.path.join(self.dirpath, "test.xls")
def test_file_handle(self):
with open(self.csv1, "rb") as f:
reader = TextReader(f)
reader.read()
def test_string_filename(self):
reader = TextReader(self.csv1, header=None)
reader.read()
def test_file_handle_mmap(self):
with open(self.csv1, "rb") as f:
reader = TextReader(f, memory_map=True, header=None)
reader.read()
def test_StringIO(self):
with open(self.csv1, "rb") as f:
text = f.read()
src = BytesIO(text)
reader = TextReader(src, header=None)
reader.read()
def test_string_factorize(self):
# should this be optional?
data = "a\nb\na\nb\na"
reader = TextReader(StringIO(data), header=None)
result = reader.read()
assert len(set(map(id, result[0]))) == 2
def test_skipinitialspace(self):
data = "a, b\na, b\na, b\na, b"
reader = TextReader(StringIO(data), skipinitialspace=True, header=None)
result = reader.read()
tm.assert_numpy_array_equal(
result[0], np.array(["a", "a", "a", "a"], dtype=np.object_)
)
tm.assert_numpy_array_equal(
result[1], np.array(["b", "b", "b", "b"], dtype=np.object_)
)
def test_parse_booleans(self):
data = "True\nFalse\nTrue\nTrue"
reader = TextReader(StringIO(data), header=None)
result = reader.read()
assert result[0].dtype == np.bool_
def test_delimit_whitespace(self):
data = 'a b\na\t\t "b"\n"a"\t \t b'
reader = TextReader(StringIO(data), delim_whitespace=True, header=None)
result = reader.read()
tm.assert_numpy_array_equal(
result[0], np.array(["a", "a", "a"], dtype=np.object_)
)
tm.assert_numpy_array_equal(
result[1], np.array(["b", "b", "b"], dtype=np.object_)
)
def test_embedded_newline(self):
data = 'a\n"hello\nthere"\nthis'
reader = TextReader(StringIO(data), header=None)
result = reader.read()
expected = np.array(["a", "hello\nthere", "this"], dtype=np.object_)
tm.assert_numpy_array_equal(result[0], expected)
def test_euro_decimal(self):
data = "12345,67\n345,678"
reader = TextReader(StringIO(data), delimiter=":", decimal=",", header=None)
result = reader.read()
expected = np.array([12345.67, 345.678])
tm.assert_almost_equal(result[0], expected)
def test_integer_thousands(self):
data = "123,456\n12,500"
reader = TextReader(StringIO(data), delimiter=":", thousands=",", header=None)
result = reader.read()
expected = np.array([123456, 12500], dtype=np.int64)
tm.assert_almost_equal(result[0], expected)
def test_integer_thousands_alt(self):
data = "123.456\n12.500"
reader = TextFileReader(
StringIO(data), delimiter=":", thousands=".", header=None
)
result = reader.read()
expected = DataFrame([123456, 12500])
tm.assert_frame_equal(result, expected)
def test_skip_bad_lines(self, capsys):
# too many lines, see #2430 for why
data = "a:b:c\nd:e:f\ng:h:i\nj:k:l:m\nl:m:n\no:p:q:r"
reader = TextReader(StringIO(data), delimiter=":", header=None)
msg = r"Error tokenizing data\. C error: Expected 3 fields in line 4, saw 4"
with pytest.raises(parser.ParserError, match=msg):
reader.read()
reader = TextReader(
StringIO(data),
delimiter=":",
header=None,
error_bad_lines=False,
warn_bad_lines=False,
)
result = reader.read()
expected = {
0: np.array(["a", "d", "g", "l"], dtype=object),
1: np.array(["b", "e", "h", "m"], dtype=object),
2: np.array(["c", "f", "i", "n"], dtype=object),
}
assert_array_dicts_equal(result, expected)
reader = TextReader(
StringIO(data),
delimiter=":",
header=None,
error_bad_lines=False,
warn_bad_lines=True,
)
reader.read()
captured = capsys.readouterr()
assert "Skipping line 4" in captured.err
assert "Skipping line 6" in captured.err
def test_header_not_enough_lines(self):
data = "skip this\nskip this\na,b,c\n1,2,3\n4,5,6"
reader = TextReader(StringIO(data), delimiter=",", header=2)
header = reader.header
expected = [["a", "b", "c"]]
assert header == expected
recs = reader.read()
expected = {
0: np.array([1, 4], dtype=np.int64),
1: np.array([2, 5], dtype=np.int64),
2: np.array([3, 6], dtype=np.int64),
}
assert_array_dicts_equal(recs, expected)
def test_escapechar(self):
data = '\\"hello world"\n' '\\"hello world"\n' '\\"hello world"'
reader = TextReader(StringIO(data), delimiter=",", header=None, escapechar="\\")
result = reader.read()
expected = {0: np.array(['"hello world"'] * 3, dtype=object)}
assert_array_dicts_equal(result, expected)
def test_eof_has_eol(self):
# handling of new line at EOF
pass
def test_na_substitution(self):
pass
def test_numpy_string_dtype(self):
data = """\
a,1
aa,2
aaa,3
aaaa,4
aaaaa,5"""
def _make_reader(**kwds):
return TextReader(StringIO(data), delimiter=",", header=None, **kwds)
reader = _make_reader(dtype="S5,i4")
result = reader.read()
assert result[0].dtype == "S5"
ex_values = np.array(["a", "aa", "aaa", "aaaa", "aaaaa"], dtype="S5")
assert (result[0] == ex_values).all()
assert result[1].dtype == "i4"
reader = _make_reader(dtype="S4")
result = reader.read()
assert result[0].dtype == "S4"
ex_values = np.array(["a", "aa", "aaa", "aaaa", "aaaa"], dtype="S4")
assert (result[0] == ex_values).all()
assert result[1].dtype == "S4"
def test_pass_dtype(self):
data = """\
one,two
1,a
2,b
3,c
4,d"""
def _make_reader(**kwds):
return TextReader(StringIO(data), delimiter=",", **kwds)
reader = _make_reader(dtype={"one": "u1", 1: "S1"})
result = reader.read()
assert result[0].dtype == "u1"
assert result[1].dtype == "S1"
reader = _make_reader(dtype={"one": np.uint8, 1: object})
result = reader.read()
assert result[0].dtype == "u1"
assert result[1].dtype == "O"
reader = _make_reader(dtype={"one": np.dtype("u1"), 1: np.dtype("O")})
result = reader.read()
assert result[0].dtype == "u1"
assert result[1].dtype == "O"
def test_usecols(self):
data = """\
a,b,c
1,2,3
4,5,6
7,8,9
10,11,12"""
def _make_reader(**kwds):
return TextReader(StringIO(data), delimiter=",", **kwds)
reader = _make_reader(usecols=(1, 2))
result = reader.read()
exp = _make_reader().read()
assert len(result) == 2
assert (result[1] == exp[1]).all()
assert (result[2] == exp[2]).all()
def test_cr_delimited(self):
def _test(text, **kwargs):
nice_text = text.replace("\r", "\r\n")
result = TextReader(StringIO(text), **kwargs).read()
expected = TextReader(StringIO(nice_text), **kwargs).read()
assert_array_dicts_equal(result, expected)
data = "a,b,c\r1,2,3\r4,5,6\r7,8,9\r10,11,12"
_test(data, delimiter=",")
data = "a b c\r1 2 3\r4 5 6\r7 8 9\r10 11 12"
_test(data, delim_whitespace=True)
data = "a,b,c\r1,2,3\r4,5,6\r,88,9\r10,11,12"
_test(data, delimiter=",")
sample = (
"A,B,C,D,E,F,G,H,I,J,K,L,M,N,O\r"
"AAAAA,BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0\r"
",BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0"
)
_test(sample, delimiter=",")
data = "A B C\r 2 3\r4 5 6"
_test(data, delim_whitespace=True)
data = "A B C\r2 3\r4 5 6"
_test(data, delim_whitespace=True)
def test_empty_field_eof(self):
data = "a,b,c\n1,2,3\n4,,"
result = TextReader(StringIO(data), delimiter=",").read()
expected = {
0: np.array([1, 4], dtype=np.int64),
1: np.array(["2", ""], dtype=object),
2: np.array(["3", ""], dtype=object),
}
assert_array_dicts_equal(result, expected)
# GH5664
a = DataFrame([["b"], [nan]], columns=["a"], index=["a", "c"])
b = DataFrame([[1, 1, 1, 0], [1, 1, 1, 0]], columns=list("abcd"), index=[1, 1])
c = DataFrame(
[[1, 2, 3, 4], [6, nan, nan, nan], [8, 9, 10, 11], [13, 14, nan, nan]],
columns=list("abcd"),
index=[0, 5, 7, 12],
)
for _ in range(100):
df = read_csv(StringIO("a,b\nc\n"), skiprows=0, names=["a"], engine="c")
assert_frame_equal(df, a)
df = read_csv(
StringIO("1,1,1,1,0\n" * 2 + "\n" * 2), names=list("abcd"), engine="c"
)
assert_frame_equal(df, b)
df = read_csv(
StringIO("0,1,2,3,4\n5,6\n7,8,9,10,11\n12,13,14"),
names=list("abcd"),
engine="c",
)
assert_frame_equal(df, c)
def test_empty_csv_input(self):
# GH14867
df = read_csv(StringIO(), chunksize=20, header=None, names=["a", "b", "c"])
assert isinstance(df, TextFileReader)
def assert_array_dicts_equal(left, right):
for k, v in left.items():
tm.assert_numpy_array_equal(np.asarray(v), np.asarray(right[k]))

View File

@@ -0,0 +1,123 @@
"""
Tests that features that are currently unsupported in
either the Python or C parser are actually enforced
and are clearly communicated to the user.
Ultimately, the goal is to remove test cases from this
test suite as new feature support is added to the parsers.
"""
from io import StringIO
import pytest
from pandas.errors import ParserError
import pandas.util.testing as tm
import pandas.io.parsers as parsers
from pandas.io.parsers import read_csv
@pytest.fixture(params=["python", "python-fwf"], ids=lambda val: val)
def python_engine(request):
return request.param
class TestUnsupportedFeatures:
def test_mangle_dupe_cols_false(self):
# see gh-12935
data = "a b c\n1 2 3"
msg = "is not supported"
for engine in ("c", "python"):
with pytest.raises(ValueError, match=msg):
read_csv(StringIO(data), engine=engine, mangle_dupe_cols=False)
def test_c_engine(self):
# see gh-6607
data = "a b c\n1 2 3"
msg = "does not support"
# specify C engine with unsupported options (raise)
with pytest.raises(ValueError, match=msg):
read_csv(StringIO(data), engine="c", sep=None, delim_whitespace=False)
with pytest.raises(ValueError, match=msg):
read_csv(StringIO(data), engine="c", sep=r"\s")
with pytest.raises(ValueError, match=msg):
read_csv(StringIO(data), engine="c", sep="\t", quotechar=chr(128))
with pytest.raises(ValueError, match=msg):
read_csv(StringIO(data), engine="c", skipfooter=1)
# specify C-unsupported options without python-unsupported options
with tm.assert_produces_warning(parsers.ParserWarning):
read_csv(StringIO(data), sep=None, delim_whitespace=False)
with tm.assert_produces_warning(parsers.ParserWarning):
read_csv(StringIO(data), sep=r"\s")
with tm.assert_produces_warning(parsers.ParserWarning):
read_csv(StringIO(data), sep="\t", quotechar=chr(128))
with tm.assert_produces_warning(parsers.ParserWarning):
read_csv(StringIO(data), skipfooter=1)
text = """ A B C D E
one two three four
a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640
a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744
x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838"""
msg = "Error tokenizing data"
with pytest.raises(ParserError, match=msg):
read_csv(StringIO(text), sep="\\s+")
with pytest.raises(ParserError, match=msg):
read_csv(StringIO(text), engine="c", sep="\\s+")
msg = "Only length-1 thousands markers supported"
data = """A|B|C
1|2,334|5
10|13|10.
"""
with pytest.raises(ValueError, match=msg):
read_csv(StringIO(data), thousands=",,")
with pytest.raises(ValueError, match=msg):
read_csv(StringIO(data), thousands="")
msg = "Only length-1 line terminators supported"
data = "a,b,c~~1,2,3~~4,5,6"
with pytest.raises(ValueError, match=msg):
read_csv(StringIO(data), lineterminator="~~")
def test_python_engine(self, python_engine):
from pandas.io.parsers import _python_unsupported as py_unsupported
data = """1,2,3,,
1,2,3,4,
1,2,3,4,5
1,2,,,
1,2,3,4,"""
for default in py_unsupported:
msg = (
"The {default!r} option is not supported with the {python_engine!r}"
" engine"
).format(default=default, python_engine=python_engine)
kwargs = {default: object()}
with pytest.raises(ValueError, match=msg):
read_csv(StringIO(data), engine=python_engine, **kwargs)
def test_python_engine_file_no_next(self, python_engine):
# see gh-16530
class NoNextBuffer:
def __init__(self, csv_data):
self.data = csv_data
def __iter__(self):
return self
def read(self):
return self.data
data = "a\n1"
msg = "The 'python' engine cannot iterate"
with pytest.raises(ValueError, match=msg):
read_csv(NoNextBuffer(data), engine=python_engine)

View File

@@ -0,0 +1,572 @@
"""
Tests the usecols functionality during parsing
for all of the parsers defined in parsers.py
"""
from io import StringIO
import numpy as np
import pytest
from pandas._libs.tslib import Timestamp
from pandas import DataFrame, Index
import pandas.util.testing as tm
_msg_validate_usecols_arg = (
"'usecols' must either be list-like "
"of all strings, all unicode, all "
"integers or a callable."
)
_msg_validate_usecols_names = (
"Usecols do not match columns, columns expected but not found: {0}"
)
def test_raise_on_mixed_dtype_usecols(all_parsers):
# See gh-12678
data = """a,b,c
1000,2000,3000
4000,5000,6000
"""
usecols = [0, "b", 2]
parser = all_parsers
with pytest.raises(ValueError, match=_msg_validate_usecols_arg):
parser.read_csv(StringIO(data), usecols=usecols)
@pytest.mark.parametrize("usecols", [(1, 2), ("b", "c")])
def test_usecols(all_parsers, usecols):
data = """\
a,b,c
1,2,3
4,5,6
7,8,9
10,11,12"""
parser = all_parsers
result = parser.read_csv(StringIO(data), usecols=usecols)
expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"])
tm.assert_frame_equal(result, expected)
def test_usecols_with_names(all_parsers):
data = """\
a,b,c
1,2,3
4,5,6
7,8,9
10,11,12"""
parser = all_parsers
names = ["foo", "bar"]
result = parser.read_csv(StringIO(data), names=names, usecols=[1, 2], header=0)
expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=names)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"names,usecols", [(["b", "c"], [1, 2]), (["a", "b", "c"], ["b", "c"])]
)
def test_usecols_relative_to_names(all_parsers, names, usecols):
data = """\
1,2,3
4,5,6
7,8,9
10,11,12"""
parser = all_parsers
result = parser.read_csv(StringIO(data), names=names, header=None, usecols=usecols)
expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"])
tm.assert_frame_equal(result, expected)
def test_usecols_relative_to_names2(all_parsers):
# see gh-5766
data = """\
1,2,3
4,5,6
7,8,9
10,11,12"""
parser = all_parsers
result = parser.read_csv(
StringIO(data), names=["a", "b"], header=None, usecols=[0, 1]
)
expected = DataFrame([[1, 2], [4, 5], [7, 8], [10, 11]], columns=["a", "b"])
tm.assert_frame_equal(result, expected)
def test_usecols_name_length_conflict(all_parsers):
data = """\
1,2,3
4,5,6
7,8,9
10,11,12"""
parser = all_parsers
msg = (
"Number of passed names did not match number of header fields in the file"
if parser.engine == "python"
else "Passed header names mismatches usecols"
)
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), names=["a", "b"], header=None, usecols=[1])
def test_usecols_single_string(all_parsers):
# see gh-20558
parser = all_parsers
data = """foo, bar, baz
1000, 2000, 3000
4000, 5000, 6000"""
with pytest.raises(ValueError, match=_msg_validate_usecols_arg):
parser.read_csv(StringIO(data), usecols="foo")
@pytest.mark.parametrize(
"data", ["a,b,c,d\n1,2,3,4\n5,6,7,8", "a,b,c,d\n1,2,3,4,\n5,6,7,8,"]
)
def test_usecols_index_col_false(all_parsers, data):
# see gh-9082
parser = all_parsers
usecols = ["a", "c", "d"]
expected = DataFrame({"a": [1, 5], "c": [3, 7], "d": [4, 8]})
result = parser.read_csv(StringIO(data), usecols=usecols, index_col=False)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("index_col", ["b", 0])
@pytest.mark.parametrize("usecols", [["b", "c"], [1, 2]])
def test_usecols_index_col_conflict(all_parsers, usecols, index_col):
# see gh-4201: test that index_col as integer reflects usecols
parser = all_parsers
data = "a,b,c,d\nA,a,1,one\nB,b,2,two"
expected = DataFrame({"c": [1, 2]}, index=Index(["a", "b"], name="b"))
result = parser.read_csv(StringIO(data), usecols=usecols, index_col=index_col)
tm.assert_frame_equal(result, expected)
def test_usecols_index_col_conflict2(all_parsers):
# see gh-4201: test that index_col as integer reflects usecols
parser = all_parsers
data = "a,b,c,d\nA,a,1,one\nB,b,2,two"
expected = DataFrame({"b": ["a", "b"], "c": [1, 2], "d": ("one", "two")})
expected = expected.set_index(["b", "c"])
result = parser.read_csv(
StringIO(data), usecols=["b", "c", "d"], index_col=["b", "c"]
)
tm.assert_frame_equal(result, expected)
def test_usecols_implicit_index_col(all_parsers):
# see gh-2654
parser = all_parsers
data = "a,b,c\n4,apple,bat,5.7\n8,orange,cow,10"
result = parser.read_csv(StringIO(data), usecols=["a", "b"])
expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8])
tm.assert_frame_equal(result, expected)
def test_usecols_regex_sep(all_parsers):
# see gh-2733
parser = all_parsers
data = "a b c\n4 apple bat 5.7\n8 orange cow 10"
result = parser.read_csv(StringIO(data), sep=r"\s+", usecols=("a", "b"))
expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8])
tm.assert_frame_equal(result, expected)
def test_usecols_with_whitespace(all_parsers):
parser = all_parsers
data = "a b c\n4 apple bat 5.7\n8 orange cow 10"
result = parser.read_csv(StringIO(data), delim_whitespace=True, usecols=("a", "b"))
expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"usecols,expected",
[
# Column selection by index.
([0, 1], DataFrame(data=[[1000, 2000], [4000, 5000]], columns=["2", "0"])),
# Column selection by name.
(["0", "1"], DataFrame(data=[[2000, 3000], [5000, 6000]], columns=["0", "1"])),
],
)
def test_usecols_with_integer_like_header(all_parsers, usecols, expected):
parser = all_parsers
data = """2,0,1
1000,2000,3000
4000,5000,6000"""
result = parser.read_csv(StringIO(data), usecols=usecols)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]])
def test_usecols_with_parse_dates(all_parsers, usecols):
# see gh-9755
data = """a,b,c,d,e
0,1,20140101,0900,4
0,1,20140102,1000,4"""
parser = all_parsers
parse_dates = [[1, 2]]
cols = {
"a": [0, 0],
"c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")],
}
expected = DataFrame(cols, columns=["c_d", "a"])
result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates)
tm.assert_frame_equal(result, expected)
def test_usecols_with_parse_dates2(all_parsers):
# see gh-13604
parser = all_parsers
data = """2008-02-07 09:40,1032.43
2008-02-07 09:50,1042.54
2008-02-07 10:00,1051.65"""
names = ["date", "values"]
usecols = names[:]
parse_dates = [0]
index = Index(
[
Timestamp("2008-02-07 09:40"),
Timestamp("2008-02-07 09:50"),
Timestamp("2008-02-07 10:00"),
],
name="date",
)
cols = {"values": [1032.43, 1042.54, 1051.65]}
expected = DataFrame(cols, index=index)
result = parser.read_csv(
StringIO(data),
parse_dates=parse_dates,
index_col=0,
usecols=usecols,
header=None,
names=names,
)
tm.assert_frame_equal(result, expected)
def test_usecols_with_parse_dates3(all_parsers):
# see gh-14792
parser = all_parsers
data = """a,b,c,d,e,f,g,h,i,j
2016/09/21,1,1,2,3,4,5,6,7,8"""
usecols = list("abcdefghij")
parse_dates = [0]
cols = {
"a": Timestamp("2016-09-21"),
"b": [1],
"c": [1],
"d": [2],
"e": [3],
"f": [4],
"g": [5],
"h": [6],
"i": [7],
"j": [8],
}
expected = DataFrame(cols, columns=usecols)
result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates)
tm.assert_frame_equal(result, expected)
def test_usecols_with_parse_dates4(all_parsers):
data = "a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8"
usecols = list("abcdefghij")
parse_dates = [[0, 1]]
parser = all_parsers
cols = {
"a_b": "2016/09/21 1",
"c": [1],
"d": [2],
"e": [3],
"f": [4],
"g": [5],
"h": [6],
"i": [7],
"j": [8],
}
expected = DataFrame(cols, columns=["a_b"] + list("cdefghij"))
result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]])
@pytest.mark.parametrize(
"names",
[
list("abcde"), # Names span all columns in original data.
list("acd"), # Names span only the selected columns.
],
)
def test_usecols_with_parse_dates_and_names(all_parsers, usecols, names):
# see gh-9755
s = """0,1,20140101,0900,4
0,1,20140102,1000,4"""
parse_dates = [[1, 2]]
parser = all_parsers
cols = {
"a": [0, 0],
"c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")],
}
expected = DataFrame(cols, columns=["c_d", "a"])
result = parser.read_csv(
StringIO(s), names=names, parse_dates=parse_dates, usecols=usecols
)
tm.assert_frame_equal(result, expected)
def test_usecols_with_unicode_strings(all_parsers):
# see gh-13219
data = """AAA,BBB,CCC,DDD
0.056674973,8,True,a
2.613230982,2,False,b
3.568935038,7,False,a"""
parser = all_parsers
exp_data = {
"AAA": {0: 0.056674972999999997, 1: 2.6132309819999997, 2: 3.5689350380000002},
"BBB": {0: 8, 1: 2, 2: 7},
}
expected = DataFrame(exp_data)
result = parser.read_csv(StringIO(data), usecols=["AAA", "BBB"])
tm.assert_frame_equal(result, expected)
def test_usecols_with_single_byte_unicode_strings(all_parsers):
# see gh-13219
data = """A,B,C,D
0.056674973,8,True,a
2.613230982,2,False,b
3.568935038,7,False,a"""
parser = all_parsers
exp_data = {
"A": {0: 0.056674972999999997, 1: 2.6132309819999997, 2: 3.5689350380000002},
"B": {0: 8, 1: 2, 2: 7},
}
expected = DataFrame(exp_data)
result = parser.read_csv(StringIO(data), usecols=["A", "B"])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("usecols", [["AAA", b"BBB"], [b"AAA", "BBB"]])
def test_usecols_with_mixed_encoding_strings(all_parsers, usecols):
data = """AAA,BBB,CCC,DDD
0.056674973,8,True,a
2.613230982,2,False,b
3.568935038,7,False,a"""
parser = all_parsers
with pytest.raises(ValueError, match=_msg_validate_usecols_arg):
parser.read_csv(StringIO(data), usecols=usecols)
@pytest.mark.parametrize("usecols", [["あああ", "いい"], ["あああ", "いい"]])
def test_usecols_with_multi_byte_characters(all_parsers, usecols):
data = """あああ,いい,ううう,ええええ
0.056674973,8,True,a
2.613230982,2,False,b
3.568935038,7,False,a"""
parser = all_parsers
exp_data = {
"あああ": {0: 0.056674972999999997, 1: 2.6132309819999997, 2: 3.5689350380000002},
"いい": {0: 8, 1: 2, 2: 7},
}
expected = DataFrame(exp_data)
result = parser.read_csv(StringIO(data), usecols=usecols)
tm.assert_frame_equal(result, expected)
def test_empty_usecols(all_parsers):
data = "a,b,c\n1,2,3\n4,5,6"
expected = DataFrame()
parser = all_parsers
result = parser.read_csv(StringIO(data), usecols=set())
tm.assert_frame_equal(result, expected)
def test_np_array_usecols(all_parsers):
# see gh-12546
parser = all_parsers
data = "a,b,c\n1,2,3"
usecols = np.array(["a", "b"])
expected = DataFrame([[1, 2]], columns=usecols)
result = parser.read_csv(StringIO(data), usecols=usecols)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"usecols,expected",
[
(
lambda x: x.upper() in ["AAA", "BBB", "DDD"],
DataFrame(
{
"AaA": {
0: 0.056674972999999997,
1: 2.6132309819999997,
2: 3.5689350380000002,
},
"bBb": {0: 8, 1: 2, 2: 7},
"ddd": {0: "a", 1: "b", 2: "a"},
}
),
),
(lambda x: False, DataFrame()),
],
)
def test_callable_usecols(all_parsers, usecols, expected):
# see gh-14154
data = """AaA,bBb,CCC,ddd
0.056674973,8,True,a
2.613230982,2,False,b
3.568935038,7,False,a"""
parser = all_parsers
result = parser.read_csv(StringIO(data), usecols=usecols)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("usecols", [["a", "c"], lambda x: x in ["a", "c"]])
def test_incomplete_first_row(all_parsers, usecols):
# see gh-6710
data = "1,2\n1,2,3"
parser = all_parsers
names = ["a", "b", "c"]
expected = DataFrame({"a": [1, 1], "c": [np.nan, 3]})
result = parser.read_csv(StringIO(data), names=names, usecols=usecols)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"data,usecols,kwargs,expected",
[
# see gh-8985
(
"19,29,39\n" * 2 + "10,20,30,40",
[0, 1, 2],
dict(header=None),
DataFrame([[19, 29, 39], [19, 29, 39], [10, 20, 30]]),
),
# see gh-9549
(
("A,B,C\n1,2,3\n3,4,5\n1,2,4,5,1,6\n1,2,3,,,1,\n1,2,3\n5,6,7"),
["A", "B", "C"],
dict(),
DataFrame(
{
"A": [1, 3, 1, 1, 1, 5],
"B": [2, 4, 2, 2, 2, 6],
"C": [3, 5, 4, 3, 3, 7],
}
),
),
],
)
def test_uneven_length_cols(all_parsers, data, usecols, kwargs, expected):
# see gh-8985
parser = all_parsers
result = parser.read_csv(StringIO(data), usecols=usecols, **kwargs)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"usecols,kwargs,expected,msg",
[
(
["a", "b", "c", "d"],
dict(),
DataFrame({"a": [1, 5], "b": [2, 6], "c": [3, 7], "d": [4, 8]}),
None,
),
(
["a", "b", "c", "f"],
dict(),
None,
_msg_validate_usecols_names.format(r"\['f'\]"),
),
(["a", "b", "f"], dict(), None, _msg_validate_usecols_names.format(r"\['f'\]")),
(
["a", "b", "f", "g"],
dict(),
None,
_msg_validate_usecols_names.format(r"\[('f', 'g'|'g', 'f')\]"),
),
# see gh-14671
(
None,
dict(header=0, names=["A", "B", "C", "D"]),
DataFrame({"A": [1, 5], "B": [2, 6], "C": [3, 7], "D": [4, 8]}),
None,
),
(
["A", "B", "C", "f"],
dict(header=0, names=["A", "B", "C", "D"]),
None,
_msg_validate_usecols_names.format(r"\['f'\]"),
),
(
["A", "B", "f"],
dict(names=["A", "B", "C", "D"]),
None,
_msg_validate_usecols_names.format(r"\['f'\]"),
),
],
)
def test_raises_on_usecols_names_mismatch(all_parsers, usecols, kwargs, expected, msg):
data = "a,b,c,d\n1,2,3,4\n5,6,7,8"
kwargs.update(usecols=usecols)
parser = all_parsers
if expected is None:
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), **kwargs)
else:
result = parser.read_csv(StringIO(data), **kwargs)
tm.assert_frame_equal(result, expected)
@pytest.mark.xfail(
reason="see gh-16469: works on the C engine but not the Python engine", strict=False
)
@pytest.mark.parametrize("usecols", [["A", "C"], [0, 2]])
def test_usecols_subset_names_mismatch_orig_columns(all_parsers, usecols):
data = "a,b,c,d\n1,2,3,4\n5,6,7,8"
names = ["A", "B", "C", "D"]
parser = all_parsers
result = parser.read_csv(StringIO(data), header=0, names=names, usecols=usecols)
expected = DataFrame({"A": [1, 5], "C": [3, 7]})
tm.assert_frame_equal(result, expected)