8th day of python challenges 111-117
This commit is contained in:
@@ -0,0 +1,81 @@
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas import read_csv, read_table
|
||||
|
||||
|
||||
class BaseParser:
|
||||
engine = None
|
||||
low_memory = True
|
||||
float_precision_choices = []
|
||||
|
||||
def update_kwargs(self, kwargs):
|
||||
kwargs = kwargs.copy()
|
||||
kwargs.update(dict(engine=self.engine, low_memory=self.low_memory))
|
||||
|
||||
return kwargs
|
||||
|
||||
def read_csv(self, *args, **kwargs):
|
||||
kwargs = self.update_kwargs(kwargs)
|
||||
return read_csv(*args, **kwargs)
|
||||
|
||||
def read_table(self, *args, **kwargs):
|
||||
kwargs = self.update_kwargs(kwargs)
|
||||
return read_table(*args, **kwargs)
|
||||
|
||||
|
||||
class CParser(BaseParser):
|
||||
engine = "c"
|
||||
float_precision_choices = [None, "high", "round_trip"]
|
||||
|
||||
|
||||
class CParserHighMemory(CParser):
|
||||
low_memory = False
|
||||
|
||||
|
||||
class CParserLowMemory(CParser):
|
||||
low_memory = True
|
||||
|
||||
|
||||
class PythonParser(BaseParser):
|
||||
engine = "python"
|
||||
float_precision_choices = [None]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def csv_dir_path(datapath):
|
||||
return datapath("io", "parser", "data")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def csv1(csv_dir_path):
|
||||
return os.path.join(csv_dir_path, "test1.csv")
|
||||
|
||||
|
||||
_cParserHighMemory = CParserHighMemory()
|
||||
_cParserLowMemory = CParserLowMemory()
|
||||
_pythonParser = PythonParser()
|
||||
|
||||
_py_parsers_only = [_pythonParser]
|
||||
_c_parsers_only = [_cParserHighMemory, _cParserLowMemory]
|
||||
_all_parsers = _c_parsers_only + _py_parsers_only
|
||||
|
||||
_py_parser_ids = ["python"]
|
||||
_c_parser_ids = ["c_high", "c_low"]
|
||||
_all_parser_ids = _c_parser_ids + _py_parser_ids
|
||||
|
||||
|
||||
@pytest.fixture(params=_all_parsers, ids=_all_parser_ids)
|
||||
def all_parsers(request):
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(params=_c_parsers_only, ids=_c_parser_ids)
|
||||
def c_parser_only(request):
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(params=_py_parsers_only, ids=_py_parser_ids)
|
||||
def python_parser_only(request):
|
||||
return request.param
|
||||
@@ -0,0 +1,599 @@
|
||||
"""
|
||||
Tests that apply specifically to the CParser. Unless specifically stated
|
||||
as a CParser-specific issue, the goal is to eventually move as many of
|
||||
these tests out of this module as soon as the Python parser can accept
|
||||
further arguments when parsing.
|
||||
"""
|
||||
|
||||
from io import BytesIO, StringIO, TextIOWrapper
|
||||
import mmap
|
||||
import os
|
||||
import tarfile
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.errors import ParserError
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
from pandas import DataFrame, concat
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"malformed",
|
||||
["1\r1\r1\r 1\r 1\r", "1\r1\r1\r 1\r 1\r11\r", "1\r1\r1\r 1\r 1\r11\r1\r"],
|
||||
ids=["words pointer", "stream pointer", "lines pointer"],
|
||||
)
|
||||
def test_buffer_overflow(c_parser_only, malformed):
|
||||
# see gh-9205: test certain malformed input files that cause
|
||||
# buffer overflows in tokenizer.c
|
||||
msg = "Buffer overflow caught - possible malformed input file."
|
||||
parser = c_parser_only
|
||||
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(malformed))
|
||||
|
||||
|
||||
def test_buffer_rd_bytes(c_parser_only):
|
||||
# see gh-12098: src->buffer in the C parser can be freed twice leading
|
||||
# to a segfault if a corrupt gzip file is read with 'read_csv', and the
|
||||
# buffer is filled more than once before gzip raises an Exception.
|
||||
|
||||
data = (
|
||||
"\x1F\x8B\x08\x00\x00\x00\x00\x00\x00\x03\xED\xC3\x41\x09"
|
||||
"\x00\x00\x08\x00\xB1\xB7\xB6\xBA\xFE\xA5\xCC\x21\x6C\xB0"
|
||||
"\xA6\x4D" + "\x55" * 267 + "\x7D\xF7\x00\x91\xE0\x47\x97\x14\x38\x04\x00"
|
||||
"\x1f\x8b\x08\x00VT\x97V\x00\x03\xed]\xefO"
|
||||
)
|
||||
parser = c_parser_only
|
||||
|
||||
for _ in range(100):
|
||||
try:
|
||||
parser.read_csv(StringIO(data), compression="gzip", delim_whitespace=True)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def test_delim_whitespace_custom_terminator(c_parser_only):
|
||||
# See gh-12912
|
||||
data = "a b c~1 2 3~4 5 6~7 8 9"
|
||||
parser = c_parser_only
|
||||
|
||||
df = parser.read_csv(StringIO(data), lineterminator="~", delim_whitespace=True)
|
||||
expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"])
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
|
||||
def test_dtype_and_names_error(c_parser_only):
|
||||
# see gh-8833: passing both dtype and names
|
||||
# resulting in an error reporting issue
|
||||
parser = c_parser_only
|
||||
data = """
|
||||
1.0 1
|
||||
2.0 2
|
||||
3.0 3
|
||||
"""
|
||||
# base cases
|
||||
result = parser.read_csv(StringIO(data), sep=r"\s+", header=None)
|
||||
expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = parser.read_csv(StringIO(data), sep=r"\s+", header=None, names=["a", "b"])
|
||||
expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]], columns=["a", "b"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# fallback casting
|
||||
result = parser.read_csv(
|
||||
StringIO(data), sep=r"\s+", header=None, names=["a", "b"], dtype={"a": np.int32}
|
||||
)
|
||||
expected = DataFrame([[1, 1], [2, 2], [3, 3]], columns=["a", "b"])
|
||||
expected["a"] = expected["a"].astype(np.int32)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
data = """
|
||||
1.0 1
|
||||
nan 2
|
||||
3.0 3
|
||||
"""
|
||||
# fallback casting, but not castable
|
||||
with pytest.raises(ValueError, match="cannot safely convert"):
|
||||
parser.read_csv(
|
||||
StringIO(data),
|
||||
sep=r"\s+",
|
||||
header=None,
|
||||
names=["a", "b"],
|
||||
dtype={"a": np.int32},
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"match,kwargs",
|
||||
[
|
||||
# For each of these cases, all of the dtypes are valid, just unsupported.
|
||||
(
|
||||
(
|
||||
"the dtype datetime64 is not supported for parsing, "
|
||||
"pass this column using parse_dates instead"
|
||||
),
|
||||
dict(dtype={"A": "datetime64", "B": "float64"}),
|
||||
),
|
||||
(
|
||||
(
|
||||
"the dtype datetime64 is not supported for parsing, "
|
||||
"pass this column using parse_dates instead"
|
||||
),
|
||||
dict(dtype={"A": "datetime64", "B": "float64"}, parse_dates=["B"]),
|
||||
),
|
||||
(
|
||||
"the dtype timedelta64 is not supported for parsing",
|
||||
dict(dtype={"A": "timedelta64", "B": "float64"}),
|
||||
),
|
||||
("the dtype <U8 is not supported for parsing", dict(dtype={"A": "U8"})),
|
||||
],
|
||||
ids=["dt64-0", "dt64-1", "td64", "<U8"],
|
||||
)
|
||||
def test_unsupported_dtype(c_parser_only, match, kwargs):
|
||||
parser = c_parser_only
|
||||
df = DataFrame(
|
||||
np.random.rand(5, 2), columns=list("AB"), index=["1A", "1B", "1C", "1D", "1E"]
|
||||
)
|
||||
|
||||
with tm.ensure_clean("__unsupported_dtype__.csv") as path:
|
||||
df.to_csv(path)
|
||||
|
||||
with pytest.raises(TypeError, match=match):
|
||||
parser.read_csv(path, index_col=0, **kwargs)
|
||||
|
||||
|
||||
@td.skip_if_32bit
|
||||
def test_precise_conversion(c_parser_only):
|
||||
from decimal import Decimal
|
||||
|
||||
parser = c_parser_only
|
||||
|
||||
normal_errors = []
|
||||
precise_errors = []
|
||||
|
||||
# test numbers between 1 and 2
|
||||
for num in np.linspace(1.0, 2.0, num=500):
|
||||
# 25 decimal digits of precision
|
||||
text = "a\n{0:.25}".format(num)
|
||||
|
||||
normal_val = float(parser.read_csv(StringIO(text))["a"][0])
|
||||
precise_val = float(
|
||||
parser.read_csv(StringIO(text), float_precision="high")["a"][0]
|
||||
)
|
||||
roundtrip_val = float(
|
||||
parser.read_csv(StringIO(text), float_precision="round_trip")["a"][0]
|
||||
)
|
||||
actual_val = Decimal(text[2:])
|
||||
|
||||
def error(val):
|
||||
return abs(Decimal("{0:.100}".format(val)) - actual_val)
|
||||
|
||||
normal_errors.append(error(normal_val))
|
||||
precise_errors.append(error(precise_val))
|
||||
|
||||
# round-trip should match float()
|
||||
assert roundtrip_val == float(text[2:])
|
||||
|
||||
assert sum(precise_errors) <= sum(normal_errors)
|
||||
assert max(precise_errors) <= max(normal_errors)
|
||||
|
||||
|
||||
def test_usecols_dtypes(c_parser_only):
|
||||
parser = c_parser_only
|
||||
data = """\
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
10,11,12"""
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
usecols=(0, 1, 2),
|
||||
names=("a", "b", "c"),
|
||||
header=None,
|
||||
converters={"a": str},
|
||||
dtype={"b": int, "c": float},
|
||||
)
|
||||
result2 = parser.read_csv(
|
||||
StringIO(data),
|
||||
usecols=(0, 2),
|
||||
names=("a", "b", "c"),
|
||||
header=None,
|
||||
converters={"a": str},
|
||||
dtype={"b": int, "c": float},
|
||||
)
|
||||
|
||||
assert (result.dtypes == [object, np.int, np.float]).all()
|
||||
assert (result2.dtypes == [object, np.float]).all()
|
||||
|
||||
|
||||
def test_disable_bool_parsing(c_parser_only):
|
||||
# see gh-2090
|
||||
|
||||
parser = c_parser_only
|
||||
data = """A,B,C
|
||||
Yes,No,Yes
|
||||
No,Yes,Yes
|
||||
Yes,,Yes
|
||||
No,No,No"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype=object)
|
||||
assert (result.dtypes == object).all()
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype=object, na_filter=False)
|
||||
assert result["B"][2] == ""
|
||||
|
||||
|
||||
def test_custom_lineterminator(c_parser_only):
|
||||
parser = c_parser_only
|
||||
data = "a,b,c~1,2,3~4,5,6"
|
||||
|
||||
result = parser.read_csv(StringIO(data), lineterminator="~")
|
||||
expected = parser.read_csv(StringIO(data.replace("~", "\n")))
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_parse_ragged_csv(c_parser_only):
|
||||
parser = c_parser_only
|
||||
data = """1,2,3
|
||||
1,2,3,4
|
||||
1,2,3,4,5
|
||||
1,2
|
||||
1,2,3,4"""
|
||||
|
||||
nice_data = """1,2,3,,
|
||||
1,2,3,4,
|
||||
1,2,3,4,5
|
||||
1,2,,,
|
||||
1,2,3,4,"""
|
||||
result = parser.read_csv(
|
||||
StringIO(data), header=None, names=["a", "b", "c", "d", "e"]
|
||||
)
|
||||
|
||||
expected = parser.read_csv(
|
||||
StringIO(nice_data), header=None, names=["a", "b", "c", "d", "e"]
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# too many columns, cause segfault if not careful
|
||||
data = "1,2\n3,4,5"
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=None, names=range(50))
|
||||
expected = parser.read_csv(StringIO(data), header=None, names=range(3)).reindex(
|
||||
columns=range(50)
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_tokenize_CR_with_quoting(c_parser_only):
|
||||
# see gh-3453
|
||||
parser = c_parser_only
|
||||
data = ' a,b,c\r"a,b","e,d","f,f"'
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=None)
|
||||
expected = parser.read_csv(StringIO(data.replace("\r", "\n")), header=None)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = parser.read_csv(StringIO(data))
|
||||
expected = parser.read_csv(StringIO(data.replace("\r", "\n")))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_grow_boundary_at_cap(c_parser_only):
|
||||
# See gh-12494
|
||||
#
|
||||
# Cause of error was that the C parser
|
||||
# was not increasing the buffer size when
|
||||
# the desired space would fill the buffer
|
||||
# to capacity, which would later cause a
|
||||
# buffer overflow error when checking the
|
||||
# EOF terminator of the CSV stream.
|
||||
parser = c_parser_only
|
||||
|
||||
def test_empty_header_read(count):
|
||||
s = StringIO("," * count)
|
||||
expected = DataFrame(
|
||||
columns=["Unnamed: {i}".format(i=i) for i in range(count + 1)]
|
||||
)
|
||||
df = parser.read_csv(s)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
for cnt in range(1, 101):
|
||||
test_empty_header_read(cnt)
|
||||
|
||||
|
||||
def test_parse_trim_buffers(c_parser_only):
|
||||
# This test is part of a bugfix for gh-13703. It attempts to
|
||||
# to stress the system memory allocator, to cause it to move the
|
||||
# stream buffer and either let the OS reclaim the region, or let
|
||||
# other memory requests of parser otherwise modify the contents
|
||||
# of memory space, where it was formally located.
|
||||
# This test is designed to cause a `segfault` with unpatched
|
||||
# `tokenizer.c`. Sometimes the test fails on `segfault`, other
|
||||
# times it fails due to memory corruption, which causes the
|
||||
# loaded DataFrame to differ from the expected one.
|
||||
|
||||
parser = c_parser_only
|
||||
|
||||
# Generate a large mixed-type CSV file on-the-fly (one record is
|
||||
# approx 1.5KiB).
|
||||
record_ = (
|
||||
"""9999-9,99:99,,,,ZZ,ZZ,,,ZZZ-ZZZZ,.Z-ZZZZ,-9.99,,,9.99,Z"""
|
||||
"""ZZZZ,,-99,9,ZZZ-ZZZZ,ZZ-ZZZZ,,9.99,ZZZ-ZZZZZ,ZZZ-ZZZZZ,"""
|
||||
"""ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,9"""
|
||||
"""99,ZZZ-ZZZZ,,ZZ-ZZZZ,,,,,ZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,,,9,9,"""
|
||||
"""9,9,99,99,999,999,ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,9,ZZ-ZZZZ,9."""
|
||||
"""99,ZZ-ZZZZ,ZZ-ZZZZ,,,,ZZZZ,,,ZZ,ZZ,,,,,,,,,,,,,9,,,999."""
|
||||
"""99,999.99,,,ZZZZZ,,,Z9,,,,,,,ZZZ,ZZZ,,,,,,,,,,,ZZZZZ,ZZ"""
|
||||
"""ZZZ,ZZZ-ZZZZZZ,ZZZ-ZZZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZ"""
|
||||
"""ZZ,,,999999,999999,ZZZ,ZZZ,,,ZZZ,ZZZ,999.99,999.99,,,,Z"""
|
||||
"""ZZ-ZZZ,ZZZ-ZZZ,-9.99,-9.99,9,9,,99,,9.99,9.99,9,9,9.99,"""
|
||||
"""9.99,,,,9.99,9.99,,99,,99,9.99,9.99,,,ZZZ,ZZZ,,999.99,,"""
|
||||
"""999.99,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,ZZZZZ,ZZZZZ,ZZZ,ZZZ,9,9,"""
|
||||
""",,,,,ZZZ-ZZZZ,ZZZ999Z,,,999.99,,999.99,ZZZ-ZZZZ,,,9.999"""
|
||||
""",9.999,9.999,9.999,-9.999,-9.999,-9.999,-9.999,9.999,9."""
|
||||
"""999,9.999,9.999,9.999,9.999,9.999,9.999,99999,ZZZ-ZZZZ,"""
|
||||
""",9.99,ZZZ,,,,,,,,ZZZ,,,,,9,,,,9,,,,,,,,,,ZZZ-ZZZZ,ZZZ-Z"""
|
||||
"""ZZZ,,ZZZZZ,ZZZZZ,ZZZZZ,ZZZZZ,,,9.99,,ZZ-ZZZZ,ZZ-ZZZZ,ZZ"""
|
||||
""",999,,,,ZZ-ZZZZ,ZZZ,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,99.99,99.99"""
|
||||
""",,,9.99,9.99,9.99,9.99,ZZZ-ZZZZ,,,ZZZ-ZZZZZ,,,,,-9.99,-"""
|
||||
"""9.99,-9.99,-9.99,,,,,,,,,ZZZ-ZZZZ,,9,9.99,9.99,99ZZ,,-9"""
|
||||
""".99,-9.99,ZZZ-ZZZZ,,,,,,,ZZZ-ZZZZ,9.99,9.99,9999,,,,,,,"""
|
||||
""",,,-9.9,Z/Z-ZZZZ,999.99,9.99,,999.99,ZZ-ZZZZ,ZZ-ZZZZ,9."""
|
||||
"""99,9.99,9.99,9.99,9.99,9.99,,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZ"""
|
||||
"""ZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ,ZZZ,ZZZ,ZZZ,9.99,,,-9.99,ZZ"""
|
||||
"""-ZZZZ,-999.99,,-9999,,999.99,,,,999.99,99.99,,,ZZ-ZZZZZ"""
|
||||
"""ZZZ,ZZ-ZZZZ-ZZZZZZZ,,,,ZZ-ZZ-ZZZZZZZZ,ZZZZZZZZ,ZZZ-ZZZZ"""
|
||||
""",9999,999.99,ZZZ-ZZZZ,-9.99,-9.99,ZZZ-ZZZZ,99:99:99,,99"""
|
||||
""",99,,9.99,,-99.99,,,,,,9.99,ZZZ-ZZZZ,-9.99,-9.99,9.99,9"""
|
||||
""".99,,ZZZ,,,,,,,ZZZ,ZZZ,,,,,"""
|
||||
)
|
||||
|
||||
# Set the number of lines so that a call to `parser_trim_buffers`
|
||||
# is triggered: after a couple of full chunks are consumed a
|
||||
# relatively small 'residual' chunk would cause reallocation
|
||||
# within the parser.
|
||||
chunksize, n_lines = 128, 2 * 128 + 15
|
||||
csv_data = "\n".join([record_] * n_lines) + "\n"
|
||||
|
||||
# We will use StringIO to load the CSV from this text buffer.
|
||||
# pd.read_csv() will iterate over the file in chunks and will
|
||||
# finally read a residual chunk of really small size.
|
||||
|
||||
# Generate the expected output: manually create the dataframe
|
||||
# by splitting by comma and repeating the `n_lines` times.
|
||||
row = tuple(val_ if val_ else np.nan for val_ in record_.split(","))
|
||||
expected = DataFrame(
|
||||
[row for _ in range(n_lines)], dtype=object, columns=None, index=None
|
||||
)
|
||||
|
||||
# Iterate over the CSV file in chunks of `chunksize` lines
|
||||
chunks_ = parser.read_csv(
|
||||
StringIO(csv_data), header=None, dtype=object, chunksize=chunksize
|
||||
)
|
||||
result = concat(chunks_, axis=0, ignore_index=True)
|
||||
|
||||
# Check for data corruption if there was no segfault
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# This extra test was added to replicate the fault in gh-5291.
|
||||
# Force 'utf-8' encoding, so that `_string_convert` would take
|
||||
# a different execution branch.
|
||||
chunks_ = parser.read_csv(
|
||||
StringIO(csv_data),
|
||||
header=None,
|
||||
dtype=object,
|
||||
chunksize=chunksize,
|
||||
encoding="utf_8",
|
||||
)
|
||||
result = concat(chunks_, axis=0, ignore_index=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_internal_null_byte(c_parser_only):
|
||||
# see gh-14012
|
||||
#
|
||||
# The null byte ('\x00') should not be used as a
|
||||
# true line terminator, escape character, or comment
|
||||
# character, only as a placeholder to indicate that
|
||||
# none was specified.
|
||||
#
|
||||
# This test should be moved to test_common.py ONLY when
|
||||
# Python's csv class supports parsing '\x00'.
|
||||
parser = c_parser_only
|
||||
|
||||
names = ["a", "b", "c"]
|
||||
data = "1,2,3\n4,\x00,6\n7,8,9"
|
||||
expected = DataFrame([[1, 2.0, 3], [4, np.nan, 6], [7, 8, 9]], columns=names)
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=names)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_nrows_large(c_parser_only):
|
||||
# gh-7626 - Read only nrows of data in for large inputs (>262144b)
|
||||
parser = c_parser_only
|
||||
header_narrow = "\t".join(["COL_HEADER_" + str(i) for i in range(10)]) + "\n"
|
||||
data_narrow = "\t".join(["somedatasomedatasomedata1" for _ in range(10)]) + "\n"
|
||||
header_wide = "\t".join(["COL_HEADER_" + str(i) for i in range(15)]) + "\n"
|
||||
data_wide = "\t".join(["somedatasomedatasomedata2" for _ in range(15)]) + "\n"
|
||||
test_input = header_narrow + data_narrow * 1050 + header_wide + data_wide * 2
|
||||
|
||||
df = parser.read_csv(StringIO(test_input), sep="\t", nrows=1010)
|
||||
|
||||
assert df.size == 1010 * 10
|
||||
|
||||
|
||||
def test_float_precision_round_trip_with_text(c_parser_only):
|
||||
# see gh-15140
|
||||
parser = c_parser_only
|
||||
df = parser.read_csv(StringIO("a"), header=None, float_precision="round_trip")
|
||||
tm.assert_frame_equal(df, DataFrame({0: ["a"]}))
|
||||
|
||||
|
||||
def test_large_difference_in_columns(c_parser_only):
|
||||
# see gh-14125
|
||||
parser = c_parser_only
|
||||
|
||||
count = 10000
|
||||
large_row = ("X," * count)[:-1] + "\n"
|
||||
normal_row = "XXXXXX XXXXXX,111111111111111\n"
|
||||
test_input = (large_row + normal_row * 6)[:-1]
|
||||
|
||||
result = parser.read_csv(StringIO(test_input), header=None, usecols=[0])
|
||||
rows = test_input.split("\n")
|
||||
|
||||
expected = DataFrame([row.split(",")[0] for row in rows])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_data_after_quote(c_parser_only):
|
||||
# see gh-15910
|
||||
parser = c_parser_only
|
||||
|
||||
data = 'a\n1\n"b"a'
|
||||
result = parser.read_csv(StringIO(data))
|
||||
|
||||
expected = DataFrame({"a": ["1", "ba"]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_comment_whitespace_delimited(c_parser_only, capsys):
|
||||
parser = c_parser_only
|
||||
test_input = """\
|
||||
1 2
|
||||
2 2 3
|
||||
3 2 3 # 3 fields
|
||||
4 2 3# 3 fields
|
||||
5 2 # 2 fields
|
||||
6 2# 2 fields
|
||||
7 # 1 field, NaN
|
||||
8# 1 field, NaN
|
||||
9 2 3 # skipped line
|
||||
# comment"""
|
||||
df = parser.read_csv(
|
||||
StringIO(test_input),
|
||||
comment="#",
|
||||
header=None,
|
||||
delimiter="\\s+",
|
||||
skiprows=0,
|
||||
error_bad_lines=False,
|
||||
)
|
||||
captured = capsys.readouterr()
|
||||
# skipped lines 2, 3, 4, 9
|
||||
for line_num in (2, 3, 4, 9):
|
||||
assert "Skipping line {}".format(line_num) in captured.err
|
||||
expected = DataFrame([[1, 2], [5, 2], [6, 2], [7, np.nan], [8, np.nan]])
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
|
||||
def test_file_like_no_next(c_parser_only):
|
||||
# gh-16530: the file-like need not have a "next" or "__next__"
|
||||
# attribute despite having an "__iter__" attribute.
|
||||
#
|
||||
# NOTE: This is only true for the C engine, not Python engine.
|
||||
class NoNextBuffer(StringIO):
|
||||
def __next__(self):
|
||||
raise AttributeError("No next method")
|
||||
|
||||
next = __next__
|
||||
|
||||
parser = c_parser_only
|
||||
data = "a\n1"
|
||||
|
||||
expected = DataFrame({"a": [1]})
|
||||
result = parser.read_csv(NoNextBuffer(data))
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_buffer_rd_bytes_bad_unicode(c_parser_only):
|
||||
# see gh-22748
|
||||
t = BytesIO(b"\xB0")
|
||||
t = TextIOWrapper(t, encoding="ascii", errors="surrogateescape")
|
||||
msg = "'utf-8' codec can't encode character"
|
||||
with pytest.raises(UnicodeError, match=msg):
|
||||
c_parser_only.read_csv(t, encoding="UTF-8")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tar_suffix", [".tar", ".tar.gz"])
|
||||
def test_read_tarfile(c_parser_only, csv_dir_path, tar_suffix):
|
||||
# see gh-16530
|
||||
#
|
||||
# Unfortunately, Python's CSV library can't handle
|
||||
# tarfile objects (expects string, not bytes when
|
||||
# iterating through a file-like).
|
||||
parser = c_parser_only
|
||||
tar_path = os.path.join(csv_dir_path, "tar_csv" + tar_suffix)
|
||||
|
||||
with tarfile.open(tar_path, "r") as tar:
|
||||
data_file = tar.extractfile("tar_data.csv")
|
||||
|
||||
out = parser.read_csv(data_file)
|
||||
expected = DataFrame({"a": [1]})
|
||||
tm.assert_frame_equal(out, expected)
|
||||
|
||||
|
||||
@pytest.mark.high_memory
|
||||
def test_bytes_exceed_2gb(c_parser_only):
|
||||
# see gh-16798
|
||||
#
|
||||
# Read from a "CSV" that has a column larger than 2GB.
|
||||
parser = c_parser_only
|
||||
|
||||
if parser.low_memory:
|
||||
pytest.skip("not a high_memory test")
|
||||
|
||||
csv = StringIO("strings\n" + "\n".join(["x" * (1 << 20) for _ in range(2100)]))
|
||||
df = parser.read_csv(csv)
|
||||
assert not df.empty
|
||||
|
||||
|
||||
def test_chunk_whitespace_on_boundary(c_parser_only):
|
||||
# see gh-9735: this issue is C parser-specific (bug when
|
||||
# parsing whitespace and characters at chunk boundary)
|
||||
#
|
||||
# This test case has a field too large for the Python parser / CSV library.
|
||||
parser = c_parser_only
|
||||
|
||||
chunk1 = "a" * (1024 * 256 - 2) + "\na"
|
||||
chunk2 = "\n a"
|
||||
result = parser.read_csv(StringIO(chunk1 + chunk2), header=None)
|
||||
|
||||
expected = DataFrame(["a" * (1024 * 256 - 2), "a", " a"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_file_handles_mmap(c_parser_only, csv1):
|
||||
# gh-14418
|
||||
#
|
||||
# Don't close user provided file handles.
|
||||
parser = c_parser_only
|
||||
|
||||
with open(csv1, "r") as f:
|
||||
m = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
|
||||
parser.read_csv(m)
|
||||
|
||||
assert not m.closed
|
||||
m.close()
|
||||
|
||||
|
||||
def test_file_binary_mode(c_parser_only):
|
||||
# see gh-23779
|
||||
parser = c_parser_only
|
||||
expected = DataFrame([[1, 2, 3], [4, 5, 6]])
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
with open(path, "w") as f:
|
||||
f.write("1,2,3\n4,5,6")
|
||||
|
||||
with open(path, "rb") as f:
|
||||
result = parser.read_csv(f, header=None)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,136 @@
|
||||
"""
|
||||
Tests that comments are properly handled during parsing
|
||||
for all of the parsers defined in parsers.py
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na_values", [None, ["NaN"]])
|
||||
def test_comment(all_parsers, na_values):
|
||||
parser = all_parsers
|
||||
data = """A,B,C
|
||||
1,2.,4.#hello world
|
||||
5.,NaN,10.0
|
||||
"""
|
||||
expected = DataFrame(
|
||||
[[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
|
||||
)
|
||||
result = parser.read_csv(StringIO(data), comment="#", na_values=na_values)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"read_kwargs", [dict(), dict(lineterminator="*"), dict(delim_whitespace=True)]
|
||||
)
|
||||
def test_line_comment(all_parsers, read_kwargs):
|
||||
parser = all_parsers
|
||||
data = """# empty
|
||||
A,B,C
|
||||
1,2.,4.#hello world
|
||||
#ignore this line
|
||||
5.,NaN,10.0
|
||||
"""
|
||||
if read_kwargs.get("delim_whitespace"):
|
||||
data = data.replace(",", " ")
|
||||
elif read_kwargs.get("lineterminator"):
|
||||
if parser.engine != "c":
|
||||
pytest.skip("Custom terminator not supported with Python engine")
|
||||
|
||||
data = data.replace("\n", read_kwargs.get("lineterminator"))
|
||||
|
||||
read_kwargs["comment"] = "#"
|
||||
result = parser.read_csv(StringIO(data), **read_kwargs)
|
||||
|
||||
expected = DataFrame(
|
||||
[[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_comment_skiprows(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """# empty
|
||||
random line
|
||||
# second empty line
|
||||
1,2,3
|
||||
A,B,C
|
||||
1,2.,4.
|
||||
5.,NaN,10.0
|
||||
"""
|
||||
# This should ignore the first four lines (including comments).
|
||||
expected = DataFrame(
|
||||
[[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
|
||||
)
|
||||
result = parser.read_csv(StringIO(data), comment="#", skiprows=4)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_comment_header(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """# empty
|
||||
# second empty line
|
||||
1,2,3
|
||||
A,B,C
|
||||
1,2.,4.
|
||||
5.,NaN,10.0
|
||||
"""
|
||||
# Header should begin at the second non-comment line.
|
||||
expected = DataFrame(
|
||||
[[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
|
||||
)
|
||||
result = parser.read_csv(StringIO(data), comment="#", header=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_comment_skiprows_header(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """# empty
|
||||
# second empty line
|
||||
# third empty line
|
||||
X,Y,Z
|
||||
1,2,3
|
||||
A,B,C
|
||||
1,2.,4.
|
||||
5.,NaN,10.0
|
||||
"""
|
||||
# Skiprows should skip the first 4 lines (including comments),
|
||||
# while header should start from the second non-commented line,
|
||||
# starting with line 5.
|
||||
expected = DataFrame(
|
||||
[[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
|
||||
)
|
||||
result = parser.read_csv(StringIO(data), comment="#", skiprows=4, header=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("comment_char", ["#", "~", "&", "^", "*", "@"])
|
||||
def test_custom_comment_char(all_parsers, comment_char):
|
||||
parser = all_parsers
|
||||
data = "a,b,c\n1,2,3#ignore this!\n4,5,6#ignorethistoo"
|
||||
result = parser.read_csv(
|
||||
StringIO(data.replace("#", comment_char)), comment=comment_char
|
||||
)
|
||||
|
||||
expected = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "b", "c"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("header", ["infer", None])
|
||||
def test_comment_first_line(all_parsers, header):
|
||||
# see gh-4623
|
||||
parser = all_parsers
|
||||
data = "# notes\na,b,c\n# more notes\n1,2,3"
|
||||
|
||||
if header is None:
|
||||
expected = DataFrame({0: ["a", "1"], 1: ["b", "2"], 2: ["c", "3"]})
|
||||
else:
|
||||
expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"])
|
||||
|
||||
result = parser.read_csv(StringIO(data), comment="#", header=header)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,150 @@
|
||||
"""
|
||||
Tests compressed data parsing functionality for all
|
||||
of the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
import os
|
||||
import zipfile
|
||||
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
@pytest.fixture(params=[True, False])
|
||||
def buffer(request):
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def parser_and_data(all_parsers, csv1):
|
||||
parser = all_parsers
|
||||
|
||||
with open(csv1, "rb") as f:
|
||||
data = f.read()
|
||||
expected = parser.read_csv(csv1)
|
||||
|
||||
return parser, data, expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize("compression", ["zip", "infer", "zip2"])
|
||||
def test_zip(parser_and_data, compression):
|
||||
parser, data, expected = parser_and_data
|
||||
|
||||
with tm.ensure_clean("test_file.zip") as path:
|
||||
with zipfile.ZipFile(path, mode="w") as tmp:
|
||||
tmp.writestr("test_file", data)
|
||||
|
||||
if compression == "zip2":
|
||||
with open(path, "rb") as f:
|
||||
result = parser.read_csv(f, compression="zip")
|
||||
else:
|
||||
result = parser.read_csv(path, compression=compression)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("compression", ["zip", "infer"])
|
||||
def test_zip_error_multiple_files(parser_and_data, compression):
|
||||
parser, data, expected = parser_and_data
|
||||
|
||||
with tm.ensure_clean("combined_zip.zip") as path:
|
||||
inner_file_names = ["test_file", "second_file"]
|
||||
|
||||
with zipfile.ZipFile(path, mode="w") as tmp:
|
||||
for file_name in inner_file_names:
|
||||
tmp.writestr(file_name, data)
|
||||
|
||||
with pytest.raises(ValueError, match="Multiple files"):
|
||||
parser.read_csv(path, compression=compression)
|
||||
|
||||
|
||||
def test_zip_error_no_files(parser_and_data):
|
||||
parser, _, _ = parser_and_data
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
with zipfile.ZipFile(path, mode="w"):
|
||||
pass
|
||||
|
||||
with pytest.raises(ValueError, match="Zero files"):
|
||||
parser.read_csv(path, compression="zip")
|
||||
|
||||
|
||||
def test_zip_error_invalid_zip(parser_and_data):
|
||||
parser, _, _ = parser_and_data
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
with open(path, "wb") as f:
|
||||
with pytest.raises(zipfile.BadZipfile, match="File is not a zip file"):
|
||||
parser.read_csv(f, compression="zip")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", [None, "test.{ext}"])
|
||||
def test_compression(parser_and_data, compression_only, buffer, filename):
|
||||
parser, data, expected = parser_and_data
|
||||
compress_type = compression_only
|
||||
|
||||
ext = "gz" if compress_type == "gzip" else compress_type
|
||||
filename = filename if filename is None else filename.format(ext=ext)
|
||||
|
||||
if filename and buffer:
|
||||
pytest.skip("Cannot deduce compression from buffer of compressed data.")
|
||||
|
||||
with tm.ensure_clean(filename=filename) as path:
|
||||
tm.write_to_compressed(compress_type, path, data)
|
||||
compression = "infer" if filename else compress_type
|
||||
|
||||
if buffer:
|
||||
with open(path, "rb") as f:
|
||||
result = parser.read_csv(f, compression=compression)
|
||||
else:
|
||||
result = parser.read_csv(path, compression=compression)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("ext", [None, "gz", "bz2"])
|
||||
def test_infer_compression(all_parsers, csv1, buffer, ext):
|
||||
# see gh-9770
|
||||
parser = all_parsers
|
||||
kwargs = dict(index_col=0, parse_dates=True)
|
||||
|
||||
expected = parser.read_csv(csv1, **kwargs)
|
||||
kwargs["compression"] = "infer"
|
||||
|
||||
if buffer:
|
||||
with open(csv1) as f:
|
||||
result = parser.read_csv(f, **kwargs)
|
||||
else:
|
||||
ext = "." + ext if ext else ""
|
||||
result = parser.read_csv(csv1 + ext, **kwargs)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_compression_utf16_encoding(all_parsers, csv_dir_path):
|
||||
# see gh-18071
|
||||
parser = all_parsers
|
||||
path = os.path.join(csv_dir_path, "utf16_ex_small.zip")
|
||||
|
||||
result = parser.read_csv(path, encoding="utf-16", compression="zip", sep="\t")
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"Country": ["Venezuela", "Venezuela"],
|
||||
"Twitter": ["Hugo Chávez Frías", "Henrique Capriles R."],
|
||||
}
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("invalid_compression", ["sfark", "bz3", "zipper"])
|
||||
def test_invalid_compression(all_parsers, invalid_compression):
|
||||
parser = all_parsers
|
||||
compress_kwargs = dict(compression=invalid_compression)
|
||||
|
||||
msg = "Unrecognized compression type: {compression}".format(**compress_kwargs)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv("test_file.zip", **compress_kwargs)
|
||||
@@ -0,0 +1,160 @@
|
||||
"""
|
||||
Tests column conversion functionality during parsing
|
||||
for all of the parsers defined in parsers.py
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
from dateutil.parser import parse
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, Index
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
def test_converters_type_must_be_dict(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
"""
|
||||
|
||||
with pytest.raises(TypeError, match="Type converters.+"):
|
||||
parser.read_csv(StringIO(data), converters=0)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("column", [3, "D"])
|
||||
@pytest.mark.parametrize(
|
||||
"converter", [parse, lambda x: int(x.split("/")[2])] # Produce integer.
|
||||
)
|
||||
def test_converters(all_parsers, column, converter):
|
||||
parser = all_parsers
|
||||
data = """A,B,C,D
|
||||
a,1,2,01/01/2009
|
||||
b,3,4,01/02/2009
|
||||
c,4,5,01/03/2009
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data), converters={column: converter})
|
||||
|
||||
expected = parser.read_csv(StringIO(data))
|
||||
expected["D"] = expected["D"].map(converter)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_converters_no_implicit_conv(all_parsers):
|
||||
# see gh-2184
|
||||
parser = all_parsers
|
||||
data = """000102,1.2,A\n001245,2,B"""
|
||||
|
||||
converters = {0: lambda x: x.strip()}
|
||||
result = parser.read_csv(StringIO(data), header=None, converters=converters)
|
||||
|
||||
# Column 0 should not be casted to numeric and should remain as object.
|
||||
expected = DataFrame([["000102", 1.2, "A"], ["001245", 2, "B"]])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_converters_euro_decimal_format(all_parsers):
|
||||
# see gh-583
|
||||
converters = dict()
|
||||
parser = all_parsers
|
||||
|
||||
data = """Id;Number1;Number2;Text1;Text2;Number3
|
||||
1;1521,1541;187101,9543;ABC;poi;4,7387
|
||||
2;121,12;14897,76;DEF;uyt;0,3773
|
||||
3;878,158;108013,434;GHI;rez;2,7356"""
|
||||
converters["Number1"] = converters["Number2"] = converters[
|
||||
"Number3"
|
||||
] = lambda x: float(x.replace(",", "."))
|
||||
|
||||
result = parser.read_csv(StringIO(data), sep=";", converters=converters)
|
||||
expected = DataFrame(
|
||||
[
|
||||
[1, 1521.1541, 187101.9543, "ABC", "poi", 4.7387],
|
||||
[2, 121.12, 14897.76, "DEF", "uyt", 0.3773],
|
||||
[3, 878.158, 108013.434, "GHI", "rez", 2.7356],
|
||||
],
|
||||
columns=["Id", "Number1", "Number2", "Text1", "Text2", "Number3"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_converters_corner_with_nans(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """id,score,days
|
||||
1,2,12
|
||||
2,2-5,
|
||||
3,,14+
|
||||
4,6-12,2"""
|
||||
|
||||
# Example converters.
|
||||
def convert_days(x):
|
||||
x = x.strip()
|
||||
|
||||
if not x:
|
||||
return np.nan
|
||||
|
||||
is_plus = x.endswith("+")
|
||||
|
||||
if is_plus:
|
||||
x = int(x[:-1]) + 1
|
||||
else:
|
||||
x = int(x)
|
||||
|
||||
return x
|
||||
|
||||
def convert_days_sentinel(x):
|
||||
x = x.strip()
|
||||
|
||||
if not x:
|
||||
return np.nan
|
||||
|
||||
is_plus = x.endswith("+")
|
||||
|
||||
if is_plus:
|
||||
x = int(x[:-1]) + 1
|
||||
else:
|
||||
x = int(x)
|
||||
|
||||
return x
|
||||
|
||||
def convert_score(x):
|
||||
x = x.strip()
|
||||
|
||||
if not x:
|
||||
return np.nan
|
||||
|
||||
if x.find("-") > 0:
|
||||
val_min, val_max = map(int, x.split("-"))
|
||||
val = 0.5 * (val_min + val_max)
|
||||
else:
|
||||
val = float(x)
|
||||
|
||||
return val
|
||||
|
||||
results = []
|
||||
|
||||
for day_converter in [convert_days, convert_days_sentinel]:
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
converters={"score": convert_score, "days": day_converter},
|
||||
na_values=["", None],
|
||||
)
|
||||
assert pd.isna(result["days"][1])
|
||||
results.append(result)
|
||||
|
||||
tm.assert_frame_equal(results[0], results[1])
|
||||
|
||||
|
||||
def test_converter_index_col_bug(all_parsers):
|
||||
# see gh-1835
|
||||
parser = all_parsers
|
||||
data = "A;B\n1;2\n3;4"
|
||||
|
||||
rs = parser.read_csv(
|
||||
StringIO(data), sep=";", index_col="A", converters={"A": lambda x: x}
|
||||
)
|
||||
|
||||
xp = DataFrame({"B": [2, 4]}, index=Index([1, 3], name="A"))
|
||||
tm.assert_frame_equal(rs, xp)
|
||||
@@ -0,0 +1,144 @@
|
||||
"""
|
||||
Tests that dialects are properly handled during parsing
|
||||
for all of the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
import csv
|
||||
from io import StringIO
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas.errors import ParserWarning
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def custom_dialect():
|
||||
dialect_name = "weird"
|
||||
dialect_kwargs = dict(
|
||||
doublequote=False,
|
||||
escapechar="~",
|
||||
delimiter=":",
|
||||
skipinitialspace=False,
|
||||
quotechar="~",
|
||||
quoting=3,
|
||||
)
|
||||
return dialect_name, dialect_kwargs
|
||||
|
||||
|
||||
def test_dialect(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
label1,label2,label3
|
||||
index1,"a,c,e
|
||||
index2,b,d,f
|
||||
"""
|
||||
|
||||
dia = csv.excel()
|
||||
dia.quoting = csv.QUOTE_NONE
|
||||
df = parser.read_csv(StringIO(data), dialect=dia)
|
||||
|
||||
data = """\
|
||||
label1,label2,label3
|
||||
index1,a,c,e
|
||||
index2,b,d,f
|
||||
"""
|
||||
exp = parser.read_csv(StringIO(data))
|
||||
exp.replace("a", '"a', inplace=True)
|
||||
tm.assert_frame_equal(df, exp)
|
||||
|
||||
|
||||
def test_dialect_str(all_parsers):
|
||||
dialect_name = "mydialect"
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
fruit:vegetable
|
||||
apple:broccoli
|
||||
pear:tomato
|
||||
"""
|
||||
exp = DataFrame({"fruit": ["apple", "pear"], "vegetable": ["broccoli", "tomato"]})
|
||||
|
||||
with tm.with_csv_dialect(dialect_name, delimiter=":"):
|
||||
df = parser.read_csv(StringIO(data), dialect=dialect_name)
|
||||
tm.assert_frame_equal(df, exp)
|
||||
|
||||
|
||||
def test_invalid_dialect(all_parsers):
|
||||
class InvalidDialect:
|
||||
pass
|
||||
|
||||
data = "a\n1"
|
||||
parser = all_parsers
|
||||
msg = "Invalid dialect"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), dialect=InvalidDialect)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"arg",
|
||||
[None, "doublequote", "escapechar", "skipinitialspace", "quotechar", "quoting"],
|
||||
)
|
||||
@pytest.mark.parametrize("value", ["dialect", "default", "other"])
|
||||
def test_dialect_conflict_except_delimiter(all_parsers, custom_dialect, arg, value):
|
||||
# see gh-23761.
|
||||
dialect_name, dialect_kwargs = custom_dialect
|
||||
parser = all_parsers
|
||||
|
||||
expected = DataFrame({"a": [1], "b": [2]})
|
||||
data = "a:b\n1:2"
|
||||
|
||||
warning_klass = None
|
||||
kwds = dict()
|
||||
|
||||
# arg=None tests when we pass in the dialect without any other arguments.
|
||||
if arg is not None:
|
||||
if "value" == "dialect": # No conflict --> no warning.
|
||||
kwds[arg] = dialect_kwargs[arg]
|
||||
elif "value" == "default": # Default --> no warning.
|
||||
from pandas.io.parsers import _parser_defaults
|
||||
|
||||
kwds[arg] = _parser_defaults[arg]
|
||||
else: # Non-default + conflict with dialect --> warning.
|
||||
warning_klass = ParserWarning
|
||||
kwds[arg] = "blah"
|
||||
|
||||
with tm.with_csv_dialect(dialect_name, **dialect_kwargs):
|
||||
with tm.assert_produces_warning(warning_klass):
|
||||
result = parser.read_csv(StringIO(data), dialect=dialect_name, **kwds)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs,warning_klass",
|
||||
[
|
||||
(dict(sep=","), None), # sep is default --> sep_override=True
|
||||
(dict(sep="."), ParserWarning), # sep isn't default --> sep_override=False
|
||||
(dict(delimiter=":"), None), # No conflict
|
||||
(dict(delimiter=None), None), # Default arguments --> sep_override=True
|
||||
(dict(delimiter=","), ParserWarning), # Conflict
|
||||
(dict(delimiter="."), ParserWarning), # Conflict
|
||||
],
|
||||
ids=[
|
||||
"sep-override-true",
|
||||
"sep-override-false",
|
||||
"delimiter-no-conflict",
|
||||
"delimiter-default-arg",
|
||||
"delimiter-conflict",
|
||||
"delimiter-conflict2",
|
||||
],
|
||||
)
|
||||
def test_dialect_conflict_delimiter(all_parsers, custom_dialect, kwargs, warning_klass):
|
||||
# see gh-23761.
|
||||
dialect_name, dialect_kwargs = custom_dialect
|
||||
parser = all_parsers
|
||||
|
||||
expected = DataFrame({"a": [1], "b": [2]})
|
||||
data = "a:b\n1:2"
|
||||
|
||||
with tm.with_csv_dialect(dialect_name, **dialect_kwargs):
|
||||
with tm.assert_produces_warning(warning_klass):
|
||||
result = parser.read_csv(StringIO(data), dialect=dialect_name, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,552 @@
|
||||
"""
|
||||
Tests dtype specification during parsing
|
||||
for all of the parsers defined in parsers.py
|
||||
"""
|
||||
from io import StringIO
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.errors import ParserWarning
|
||||
|
||||
from pandas.core.dtypes.dtypes import CategoricalDtype
|
||||
|
||||
import pandas as pd
|
||||
from pandas import Categorical, DataFrame, Index, MultiIndex, Series, Timestamp, concat
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [str, object])
|
||||
@pytest.mark.parametrize("check_orig", [True, False])
|
||||
def test_dtype_all_columns(all_parsers, dtype, check_orig):
|
||||
# see gh-3795, gh-6607
|
||||
parser = all_parsers
|
||||
|
||||
df = DataFrame(
|
||||
np.random.rand(5, 2).round(4),
|
||||
columns=list("AB"),
|
||||
index=["1A", "1B", "1C", "1D", "1E"],
|
||||
)
|
||||
|
||||
with tm.ensure_clean("__passing_str_as_dtype__.csv") as path:
|
||||
df.to_csv(path)
|
||||
|
||||
result = parser.read_csv(path, dtype=dtype, index_col=0)
|
||||
|
||||
if check_orig:
|
||||
expected = df.copy()
|
||||
result = result.astype(float)
|
||||
else:
|
||||
expected = df.astype(str)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_dtype_all_columns_empty(all_parsers):
|
||||
# see gh-12048
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO("A,B"), dtype=str)
|
||||
|
||||
expected = DataFrame({"A": [], "B": []}, index=[], dtype=str)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_dtype_per_column(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
one,two
|
||||
1,2.5
|
||||
2,3.5
|
||||
3,4.5
|
||||
4,5.5"""
|
||||
expected = DataFrame(
|
||||
[[1, "2.5"], [2, "3.5"], [3, "4.5"], [4, "5.5"]], columns=["one", "two"]
|
||||
)
|
||||
expected["one"] = expected["one"].astype(np.float64)
|
||||
expected["two"] = expected["two"].astype(object)
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype={"one": np.float64, 1: str})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_invalid_dtype_per_column(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
one,two
|
||||
1,2.5
|
||||
2,3.5
|
||||
3,4.5
|
||||
4,5.5"""
|
||||
|
||||
with pytest.raises(TypeError, match="data type 'foo' not understood"):
|
||||
parser.read_csv(StringIO(data), dtype={"one": "foo", 1: "int"})
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype",
|
||||
[
|
||||
"category",
|
||||
CategoricalDtype(),
|
||||
{"a": "category", "b": "category", "c": CategoricalDtype()},
|
||||
],
|
||||
)
|
||||
def test_categorical_dtype(all_parsers, dtype):
|
||||
# see gh-10153
|
||||
parser = all_parsers
|
||||
data = """a,b,c
|
||||
1,a,3.4
|
||||
1,a,3.4
|
||||
2,b,4.5"""
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": Categorical(["1", "1", "2"]),
|
||||
"b": Categorical(["a", "a", "b"]),
|
||||
"c": Categorical(["3.4", "3.4", "4.5"]),
|
||||
}
|
||||
)
|
||||
actual = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [{"b": "category"}, {1: "category"}])
|
||||
def test_categorical_dtype_single(all_parsers, dtype):
|
||||
# see gh-10153
|
||||
parser = all_parsers
|
||||
data = """a,b,c
|
||||
1,a,3.4
|
||||
1,a,3.4
|
||||
2,b,4.5"""
|
||||
expected = DataFrame(
|
||||
{"a": [1, 1, 2], "b": Categorical(["a", "a", "b"]), "c": [3.4, 3.4, 4.5]}
|
||||
)
|
||||
actual = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
def test_categorical_dtype_unsorted(all_parsers):
|
||||
# see gh-10153
|
||||
parser = all_parsers
|
||||
data = """a,b,c
|
||||
1,b,3.4
|
||||
1,b,3.4
|
||||
2,a,4.5"""
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": Categorical(["1", "1", "2"]),
|
||||
"b": Categorical(["b", "b", "a"]),
|
||||
"c": Categorical(["3.4", "3.4", "4.5"]),
|
||||
}
|
||||
)
|
||||
actual = parser.read_csv(StringIO(data), dtype="category")
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
def test_categorical_dtype_missing(all_parsers):
|
||||
# see gh-10153
|
||||
parser = all_parsers
|
||||
data = """a,b,c
|
||||
1,b,3.4
|
||||
1,nan,3.4
|
||||
2,a,4.5"""
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": Categorical(["1", "1", "2"]),
|
||||
"b": Categorical(["b", np.nan, "a"]),
|
||||
"c": Categorical(["3.4", "3.4", "4.5"]),
|
||||
}
|
||||
)
|
||||
actual = parser.read_csv(StringIO(data), dtype="category")
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_categorical_dtype_high_cardinality_numeric(all_parsers):
|
||||
# see gh-18186
|
||||
parser = all_parsers
|
||||
data = np.sort([str(i) for i in range(524289)])
|
||||
expected = DataFrame({"a": Categorical(data, ordered=True)})
|
||||
|
||||
actual = parser.read_csv(StringIO("a\n" + "\n".join(data)), dtype="category")
|
||||
actual["a"] = actual["a"].cat.reorder_categories(
|
||||
np.sort(actual.a.cat.categories), ordered=True
|
||||
)
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
def test_categorical_dtype_latin1(all_parsers, csv_dir_path):
|
||||
# see gh-10153
|
||||
pth = os.path.join(csv_dir_path, "unicode_series.csv")
|
||||
parser = all_parsers
|
||||
encoding = "latin-1"
|
||||
|
||||
expected = parser.read_csv(pth, header=None, encoding=encoding)
|
||||
expected[1] = Categorical(expected[1])
|
||||
|
||||
actual = parser.read_csv(pth, header=None, encoding=encoding, dtype={1: "category"})
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
def test_categorical_dtype_utf16(all_parsers, csv_dir_path):
|
||||
# see gh-10153
|
||||
pth = os.path.join(csv_dir_path, "utf16_ex.txt")
|
||||
parser = all_parsers
|
||||
encoding = "utf-16"
|
||||
sep = ","
|
||||
|
||||
expected = parser.read_csv(pth, sep=sep, encoding=encoding)
|
||||
expected = expected.apply(Categorical)
|
||||
|
||||
actual = parser.read_csv(pth, sep=sep, encoding=encoding, dtype="category")
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
def test_categorical_dtype_chunksize_infer_categories(all_parsers):
|
||||
# see gh-10153
|
||||
parser = all_parsers
|
||||
data = """a,b
|
||||
1,a
|
||||
1,b
|
||||
1,b
|
||||
2,c"""
|
||||
expecteds = [
|
||||
DataFrame({"a": [1, 1], "b": Categorical(["a", "b"])}),
|
||||
DataFrame({"a": [1, 2], "b": Categorical(["b", "c"])}, index=[2, 3]),
|
||||
]
|
||||
actuals = parser.read_csv(StringIO(data), dtype={"b": "category"}, chunksize=2)
|
||||
|
||||
for actual, expected in zip(actuals, expecteds):
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
def test_categorical_dtype_chunksize_explicit_categories(all_parsers):
|
||||
# see gh-10153
|
||||
parser = all_parsers
|
||||
data = """a,b
|
||||
1,a
|
||||
1,b
|
||||
1,b
|
||||
2,c"""
|
||||
cats = ["a", "b", "c"]
|
||||
expecteds = [
|
||||
DataFrame({"a": [1, 1], "b": Categorical(["a", "b"], categories=cats)}),
|
||||
DataFrame(
|
||||
{"a": [1, 2], "b": Categorical(["b", "c"], categories=cats)}, index=[2, 3]
|
||||
),
|
||||
]
|
||||
dtype = CategoricalDtype(cats)
|
||||
actuals = parser.read_csv(StringIO(data), dtype={"b": dtype}, chunksize=2)
|
||||
|
||||
for actual, expected in zip(actuals, expecteds):
|
||||
tm.assert_frame_equal(actual, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("ordered", [False, True])
|
||||
@pytest.mark.parametrize(
|
||||
"categories",
|
||||
[["a", "b", "c"], ["a", "c", "b"], ["a", "b", "c", "d"], ["c", "b", "a"]],
|
||||
)
|
||||
def test_categorical_category_dtype(all_parsers, categories, ordered):
|
||||
parser = all_parsers
|
||||
data = """a,b
|
||||
1,a
|
||||
1,b
|
||||
1,b
|
||||
2,c"""
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": [1, 1, 1, 2],
|
||||
"b": Categorical(
|
||||
["a", "b", "b", "c"], categories=categories, ordered=ordered
|
||||
),
|
||||
}
|
||||
)
|
||||
|
||||
dtype = {"b": CategoricalDtype(categories=categories, ordered=ordered)}
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_categorical_category_dtype_unsorted(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """a,b
|
||||
1,a
|
||||
1,b
|
||||
1,b
|
||||
2,c"""
|
||||
dtype = CategoricalDtype(["c", "b", "a"])
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": [1, 1, 1, 2],
|
||||
"b": Categorical(["a", "b", "b", "c"], categories=["c", "b", "a"]),
|
||||
}
|
||||
)
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype={"b": dtype})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_categorical_coerces_numeric(all_parsers):
|
||||
parser = all_parsers
|
||||
dtype = {"b": CategoricalDtype([1, 2, 3])}
|
||||
|
||||
data = "b\n1\n1\n2\n3"
|
||||
expected = DataFrame({"b": Categorical([1, 1, 2, 3])})
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_categorical_coerces_datetime(all_parsers):
|
||||
parser = all_parsers
|
||||
dtype = {"b": CategoricalDtype(pd.date_range("2017", "2019", freq="AS"))}
|
||||
|
||||
data = "b\n2017-01-01\n2018-01-01\n2019-01-01"
|
||||
expected = DataFrame({"b": Categorical(dtype["b"].categories)})
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_categorical_coerces_timestamp(all_parsers):
|
||||
parser = all_parsers
|
||||
dtype = {"b": CategoricalDtype([Timestamp("2014")])}
|
||||
|
||||
data = "b\n2014-01-01\n2014-01-01T00:00:00"
|
||||
expected = DataFrame({"b": Categorical([Timestamp("2014")] * 2)})
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_categorical_coerces_timedelta(all_parsers):
|
||||
parser = all_parsers
|
||||
dtype = {"b": CategoricalDtype(pd.to_timedelta(["1H", "2H", "3H"]))}
|
||||
|
||||
data = "b\n1H\n2H\n3H"
|
||||
expected = DataFrame({"b": Categorical(dtype["b"].categories)})
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data",
|
||||
[
|
||||
"b\nTrue\nFalse\nNA\nFalse",
|
||||
"b\ntrue\nfalse\nNA\nfalse",
|
||||
"b\nTRUE\nFALSE\nNA\nFALSE",
|
||||
"b\nTrue\nFalse\nNA\nFALSE",
|
||||
],
|
||||
)
|
||||
def test_categorical_dtype_coerces_boolean(all_parsers, data):
|
||||
# see gh-20498
|
||||
parser = all_parsers
|
||||
dtype = {"b": CategoricalDtype([False, True])}
|
||||
expected = DataFrame({"b": Categorical([True, False, None, False])})
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_categorical_unexpected_categories(all_parsers):
|
||||
parser = all_parsers
|
||||
dtype = {"b": CategoricalDtype(["a", "b", "d", "e"])}
|
||||
|
||||
data = "b\nd\na\nc\nd" # Unexpected c
|
||||
expected = DataFrame({"b": Categorical(list("dacd"), dtype=dtype["b"])})
|
||||
|
||||
result = parser.read_csv(StringIO(data), dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_empty_pass_dtype(all_parsers):
|
||||
parser = all_parsers
|
||||
|
||||
data = "one,two"
|
||||
result = parser.read_csv(StringIO(data), dtype={"one": "u1"})
|
||||
|
||||
expected = DataFrame(
|
||||
{"one": np.empty(0, dtype="u1"), "two": np.empty(0, dtype=np.object)},
|
||||
index=Index([], dtype=object),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_empty_with_index_pass_dtype(all_parsers):
|
||||
parser = all_parsers
|
||||
|
||||
data = "one,two"
|
||||
result = parser.read_csv(
|
||||
StringIO(data), index_col=["one"], dtype={"one": "u1", 1: "f"}
|
||||
)
|
||||
|
||||
expected = DataFrame(
|
||||
{"two": np.empty(0, dtype="f")}, index=Index([], dtype="u1", name="one")
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_empty_with_multi_index_pass_dtype(all_parsers):
|
||||
parser = all_parsers
|
||||
|
||||
data = "one,two,three"
|
||||
result = parser.read_csv(
|
||||
StringIO(data), index_col=["one", "two"], dtype={"one": "u1", 1: "f8"}
|
||||
)
|
||||
|
||||
exp_idx = MultiIndex.from_arrays(
|
||||
[np.empty(0, dtype="u1"), np.empty(0, dtype=np.float64)], names=["one", "two"]
|
||||
)
|
||||
expected = DataFrame({"three": np.empty(0, dtype=np.object)}, index=exp_idx)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_empty_with_mangled_column_pass_dtype_by_names(all_parsers):
|
||||
parser = all_parsers
|
||||
|
||||
data = "one,one"
|
||||
result = parser.read_csv(StringIO(data), dtype={"one": "u1", "one.1": "f"})
|
||||
|
||||
expected = DataFrame(
|
||||
{"one": np.empty(0, dtype="u1"), "one.1": np.empty(0, dtype="f")},
|
||||
index=Index([], dtype=object),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_empty_with_mangled_column_pass_dtype_by_indexes(all_parsers):
|
||||
parser = all_parsers
|
||||
|
||||
data = "one,one"
|
||||
result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"})
|
||||
|
||||
expected = DataFrame(
|
||||
{"one": np.empty(0, dtype="u1"), "one.1": np.empty(0, dtype="f")},
|
||||
index=Index([], dtype=object),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_empty_with_dup_column_pass_dtype_by_indexes(all_parsers):
|
||||
# see gh-9424
|
||||
parser = all_parsers
|
||||
expected = concat(
|
||||
[Series([], name="one", dtype="u1"), Series([], name="one.1", dtype="f")],
|
||||
axis=1,
|
||||
)
|
||||
expected.index = expected.index.astype(object)
|
||||
|
||||
data = "one,one"
|
||||
result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_empty_with_dup_column_pass_dtype_by_indexes_raises(all_parsers):
|
||||
# see gh-9424
|
||||
parser = all_parsers
|
||||
expected = concat(
|
||||
[Series([], name="one", dtype="u1"), Series([], name="one.1", dtype="f")],
|
||||
axis=1,
|
||||
)
|
||||
expected.index = expected.index.astype(object)
|
||||
|
||||
with pytest.raises(ValueError, match="Duplicate names"):
|
||||
data = ""
|
||||
parser.read_csv(StringIO(data), names=["one", "one"], dtype={0: "u1", 1: "f"})
|
||||
|
||||
|
||||
def test_raise_on_passed_int_dtype_with_nas(all_parsers):
|
||||
# see gh-2631
|
||||
parser = all_parsers
|
||||
data = """YEAR, DOY, a
|
||||
2001,106380451,10
|
||||
2001,,11
|
||||
2001,106380451,67"""
|
||||
|
||||
msg = (
|
||||
"Integer column has NA values"
|
||||
if parser.engine == "c"
|
||||
else "Unable to convert column DOY"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), dtype={"DOY": np.int64}, skipinitialspace=True)
|
||||
|
||||
|
||||
def test_dtype_with_converters(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """a,b
|
||||
1.1,2.2
|
||||
1.2,2.3"""
|
||||
|
||||
# Dtype spec ignored if converted specified.
|
||||
with tm.assert_produces_warning(ParserWarning):
|
||||
result = parser.read_csv(
|
||||
StringIO(data), dtype={"a": "i8"}, converters={"a": lambda x: str(x)}
|
||||
)
|
||||
expected = DataFrame({"a": ["1.1", "1.2"], "b": [2.2, 2.3]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype,expected",
|
||||
[
|
||||
(np.float64, DataFrame(columns=["a", "b"], dtype=np.float64)),
|
||||
("category", DataFrame({"a": Categorical([]), "b": Categorical([])}, index=[])),
|
||||
(
|
||||
dict(a="category", b="category"),
|
||||
DataFrame({"a": Categorical([]), "b": Categorical([])}, index=[]),
|
||||
),
|
||||
("datetime64[ns]", DataFrame(columns=["a", "b"], dtype="datetime64[ns]")),
|
||||
(
|
||||
"timedelta64[ns]",
|
||||
DataFrame(
|
||||
{
|
||||
"a": Series([], dtype="timedelta64[ns]"),
|
||||
"b": Series([], dtype="timedelta64[ns]"),
|
||||
},
|
||||
index=[],
|
||||
),
|
||||
),
|
||||
(
|
||||
dict(a=np.int64, b=np.int32),
|
||||
DataFrame(
|
||||
{"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)},
|
||||
index=[],
|
||||
),
|
||||
),
|
||||
(
|
||||
{0: np.int64, 1: np.int32},
|
||||
DataFrame(
|
||||
{"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)},
|
||||
index=[],
|
||||
),
|
||||
),
|
||||
(
|
||||
{"a": np.int64, 1: np.int32},
|
||||
DataFrame(
|
||||
{"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)},
|
||||
index=[],
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_empty_dtype(all_parsers, dtype, expected):
|
||||
# see gh-14712
|
||||
parser = all_parsers
|
||||
data = "a,b"
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=0, dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype", list(np.typecodes["AllInteger"] + np.typecodes["Float"])
|
||||
)
|
||||
def test_numeric_dtype(all_parsers, dtype):
|
||||
data = "0\n1"
|
||||
parser = all_parsers
|
||||
expected = DataFrame([0, 1], dtype=dtype)
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=None, dtype=dtype)
|
||||
tm.assert_frame_equal(expected, result)
|
||||
@@ -0,0 +1,513 @@
|
||||
"""
|
||||
Tests that the file header is properly handled or inferred
|
||||
during parsing for all of the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
from collections import namedtuple
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.errors import ParserError
|
||||
|
||||
from pandas import DataFrame, Index, MultiIndex
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
def test_read_with_bad_header(all_parsers):
|
||||
parser = all_parsers
|
||||
msg = r"but only \d+ lines in file"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
s = StringIO(",,")
|
||||
parser.read_csv(s, header=[10])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("header", [True, False])
|
||||
def test_bool_header_arg(all_parsers, header):
|
||||
# see gh-6114
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
MyColumn
|
||||
a
|
||||
b
|
||||
a
|
||||
b"""
|
||||
msg = "Passing a bool to header is invalid"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
parser.read_csv(StringIO(data), header=header)
|
||||
|
||||
|
||||
def test_no_header_prefix(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """1,2,3,4,5
|
||||
6,7,8,9,10
|
||||
11,12,13,14,15
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data), prefix="Field", header=None)
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]],
|
||||
columns=["Field0", "Field1", "Field2", "Field3", "Field4"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_header_with_index_col(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """foo,1,2,3
|
||||
bar,4,5,6
|
||||
baz,7,8,9
|
||||
"""
|
||||
names = ["A", "B", "C"]
|
||||
result = parser.read_csv(StringIO(data), names=names)
|
||||
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
index=["foo", "bar", "baz"],
|
||||
columns=["A", "B", "C"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_header_not_first_line(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """got,to,ignore,this,line
|
||||
got,to,ignore,this,line
|
||||
index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
"""
|
||||
data2 = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=2, index_col=0)
|
||||
expected = parser.read_csv(StringIO(data2), header=0, index_col=0)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_header_multi_index(all_parsers):
|
||||
parser = all_parsers
|
||||
expected = tm.makeCustomDataframe(5, 3, r_idx_nlevels=2, c_idx_nlevels=4)
|
||||
|
||||
data = """\
|
||||
C0,,C_l0_g0,C_l0_g1,C_l0_g2
|
||||
|
||||
C1,,C_l1_g0,C_l1_g1,C_l1_g2
|
||||
C2,,C_l2_g0,C_l2_g1,C_l2_g2
|
||||
C3,,C_l3_g0,C_l3_g1,C_l3_g2
|
||||
R0,R1,,,
|
||||
R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2
|
||||
R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2
|
||||
R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2
|
||||
R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2
|
||||
R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data), header=[0, 1, 2, 3], index_col=[0, 1])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs,msg",
|
||||
[
|
||||
(
|
||||
dict(index_col=["foo", "bar"]),
|
||||
(
|
||||
"index_col must only contain "
|
||||
"row numbers when specifying "
|
||||
"a multi-index header"
|
||||
),
|
||||
),
|
||||
(
|
||||
dict(index_col=[0, 1], names=["foo", "bar"]),
|
||||
("cannot specify names when specifying a multi-index header"),
|
||||
),
|
||||
(
|
||||
dict(index_col=[0, 1], usecols=["foo", "bar"]),
|
||||
("cannot specify usecols when specifying a multi-index header"),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_header_multi_index_invalid(all_parsers, kwargs, msg):
|
||||
data = """\
|
||||
C0,,C_l0_g0,C_l0_g1,C_l0_g2
|
||||
|
||||
C1,,C_l1_g0,C_l1_g1,C_l1_g2
|
||||
C2,,C_l2_g0,C_l2_g1,C_l2_g2
|
||||
C3,,C_l3_g0,C_l3_g1,C_l3_g2
|
||||
R0,R1,,,
|
||||
R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2
|
||||
R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2
|
||||
R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2
|
||||
R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2
|
||||
R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2
|
||||
"""
|
||||
parser = all_parsers
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), header=[0, 1, 2, 3], **kwargs)
|
||||
|
||||
|
||||
_TestTuple = namedtuple("names", ["first", "second"])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs",
|
||||
[
|
||||
dict(header=[0, 1]),
|
||||
dict(
|
||||
skiprows=3,
|
||||
names=[
|
||||
("a", "q"),
|
||||
("a", "r"),
|
||||
("a", "s"),
|
||||
("b", "t"),
|
||||
("c", "u"),
|
||||
("c", "v"),
|
||||
],
|
||||
),
|
||||
dict(
|
||||
skiprows=3,
|
||||
names=[
|
||||
_TestTuple("a", "q"),
|
||||
_TestTuple("a", "r"),
|
||||
_TestTuple("a", "s"),
|
||||
_TestTuple("b", "t"),
|
||||
_TestTuple("c", "u"),
|
||||
_TestTuple("c", "v"),
|
||||
],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_header_multi_index_common_format1(all_parsers, kwargs):
|
||||
parser = all_parsers
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
|
||||
index=["one", "two"],
|
||||
columns=MultiIndex.from_tuples(
|
||||
[("a", "q"), ("a", "r"), ("a", "s"), ("b", "t"), ("c", "u"), ("c", "v")]
|
||||
),
|
||||
)
|
||||
data = """,a,a,a,b,c,c
|
||||
,q,r,s,t,u,v
|
||||
,,,,,,
|
||||
one,1,2,3,4,5,6
|
||||
two,7,8,9,10,11,12"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), index_col=0, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs",
|
||||
[
|
||||
dict(header=[0, 1]),
|
||||
dict(
|
||||
skiprows=2,
|
||||
names=[
|
||||
("a", "q"),
|
||||
("a", "r"),
|
||||
("a", "s"),
|
||||
("b", "t"),
|
||||
("c", "u"),
|
||||
("c", "v"),
|
||||
],
|
||||
),
|
||||
dict(
|
||||
skiprows=2,
|
||||
names=[
|
||||
_TestTuple("a", "q"),
|
||||
_TestTuple("a", "r"),
|
||||
_TestTuple("a", "s"),
|
||||
_TestTuple("b", "t"),
|
||||
_TestTuple("c", "u"),
|
||||
_TestTuple("c", "v"),
|
||||
],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_header_multi_index_common_format2(all_parsers, kwargs):
|
||||
parser = all_parsers
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
|
||||
index=["one", "two"],
|
||||
columns=MultiIndex.from_tuples(
|
||||
[("a", "q"), ("a", "r"), ("a", "s"), ("b", "t"), ("c", "u"), ("c", "v")]
|
||||
),
|
||||
)
|
||||
data = """,a,a,a,b,c,c
|
||||
,q,r,s,t,u,v
|
||||
one,1,2,3,4,5,6
|
||||
two,7,8,9,10,11,12"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), index_col=0, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs",
|
||||
[
|
||||
dict(header=[0, 1]),
|
||||
dict(
|
||||
skiprows=2,
|
||||
names=[
|
||||
("a", "q"),
|
||||
("a", "r"),
|
||||
("a", "s"),
|
||||
("b", "t"),
|
||||
("c", "u"),
|
||||
("c", "v"),
|
||||
],
|
||||
),
|
||||
dict(
|
||||
skiprows=2,
|
||||
names=[
|
||||
_TestTuple("a", "q"),
|
||||
_TestTuple("a", "r"),
|
||||
_TestTuple("a", "s"),
|
||||
_TestTuple("b", "t"),
|
||||
_TestTuple("c", "u"),
|
||||
_TestTuple("c", "v"),
|
||||
],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_header_multi_index_common_format3(all_parsers, kwargs):
|
||||
parser = all_parsers
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
|
||||
index=["one", "two"],
|
||||
columns=MultiIndex.from_tuples(
|
||||
[("a", "q"), ("a", "r"), ("a", "s"), ("b", "t"), ("c", "u"), ("c", "v")]
|
||||
),
|
||||
)
|
||||
expected = expected.reset_index(drop=True)
|
||||
data = """a,a,a,b,c,c
|
||||
q,r,s,t,u,v
|
||||
1,2,3,4,5,6
|
||||
7,8,9,10,11,12"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), index_col=None, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_header_multi_index_common_format_malformed1(all_parsers):
|
||||
parser = all_parsers
|
||||
expected = DataFrame(
|
||||
np.array([[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype="int64"),
|
||||
index=Index([1, 7]),
|
||||
columns=MultiIndex(
|
||||
levels=[["a", "b", "c"], ["r", "s", "t", "u", "v"]],
|
||||
codes=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]],
|
||||
names=["a", "q"],
|
||||
),
|
||||
)
|
||||
data = """a,a,a,b,c,c
|
||||
q,r,s,t,u,v
|
||||
1,2,3,4,5,6
|
||||
7,8,9,10,11,12"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=[0, 1], index_col=0)
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
|
||||
def test_header_multi_index_common_format_malformed2(all_parsers):
|
||||
parser = all_parsers
|
||||
expected = DataFrame(
|
||||
np.array([[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype="int64"),
|
||||
index=Index([1, 7]),
|
||||
columns=MultiIndex(
|
||||
levels=[["a", "b", "c"], ["r", "s", "t", "u", "v"]],
|
||||
codes=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]],
|
||||
names=[None, "q"],
|
||||
),
|
||||
)
|
||||
|
||||
data = """,a,a,b,c,c
|
||||
q,r,s,t,u,v
|
||||
1,2,3,4,5,6
|
||||
7,8,9,10,11,12"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=[0, 1], index_col=0)
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
|
||||
def test_header_multi_index_common_format_malformed3(all_parsers):
|
||||
parser = all_parsers
|
||||
expected = DataFrame(
|
||||
np.array([[3, 4, 5, 6], [9, 10, 11, 12]], dtype="int64"),
|
||||
index=MultiIndex(levels=[[1, 7], [2, 8]], codes=[[0, 1], [0, 1]]),
|
||||
columns=MultiIndex(
|
||||
levels=[["a", "b", "c"], ["s", "t", "u", "v"]],
|
||||
codes=[[0, 1, 2, 2], [0, 1, 2, 3]],
|
||||
names=[None, "q"],
|
||||
),
|
||||
)
|
||||
data = """,a,a,b,c,c
|
||||
q,r,s,t,u,v
|
||||
1,2,3,4,5,6
|
||||
7,8,9,10,11,12"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=[0, 1], index_col=[0, 1])
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,header", [("1,2,3\n4,5,6", None), ("foo,bar,baz\n1,2,3\n4,5,6", 0)]
|
||||
)
|
||||
def test_header_names_backward_compat(all_parsers, data, header):
|
||||
# see gh-2539
|
||||
parser = all_parsers
|
||||
expected = parser.read_csv(StringIO("1,2,3\n4,5,6"), names=["a", "b", "c"])
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=["a", "b", "c"], header=header)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs", [dict(), dict(index_col=False)])
|
||||
def test_read_only_header_no_rows(all_parsers, kwargs):
|
||||
# See gh-7773
|
||||
parser = all_parsers
|
||||
expected = DataFrame(columns=["a", "b", "c"])
|
||||
|
||||
result = parser.read_csv(StringIO("a,b,c"), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs,names",
|
||||
[
|
||||
(dict(), [0, 1, 2, 3, 4]),
|
||||
(dict(prefix="X"), ["X0", "X1", "X2", "X3", "X4"]),
|
||||
(
|
||||
dict(names=["foo", "bar", "baz", "quux", "panda"]),
|
||||
["foo", "bar", "baz", "quux", "panda"],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_no_header(all_parsers, kwargs, names):
|
||||
parser = all_parsers
|
||||
data = """1,2,3,4,5
|
||||
6,7,8,9,10
|
||||
11,12,13,14,15
|
||||
"""
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]], columns=names
|
||||
)
|
||||
result = parser.read_csv(StringIO(data), header=None, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("header", [["a", "b"], "string_header"])
|
||||
def test_non_int_header(all_parsers, header):
|
||||
# see gh-16338
|
||||
msg = "header must be integer or list of integers"
|
||||
data = """1,2\n3,4"""
|
||||
parser = all_parsers
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), header=header)
|
||||
|
||||
|
||||
def test_singleton_header(all_parsers):
|
||||
# see gh-7757
|
||||
data = """a,b,c\n0,1,2\n1,2,3"""
|
||||
parser = all_parsers
|
||||
|
||||
expected = DataFrame({"a": [0, 1], "b": [1, 2], "c": [2, 3]})
|
||||
result = parser.read_csv(StringIO(data), header=[0])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,expected",
|
||||
[
|
||||
(
|
||||
"A,A,A,B\none,one,one,two\n0,40,34,0.1",
|
||||
DataFrame(
|
||||
[[0, 40, 34, 0.1]],
|
||||
columns=MultiIndex.from_tuples(
|
||||
[("A", "one"), ("A", "one.1"), ("A", "one.2"), ("B", "two")]
|
||||
),
|
||||
),
|
||||
),
|
||||
(
|
||||
"A,A,A,B\none,one,one.1,two\n0,40,34,0.1",
|
||||
DataFrame(
|
||||
[[0, 40, 34, 0.1]],
|
||||
columns=MultiIndex.from_tuples(
|
||||
[("A", "one"), ("A", "one.1"), ("A", "one.1.1"), ("B", "two")]
|
||||
),
|
||||
),
|
||||
),
|
||||
(
|
||||
"A,A,A,B,B\none,one,one.1,two,two\n0,40,34,0.1,0.1",
|
||||
DataFrame(
|
||||
[[0, 40, 34, 0.1, 0.1]],
|
||||
columns=MultiIndex.from_tuples(
|
||||
[
|
||||
("A", "one"),
|
||||
("A", "one.1"),
|
||||
("A", "one.1.1"),
|
||||
("B", "two"),
|
||||
("B", "two.1"),
|
||||
]
|
||||
),
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_mangles_multi_index(all_parsers, data, expected):
|
||||
# see gh-18062
|
||||
parser = all_parsers
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=[0, 1])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("index_col", [None, [0]])
|
||||
@pytest.mark.parametrize(
|
||||
"columns", [None, (["", "Unnamed"]), (["Unnamed", ""]), (["Unnamed", "NotUnnamed"])]
|
||||
)
|
||||
def test_multi_index_unnamed(all_parsers, index_col, columns):
|
||||
# see gh-23687
|
||||
#
|
||||
# When specifying a multi-index header, make sure that
|
||||
# we don't error just because one of the rows in our header
|
||||
# has ALL column names containing the string "Unnamed". The
|
||||
# correct condition to check is whether the row contains
|
||||
# ALL columns that did not have names (and instead were given
|
||||
# placeholder ones).
|
||||
parser = all_parsers
|
||||
header = [0, 1]
|
||||
|
||||
if index_col is None:
|
||||
data = ",".join(columns or ["", ""]) + "\n0,1\n2,3\n4,5\n"
|
||||
else:
|
||||
data = ",".join([""] + (columns or ["", ""])) + "\n,0,1\n0,2,3\n1,4,5\n"
|
||||
|
||||
if columns is None:
|
||||
msg = (
|
||||
r"Passed header=\[0,1\] are too "
|
||||
r"many rows for this multi_index of columns"
|
||||
)
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data), header=header, index_col=index_col)
|
||||
else:
|
||||
result = parser.read_csv(StringIO(data), header=header, index_col=index_col)
|
||||
template = "Unnamed: {i}_level_0"
|
||||
exp_columns = []
|
||||
|
||||
for i, col in enumerate(columns):
|
||||
if not col: # Unnamed.
|
||||
col = template.format(i=i if index_col is None else i + 1)
|
||||
|
||||
exp_columns.append(col)
|
||||
|
||||
columns = MultiIndex.from_tuples(zip(exp_columns, ["0", "1"]))
|
||||
expected = DataFrame([[2, 3], [4, 5]], columns=columns)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,176 @@
|
||||
"""
|
||||
Tests that the specified index column (a.k.a "index_col")
|
||||
is properly handled or inferred during parsing for all of
|
||||
the parsers defined in parsers.py
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame, Index, MultiIndex
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize("with_header", [True, False])
|
||||
def test_index_col_named(all_parsers, with_header):
|
||||
parser = all_parsers
|
||||
no_header = """\
|
||||
KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
|
||||
KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
|
||||
KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
|
||||
KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
|
||||
KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
|
||||
KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" # noqa
|
||||
header = (
|
||||
"ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir\n"
|
||||
) # noqa
|
||||
|
||||
if with_header:
|
||||
data = header + no_header
|
||||
|
||||
result = parser.read_csv(StringIO(data), index_col="ID")
|
||||
expected = parser.read_csv(StringIO(data), header=0).set_index("ID")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
data = no_header
|
||||
msg = "Index ID invalid"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), index_col="ID")
|
||||
|
||||
|
||||
def test_index_col_named2(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """\
|
||||
1,2,3,4,hello
|
||||
5,6,7,8,world
|
||||
9,10,11,12,foo
|
||||
"""
|
||||
|
||||
expected = DataFrame(
|
||||
{"a": [1, 5, 9], "b": [2, 6, 10], "c": [3, 7, 11], "d": [4, 8, 12]},
|
||||
index=Index(["hello", "world", "foo"], name="message"),
|
||||
)
|
||||
names = ["a", "b", "c", "d", "message"]
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=names, index_col=["message"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_index_col_is_true(all_parsers):
|
||||
# see gh-9798
|
||||
data = "a,b\n1,2"
|
||||
parser = all_parsers
|
||||
|
||||
msg = "The value of index_col couldn't be 'True'"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), index_col=True)
|
||||
|
||||
|
||||
def test_infer_index_col(all_parsers):
|
||||
data = """A,B,C
|
||||
foo,1,2,3
|
||||
bar,4,5,6
|
||||
baz,7,8,9
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data))
|
||||
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
index=["foo", "bar", "baz"],
|
||||
columns=["A", "B", "C"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"index_col,kwargs",
|
||||
[
|
||||
(None, dict(columns=["x", "y", "z"])),
|
||||
(False, dict(columns=["x", "y", "z"])),
|
||||
(0, dict(columns=["y", "z"], index=Index([], name="x"))),
|
||||
(1, dict(columns=["x", "z"], index=Index([], name="y"))),
|
||||
("x", dict(columns=["y", "z"], index=Index([], name="x"))),
|
||||
("y", dict(columns=["x", "z"], index=Index([], name="y"))),
|
||||
(
|
||||
[0, 1],
|
||||
dict(
|
||||
columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["x", "y"])
|
||||
),
|
||||
),
|
||||
(
|
||||
["x", "y"],
|
||||
dict(
|
||||
columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["x", "y"])
|
||||
),
|
||||
),
|
||||
(
|
||||
[1, 0],
|
||||
dict(
|
||||
columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["y", "x"])
|
||||
),
|
||||
),
|
||||
(
|
||||
["y", "x"],
|
||||
dict(
|
||||
columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["y", "x"])
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_index_col_empty_data(all_parsers, index_col, kwargs):
|
||||
data = "x,y,z"
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), index_col=index_col)
|
||||
|
||||
expected = DataFrame(**kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_empty_with_index_col_false(all_parsers):
|
||||
# see gh-10413
|
||||
data = "x,y"
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), index_col=False)
|
||||
|
||||
expected = DataFrame(columns=["x", "y"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"index_names",
|
||||
[
|
||||
["", ""],
|
||||
["foo", ""],
|
||||
["", "bar"],
|
||||
["foo", "bar"],
|
||||
["NotReallyUnnamed", "Unnamed: 0"],
|
||||
],
|
||||
)
|
||||
def test_multi_index_naming(all_parsers, index_names):
|
||||
parser = all_parsers
|
||||
|
||||
# We don't want empty index names being replaced with "Unnamed: 0"
|
||||
data = ",".join(index_names + ["col\na,c,1\na,d,2\nb,c,3\nb,d,4"])
|
||||
result = parser.read_csv(StringIO(data), index_col=[0, 1])
|
||||
|
||||
expected = DataFrame(
|
||||
{"col": [1, 2, 3, 4]}, index=MultiIndex.from_product([["a", "b"], ["c", "d"]])
|
||||
)
|
||||
expected.index.names = [name if name else None for name in index_names]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_multi_index_naming_not_all_at_beginning(all_parsers):
|
||||
parser = all_parsers
|
||||
data = ",Unnamed: 2,\na,c,1\na,d,2\nb,c,3\nb,d,4"
|
||||
result = parser.read_csv(StringIO(data), index_col=[0, 2])
|
||||
|
||||
expected = DataFrame(
|
||||
{"Unnamed: 2": ["c", "d", "c", "d"]},
|
||||
index=MultiIndex(
|
||||
levels=[["a", "b"], [1, 2, 3, 4]], codes=[[0, 0, 1, 1], [0, 1, 2, 3]]
|
||||
),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,132 @@
|
||||
"""
|
||||
Tests that duplicate columns are handled appropriately when parsed by the
|
||||
CSV engine. In general, the expected result is that they are either thoroughly
|
||||
de-duplicated (if mangling requested) or ignored otherwise.
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs", [dict(), dict(mangle_dupe_cols=True)])
|
||||
def test_basic(all_parsers, kwargs):
|
||||
# TODO: add test for condition "mangle_dupe_cols=False"
|
||||
# once it is actually supported (gh-12935)
|
||||
parser = all_parsers
|
||||
|
||||
data = "a,a,b,b,b\n1,2,3,4,5"
|
||||
result = parser.read_csv(StringIO(data), sep=",", **kwargs)
|
||||
|
||||
expected = DataFrame([[1, 2, 3, 4, 5]], columns=["a", "a.1", "b", "b.1", "b.2"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_basic_names(all_parsers):
|
||||
# See gh-7160
|
||||
parser = all_parsers
|
||||
|
||||
data = "a,b,a\n0,1,2\n3,4,5"
|
||||
expected = DataFrame([[0, 1, 2], [3, 4, 5]], columns=["a", "b", "a.1"])
|
||||
|
||||
result = parser.read_csv(StringIO(data))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_basic_names_raise(all_parsers):
|
||||
# See gh-7160
|
||||
parser = all_parsers
|
||||
|
||||
data = "0,1,2\n3,4,5"
|
||||
with pytest.raises(ValueError, match="Duplicate names"):
|
||||
parser.read_csv(StringIO(data), names=["a", "b", "a"])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,expected",
|
||||
[
|
||||
("a,a,a.1\n1,2,3", DataFrame([[1, 2, 3]], columns=["a", "a.1", "a.1.1"])),
|
||||
(
|
||||
"a,a,a.1,a.1.1,a.1.1.1,a.1.1.1.1\n1,2,3,4,5,6",
|
||||
DataFrame(
|
||||
[[1, 2, 3, 4, 5, 6]],
|
||||
columns=["a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1", "a.1.1.1.1.1"],
|
||||
),
|
||||
),
|
||||
(
|
||||
"a,a,a.3,a.1,a.2,a,a\n1,2,3,4,5,6,7",
|
||||
DataFrame(
|
||||
[[1, 2, 3, 4, 5, 6, 7]],
|
||||
columns=["a", "a.1", "a.3", "a.1.1", "a.2", "a.2.1", "a.3.1"],
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_thorough_mangle_columns(all_parsers, data, expected):
|
||||
# see gh-17060
|
||||
parser = all_parsers
|
||||
|
||||
result = parser.read_csv(StringIO(data))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,names,expected",
|
||||
[
|
||||
(
|
||||
"a,b,b\n1,2,3",
|
||||
["a.1", "a.1", "a.1.1"],
|
||||
DataFrame(
|
||||
[["a", "b", "b"], ["1", "2", "3"]], columns=["a.1", "a.1.1", "a.1.1.1"]
|
||||
),
|
||||
),
|
||||
(
|
||||
"a,b,c,d,e,f\n1,2,3,4,5,6",
|
||||
["a", "a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"],
|
||||
DataFrame(
|
||||
[["a", "b", "c", "d", "e", "f"], ["1", "2", "3", "4", "5", "6"]],
|
||||
columns=["a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1", "a.1.1.1.1.1"],
|
||||
),
|
||||
),
|
||||
(
|
||||
"a,b,c,d,e,f,g\n1,2,3,4,5,6,7",
|
||||
["a", "a", "a.3", "a.1", "a.2", "a", "a"],
|
||||
DataFrame(
|
||||
[
|
||||
["a", "b", "c", "d", "e", "f", "g"],
|
||||
["1", "2", "3", "4", "5", "6", "7"],
|
||||
],
|
||||
columns=["a", "a.1", "a.3", "a.1.1", "a.2", "a.2.1", "a.3.1"],
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_thorough_mangle_names(all_parsers, data, names, expected):
|
||||
# see gh-17095
|
||||
parser = all_parsers
|
||||
|
||||
with pytest.raises(ValueError, match="Duplicate names"):
|
||||
parser.read_csv(StringIO(data), names=names)
|
||||
|
||||
|
||||
def test_mangled_unnamed_placeholders(all_parsers):
|
||||
# xref gh-13017
|
||||
orig_key = "0"
|
||||
parser = all_parsers
|
||||
|
||||
orig_value = [1, 2, 3]
|
||||
df = DataFrame({orig_key: orig_value})
|
||||
|
||||
# This test recursively updates `df`.
|
||||
for i in range(3):
|
||||
expected = DataFrame()
|
||||
|
||||
for j in range(i + 1):
|
||||
expected["Unnamed: 0" + ".1" * j] = [0, 1, 2]
|
||||
|
||||
expected[orig_key] = orig_value
|
||||
df = parser.read_csv(StringIO(df.to_csv()))
|
||||
|
||||
tm.assert_frame_equal(df, expected)
|
||||
@@ -0,0 +1,146 @@
|
||||
"""
|
||||
Tests multithreading behaviour for reading and
|
||||
parsing files for each parser defined in parsers.py
|
||||
"""
|
||||
from io import BytesIO
|
||||
from multiprocessing.pool import ThreadPool
|
||||
|
||||
import numpy as np
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
def _construct_dataframe(num_rows):
|
||||
"""
|
||||
Construct a DataFrame for testing.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
num_rows : int
|
||||
The number of rows for our DataFrame.
|
||||
|
||||
Returns
|
||||
-------
|
||||
df : DataFrame
|
||||
"""
|
||||
df = DataFrame(np.random.rand(num_rows, 5), columns=list("abcde"))
|
||||
df["foo"] = "foo"
|
||||
df["bar"] = "bar"
|
||||
df["baz"] = "baz"
|
||||
df["date"] = pd.date_range("20000101 09:00:00", periods=num_rows, freq="s")
|
||||
df["int"] = np.arange(num_rows, dtype="int64")
|
||||
return df
|
||||
|
||||
|
||||
def test_multi_thread_string_io_read_csv(all_parsers):
|
||||
# see gh-11786
|
||||
parser = all_parsers
|
||||
max_row_range = 10000
|
||||
num_files = 100
|
||||
|
||||
bytes_to_df = [
|
||||
"\n".join(
|
||||
["{i:d},{i:d},{i:d}".format(i=i) for i in range(max_row_range)]
|
||||
).encode()
|
||||
for _ in range(num_files)
|
||||
]
|
||||
files = [BytesIO(b) for b in bytes_to_df]
|
||||
|
||||
# Read all files in many threads.
|
||||
pool = ThreadPool(8)
|
||||
|
||||
results = pool.map(parser.read_csv, files)
|
||||
first_result = results[0]
|
||||
|
||||
for result in results:
|
||||
tm.assert_frame_equal(first_result, result)
|
||||
|
||||
|
||||
def _generate_multi_thread_dataframe(parser, path, num_rows, num_tasks):
|
||||
"""
|
||||
Generate a DataFrame via multi-thread.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
parser : BaseParser
|
||||
The parser object to use for reading the data.
|
||||
path : str
|
||||
The location of the CSV file to read.
|
||||
num_rows : int
|
||||
The number of rows to read per task.
|
||||
num_tasks : int
|
||||
The number of tasks to use for reading this DataFrame.
|
||||
|
||||
Returns
|
||||
-------
|
||||
df : DataFrame
|
||||
"""
|
||||
|
||||
def reader(arg):
|
||||
"""
|
||||
Create a reader for part of the CSV.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
arg : tuple
|
||||
A tuple of the following:
|
||||
|
||||
* start : int
|
||||
The starting row to start for parsing CSV
|
||||
* nrows : int
|
||||
The number of rows to read.
|
||||
|
||||
Returns
|
||||
-------
|
||||
df : DataFrame
|
||||
"""
|
||||
start, nrows = arg
|
||||
|
||||
if not start:
|
||||
return parser.read_csv(
|
||||
path, index_col=0, header=0, nrows=nrows, parse_dates=["date"]
|
||||
)
|
||||
|
||||
return parser.read_csv(
|
||||
path,
|
||||
index_col=0,
|
||||
header=None,
|
||||
skiprows=int(start) + 1,
|
||||
nrows=nrows,
|
||||
parse_dates=[9],
|
||||
)
|
||||
|
||||
tasks = [
|
||||
(num_rows * i // num_tasks, num_rows // num_tasks) for i in range(num_tasks)
|
||||
]
|
||||
|
||||
pool = ThreadPool(processes=num_tasks)
|
||||
results = pool.map(reader, tasks)
|
||||
|
||||
header = results[0].columns
|
||||
|
||||
for r in results[1:]:
|
||||
r.columns = header
|
||||
|
||||
final_dataframe = pd.concat(results)
|
||||
return final_dataframe
|
||||
|
||||
|
||||
def test_multi_thread_path_multipart_read_csv(all_parsers):
|
||||
# see gh-11786
|
||||
num_tasks = 4
|
||||
num_rows = 100000
|
||||
|
||||
parser = all_parsers
|
||||
file_name = "__thread_pool_reader__.csv"
|
||||
df = _construct_dataframe(num_rows)
|
||||
|
||||
with tm.ensure_clean(file_name) as path:
|
||||
df.to_csv(path)
|
||||
|
||||
final_dataframe = _generate_multi_thread_dataframe(
|
||||
parser, path, num_rows, num_tasks
|
||||
)
|
||||
tm.assert_frame_equal(df, final_dataframe)
|
||||
@@ -0,0 +1,538 @@
|
||||
"""
|
||||
Tests that NA values are properly handled during
|
||||
parsing for all of the parsers defined in parsers.py
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame, Index, MultiIndex
|
||||
import pandas.util.testing as tm
|
||||
|
||||
import pandas.io.common as com
|
||||
|
||||
|
||||
def test_string_nas(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """A,B,C
|
||||
a,b,c
|
||||
d,,f
|
||||
,g,h
|
||||
"""
|
||||
result = parser.read_csv(StringIO(data))
|
||||
expected = DataFrame(
|
||||
[["a", "b", "c"], ["d", np.nan, "f"], [np.nan, "g", "h"]],
|
||||
columns=["A", "B", "C"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_detect_string_na(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """A,B
|
||||
foo,bar
|
||||
NA,baz
|
||||
NaN,nan
|
||||
"""
|
||||
expected = DataFrame(
|
||||
[["foo", "bar"], [np.nan, "baz"], [np.nan, np.nan]], columns=["A", "B"]
|
||||
)
|
||||
result = parser.read_csv(StringIO(data))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"na_values",
|
||||
[
|
||||
["-999.0", "-999"],
|
||||
[-999, -999.0],
|
||||
[-999.0, -999],
|
||||
["-999.0"],
|
||||
["-999"],
|
||||
[-999.0],
|
||||
[-999],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"data",
|
||||
[
|
||||
"""A,B
|
||||
-999,1.2
|
||||
2,-999
|
||||
3,4.5
|
||||
""",
|
||||
"""A,B
|
||||
-999,1.200
|
||||
2,-999.000
|
||||
3,4.500
|
||||
""",
|
||||
],
|
||||
)
|
||||
def test_non_string_na_values(all_parsers, data, na_values):
|
||||
# see gh-3611: with an odd float format, we can't match
|
||||
# the string "999.0" exactly but still need float matching
|
||||
parser = all_parsers
|
||||
expected = DataFrame([[np.nan, 1.2], [2.0, np.nan], [3.0, 4.5]], columns=["A", "B"])
|
||||
|
||||
result = parser.read_csv(StringIO(data), na_values=na_values)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_default_na_values(all_parsers):
|
||||
_NA_VALUES = {
|
||||
"-1.#IND",
|
||||
"1.#QNAN",
|
||||
"1.#IND",
|
||||
"-1.#QNAN",
|
||||
"#N/A",
|
||||
"N/A",
|
||||
"n/a",
|
||||
"NA",
|
||||
"#NA",
|
||||
"NULL",
|
||||
"null",
|
||||
"NaN",
|
||||
"nan",
|
||||
"-NaN",
|
||||
"-nan",
|
||||
"#N/A N/A",
|
||||
"",
|
||||
}
|
||||
assert _NA_VALUES == com._NA_VALUES
|
||||
|
||||
parser = all_parsers
|
||||
nv = len(_NA_VALUES)
|
||||
|
||||
def f(i, v):
|
||||
if i == 0:
|
||||
buf = ""
|
||||
elif i > 0:
|
||||
buf = "".join([","] * i)
|
||||
|
||||
buf = "{0}{1}".format(buf, v)
|
||||
|
||||
if i < nv - 1:
|
||||
buf = "{0}{1}".format(buf, "".join([","] * (nv - i - 1)))
|
||||
|
||||
return buf
|
||||
|
||||
data = StringIO("\n".join(f(i, v) for i, v in enumerate(_NA_VALUES)))
|
||||
expected = DataFrame(np.nan, columns=range(nv), index=range(nv))
|
||||
|
||||
result = parser.read_csv(data, header=None)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na_values", ["baz", ["baz"]])
|
||||
def test_custom_na_values(all_parsers, na_values):
|
||||
parser = all_parsers
|
||||
data = """A,B,C
|
||||
ignore,this,row
|
||||
1,NA,3
|
||||
-1.#IND,5,baz
|
||||
7,8,NaN
|
||||
"""
|
||||
expected = DataFrame(
|
||||
[[1.0, np.nan, 3], [np.nan, 5, np.nan], [7, 8, np.nan]], columns=["A", "B", "C"]
|
||||
)
|
||||
result = parser.read_csv(StringIO(data), na_values=na_values, skiprows=[1])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_bool_na_values(all_parsers):
|
||||
data = """A,B,C
|
||||
True,False,True
|
||||
NA,True,False
|
||||
False,NA,True"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data))
|
||||
expected = DataFrame(
|
||||
{
|
||||
"A": np.array([True, np.nan, False], dtype=object),
|
||||
"B": np.array([False, True, np.nan], dtype=object),
|
||||
"C": [True, False, True],
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_na_value_dict(all_parsers):
|
||||
data = """A,B,C
|
||||
foo,bar,NA
|
||||
bar,foo,foo
|
||||
foo,bar,NA
|
||||
bar,foo,foo"""
|
||||
parser = all_parsers
|
||||
df = parser.read_csv(StringIO(data), na_values={"A": ["foo"], "B": ["bar"]})
|
||||
expected = DataFrame(
|
||||
{
|
||||
"A": [np.nan, "bar", np.nan, "bar"],
|
||||
"B": [np.nan, "foo", np.nan, "foo"],
|
||||
"C": [np.nan, "foo", np.nan, "foo"],
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"index_col,expected",
|
||||
[
|
||||
(
|
||||
[0],
|
||||
DataFrame({"b": [np.nan], "c": [1], "d": [5]}, index=Index([0], name="a")),
|
||||
),
|
||||
(
|
||||
[0, 2],
|
||||
DataFrame(
|
||||
{"b": [np.nan], "d": [5]},
|
||||
index=MultiIndex.from_tuples([(0, 1)], names=["a", "c"]),
|
||||
),
|
||||
),
|
||||
(
|
||||
["a", "c"],
|
||||
DataFrame(
|
||||
{"b": [np.nan], "d": [5]},
|
||||
index=MultiIndex.from_tuples([(0, 1)], names=["a", "c"]),
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_na_value_dict_multi_index(all_parsers, index_col, expected):
|
||||
data = """\
|
||||
a,b,c,d
|
||||
0,NA,1,5
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), na_values=set(), index_col=index_col)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs,expected",
|
||||
[
|
||||
(
|
||||
dict(),
|
||||
DataFrame(
|
||||
{
|
||||
"A": ["a", "b", np.nan, "d", "e", np.nan, "g"],
|
||||
"B": [1, 2, 3, 4, 5, 6, 7],
|
||||
"C": ["one", "two", "three", np.nan, "five", np.nan, "seven"],
|
||||
}
|
||||
),
|
||||
),
|
||||
(
|
||||
dict(na_values={"A": [], "C": []}, keep_default_na=False),
|
||||
DataFrame(
|
||||
{
|
||||
"A": ["a", "b", "", "d", "e", "nan", "g"],
|
||||
"B": [1, 2, 3, 4, 5, 6, 7],
|
||||
"C": ["one", "two", "three", "nan", "five", "", "seven"],
|
||||
}
|
||||
),
|
||||
),
|
||||
(
|
||||
dict(na_values=["a"], keep_default_na=False),
|
||||
DataFrame(
|
||||
{
|
||||
"A": [np.nan, "b", "", "d", "e", "nan", "g"],
|
||||
"B": [1, 2, 3, 4, 5, 6, 7],
|
||||
"C": ["one", "two", "three", "nan", "five", "", "seven"],
|
||||
}
|
||||
),
|
||||
),
|
||||
(
|
||||
dict(na_values={"A": [], "C": []}),
|
||||
DataFrame(
|
||||
{
|
||||
"A": ["a", "b", np.nan, "d", "e", np.nan, "g"],
|
||||
"B": [1, 2, 3, 4, 5, 6, 7],
|
||||
"C": ["one", "two", "three", np.nan, "five", np.nan, "seven"],
|
||||
}
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_na_values_keep_default(all_parsers, kwargs, expected):
|
||||
data = """\
|
||||
A,B,C
|
||||
a,1,one
|
||||
b,2,two
|
||||
,3,three
|
||||
d,4,nan
|
||||
e,5,five
|
||||
nan,6,
|
||||
g,7,seven
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_no_na_values_no_keep_default(all_parsers):
|
||||
# see gh-4318: passing na_values=None and
|
||||
# keep_default_na=False yields 'None" as a na_value
|
||||
data = """\
|
||||
A,B,C
|
||||
a,1,None
|
||||
b,2,two
|
||||
,3,None
|
||||
d,4,nan
|
||||
e,5,five
|
||||
nan,6,
|
||||
g,7,seven
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), keep_default_na=False)
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"A": ["a", "b", "", "d", "e", "nan", "g"],
|
||||
"B": [1, 2, 3, 4, 5, 6, 7],
|
||||
"C": ["None", "two", "None", "nan", "five", "", "seven"],
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_no_keep_default_na_dict_na_values(all_parsers):
|
||||
# see gh-19227
|
||||
data = "a,b\n,2"
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(
|
||||
StringIO(data), na_values={"b": ["2"]}, keep_default_na=False
|
||||
)
|
||||
expected = DataFrame({"a": [""], "b": [np.nan]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_no_keep_default_na_dict_na_scalar_values(all_parsers):
|
||||
# see gh-19227
|
||||
#
|
||||
# Scalar values shouldn't cause the parsing to crash or fail.
|
||||
data = "a,b\n1,2"
|
||||
parser = all_parsers
|
||||
df = parser.read_csv(StringIO(data), na_values={"b": 2}, keep_default_na=False)
|
||||
expected = DataFrame({"a": [1], "b": [np.nan]})
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("col_zero_na_values", [113125, "113125"])
|
||||
def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_values):
|
||||
# see gh-19227
|
||||
data = """\
|
||||
113125,"blah","/blaha",kjsdkj,412.166,225.874,214.008
|
||||
729639,"qwer","",asdfkj,466.681,,252.373
|
||||
"""
|
||||
parser = all_parsers
|
||||
expected = DataFrame(
|
||||
{
|
||||
0: [np.nan, 729639.0],
|
||||
1: [np.nan, "qwer"],
|
||||
2: ["/blaha", np.nan],
|
||||
3: ["kjsdkj", "asdfkj"],
|
||||
4: [412.166, 466.681],
|
||||
5: ["225.874", ""],
|
||||
6: [np.nan, 252.373],
|
||||
}
|
||||
)
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
header=None,
|
||||
keep_default_na=False,
|
||||
na_values={2: "", 6: "214.008", 1: "blah", 0: col_zero_na_values},
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"na_filter,row_data",
|
||||
[
|
||||
(True, [[1, "A"], [np.nan, np.nan], [3, "C"]]),
|
||||
(False, [["1", "A"], ["nan", "B"], ["3", "C"]]),
|
||||
],
|
||||
)
|
||||
def test_na_values_na_filter_override(all_parsers, na_filter, row_data):
|
||||
data = """\
|
||||
A,B
|
||||
1,A
|
||||
nan,B
|
||||
3,C
|
||||
"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), na_values=["B"], na_filter=na_filter)
|
||||
|
||||
expected = DataFrame(row_data, columns=["A", "B"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_na_trailing_columns(all_parsers):
|
||||
parser = all_parsers
|
||||
data = """Date,Currency,Symbol,Type,Units,UnitPrice,Cost,Tax
|
||||
2012-03-14,USD,AAPL,BUY,1000
|
||||
2012-05-12,USD,SBUX,SELL,500"""
|
||||
|
||||
# Trailing columns should be all NaN.
|
||||
result = parser.read_csv(StringIO(data))
|
||||
expected = DataFrame(
|
||||
[
|
||||
["2012-03-14", "USD", "AAPL", "BUY", 1000, np.nan, np.nan, np.nan],
|
||||
["2012-05-12", "USD", "SBUX", "SELL", 500, np.nan, np.nan, np.nan],
|
||||
],
|
||||
columns=[
|
||||
"Date",
|
||||
"Currency",
|
||||
"Symbol",
|
||||
"Type",
|
||||
"Units",
|
||||
"UnitPrice",
|
||||
"Cost",
|
||||
"Tax",
|
||||
],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"na_values,row_data",
|
||||
[
|
||||
(1, [[np.nan, 2.0], [2.0, np.nan]]),
|
||||
({"a": 2, "b": 1}, [[1.0, 2.0], [np.nan, np.nan]]),
|
||||
],
|
||||
)
|
||||
def test_na_values_scalar(all_parsers, na_values, row_data):
|
||||
# see gh-12224
|
||||
parser = all_parsers
|
||||
names = ["a", "b"]
|
||||
data = "1,2\n2,1"
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=names, na_values=na_values)
|
||||
expected = DataFrame(row_data, columns=names)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_na_values_dict_aliasing(all_parsers):
|
||||
parser = all_parsers
|
||||
na_values = {"a": 2, "b": 1}
|
||||
na_values_copy = na_values.copy()
|
||||
|
||||
names = ["a", "b"]
|
||||
data = "1,2\n2,1"
|
||||
|
||||
expected = DataFrame([[1.0, 2.0], [np.nan, np.nan]], columns=names)
|
||||
result = parser.read_csv(StringIO(data), names=names, na_values=na_values)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
tm.assert_dict_equal(na_values, na_values_copy)
|
||||
|
||||
|
||||
def test_na_values_dict_col_index(all_parsers):
|
||||
# see gh-14203
|
||||
data = "a\nfoo\n1"
|
||||
parser = all_parsers
|
||||
na_values = {0: "foo"}
|
||||
|
||||
result = parser.read_csv(StringIO(data), na_values=na_values)
|
||||
expected = DataFrame({"a": [np.nan, 1]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,kwargs,expected",
|
||||
[
|
||||
(
|
||||
str(2 ** 63) + "\n" + str(2 ** 63 + 1),
|
||||
dict(na_values=[2 ** 63]),
|
||||
DataFrame([str(2 ** 63), str(2 ** 63 + 1)]),
|
||||
),
|
||||
(str(2 ** 63) + ",1" + "\n,2", dict(), DataFrame([[str(2 ** 63), 1], ["", 2]])),
|
||||
(str(2 ** 63) + "\n1", dict(na_values=[2 ** 63]), DataFrame([np.nan, 1])),
|
||||
],
|
||||
)
|
||||
def test_na_values_uint64(all_parsers, data, kwargs, expected):
|
||||
# see gh-14983
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), header=None, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_empty_na_values_no_default_with_index(all_parsers):
|
||||
# see gh-15835
|
||||
data = "a,1\nb,2"
|
||||
parser = all_parsers
|
||||
expected = DataFrame({"1": [2]}, index=Index(["b"], name="a"))
|
||||
|
||||
result = parser.read_csv(StringIO(data), index_col=0, keep_default_na=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"na_filter,index_data", [(False, ["", "5"]), (True, [np.nan, 5.0])]
|
||||
)
|
||||
def test_no_na_filter_on_index(all_parsers, na_filter, index_data):
|
||||
# see gh-5239
|
||||
#
|
||||
# Don't parse NA-values in index unless na_filter=True
|
||||
parser = all_parsers
|
||||
data = "a,b,c\n1,,3\n4,5,6"
|
||||
|
||||
expected = DataFrame({"a": [1, 4], "c": [3, 6]}, index=Index(index_data, name="b"))
|
||||
result = parser.read_csv(StringIO(data), index_col=[1], na_filter=na_filter)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_inf_na_values_with_int_index(all_parsers):
|
||||
# see gh-17128
|
||||
parser = all_parsers
|
||||
data = "idx,col1,col2\n1,3,4\n2,inf,-inf"
|
||||
|
||||
# Don't fail with OverflowError with inf's and integer index column.
|
||||
out = parser.read_csv(StringIO(data), index_col=[0], na_values=["inf", "-inf"])
|
||||
expected = DataFrame(
|
||||
{"col1": [3, np.nan], "col2": [4, np.nan]}, index=Index([1, 2], name="idx")
|
||||
)
|
||||
tm.assert_frame_equal(out, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na_filter", [True, False])
|
||||
def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter):
|
||||
# see gh-20377
|
||||
parser = all_parsers
|
||||
data = "a,b,c\n1,,3\n4,5,6"
|
||||
|
||||
# na_filter=True --> missing value becomes NaN.
|
||||
# na_filter=False --> missing value remains empty string.
|
||||
empty = np.nan if na_filter else ""
|
||||
expected = DataFrame({"a": ["1", "4"], "b": [empty, "5"], "c": ["3", "6"]})
|
||||
|
||||
result = parser.read_csv(StringIO(data), na_filter=na_filter, dtype=str)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data, na_values",
|
||||
[
|
||||
("false,1\n,1\ntrue", None),
|
||||
("false,1\nnull,1\ntrue", None),
|
||||
("false,1\nnan,1\ntrue", None),
|
||||
("false,1\nfoo,1\ntrue", "foo"),
|
||||
("false,1\nfoo,1\ntrue", ["foo"]),
|
||||
("false,1\nfoo,1\ntrue", {"a": "foo"}),
|
||||
],
|
||||
)
|
||||
def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values):
|
||||
parser = all_parsers
|
||||
msg = (
|
||||
"(Bool column has NA values in column [0a])|"
|
||||
"(cannot safely convert passed user dtype of "
|
||||
"bool for object dtyped data in column 0)"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(
|
||||
StringIO(data),
|
||||
header=None,
|
||||
names=["a", "b"],
|
||||
dtype={"a": "bool"},
|
||||
na_values=na_values,
|
||||
)
|
||||
@@ -0,0 +1,205 @@
|
||||
"""
|
||||
Tests parsers ability to read and parse non-local files
|
||||
and hence require a network connection to be read.
|
||||
"""
|
||||
from io import BytesIO, StringIO
|
||||
import logging
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas.util.testing as tm
|
||||
|
||||
from pandas.io.parsers import read_csv
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
@pytest.mark.parametrize(
|
||||
"compress_type, extension",
|
||||
[("gzip", ".gz"), ("bz2", ".bz2"), ("zip", ".zip"), ("xz", ".xz")],
|
||||
)
|
||||
@pytest.mark.parametrize("mode", ["explicit", "infer"])
|
||||
@pytest.mark.parametrize("engine", ["python", "c"])
|
||||
def test_compressed_urls(salaries_table, compress_type, extension, mode, engine):
|
||||
check_compressed_urls(salaries_table, compress_type, extension, mode, engine)
|
||||
|
||||
|
||||
@tm.network
|
||||
def check_compressed_urls(salaries_table, compression, extension, mode, engine):
|
||||
# test reading compressed urls with various engines and
|
||||
# extension inference
|
||||
base_url = (
|
||||
"https://github.com/pandas-dev/pandas/raw/master/"
|
||||
"pandas/tests/io/parser/data/salaries.csv"
|
||||
)
|
||||
|
||||
url = base_url + extension
|
||||
|
||||
if mode != "explicit":
|
||||
compression = mode
|
||||
|
||||
url_table = read_csv(url, sep="\t", compression=compression, engine=engine)
|
||||
tm.assert_frame_equal(url_table, salaries_table)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def tips_df(datapath):
|
||||
"""DataFrame with the tips dataset."""
|
||||
return read_csv(datapath("io", "parser", "data", "tips.csv"))
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("s3_resource")
|
||||
@td.skip_if_not_us_locale()
|
||||
class TestS3:
|
||||
def test_parse_public_s3_bucket(self, tips_df):
|
||||
pytest.importorskip("s3fs")
|
||||
|
||||
# more of an integration test due to the not-public contents portion
|
||||
# can probably mock this though.
|
||||
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
|
||||
df = read_csv("s3://pandas-test/tips.csv" + ext, compression=comp)
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(df, tips_df)
|
||||
|
||||
# Read public file from bucket with not-public contents
|
||||
df = read_csv("s3://cant_get_it/tips.csv")
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(df, tips_df)
|
||||
|
||||
def test_parse_public_s3n_bucket(self, tips_df):
|
||||
|
||||
# Read from AWS s3 as "s3n" URL
|
||||
df = read_csv("s3n://pandas-test/tips.csv", nrows=10)
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(tips_df.iloc[:10], df)
|
||||
|
||||
def test_parse_public_s3a_bucket(self, tips_df):
|
||||
# Read from AWS s3 as "s3a" URL
|
||||
df = read_csv("s3a://pandas-test/tips.csv", nrows=10)
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(tips_df.iloc[:10], df)
|
||||
|
||||
def test_parse_public_s3_bucket_nrows(self, tips_df):
|
||||
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
|
||||
df = read_csv("s3://pandas-test/tips.csv" + ext, nrows=10, compression=comp)
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(tips_df.iloc[:10], df)
|
||||
|
||||
def test_parse_public_s3_bucket_chunked(self, tips_df):
|
||||
# Read with a chunksize
|
||||
chunksize = 5
|
||||
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
|
||||
df_reader = read_csv(
|
||||
"s3://pandas-test/tips.csv" + ext, chunksize=chunksize, compression=comp
|
||||
)
|
||||
assert df_reader.chunksize == chunksize
|
||||
for i_chunk in [0, 1, 2]:
|
||||
# Read a couple of chunks and make sure we see them
|
||||
# properly.
|
||||
df = df_reader.get_chunk()
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
true_df = tips_df.iloc[chunksize * i_chunk : chunksize * (i_chunk + 1)]
|
||||
tm.assert_frame_equal(true_df, df)
|
||||
|
||||
def test_parse_public_s3_bucket_chunked_python(self, tips_df):
|
||||
# Read with a chunksize using the Python parser
|
||||
chunksize = 5
|
||||
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
|
||||
df_reader = read_csv(
|
||||
"s3://pandas-test/tips.csv" + ext,
|
||||
chunksize=chunksize,
|
||||
compression=comp,
|
||||
engine="python",
|
||||
)
|
||||
assert df_reader.chunksize == chunksize
|
||||
for i_chunk in [0, 1, 2]:
|
||||
# Read a couple of chunks and make sure we see them properly.
|
||||
df = df_reader.get_chunk()
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
true_df = tips_df.iloc[chunksize * i_chunk : chunksize * (i_chunk + 1)]
|
||||
tm.assert_frame_equal(true_df, df)
|
||||
|
||||
def test_parse_public_s3_bucket_python(self, tips_df):
|
||||
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
|
||||
df = read_csv(
|
||||
"s3://pandas-test/tips.csv" + ext, engine="python", compression=comp
|
||||
)
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(df, tips_df)
|
||||
|
||||
def test_infer_s3_compression(self, tips_df):
|
||||
for ext in ["", ".gz", ".bz2"]:
|
||||
df = read_csv(
|
||||
"s3://pandas-test/tips.csv" + ext, engine="python", compression="infer"
|
||||
)
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(df, tips_df)
|
||||
|
||||
def test_parse_public_s3_bucket_nrows_python(self, tips_df):
|
||||
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
|
||||
df = read_csv(
|
||||
"s3://pandas-test/tips.csv" + ext,
|
||||
engine="python",
|
||||
nrows=10,
|
||||
compression=comp,
|
||||
)
|
||||
assert isinstance(df, DataFrame)
|
||||
assert not df.empty
|
||||
tm.assert_frame_equal(tips_df.iloc[:10], df)
|
||||
|
||||
def test_s3_fails(self):
|
||||
with pytest.raises(IOError):
|
||||
read_csv("s3://nyqpug/asdf.csv")
|
||||
|
||||
# Receive a permission error when trying to read a private bucket.
|
||||
# It's irrelevant here that this isn't actually a table.
|
||||
with pytest.raises(IOError):
|
||||
read_csv("s3://cant_get_it/")
|
||||
|
||||
def test_read_csv_handles_boto_s3_object(self, s3_resource, tips_file):
|
||||
# see gh-16135
|
||||
|
||||
s3_object = s3_resource.meta.client.get_object(
|
||||
Bucket="pandas-test", Key="tips.csv"
|
||||
)
|
||||
|
||||
result = read_csv(BytesIO(s3_object["Body"].read()), encoding="utf8")
|
||||
assert isinstance(result, DataFrame)
|
||||
assert not result.empty
|
||||
|
||||
expected = read_csv(tips_file)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_read_csv_chunked_download(self, s3_resource, caplog):
|
||||
# 8 MB, S3FS usees 5MB chunks
|
||||
df = DataFrame(np.random.randn(100000, 4), columns=list("abcd"))
|
||||
buf = BytesIO()
|
||||
str_buf = StringIO()
|
||||
|
||||
df.to_csv(str_buf)
|
||||
|
||||
buf = BytesIO(str_buf.getvalue().encode("utf-8"))
|
||||
|
||||
s3_resource.Bucket("pandas-test").put_object(Key="large-file.csv", Body=buf)
|
||||
|
||||
with caplog.at_level(logging.DEBUG, logger="s3fs.core"):
|
||||
read_csv("s3://pandas-test/large-file.csv", nrows=5)
|
||||
# log of fetch_range (start, stop)
|
||||
assert (0, 5505024) in {x.args[-2:] for x in caplog.records}
|
||||
|
||||
def test_read_s3_with_hash_in_key(self, tips_df):
|
||||
# GH 25945
|
||||
result = read_csv("s3://pandas-test/tips#1.csv")
|
||||
tm.assert_frame_equal(tips_df, result)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,298 @@
|
||||
"""
|
||||
Tests that apply specifically to the Python parser. Unless specifically
|
||||
stated as a Python-specific issue, the goal is to eventually move as many of
|
||||
these tests out of this module as soon as the C parser can accept further
|
||||
arguments when parsing.
|
||||
"""
|
||||
|
||||
import csv
|
||||
from io import BytesIO, StringIO
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas.errors import ParserError
|
||||
|
||||
from pandas import DataFrame, Index, MultiIndex
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
def test_default_separator(python_parser_only):
|
||||
# see gh-17333
|
||||
#
|
||||
# csv.Sniffer in Python treats "o" as separator.
|
||||
data = "aob\n1o2\n3o4"
|
||||
parser = python_parser_only
|
||||
expected = DataFrame({"a": [1, 3], "b": [2, 4]})
|
||||
|
||||
result = parser.read_csv(StringIO(data), sep=None)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("skipfooter", ["foo", 1.5, True])
|
||||
def test_invalid_skipfooter_non_int(python_parser_only, skipfooter):
|
||||
# see gh-15925 (comment)
|
||||
data = "a\n1\n2"
|
||||
parser = python_parser_only
|
||||
msg = "skipfooter must be an integer"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), skipfooter=skipfooter)
|
||||
|
||||
|
||||
def test_invalid_skipfooter_negative(python_parser_only):
|
||||
# see gh-15925 (comment)
|
||||
data = "a\n1\n2"
|
||||
parser = python_parser_only
|
||||
msg = "skipfooter cannot be negative"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), skipfooter=-1)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs", [dict(sep=None), dict(delimiter="|")])
|
||||
def test_sniff_delimiter(python_parser_only, kwargs):
|
||||
data = """index|A|B|C
|
||||
foo|1|2|3
|
||||
bar|4|5|6
|
||||
baz|7|8|9
|
||||
"""
|
||||
parser = python_parser_only
|
||||
result = parser.read_csv(StringIO(data), index_col=0, **kwargs)
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
columns=["A", "B", "C"],
|
||||
index=Index(["foo", "bar", "baz"], name="index"),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("encoding", [None, "utf-8"])
|
||||
def test_sniff_delimiter_encoding(python_parser_only, encoding):
|
||||
parser = python_parser_only
|
||||
data = """ignore this
|
||||
ignore this too
|
||||
index|A|B|C
|
||||
foo|1|2|3
|
||||
bar|4|5|6
|
||||
baz|7|8|9
|
||||
"""
|
||||
|
||||
if encoding is not None:
|
||||
from io import TextIOWrapper
|
||||
|
||||
data = data.encode(encoding)
|
||||
data = BytesIO(data)
|
||||
data = TextIOWrapper(data, encoding=encoding)
|
||||
else:
|
||||
data = StringIO(data)
|
||||
|
||||
result = parser.read_csv(data, index_col=0, sep=None, skiprows=2, encoding=encoding)
|
||||
expected = DataFrame(
|
||||
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
||||
columns=["A", "B", "C"],
|
||||
index=Index(["foo", "bar", "baz"], name="index"),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_single_line(python_parser_only):
|
||||
# see gh-6607: sniff separator
|
||||
parser = python_parser_only
|
||||
result = parser.read_csv(StringIO("1,2"), names=["a", "b"], header=None, sep=None)
|
||||
|
||||
expected = DataFrame({"a": [1], "b": [2]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs", [dict(skipfooter=2), dict(nrows=3)])
|
||||
def test_skipfooter(python_parser_only, kwargs):
|
||||
# see gh-6607
|
||||
data = """A,B,C
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
want to skip this
|
||||
also also skip this
|
||||
"""
|
||||
parser = python_parser_only
|
||||
result = parser.read_csv(StringIO(data), **kwargs)
|
||||
|
||||
expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["A", "B", "C"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"compression,klass", [("gzip", "GzipFile"), ("bz2", "BZ2File")]
|
||||
)
|
||||
def test_decompression_regex_sep(python_parser_only, csv1, compression, klass):
|
||||
# see gh-6607
|
||||
parser = python_parser_only
|
||||
|
||||
with open(csv1, "rb") as f:
|
||||
data = f.read()
|
||||
|
||||
data = data.replace(b",", b"::")
|
||||
expected = parser.read_csv(csv1)
|
||||
|
||||
module = pytest.importorskip(compression)
|
||||
klass = getattr(module, klass)
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
tmp = klass(path, mode="wb")
|
||||
tmp.write(data)
|
||||
tmp.close()
|
||||
|
||||
result = parser.read_csv(path, sep="::", compression=compression)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_csv_buglet_4x_multi_index(python_parser_only):
|
||||
# see gh-6607
|
||||
data = """ A B C D E
|
||||
one two three four
|
||||
a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640
|
||||
a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744
|
||||
x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838"""
|
||||
parser = python_parser_only
|
||||
|
||||
expected = DataFrame(
|
||||
[
|
||||
[-0.5109, -2.3358, -0.4645, 0.05076, 0.3640],
|
||||
[0.4473, 1.4152, 0.2834, 1.00661, 0.1744],
|
||||
[-0.6662, -0.5243, -0.3580, 0.89145, 2.5838],
|
||||
],
|
||||
columns=["A", "B", "C", "D", "E"],
|
||||
index=MultiIndex.from_tuples(
|
||||
[("a", "b", 10.0032, 5), ("a", "q", 20, 4), ("x", "q", 30, 3)],
|
||||
names=["one", "two", "three", "four"],
|
||||
),
|
||||
)
|
||||
result = parser.read_csv(StringIO(data), sep=r"\s+")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_csv_buglet_4x_multi_index2(python_parser_only):
|
||||
# see gh-6893
|
||||
data = " A B C\na b c\n1 3 7 0 3 6\n3 1 4 1 5 9"
|
||||
parser = python_parser_only
|
||||
|
||||
expected = DataFrame.from_records(
|
||||
[(1, 3, 7, 0, 3, 6), (3, 1, 4, 1, 5, 9)],
|
||||
columns=list("abcABC"),
|
||||
index=list("abc"),
|
||||
)
|
||||
result = parser.read_csv(StringIO(data), sep=r"\s+")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("add_footer", [True, False])
|
||||
def test_skipfooter_with_decimal(python_parser_only, add_footer):
|
||||
# see gh-6971
|
||||
data = "1#2\n3#4"
|
||||
parser = python_parser_only
|
||||
expected = DataFrame({"a": [1.2, 3.4]})
|
||||
|
||||
if add_footer:
|
||||
# The stray footer line should not mess with the
|
||||
# casting of the first two lines if we skip it.
|
||||
kwargs = dict(skipfooter=1)
|
||||
data += "\nFooter"
|
||||
else:
|
||||
kwargs = dict()
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=["a"], decimal="#", **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"sep", ["::", "#####", "!!!", "123", "#1!c5", "%!c!d", "@@#4:2", "_!pd#_"]
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"encoding", ["utf-16", "utf-16-be", "utf-16-le", "utf-32", "cp037"]
|
||||
)
|
||||
def test_encoding_non_utf8_multichar_sep(python_parser_only, sep, encoding):
|
||||
# see gh-3404
|
||||
expected = DataFrame({"a": [1], "b": [2]})
|
||||
parser = python_parser_only
|
||||
|
||||
data = "1" + sep + "2"
|
||||
encoded_data = data.encode(encoding)
|
||||
|
||||
result = parser.read_csv(
|
||||
BytesIO(encoded_data), sep=sep, names=["a", "b"], encoding=encoding
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE])
|
||||
def test_multi_char_sep_quotes(python_parser_only, quoting):
|
||||
# see gh-13374
|
||||
kwargs = dict(sep=",,")
|
||||
parser = python_parser_only
|
||||
|
||||
data = 'a,,b\n1,,a\n2,,"2,,b"'
|
||||
msg = "ignored when a multi-char delimiter is used"
|
||||
|
||||
def fail_read():
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data), quoting=quoting, **kwargs)
|
||||
|
||||
if quoting == csv.QUOTE_NONE:
|
||||
# We expect no match, so there should be an assertion
|
||||
# error out of the inner context manager.
|
||||
with pytest.raises(AssertionError):
|
||||
fail_read()
|
||||
else:
|
||||
fail_read()
|
||||
|
||||
|
||||
def test_none_delimiter(python_parser_only, capsys):
|
||||
# see gh-13374 and gh-17465
|
||||
parser = python_parser_only
|
||||
data = "a,b,c\n0,1,2\n3,4,5,6\n7,8,9"
|
||||
expected = DataFrame({"a": [0, 7], "b": [1, 8], "c": [2, 9]})
|
||||
|
||||
# We expect the third line in the data to be
|
||||
# skipped because it is malformed, but we do
|
||||
# not expect any errors to occur.
|
||||
result = parser.read_csv(
|
||||
StringIO(data), header=0, sep=None, warn_bad_lines=True, error_bad_lines=False
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert "Skipping line 3" in captured.err
|
||||
|
||||
|
||||
@pytest.mark.parametrize("data", ['a\n1\n"b"a', 'a,b,c\ncat,foo,bar\ndog,foo,"baz'])
|
||||
@pytest.mark.parametrize("skipfooter", [0, 1])
|
||||
def test_skipfooter_bad_row(python_parser_only, data, skipfooter):
|
||||
# see gh-13879 and gh-15910
|
||||
msg = "parsing errors in the skipped footer rows"
|
||||
parser = python_parser_only
|
||||
|
||||
def fail_read():
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data), skipfooter=skipfooter)
|
||||
|
||||
if skipfooter:
|
||||
fail_read()
|
||||
else:
|
||||
# We expect no match, so there should be an assertion
|
||||
# error out of the inner context manager.
|
||||
with pytest.raises(AssertionError):
|
||||
fail_read()
|
||||
|
||||
|
||||
def test_malformed_skipfooter(python_parser_only):
|
||||
parser = python_parser_only
|
||||
data = """ignore
|
||||
A,B,C
|
||||
1,2,3 # comment
|
||||
1,2,3,4,5
|
||||
2,3,4
|
||||
footer
|
||||
"""
|
||||
msg = "Expected 3 fields in line 4, saw 5"
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data), header=1, comment="#", skipfooter=1)
|
||||
@@ -0,0 +1,159 @@
|
||||
"""
|
||||
Tests that quoting specifications are properly handled
|
||||
during parsing for all of the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
import csv
|
||||
from io import StringIO
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas.errors import ParserError
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs,msg",
|
||||
[
|
||||
(dict(quotechar="foo"), '"quotechar" must be a(n)? 1-character string'),
|
||||
(
|
||||
dict(quotechar=None, quoting=csv.QUOTE_MINIMAL),
|
||||
"quotechar must be set if quoting enabled",
|
||||
),
|
||||
(dict(quotechar=2), '"quotechar" must be string, not int'),
|
||||
],
|
||||
)
|
||||
def test_bad_quote_char(all_parsers, kwargs, msg):
|
||||
data = "1,2,3"
|
||||
parser = all_parsers
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
parser.read_csv(StringIO(data), **kwargs)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"quoting,msg",
|
||||
[
|
||||
("foo", '"quoting" must be an integer'),
|
||||
(5, 'bad "quoting" value'), # quoting must be in the range [0, 3]
|
||||
],
|
||||
)
|
||||
def test_bad_quoting(all_parsers, quoting, msg):
|
||||
data = "1,2,3"
|
||||
parser = all_parsers
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
parser.read_csv(StringIO(data), quoting=quoting)
|
||||
|
||||
|
||||
def test_quote_char_basic(all_parsers):
|
||||
parser = all_parsers
|
||||
data = 'a,b,c\n1,2,"cat"'
|
||||
expected = DataFrame([[1, 2, "cat"]], columns=["a", "b", "c"])
|
||||
|
||||
result = parser.read_csv(StringIO(data), quotechar='"')
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("quote_char", ["~", "*", "%", "$", "@", "P"])
|
||||
def test_quote_char_various(all_parsers, quote_char):
|
||||
parser = all_parsers
|
||||
expected = DataFrame([[1, 2, "cat"]], columns=["a", "b", "c"])
|
||||
|
||||
data = 'a,b,c\n1,2,"cat"'
|
||||
new_data = data.replace('"', quote_char)
|
||||
|
||||
result = parser.read_csv(StringIO(new_data), quotechar=quote_char)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE])
|
||||
@pytest.mark.parametrize("quote_char", ["", None])
|
||||
def test_null_quote_char(all_parsers, quoting, quote_char):
|
||||
kwargs = dict(quotechar=quote_char, quoting=quoting)
|
||||
data = "a,b,c\n1,2,3"
|
||||
parser = all_parsers
|
||||
|
||||
if quoting != csv.QUOTE_NONE:
|
||||
# Sanity checking.
|
||||
msg = "quotechar must be set if quoting enabled"
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
parser.read_csv(StringIO(data), **kwargs)
|
||||
else:
|
||||
expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"])
|
||||
result = parser.read_csv(StringIO(data), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs,exp_data",
|
||||
[
|
||||
(dict(), [[1, 2, "foo"]]), # Test default.
|
||||
# QUOTE_MINIMAL only applies to CSV writing, so no effect on reading.
|
||||
(dict(quotechar='"', quoting=csv.QUOTE_MINIMAL), [[1, 2, "foo"]]),
|
||||
# QUOTE_MINIMAL only applies to CSV writing, so no effect on reading.
|
||||
(dict(quotechar='"', quoting=csv.QUOTE_ALL), [[1, 2, "foo"]]),
|
||||
# QUOTE_NONE tells the reader to do no special handling
|
||||
# of quote characters and leave them alone.
|
||||
(dict(quotechar='"', quoting=csv.QUOTE_NONE), [[1, 2, '"foo"']]),
|
||||
# QUOTE_NONNUMERIC tells the reader to cast
|
||||
# all non-quoted fields to float
|
||||
(dict(quotechar='"', quoting=csv.QUOTE_NONNUMERIC), [[1.0, 2.0, "foo"]]),
|
||||
],
|
||||
)
|
||||
def test_quoting_various(all_parsers, kwargs, exp_data):
|
||||
data = '1,2,"foo"'
|
||||
parser = all_parsers
|
||||
columns = ["a", "b", "c"]
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=columns, **kwargs)
|
||||
expected = DataFrame(exp_data, columns=columns)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"doublequote,exp_data", [(True, [[3, '4 " 5']]), (False, [[3, '4 " 5"']])]
|
||||
)
|
||||
def test_double_quote(all_parsers, doublequote, exp_data):
|
||||
parser = all_parsers
|
||||
data = 'a,b\n3,"4 "" 5"'
|
||||
|
||||
result = parser.read_csv(StringIO(data), quotechar='"', doublequote=doublequote)
|
||||
expected = DataFrame(exp_data, columns=["a", "b"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("quotechar", ['"', "\u0001"])
|
||||
def test_quotechar_unicode(all_parsers, quotechar):
|
||||
# see gh-14477
|
||||
data = "a\n1"
|
||||
parser = all_parsers
|
||||
expected = DataFrame({"a": [1]})
|
||||
|
||||
result = parser.read_csv(StringIO(data), quotechar=quotechar)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("balanced", [True, False])
|
||||
def test_unbalanced_quoting(all_parsers, balanced):
|
||||
# see gh-22789.
|
||||
parser = all_parsers
|
||||
data = 'a,b,c\n1,2,"3'
|
||||
|
||||
if balanced:
|
||||
# Re-balance the quoting and read in without errors.
|
||||
expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"])
|
||||
result = parser.read_csv(StringIO(data + '"'))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
msg = (
|
||||
"EOF inside string starting at row 1"
|
||||
if parser.engine == "c"
|
||||
else "unexpected end of data"
|
||||
)
|
||||
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
parser.read_csv(StringIO(data))
|
||||
@@ -0,0 +1,618 @@
|
||||
"""
|
||||
Tests the 'read_fwf' function in parsers.py. This
|
||||
test suite is independent of the others because the
|
||||
engine is set to 'python-fwf' internally.
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from io import BytesIO, StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, DatetimeIndex
|
||||
import pandas.util.testing as tm
|
||||
|
||||
from pandas.io.parsers import EmptyDataError, read_csv, read_fwf
|
||||
|
||||
|
||||
def test_basic():
|
||||
data = """\
|
||||
A B C D
|
||||
201158 360.242940 149.910199 11950.7
|
||||
201159 444.953632 166.985655 11788.4
|
||||
201160 364.136849 183.628767 11806.2
|
||||
201161 413.836124 184.375703 11916.8
|
||||
201162 502.953953 173.237159 12468.3
|
||||
"""
|
||||
result = read_fwf(StringIO(data))
|
||||
expected = DataFrame(
|
||||
[
|
||||
[201158, 360.242940, 149.910199, 11950.7],
|
||||
[201159, 444.953632, 166.985655, 11788.4],
|
||||
[201160, 364.136849, 183.628767, 11806.2],
|
||||
[201161, 413.836124, 184.375703, 11916.8],
|
||||
[201162, 502.953953, 173.237159, 12468.3],
|
||||
],
|
||||
columns=["A", "B", "C", "D"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_colspecs():
|
||||
data = """\
|
||||
A B C D E
|
||||
201158 360.242940 149.910199 11950.7
|
||||
201159 444.953632 166.985655 11788.4
|
||||
201160 364.136849 183.628767 11806.2
|
||||
201161 413.836124 184.375703 11916.8
|
||||
201162 502.953953 173.237159 12468.3
|
||||
"""
|
||||
colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)]
|
||||
result = read_fwf(StringIO(data), colspecs=colspecs)
|
||||
|
||||
expected = DataFrame(
|
||||
[
|
||||
[2011, 58, 360.242940, 149.910199, 11950.7],
|
||||
[2011, 59, 444.953632, 166.985655, 11788.4],
|
||||
[2011, 60, 364.136849, 183.628767, 11806.2],
|
||||
[2011, 61, 413.836124, 184.375703, 11916.8],
|
||||
[2011, 62, 502.953953, 173.237159, 12468.3],
|
||||
],
|
||||
columns=["A", "B", "C", "D", "E"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_widths():
|
||||
data = """\
|
||||
A B C D E
|
||||
2011 58 360.242940 149.910199 11950.7
|
||||
2011 59 444.953632 166.985655 11788.4
|
||||
2011 60 364.136849 183.628767 11806.2
|
||||
2011 61 413.836124 184.375703 11916.8
|
||||
2011 62 502.953953 173.237159 12468.3
|
||||
"""
|
||||
result = read_fwf(StringIO(data), widths=[5, 5, 13, 13, 7])
|
||||
|
||||
expected = DataFrame(
|
||||
[
|
||||
[2011, 58, 360.242940, 149.910199, 11950.7],
|
||||
[2011, 59, 444.953632, 166.985655, 11788.4],
|
||||
[2011, 60, 364.136849, 183.628767, 11806.2],
|
||||
[2011, 61, 413.836124, 184.375703, 11916.8],
|
||||
[2011, 62, 502.953953, 173.237159, 12468.3],
|
||||
],
|
||||
columns=["A", "B", "C", "D", "E"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_non_space_filler():
|
||||
# From Thomas Kluyver:
|
||||
#
|
||||
# Apparently, some non-space filler characters can be seen, this is
|
||||
# supported by specifying the 'delimiter' character:
|
||||
#
|
||||
# http://publib.boulder.ibm.com/infocenter/dmndhelp/v6r1mx/index.jsp?topic=/com.ibm.wbit.612.help.config.doc/topics/rfixwidth.html
|
||||
data = """\
|
||||
A~~~~B~~~~C~~~~~~~~~~~~D~~~~~~~~~~~~E
|
||||
201158~~~~360.242940~~~149.910199~~~11950.7
|
||||
201159~~~~444.953632~~~166.985655~~~11788.4
|
||||
201160~~~~364.136849~~~183.628767~~~11806.2
|
||||
201161~~~~413.836124~~~184.375703~~~11916.8
|
||||
201162~~~~502.953953~~~173.237159~~~12468.3
|
||||
"""
|
||||
colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)]
|
||||
result = read_fwf(StringIO(data), colspecs=colspecs, delimiter="~")
|
||||
|
||||
expected = DataFrame(
|
||||
[
|
||||
[2011, 58, 360.242940, 149.910199, 11950.7],
|
||||
[2011, 59, 444.953632, 166.985655, 11788.4],
|
||||
[2011, 60, 364.136849, 183.628767, 11806.2],
|
||||
[2011, 61, 413.836124, 184.375703, 11916.8],
|
||||
[2011, 62, 502.953953, 173.237159, 12468.3],
|
||||
],
|
||||
columns=["A", "B", "C", "D", "E"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_over_specified():
|
||||
data = """\
|
||||
A B C D E
|
||||
201158 360.242940 149.910199 11950.7
|
||||
201159 444.953632 166.985655 11788.4
|
||||
201160 364.136849 183.628767 11806.2
|
||||
201161 413.836124 184.375703 11916.8
|
||||
201162 502.953953 173.237159 12468.3
|
||||
"""
|
||||
colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)]
|
||||
|
||||
with pytest.raises(ValueError, match="must specify only one of"):
|
||||
read_fwf(StringIO(data), colspecs=colspecs, widths=[6, 10, 10, 7])
|
||||
|
||||
|
||||
def test_under_specified():
|
||||
data = """\
|
||||
A B C D E
|
||||
201158 360.242940 149.910199 11950.7
|
||||
201159 444.953632 166.985655 11788.4
|
||||
201160 364.136849 183.628767 11806.2
|
||||
201161 413.836124 184.375703 11916.8
|
||||
201162 502.953953 173.237159 12468.3
|
||||
"""
|
||||
with pytest.raises(ValueError, match="Must specify either"):
|
||||
read_fwf(StringIO(data), colspecs=None, widths=None)
|
||||
|
||||
|
||||
def test_read_csv_compat():
|
||||
csv_data = """\
|
||||
A,B,C,D,E
|
||||
2011,58,360.242940,149.910199,11950.7
|
||||
2011,59,444.953632,166.985655,11788.4
|
||||
2011,60,364.136849,183.628767,11806.2
|
||||
2011,61,413.836124,184.375703,11916.8
|
||||
2011,62,502.953953,173.237159,12468.3
|
||||
"""
|
||||
expected = read_csv(StringIO(csv_data), engine="python")
|
||||
|
||||
fwf_data = """\
|
||||
A B C D E
|
||||
201158 360.242940 149.910199 11950.7
|
||||
201159 444.953632 166.985655 11788.4
|
||||
201160 364.136849 183.628767 11806.2
|
||||
201161 413.836124 184.375703 11916.8
|
||||
201162 502.953953 173.237159 12468.3
|
||||
"""
|
||||
colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)]
|
||||
result = read_fwf(StringIO(fwf_data), colspecs=colspecs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_bytes_io_input():
|
||||
result = read_fwf(
|
||||
BytesIO("שלום\nשלום".encode("utf8")), widths=[2, 2], encoding="utf8"
|
||||
)
|
||||
expected = DataFrame([["של", "ום"]], columns=["של", "ום"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_fwf_colspecs_is_list_or_tuple():
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
"""
|
||||
|
||||
msg = "column specifications must be a list or tuple.+"
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
read_fwf(StringIO(data), colspecs={"a": 1}, delimiter=",")
|
||||
|
||||
|
||||
def test_fwf_colspecs_is_list_or_tuple_of_two_element_tuples():
|
||||
data = """index,A,B,C,D
|
||||
foo,2,3,4,5
|
||||
bar,7,8,9,10
|
||||
baz,12,13,14,15
|
||||
qux,12,13,14,15
|
||||
foo2,12,13,14,15
|
||||
bar2,12,13,14,15
|
||||
"""
|
||||
|
||||
msg = "Each column specification must be.+"
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
read_fwf(StringIO(data), [("a", 1)])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"colspecs,exp_data",
|
||||
[
|
||||
([(0, 3), (3, None)], [[123, 456], [456, 789]]),
|
||||
([(None, 3), (3, 6)], [[123, 456], [456, 789]]),
|
||||
([(0, None), (3, None)], [[123456, 456], [456789, 789]]),
|
||||
([(None, None), (3, 6)], [[123456, 456], [456789, 789]]),
|
||||
],
|
||||
)
|
||||
def test_fwf_colspecs_none(colspecs, exp_data):
|
||||
# see gh-7079
|
||||
data = """\
|
||||
123456
|
||||
456789
|
||||
"""
|
||||
expected = DataFrame(exp_data)
|
||||
|
||||
result = read_fwf(StringIO(data), colspecs=colspecs, header=None)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"infer_nrows,exp_data",
|
||||
[
|
||||
# infer_nrows --> colspec == [(2, 3), (5, 6)]
|
||||
(1, [[1, 2], [3, 8]]),
|
||||
# infer_nrows > number of rows
|
||||
(10, [[1, 2], [123, 98]]),
|
||||
],
|
||||
)
|
||||
def test_fwf_colspecs_infer_nrows(infer_nrows, exp_data):
|
||||
# see gh-15138
|
||||
data = """\
|
||||
1 2
|
||||
123 98
|
||||
"""
|
||||
expected = DataFrame(exp_data)
|
||||
|
||||
result = read_fwf(StringIO(data), infer_nrows=infer_nrows, header=None)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_fwf_regression():
|
||||
# see gh-3594
|
||||
#
|
||||
# Turns out "T060" is parsable as a datetime slice!
|
||||
tz_list = [1, 10, 20, 30, 60, 80, 100]
|
||||
widths = [16] + [8] * len(tz_list)
|
||||
names = ["SST"] + ["T{z:03d}".format(z=z) for z in tz_list[1:]]
|
||||
|
||||
data = """ 2009164202000 9.5403 9.4105 8.6571 7.8372 6.0612 5.8843 5.5192
|
||||
2009164203000 9.5435 9.2010 8.6167 7.8176 6.0804 5.8728 5.4869
|
||||
2009164204000 9.5873 9.1326 8.4694 7.5889 6.0422 5.8526 5.4657
|
||||
2009164205000 9.5810 9.0896 8.4009 7.4652 6.0322 5.8189 5.4379
|
||||
2009164210000 9.6034 9.0897 8.3822 7.4905 6.0908 5.7904 5.4039
|
||||
"""
|
||||
|
||||
result = read_fwf(
|
||||
StringIO(data),
|
||||
index_col=0,
|
||||
header=None,
|
||||
names=names,
|
||||
widths=widths,
|
||||
parse_dates=True,
|
||||
date_parser=lambda s: datetime.strptime(s, "%Y%j%H%M%S"),
|
||||
)
|
||||
expected = DataFrame(
|
||||
[
|
||||
[9.5403, 9.4105, 8.6571, 7.8372, 6.0612, 5.8843, 5.5192],
|
||||
[9.5435, 9.2010, 8.6167, 7.8176, 6.0804, 5.8728, 5.4869],
|
||||
[9.5873, 9.1326, 8.4694, 7.5889, 6.0422, 5.8526, 5.4657],
|
||||
[9.5810, 9.0896, 8.4009, 7.4652, 6.0322, 5.8189, 5.4379],
|
||||
[9.6034, 9.0897, 8.3822, 7.4905, 6.0908, 5.7904, 5.4039],
|
||||
],
|
||||
index=DatetimeIndex(
|
||||
[
|
||||
"2009-06-13 20:20:00",
|
||||
"2009-06-13 20:30:00",
|
||||
"2009-06-13 20:40:00",
|
||||
"2009-06-13 20:50:00",
|
||||
"2009-06-13 21:00:00",
|
||||
]
|
||||
),
|
||||
columns=["SST", "T010", "T020", "T030", "T060", "T080", "T100"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_fwf_for_uint8():
|
||||
data = """1421302965.213420 PRI=3 PGN=0xef00 DST=0x17 SRC=0x28 04 154 00 00 00 00 00 127
|
||||
1421302964.226776 PRI=6 PGN=0xf002 SRC=0x47 243 00 00 255 247 00 00 71""" # noqa
|
||||
df = read_fwf(
|
||||
StringIO(data),
|
||||
colspecs=[(0, 17), (25, 26), (33, 37), (49, 51), (58, 62), (63, 1000)],
|
||||
names=["time", "pri", "pgn", "dst", "src", "data"],
|
||||
converters={
|
||||
"pgn": lambda x: int(x, 16),
|
||||
"src": lambda x: int(x, 16),
|
||||
"dst": lambda x: int(x, 16),
|
||||
"data": lambda x: len(x.split(" ")),
|
||||
},
|
||||
)
|
||||
|
||||
expected = DataFrame(
|
||||
[
|
||||
[1421302965.213420, 3, 61184, 23, 40, 8],
|
||||
[1421302964.226776, 6, 61442, None, 71, 8],
|
||||
],
|
||||
columns=["time", "pri", "pgn", "dst", "src", "data"],
|
||||
)
|
||||
expected["dst"] = expected["dst"].astype(object)
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("comment", ["#", "~", "!"])
|
||||
def test_fwf_comment(comment):
|
||||
data = """\
|
||||
1 2. 4 #hello world
|
||||
5 NaN 10.0
|
||||
"""
|
||||
data = data.replace("#", comment)
|
||||
|
||||
colspecs = [(0, 3), (4, 9), (9, 25)]
|
||||
expected = DataFrame([[1, 2.0, 4], [5, np.nan, 10.0]])
|
||||
|
||||
result = read_fwf(StringIO(data), colspecs=colspecs, header=None, comment=comment)
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("thousands", [",", "#", "~"])
|
||||
def test_fwf_thousands(thousands):
|
||||
data = """\
|
||||
1 2,334.0 5
|
||||
10 13 10.
|
||||
"""
|
||||
data = data.replace(",", thousands)
|
||||
|
||||
colspecs = [(0, 3), (3, 11), (12, 16)]
|
||||
expected = DataFrame([[1, 2334.0, 5], [10, 13, 10.0]])
|
||||
|
||||
result = read_fwf(
|
||||
StringIO(data), header=None, colspecs=colspecs, thousands=thousands
|
||||
)
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("header", [True, False])
|
||||
def test_bool_header_arg(header):
|
||||
# see gh-6114
|
||||
data = """\
|
||||
MyColumn
|
||||
a
|
||||
b
|
||||
a
|
||||
b"""
|
||||
|
||||
msg = "Passing a bool to header is invalid"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
read_fwf(StringIO(data), header=header)
|
||||
|
||||
|
||||
def test_full_file():
|
||||
# File with all values.
|
||||
test = """index A B C
|
||||
2000-01-03T00:00:00 0.980268513777 3 foo
|
||||
2000-01-04T00:00:00 1.04791624281 -4 bar
|
||||
2000-01-05T00:00:00 0.498580885705 73 baz
|
||||
2000-01-06T00:00:00 1.12020151869 1 foo
|
||||
2000-01-07T00:00:00 0.487094399463 0 bar
|
||||
2000-01-10T00:00:00 0.836648671666 2 baz
|
||||
2000-01-11T00:00:00 0.157160753327 34 foo"""
|
||||
colspecs = ((0, 19), (21, 35), (38, 40), (42, 45))
|
||||
expected = read_fwf(StringIO(test), colspecs=colspecs)
|
||||
|
||||
result = read_fwf(StringIO(test))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_full_file_with_missing():
|
||||
# File with missing values.
|
||||
test = """index A B C
|
||||
2000-01-03T00:00:00 0.980268513777 3 foo
|
||||
2000-01-04T00:00:00 1.04791624281 -4 bar
|
||||
0.498580885705 73 baz
|
||||
2000-01-06T00:00:00 1.12020151869 1 foo
|
||||
2000-01-07T00:00:00 0 bar
|
||||
2000-01-10T00:00:00 0.836648671666 2 baz
|
||||
34"""
|
||||
colspecs = ((0, 19), (21, 35), (38, 40), (42, 45))
|
||||
expected = read_fwf(StringIO(test), colspecs=colspecs)
|
||||
|
||||
result = read_fwf(StringIO(test))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_full_file_with_spaces():
|
||||
# File with spaces in columns.
|
||||
test = """
|
||||
Account Name Balance CreditLimit AccountCreated
|
||||
101 Keanu Reeves 9315.45 10000.00 1/17/1998
|
||||
312 Gerard Butler 90.00 1000.00 8/6/2003
|
||||
868 Jennifer Love Hewitt 0 17000.00 5/25/1985
|
||||
761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006
|
||||
317 Bill Murray 789.65 5000.00 2/5/2007
|
||||
""".strip(
|
||||
"\r\n"
|
||||
)
|
||||
colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70))
|
||||
expected = read_fwf(StringIO(test), colspecs=colspecs)
|
||||
|
||||
result = read_fwf(StringIO(test))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_full_file_with_spaces_and_missing():
|
||||
# File with spaces and missing values in columns.
|
||||
test = """
|
||||
Account Name Balance CreditLimit AccountCreated
|
||||
101 10000.00 1/17/1998
|
||||
312 Gerard Butler 90.00 1000.00 8/6/2003
|
||||
868 5/25/1985
|
||||
761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006
|
||||
317 Bill Murray 789.65
|
||||
""".strip(
|
||||
"\r\n"
|
||||
)
|
||||
colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70))
|
||||
expected = read_fwf(StringIO(test), colspecs=colspecs)
|
||||
|
||||
result = read_fwf(StringIO(test))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_messed_up_data():
|
||||
# Completely messed up file.
|
||||
test = """
|
||||
Account Name Balance Credit Limit Account Created
|
||||
101 10000.00 1/17/1998
|
||||
312 Gerard Butler 90.00 1000.00
|
||||
|
||||
761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006
|
||||
317 Bill Murray 789.65
|
||||
""".strip(
|
||||
"\r\n"
|
||||
)
|
||||
colspecs = ((2, 10), (15, 33), (37, 45), (49, 61), (64, 79))
|
||||
expected = read_fwf(StringIO(test), colspecs=colspecs)
|
||||
|
||||
result = read_fwf(StringIO(test))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_multiple_delimiters():
|
||||
test = r"""
|
||||
col1~~~~~col2 col3++++++++++++++++++col4
|
||||
~~22.....11.0+++foo~~~~~~~~~~Keanu Reeves
|
||||
33+++122.33\\\bar.........Gerard Butler
|
||||
++44~~~~12.01 baz~~Jennifer Love Hewitt
|
||||
~~55 11+++foo++++Jada Pinkett-Smith
|
||||
..66++++++.03~~~bar Bill Murray
|
||||
""".strip(
|
||||
"\r\n"
|
||||
)
|
||||
delimiter = " +~.\\"
|
||||
colspecs = ((0, 4), (7, 13), (15, 19), (21, 41))
|
||||
expected = read_fwf(StringIO(test), colspecs=colspecs, delimiter=delimiter)
|
||||
|
||||
result = read_fwf(StringIO(test), delimiter=delimiter)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_variable_width_unicode():
|
||||
data = """
|
||||
שלום שלום
|
||||
ום שלל
|
||||
של ום
|
||||
""".strip(
|
||||
"\r\n"
|
||||
)
|
||||
encoding = "utf8"
|
||||
kwargs = dict(header=None, encoding=encoding)
|
||||
|
||||
expected = read_fwf(
|
||||
BytesIO(data.encode(encoding)), colspecs=[(0, 4), (5, 9)], **kwargs
|
||||
)
|
||||
result = read_fwf(BytesIO(data.encode(encoding)), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [dict(), {"a": "float64", "b": str, "c": "int32"}])
|
||||
def test_dtype(dtype):
|
||||
data = """ a b c
|
||||
1 2 3.2
|
||||
3 4 5.2
|
||||
"""
|
||||
colspecs = [(0, 5), (5, 10), (10, None)]
|
||||
result = read_fwf(StringIO(data), colspecs=colspecs, dtype=dtype)
|
||||
|
||||
expected = pd.DataFrame(
|
||||
{"a": [1, 3], "b": [2, 4], "c": [3.2, 5.2]}, columns=["a", "b", "c"]
|
||||
)
|
||||
|
||||
for col, dt in dtype.items():
|
||||
expected[col] = expected[col].astype(dt)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_skiprows_inference():
|
||||
# see gh-11256
|
||||
data = """
|
||||
Text contained in the file header
|
||||
|
||||
DataCol1 DataCol2
|
||||
0.0 1.0
|
||||
101.6 956.1
|
||||
""".strip()
|
||||
skiprows = 2
|
||||
expected = read_csv(StringIO(data), skiprows=skiprows, delim_whitespace=True)
|
||||
|
||||
result = read_fwf(StringIO(data), skiprows=skiprows)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_skiprows_by_index_inference():
|
||||
data = """
|
||||
To be skipped
|
||||
Not To Be Skipped
|
||||
Once more to be skipped
|
||||
123 34 8 123
|
||||
456 78 9 456
|
||||
""".strip()
|
||||
skiprows = [0, 2]
|
||||
expected = read_csv(StringIO(data), skiprows=skiprows, delim_whitespace=True)
|
||||
|
||||
result = read_fwf(StringIO(data), skiprows=skiprows)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_skiprows_inference_empty():
|
||||
data = """
|
||||
AA BBB C
|
||||
12 345 6
|
||||
78 901 2
|
||||
""".strip()
|
||||
|
||||
msg = "No rows from which to infer column width"
|
||||
with pytest.raises(EmptyDataError, match=msg):
|
||||
read_fwf(StringIO(data), skiprows=3)
|
||||
|
||||
|
||||
def test_whitespace_preservation():
|
||||
# see gh-16772
|
||||
header = None
|
||||
csv_data = """
|
||||
a ,bbb
|
||||
cc,dd """
|
||||
|
||||
fwf_data = """
|
||||
a bbb
|
||||
ccdd """
|
||||
result = read_fwf(
|
||||
StringIO(fwf_data), widths=[3, 3], header=header, skiprows=[0], delimiter="\n\t"
|
||||
)
|
||||
expected = read_csv(StringIO(csv_data), header=header)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_default_delimiter():
|
||||
header = None
|
||||
csv_data = """
|
||||
a,bbb
|
||||
cc,dd"""
|
||||
|
||||
fwf_data = """
|
||||
a \tbbb
|
||||
cc\tdd """
|
||||
result = read_fwf(StringIO(fwf_data), widths=[3, 3], header=header, skiprows=[0])
|
||||
expected = read_csv(StringIO(csv_data), header=header)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("infer", [True, False, None])
|
||||
def test_fwf_compression(compression_only, infer):
|
||||
data = """1111111111
|
||||
2222222222
|
||||
3333333333""".strip()
|
||||
|
||||
compression = compression_only
|
||||
extension = "gz" if compression == "gzip" else compression
|
||||
|
||||
kwargs = dict(widths=[5, 5], names=["one", "two"])
|
||||
expected = read_fwf(StringIO(data), **kwargs)
|
||||
|
||||
data = bytes(data, encoding="utf-8")
|
||||
|
||||
with tm.ensure_clean(filename="tmp." + extension) as path:
|
||||
tm.write_to_compressed(compression, path, data)
|
||||
|
||||
if infer is not None:
|
||||
kwargs["compression"] = "infer" if infer else compression
|
||||
|
||||
result = read_fwf(path, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,252 @@
|
||||
"""
|
||||
Tests that skipped rows are properly handled during
|
||||
parsing for all of the parsers defined in parsers.py
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.errors import EmptyDataError
|
||||
|
||||
from pandas import DataFrame, Index
|
||||
import pandas.util.testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize("skiprows", [list(range(6)), 6])
|
||||
def test_skip_rows_bug(all_parsers, skiprows):
|
||||
# see gh-505
|
||||
parser = all_parsers
|
||||
text = """#foo,a,b,c
|
||||
#foo,a,b,c
|
||||
#foo,a,b,c
|
||||
#foo,a,b,c
|
||||
#foo,a,b,c
|
||||
#foo,a,b,c
|
||||
1/1/2000,1.,2.,3.
|
||||
1/2/2000,4,5,6
|
||||
1/3/2000,7,8,9
|
||||
"""
|
||||
result = parser.read_csv(
|
||||
StringIO(text), skiprows=skiprows, header=None, index_col=0, parse_dates=True
|
||||
)
|
||||
index = Index(
|
||||
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0
|
||||
)
|
||||
|
||||
expected = DataFrame(
|
||||
np.arange(1.0, 10.0).reshape((3, 3)), columns=[1, 2, 3], index=index
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_deep_skip_rows(all_parsers):
|
||||
# see gh-4382
|
||||
parser = all_parsers
|
||||
data = "a,b,c\n" + "\n".join(
|
||||
[",".join([str(i), str(i + 1), str(i + 2)]) for i in range(10)]
|
||||
)
|
||||
condensed_data = "a,b,c\n" + "\n".join(
|
||||
[",".join([str(i), str(i + 1), str(i + 2)]) for i in [0, 1, 2, 3, 4, 6, 8, 9]]
|
||||
)
|
||||
|
||||
result = parser.read_csv(StringIO(data), skiprows=[6, 8])
|
||||
condensed_result = parser.read_csv(StringIO(condensed_data))
|
||||
tm.assert_frame_equal(result, condensed_result)
|
||||
|
||||
|
||||
def test_skip_rows_blank(all_parsers):
|
||||
# see gh-9832
|
||||
parser = all_parsers
|
||||
text = """#foo,a,b,c
|
||||
#foo,a,b,c
|
||||
|
||||
#foo,a,b,c
|
||||
#foo,a,b,c
|
||||
|
||||
1/1/2000,1.,2.,3.
|
||||
1/2/2000,4,5,6
|
||||
1/3/2000,7,8,9
|
||||
"""
|
||||
data = parser.read_csv(
|
||||
StringIO(text), skiprows=6, header=None, index_col=0, parse_dates=True
|
||||
)
|
||||
index = Index(
|
||||
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0
|
||||
)
|
||||
|
||||
expected = DataFrame(
|
||||
np.arange(1.0, 10.0).reshape((3, 3)), columns=[1, 2, 3], index=index
|
||||
)
|
||||
tm.assert_frame_equal(data, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,kwargs,expected",
|
||||
[
|
||||
(
|
||||
"""id,text,num_lines
|
||||
1,"line 11
|
||||
line 12",2
|
||||
2,"line 21
|
||||
line 22",2
|
||||
3,"line 31",1""",
|
||||
dict(skiprows=[1]),
|
||||
DataFrame(
|
||||
[[2, "line 21\nline 22", 2], [3, "line 31", 1]],
|
||||
columns=["id", "text", "num_lines"],
|
||||
),
|
||||
),
|
||||
(
|
||||
"a,b,c\n~a\n b~,~e\n d~,~f\n f~\n1,2,~12\n 13\n 14~",
|
||||
dict(quotechar="~", skiprows=[2]),
|
||||
DataFrame([["a\n b", "e\n d", "f\n f"]], columns=["a", "b", "c"]),
|
||||
),
|
||||
(
|
||||
(
|
||||
"Text,url\n~example\n "
|
||||
"sentence\n one~,url1\n~"
|
||||
"example\n sentence\n two~,url2\n~"
|
||||
"example\n sentence\n three~,url3"
|
||||
),
|
||||
dict(quotechar="~", skiprows=[1, 3]),
|
||||
DataFrame([["example\n sentence\n two", "url2"]], columns=["Text", "url"]),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_skip_row_with_newline(all_parsers, data, kwargs, expected):
|
||||
# see gh-12775 and gh-10911
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_skip_row_with_quote(all_parsers):
|
||||
# see gh-12775 and gh-10911
|
||||
parser = all_parsers
|
||||
data = """id,text,num_lines
|
||||
1,"line '11' line 12",2
|
||||
2,"line '21' line 22",2
|
||||
3,"line '31' line 32",1"""
|
||||
|
||||
exp_data = [[2, "line '21' line 22", 2], [3, "line '31' line 32", 1]]
|
||||
expected = DataFrame(exp_data, columns=["id", "text", "num_lines"])
|
||||
|
||||
result = parser.read_csv(StringIO(data), skiprows=[1])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,exp_data",
|
||||
[
|
||||
(
|
||||
"""id,text,num_lines
|
||||
1,"line \n'11' line 12",2
|
||||
2,"line \n'21' line 22",2
|
||||
3,"line \n'31' line 32",1""",
|
||||
[[2, "line \n'21' line 22", 2], [3, "line \n'31' line 32", 1]],
|
||||
),
|
||||
(
|
||||
"""id,text,num_lines
|
||||
1,"line '11\n' line 12",2
|
||||
2,"line '21\n' line 22",2
|
||||
3,"line '31\n' line 32",1""",
|
||||
[[2, "line '21\n' line 22", 2], [3, "line '31\n' line 32", 1]],
|
||||
),
|
||||
(
|
||||
"""id,text,num_lines
|
||||
1,"line '11\n' \r\tline 12",2
|
||||
2,"line '21\n' \r\tline 22",2
|
||||
3,"line '31\n' \r\tline 32",1""",
|
||||
[[2, "line '21\n' \r\tline 22", 2], [3, "line '31\n' \r\tline 32", 1]],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_skip_row_with_newline_and_quote(all_parsers, data, exp_data):
|
||||
# see gh-12775 and gh-10911
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), skiprows=[1])
|
||||
|
||||
expected = DataFrame(exp_data, columns=["id", "text", "num_lines"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"line_terminator", ["\n", "\r\n", "\r"] # "LF" # "CRLF" # "CR"
|
||||
)
|
||||
def test_skiprows_lineterminator(all_parsers, line_terminator):
|
||||
# see gh-9079
|
||||
parser = all_parsers
|
||||
data = "\n".join(
|
||||
[
|
||||
"SMOSMANIA ThetaProbe-ML2X ",
|
||||
"2007/01/01 01:00 0.2140 U M ",
|
||||
"2007/01/01 02:00 0.2141 M O ",
|
||||
"2007/01/01 04:00 0.2142 D M ",
|
||||
]
|
||||
)
|
||||
expected = DataFrame(
|
||||
[
|
||||
["2007/01/01", "01:00", 0.2140, "U", "M"],
|
||||
["2007/01/01", "02:00", 0.2141, "M", "O"],
|
||||
["2007/01/01", "04:00", 0.2142, "D", "M"],
|
||||
],
|
||||
columns=["date", "time", "var", "flag", "oflag"],
|
||||
)
|
||||
|
||||
if parser.engine == "python" and line_terminator == "\r":
|
||||
pytest.skip("'CR' not respect with the Python parser yet")
|
||||
|
||||
data = data.replace("\n", line_terminator)
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
skiprows=1,
|
||||
delim_whitespace=True,
|
||||
names=["date", "time", "var", "flag", "oflag"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_skiprows_infield_quote(all_parsers):
|
||||
# see gh-14459
|
||||
parser = all_parsers
|
||||
data = 'a"\nb"\na\n1'
|
||||
expected = DataFrame({"a": [1]})
|
||||
|
||||
result = parser.read_csv(StringIO(data), skiprows=2)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs,expected",
|
||||
[
|
||||
(dict(), DataFrame({"1": [3, 5]})),
|
||||
(dict(header=0, names=["foo"]), DataFrame({"foo": [3, 5]})),
|
||||
],
|
||||
)
|
||||
def test_skip_rows_callable(all_parsers, kwargs, expected):
|
||||
parser = all_parsers
|
||||
data = "a\n1\n2\n3\n4\n5"
|
||||
|
||||
result = parser.read_csv(StringIO(data), skiprows=lambda x: x % 2 == 0, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_skip_rows_skip_all(all_parsers):
|
||||
parser = all_parsers
|
||||
data = "a\n1\n2\n3\n4\n5"
|
||||
msg = "No columns to parse from file"
|
||||
|
||||
with pytest.raises(EmptyDataError, match=msg):
|
||||
parser.read_csv(StringIO(data), skiprows=lambda x: True)
|
||||
|
||||
|
||||
def test_skip_rows_bad_callable(all_parsers):
|
||||
msg = "by zero"
|
||||
parser = all_parsers
|
||||
data = "a\n1\n2\n3\n4\n5"
|
||||
|
||||
with pytest.raises(ZeroDivisionError, match=msg):
|
||||
parser.read_csv(StringIO(data), skiprows=lambda x: 1 / 0)
|
||||
@@ -0,0 +1,344 @@
|
||||
"""
|
||||
Tests the TextReader class in parsers.pyx, which
|
||||
is integral to the C engine in parsers.py
|
||||
"""
|
||||
from io import BytesIO, StringIO
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
from numpy import nan
|
||||
import pytest
|
||||
|
||||
import pandas._libs.parsers as parser
|
||||
from pandas._libs.parsers import TextReader
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas.util.testing as tm
|
||||
from pandas.util.testing import assert_frame_equal
|
||||
|
||||
from pandas.io.parsers import TextFileReader, read_csv
|
||||
|
||||
|
||||
class TestTextReader:
|
||||
@pytest.fixture(autouse=True)
|
||||
def setup_method(self, datapath):
|
||||
self.dirpath = datapath("io", "parser", "data")
|
||||
self.csv1 = os.path.join(self.dirpath, "test1.csv")
|
||||
self.csv2 = os.path.join(self.dirpath, "test2.csv")
|
||||
self.xls1 = os.path.join(self.dirpath, "test.xls")
|
||||
|
||||
def test_file_handle(self):
|
||||
with open(self.csv1, "rb") as f:
|
||||
reader = TextReader(f)
|
||||
reader.read()
|
||||
|
||||
def test_string_filename(self):
|
||||
reader = TextReader(self.csv1, header=None)
|
||||
reader.read()
|
||||
|
||||
def test_file_handle_mmap(self):
|
||||
with open(self.csv1, "rb") as f:
|
||||
reader = TextReader(f, memory_map=True, header=None)
|
||||
reader.read()
|
||||
|
||||
def test_StringIO(self):
|
||||
with open(self.csv1, "rb") as f:
|
||||
text = f.read()
|
||||
src = BytesIO(text)
|
||||
reader = TextReader(src, header=None)
|
||||
reader.read()
|
||||
|
||||
def test_string_factorize(self):
|
||||
# should this be optional?
|
||||
data = "a\nb\na\nb\na"
|
||||
reader = TextReader(StringIO(data), header=None)
|
||||
result = reader.read()
|
||||
assert len(set(map(id, result[0]))) == 2
|
||||
|
||||
def test_skipinitialspace(self):
|
||||
data = "a, b\na, b\na, b\na, b"
|
||||
|
||||
reader = TextReader(StringIO(data), skipinitialspace=True, header=None)
|
||||
result = reader.read()
|
||||
|
||||
tm.assert_numpy_array_equal(
|
||||
result[0], np.array(["a", "a", "a", "a"], dtype=np.object_)
|
||||
)
|
||||
tm.assert_numpy_array_equal(
|
||||
result[1], np.array(["b", "b", "b", "b"], dtype=np.object_)
|
||||
)
|
||||
|
||||
def test_parse_booleans(self):
|
||||
data = "True\nFalse\nTrue\nTrue"
|
||||
|
||||
reader = TextReader(StringIO(data), header=None)
|
||||
result = reader.read()
|
||||
|
||||
assert result[0].dtype == np.bool_
|
||||
|
||||
def test_delimit_whitespace(self):
|
||||
data = 'a b\na\t\t "b"\n"a"\t \t b'
|
||||
|
||||
reader = TextReader(StringIO(data), delim_whitespace=True, header=None)
|
||||
result = reader.read()
|
||||
|
||||
tm.assert_numpy_array_equal(
|
||||
result[0], np.array(["a", "a", "a"], dtype=np.object_)
|
||||
)
|
||||
tm.assert_numpy_array_equal(
|
||||
result[1], np.array(["b", "b", "b"], dtype=np.object_)
|
||||
)
|
||||
|
||||
def test_embedded_newline(self):
|
||||
data = 'a\n"hello\nthere"\nthis'
|
||||
|
||||
reader = TextReader(StringIO(data), header=None)
|
||||
result = reader.read()
|
||||
|
||||
expected = np.array(["a", "hello\nthere", "this"], dtype=np.object_)
|
||||
tm.assert_numpy_array_equal(result[0], expected)
|
||||
|
||||
def test_euro_decimal(self):
|
||||
data = "12345,67\n345,678"
|
||||
|
||||
reader = TextReader(StringIO(data), delimiter=":", decimal=",", header=None)
|
||||
result = reader.read()
|
||||
|
||||
expected = np.array([12345.67, 345.678])
|
||||
tm.assert_almost_equal(result[0], expected)
|
||||
|
||||
def test_integer_thousands(self):
|
||||
data = "123,456\n12,500"
|
||||
|
||||
reader = TextReader(StringIO(data), delimiter=":", thousands=",", header=None)
|
||||
result = reader.read()
|
||||
|
||||
expected = np.array([123456, 12500], dtype=np.int64)
|
||||
tm.assert_almost_equal(result[0], expected)
|
||||
|
||||
def test_integer_thousands_alt(self):
|
||||
data = "123.456\n12.500"
|
||||
|
||||
reader = TextFileReader(
|
||||
StringIO(data), delimiter=":", thousands=".", header=None
|
||||
)
|
||||
result = reader.read()
|
||||
|
||||
expected = DataFrame([123456, 12500])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_skip_bad_lines(self, capsys):
|
||||
# too many lines, see #2430 for why
|
||||
data = "a:b:c\nd:e:f\ng:h:i\nj:k:l:m\nl:m:n\no:p:q:r"
|
||||
|
||||
reader = TextReader(StringIO(data), delimiter=":", header=None)
|
||||
msg = r"Error tokenizing data\. C error: Expected 3 fields in line 4, saw 4"
|
||||
with pytest.raises(parser.ParserError, match=msg):
|
||||
reader.read()
|
||||
|
||||
reader = TextReader(
|
||||
StringIO(data),
|
||||
delimiter=":",
|
||||
header=None,
|
||||
error_bad_lines=False,
|
||||
warn_bad_lines=False,
|
||||
)
|
||||
result = reader.read()
|
||||
expected = {
|
||||
0: np.array(["a", "d", "g", "l"], dtype=object),
|
||||
1: np.array(["b", "e", "h", "m"], dtype=object),
|
||||
2: np.array(["c", "f", "i", "n"], dtype=object),
|
||||
}
|
||||
assert_array_dicts_equal(result, expected)
|
||||
|
||||
reader = TextReader(
|
||||
StringIO(data),
|
||||
delimiter=":",
|
||||
header=None,
|
||||
error_bad_lines=False,
|
||||
warn_bad_lines=True,
|
||||
)
|
||||
reader.read()
|
||||
captured = capsys.readouterr()
|
||||
|
||||
assert "Skipping line 4" in captured.err
|
||||
assert "Skipping line 6" in captured.err
|
||||
|
||||
def test_header_not_enough_lines(self):
|
||||
data = "skip this\nskip this\na,b,c\n1,2,3\n4,5,6"
|
||||
|
||||
reader = TextReader(StringIO(data), delimiter=",", header=2)
|
||||
header = reader.header
|
||||
expected = [["a", "b", "c"]]
|
||||
assert header == expected
|
||||
|
||||
recs = reader.read()
|
||||
expected = {
|
||||
0: np.array([1, 4], dtype=np.int64),
|
||||
1: np.array([2, 5], dtype=np.int64),
|
||||
2: np.array([3, 6], dtype=np.int64),
|
||||
}
|
||||
assert_array_dicts_equal(recs, expected)
|
||||
|
||||
def test_escapechar(self):
|
||||
data = '\\"hello world"\n' '\\"hello world"\n' '\\"hello world"'
|
||||
|
||||
reader = TextReader(StringIO(data), delimiter=",", header=None, escapechar="\\")
|
||||
result = reader.read()
|
||||
expected = {0: np.array(['"hello world"'] * 3, dtype=object)}
|
||||
assert_array_dicts_equal(result, expected)
|
||||
|
||||
def test_eof_has_eol(self):
|
||||
# handling of new line at EOF
|
||||
pass
|
||||
|
||||
def test_na_substitution(self):
|
||||
pass
|
||||
|
||||
def test_numpy_string_dtype(self):
|
||||
data = """\
|
||||
a,1
|
||||
aa,2
|
||||
aaa,3
|
||||
aaaa,4
|
||||
aaaaa,5"""
|
||||
|
||||
def _make_reader(**kwds):
|
||||
return TextReader(StringIO(data), delimiter=",", header=None, **kwds)
|
||||
|
||||
reader = _make_reader(dtype="S5,i4")
|
||||
result = reader.read()
|
||||
|
||||
assert result[0].dtype == "S5"
|
||||
|
||||
ex_values = np.array(["a", "aa", "aaa", "aaaa", "aaaaa"], dtype="S5")
|
||||
assert (result[0] == ex_values).all()
|
||||
assert result[1].dtype == "i4"
|
||||
|
||||
reader = _make_reader(dtype="S4")
|
||||
result = reader.read()
|
||||
assert result[0].dtype == "S4"
|
||||
ex_values = np.array(["a", "aa", "aaa", "aaaa", "aaaa"], dtype="S4")
|
||||
assert (result[0] == ex_values).all()
|
||||
assert result[1].dtype == "S4"
|
||||
|
||||
def test_pass_dtype(self):
|
||||
data = """\
|
||||
one,two
|
||||
1,a
|
||||
2,b
|
||||
3,c
|
||||
4,d"""
|
||||
|
||||
def _make_reader(**kwds):
|
||||
return TextReader(StringIO(data), delimiter=",", **kwds)
|
||||
|
||||
reader = _make_reader(dtype={"one": "u1", 1: "S1"})
|
||||
result = reader.read()
|
||||
assert result[0].dtype == "u1"
|
||||
assert result[1].dtype == "S1"
|
||||
|
||||
reader = _make_reader(dtype={"one": np.uint8, 1: object})
|
||||
result = reader.read()
|
||||
assert result[0].dtype == "u1"
|
||||
assert result[1].dtype == "O"
|
||||
|
||||
reader = _make_reader(dtype={"one": np.dtype("u1"), 1: np.dtype("O")})
|
||||
result = reader.read()
|
||||
assert result[0].dtype == "u1"
|
||||
assert result[1].dtype == "O"
|
||||
|
||||
def test_usecols(self):
|
||||
data = """\
|
||||
a,b,c
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
10,11,12"""
|
||||
|
||||
def _make_reader(**kwds):
|
||||
return TextReader(StringIO(data), delimiter=",", **kwds)
|
||||
|
||||
reader = _make_reader(usecols=(1, 2))
|
||||
result = reader.read()
|
||||
|
||||
exp = _make_reader().read()
|
||||
assert len(result) == 2
|
||||
assert (result[1] == exp[1]).all()
|
||||
assert (result[2] == exp[2]).all()
|
||||
|
||||
def test_cr_delimited(self):
|
||||
def _test(text, **kwargs):
|
||||
nice_text = text.replace("\r", "\r\n")
|
||||
result = TextReader(StringIO(text), **kwargs).read()
|
||||
expected = TextReader(StringIO(nice_text), **kwargs).read()
|
||||
assert_array_dicts_equal(result, expected)
|
||||
|
||||
data = "a,b,c\r1,2,3\r4,5,6\r7,8,9\r10,11,12"
|
||||
_test(data, delimiter=",")
|
||||
|
||||
data = "a b c\r1 2 3\r4 5 6\r7 8 9\r10 11 12"
|
||||
_test(data, delim_whitespace=True)
|
||||
|
||||
data = "a,b,c\r1,2,3\r4,5,6\r,88,9\r10,11,12"
|
||||
_test(data, delimiter=",")
|
||||
|
||||
sample = (
|
||||
"A,B,C,D,E,F,G,H,I,J,K,L,M,N,O\r"
|
||||
"AAAAA,BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0\r"
|
||||
",BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0"
|
||||
)
|
||||
_test(sample, delimiter=",")
|
||||
|
||||
data = "A B C\r 2 3\r4 5 6"
|
||||
_test(data, delim_whitespace=True)
|
||||
|
||||
data = "A B C\r2 3\r4 5 6"
|
||||
_test(data, delim_whitespace=True)
|
||||
|
||||
def test_empty_field_eof(self):
|
||||
data = "a,b,c\n1,2,3\n4,,"
|
||||
|
||||
result = TextReader(StringIO(data), delimiter=",").read()
|
||||
|
||||
expected = {
|
||||
0: np.array([1, 4], dtype=np.int64),
|
||||
1: np.array(["2", ""], dtype=object),
|
||||
2: np.array(["3", ""], dtype=object),
|
||||
}
|
||||
assert_array_dicts_equal(result, expected)
|
||||
|
||||
# GH5664
|
||||
a = DataFrame([["b"], [nan]], columns=["a"], index=["a", "c"])
|
||||
b = DataFrame([[1, 1, 1, 0], [1, 1, 1, 0]], columns=list("abcd"), index=[1, 1])
|
||||
c = DataFrame(
|
||||
[[1, 2, 3, 4], [6, nan, nan, nan], [8, 9, 10, 11], [13, 14, nan, nan]],
|
||||
columns=list("abcd"),
|
||||
index=[0, 5, 7, 12],
|
||||
)
|
||||
|
||||
for _ in range(100):
|
||||
df = read_csv(StringIO("a,b\nc\n"), skiprows=0, names=["a"], engine="c")
|
||||
assert_frame_equal(df, a)
|
||||
|
||||
df = read_csv(
|
||||
StringIO("1,1,1,1,0\n" * 2 + "\n" * 2), names=list("abcd"), engine="c"
|
||||
)
|
||||
assert_frame_equal(df, b)
|
||||
|
||||
df = read_csv(
|
||||
StringIO("0,1,2,3,4\n5,6\n7,8,9,10,11\n12,13,14"),
|
||||
names=list("abcd"),
|
||||
engine="c",
|
||||
)
|
||||
assert_frame_equal(df, c)
|
||||
|
||||
def test_empty_csv_input(self):
|
||||
# GH14867
|
||||
df = read_csv(StringIO(), chunksize=20, header=None, names=["a", "b", "c"])
|
||||
assert isinstance(df, TextFileReader)
|
||||
|
||||
|
||||
def assert_array_dicts_equal(left, right):
|
||||
for k, v in left.items():
|
||||
tm.assert_numpy_array_equal(np.asarray(v), np.asarray(right[k]))
|
||||
@@ -0,0 +1,123 @@
|
||||
"""
|
||||
Tests that features that are currently unsupported in
|
||||
either the Python or C parser are actually enforced
|
||||
and are clearly communicated to the user.
|
||||
|
||||
Ultimately, the goal is to remove test cases from this
|
||||
test suite as new feature support is added to the parsers.
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas.errors import ParserError
|
||||
|
||||
import pandas.util.testing as tm
|
||||
|
||||
import pandas.io.parsers as parsers
|
||||
from pandas.io.parsers import read_csv
|
||||
|
||||
|
||||
@pytest.fixture(params=["python", "python-fwf"], ids=lambda val: val)
|
||||
def python_engine(request):
|
||||
return request.param
|
||||
|
||||
|
||||
class TestUnsupportedFeatures:
|
||||
def test_mangle_dupe_cols_false(self):
|
||||
# see gh-12935
|
||||
data = "a b c\n1 2 3"
|
||||
msg = "is not supported"
|
||||
|
||||
for engine in ("c", "python"):
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_csv(StringIO(data), engine=engine, mangle_dupe_cols=False)
|
||||
|
||||
def test_c_engine(self):
|
||||
# see gh-6607
|
||||
data = "a b c\n1 2 3"
|
||||
msg = "does not support"
|
||||
|
||||
# specify C engine with unsupported options (raise)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_csv(StringIO(data), engine="c", sep=None, delim_whitespace=False)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_csv(StringIO(data), engine="c", sep=r"\s")
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_csv(StringIO(data), engine="c", sep="\t", quotechar=chr(128))
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_csv(StringIO(data), engine="c", skipfooter=1)
|
||||
|
||||
# specify C-unsupported options without python-unsupported options
|
||||
with tm.assert_produces_warning(parsers.ParserWarning):
|
||||
read_csv(StringIO(data), sep=None, delim_whitespace=False)
|
||||
with tm.assert_produces_warning(parsers.ParserWarning):
|
||||
read_csv(StringIO(data), sep=r"\s")
|
||||
with tm.assert_produces_warning(parsers.ParserWarning):
|
||||
read_csv(StringIO(data), sep="\t", quotechar=chr(128))
|
||||
with tm.assert_produces_warning(parsers.ParserWarning):
|
||||
read_csv(StringIO(data), skipfooter=1)
|
||||
|
||||
text = """ A B C D E
|
||||
one two three four
|
||||
a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640
|
||||
a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744
|
||||
x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838"""
|
||||
msg = "Error tokenizing data"
|
||||
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
read_csv(StringIO(text), sep="\\s+")
|
||||
with pytest.raises(ParserError, match=msg):
|
||||
read_csv(StringIO(text), engine="c", sep="\\s+")
|
||||
|
||||
msg = "Only length-1 thousands markers supported"
|
||||
data = """A|B|C
|
||||
1|2,334|5
|
||||
10|13|10.
|
||||
"""
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_csv(StringIO(data), thousands=",,")
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_csv(StringIO(data), thousands="")
|
||||
|
||||
msg = "Only length-1 line terminators supported"
|
||||
data = "a,b,c~~1,2,3~~4,5,6"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_csv(StringIO(data), lineterminator="~~")
|
||||
|
||||
def test_python_engine(self, python_engine):
|
||||
from pandas.io.parsers import _python_unsupported as py_unsupported
|
||||
|
||||
data = """1,2,3,,
|
||||
1,2,3,4,
|
||||
1,2,3,4,5
|
||||
1,2,,,
|
||||
1,2,3,4,"""
|
||||
|
||||
for default in py_unsupported:
|
||||
msg = (
|
||||
"The {default!r} option is not supported with the {python_engine!r}"
|
||||
" engine"
|
||||
).format(default=default, python_engine=python_engine)
|
||||
|
||||
kwargs = {default: object()}
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_csv(StringIO(data), engine=python_engine, **kwargs)
|
||||
|
||||
def test_python_engine_file_no_next(self, python_engine):
|
||||
# see gh-16530
|
||||
class NoNextBuffer:
|
||||
def __init__(self, csv_data):
|
||||
self.data = csv_data
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def read(self):
|
||||
return self.data
|
||||
|
||||
data = "a\n1"
|
||||
msg = "The 'python' engine cannot iterate"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_csv(NoNextBuffer(data), engine=python_engine)
|
||||
@@ -0,0 +1,572 @@
|
||||
"""
|
||||
Tests the usecols functionality during parsing
|
||||
for all of the parsers defined in parsers.py
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs.tslib import Timestamp
|
||||
|
||||
from pandas import DataFrame, Index
|
||||
import pandas.util.testing as tm
|
||||
|
||||
_msg_validate_usecols_arg = (
|
||||
"'usecols' must either be list-like "
|
||||
"of all strings, all unicode, all "
|
||||
"integers or a callable."
|
||||
)
|
||||
_msg_validate_usecols_names = (
|
||||
"Usecols do not match columns, columns expected but not found: {0}"
|
||||
)
|
||||
|
||||
|
||||
def test_raise_on_mixed_dtype_usecols(all_parsers):
|
||||
# See gh-12678
|
||||
data = """a,b,c
|
||||
1000,2000,3000
|
||||
4000,5000,6000
|
||||
"""
|
||||
usecols = [0, "b", 2]
|
||||
parser = all_parsers
|
||||
|
||||
with pytest.raises(ValueError, match=_msg_validate_usecols_arg):
|
||||
parser.read_csv(StringIO(data), usecols=usecols)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("usecols", [(1, 2), ("b", "c")])
|
||||
def test_usecols(all_parsers, usecols):
|
||||
data = """\
|
||||
a,b,c
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
10,11,12"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols)
|
||||
|
||||
expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_with_names(all_parsers):
|
||||
data = """\
|
||||
a,b,c
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
10,11,12"""
|
||||
parser = all_parsers
|
||||
names = ["foo", "bar"]
|
||||
result = parser.read_csv(StringIO(data), names=names, usecols=[1, 2], header=0)
|
||||
|
||||
expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=names)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"names,usecols", [(["b", "c"], [1, 2]), (["a", "b", "c"], ["b", "c"])]
|
||||
)
|
||||
def test_usecols_relative_to_names(all_parsers, names, usecols):
|
||||
data = """\
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
10,11,12"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), names=names, header=None, usecols=usecols)
|
||||
|
||||
expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_relative_to_names2(all_parsers):
|
||||
# see gh-5766
|
||||
data = """\
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
10,11,12"""
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(
|
||||
StringIO(data), names=["a", "b"], header=None, usecols=[0, 1]
|
||||
)
|
||||
|
||||
expected = DataFrame([[1, 2], [4, 5], [7, 8], [10, 11]], columns=["a", "b"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_name_length_conflict(all_parsers):
|
||||
data = """\
|
||||
1,2,3
|
||||
4,5,6
|
||||
7,8,9
|
||||
10,11,12"""
|
||||
parser = all_parsers
|
||||
msg = (
|
||||
"Number of passed names did not match number of header fields in the file"
|
||||
if parser.engine == "python"
|
||||
else "Passed header names mismatches usecols"
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), names=["a", "b"], header=None, usecols=[1])
|
||||
|
||||
|
||||
def test_usecols_single_string(all_parsers):
|
||||
# see gh-20558
|
||||
parser = all_parsers
|
||||
data = """foo, bar, baz
|
||||
1000, 2000, 3000
|
||||
4000, 5000, 6000"""
|
||||
|
||||
with pytest.raises(ValueError, match=_msg_validate_usecols_arg):
|
||||
parser.read_csv(StringIO(data), usecols="foo")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data", ["a,b,c,d\n1,2,3,4\n5,6,7,8", "a,b,c,d\n1,2,3,4,\n5,6,7,8,"]
|
||||
)
|
||||
def test_usecols_index_col_false(all_parsers, data):
|
||||
# see gh-9082
|
||||
parser = all_parsers
|
||||
usecols = ["a", "c", "d"]
|
||||
expected = DataFrame({"a": [1, 5], "c": [3, 7], "d": [4, 8]})
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols, index_col=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("index_col", ["b", 0])
|
||||
@pytest.mark.parametrize("usecols", [["b", "c"], [1, 2]])
|
||||
def test_usecols_index_col_conflict(all_parsers, usecols, index_col):
|
||||
# see gh-4201: test that index_col as integer reflects usecols
|
||||
parser = all_parsers
|
||||
data = "a,b,c,d\nA,a,1,one\nB,b,2,two"
|
||||
expected = DataFrame({"c": [1, 2]}, index=Index(["a", "b"], name="b"))
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols, index_col=index_col)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_index_col_conflict2(all_parsers):
|
||||
# see gh-4201: test that index_col as integer reflects usecols
|
||||
parser = all_parsers
|
||||
data = "a,b,c,d\nA,a,1,one\nB,b,2,two"
|
||||
|
||||
expected = DataFrame({"b": ["a", "b"], "c": [1, 2], "d": ("one", "two")})
|
||||
expected = expected.set_index(["b", "c"])
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data), usecols=["b", "c", "d"], index_col=["b", "c"]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_implicit_index_col(all_parsers):
|
||||
# see gh-2654
|
||||
parser = all_parsers
|
||||
data = "a,b,c\n4,apple,bat,5.7\n8,orange,cow,10"
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=["a", "b"])
|
||||
expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_regex_sep(all_parsers):
|
||||
# see gh-2733
|
||||
parser = all_parsers
|
||||
data = "a b c\n4 apple bat 5.7\n8 orange cow 10"
|
||||
result = parser.read_csv(StringIO(data), sep=r"\s+", usecols=("a", "b"))
|
||||
|
||||
expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_with_whitespace(all_parsers):
|
||||
parser = all_parsers
|
||||
data = "a b c\n4 apple bat 5.7\n8 orange cow 10"
|
||||
|
||||
result = parser.read_csv(StringIO(data), delim_whitespace=True, usecols=("a", "b"))
|
||||
expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"usecols,expected",
|
||||
[
|
||||
# Column selection by index.
|
||||
([0, 1], DataFrame(data=[[1000, 2000], [4000, 5000]], columns=["2", "0"])),
|
||||
# Column selection by name.
|
||||
(["0", "1"], DataFrame(data=[[2000, 3000], [5000, 6000]], columns=["0", "1"])),
|
||||
],
|
||||
)
|
||||
def test_usecols_with_integer_like_header(all_parsers, usecols, expected):
|
||||
parser = all_parsers
|
||||
data = """2,0,1
|
||||
1000,2000,3000
|
||||
4000,5000,6000"""
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]])
|
||||
def test_usecols_with_parse_dates(all_parsers, usecols):
|
||||
# see gh-9755
|
||||
data = """a,b,c,d,e
|
||||
0,1,20140101,0900,4
|
||||
0,1,20140102,1000,4"""
|
||||
parser = all_parsers
|
||||
parse_dates = [[1, 2]]
|
||||
|
||||
cols = {
|
||||
"a": [0, 0],
|
||||
"c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")],
|
||||
}
|
||||
expected = DataFrame(cols, columns=["c_d", "a"])
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_with_parse_dates2(all_parsers):
|
||||
# see gh-13604
|
||||
parser = all_parsers
|
||||
data = """2008-02-07 09:40,1032.43
|
||||
2008-02-07 09:50,1042.54
|
||||
2008-02-07 10:00,1051.65"""
|
||||
|
||||
names = ["date", "values"]
|
||||
usecols = names[:]
|
||||
parse_dates = [0]
|
||||
|
||||
index = Index(
|
||||
[
|
||||
Timestamp("2008-02-07 09:40"),
|
||||
Timestamp("2008-02-07 09:50"),
|
||||
Timestamp("2008-02-07 10:00"),
|
||||
],
|
||||
name="date",
|
||||
)
|
||||
cols = {"values": [1032.43, 1042.54, 1051.65]}
|
||||
expected = DataFrame(cols, index=index)
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(data),
|
||||
parse_dates=parse_dates,
|
||||
index_col=0,
|
||||
usecols=usecols,
|
||||
header=None,
|
||||
names=names,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_with_parse_dates3(all_parsers):
|
||||
# see gh-14792
|
||||
parser = all_parsers
|
||||
data = """a,b,c,d,e,f,g,h,i,j
|
||||
2016/09/21,1,1,2,3,4,5,6,7,8"""
|
||||
|
||||
usecols = list("abcdefghij")
|
||||
parse_dates = [0]
|
||||
|
||||
cols = {
|
||||
"a": Timestamp("2016-09-21"),
|
||||
"b": [1],
|
||||
"c": [1],
|
||||
"d": [2],
|
||||
"e": [3],
|
||||
"f": [4],
|
||||
"g": [5],
|
||||
"h": [6],
|
||||
"i": [7],
|
||||
"j": [8],
|
||||
}
|
||||
expected = DataFrame(cols, columns=usecols)
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_with_parse_dates4(all_parsers):
|
||||
data = "a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8"
|
||||
usecols = list("abcdefghij")
|
||||
parse_dates = [[0, 1]]
|
||||
parser = all_parsers
|
||||
|
||||
cols = {
|
||||
"a_b": "2016/09/21 1",
|
||||
"c": [1],
|
||||
"d": [2],
|
||||
"e": [3],
|
||||
"f": [4],
|
||||
"g": [5],
|
||||
"h": [6],
|
||||
"i": [7],
|
||||
"j": [8],
|
||||
}
|
||||
expected = DataFrame(cols, columns=["a_b"] + list("cdefghij"))
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]])
|
||||
@pytest.mark.parametrize(
|
||||
"names",
|
||||
[
|
||||
list("abcde"), # Names span all columns in original data.
|
||||
list("acd"), # Names span only the selected columns.
|
||||
],
|
||||
)
|
||||
def test_usecols_with_parse_dates_and_names(all_parsers, usecols, names):
|
||||
# see gh-9755
|
||||
s = """0,1,20140101,0900,4
|
||||
0,1,20140102,1000,4"""
|
||||
parse_dates = [[1, 2]]
|
||||
parser = all_parsers
|
||||
|
||||
cols = {
|
||||
"a": [0, 0],
|
||||
"c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")],
|
||||
}
|
||||
expected = DataFrame(cols, columns=["c_d", "a"])
|
||||
|
||||
result = parser.read_csv(
|
||||
StringIO(s), names=names, parse_dates=parse_dates, usecols=usecols
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_with_unicode_strings(all_parsers):
|
||||
# see gh-13219
|
||||
data = """AAA,BBB,CCC,DDD
|
||||
0.056674973,8,True,a
|
||||
2.613230982,2,False,b
|
||||
3.568935038,7,False,a"""
|
||||
parser = all_parsers
|
||||
|
||||
exp_data = {
|
||||
"AAA": {0: 0.056674972999999997, 1: 2.6132309819999997, 2: 3.5689350380000002},
|
||||
"BBB": {0: 8, 1: 2, 2: 7},
|
||||
}
|
||||
expected = DataFrame(exp_data)
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=["AAA", "BBB"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_usecols_with_single_byte_unicode_strings(all_parsers):
|
||||
# see gh-13219
|
||||
data = """A,B,C,D
|
||||
0.056674973,8,True,a
|
||||
2.613230982,2,False,b
|
||||
3.568935038,7,False,a"""
|
||||
parser = all_parsers
|
||||
|
||||
exp_data = {
|
||||
"A": {0: 0.056674972999999997, 1: 2.6132309819999997, 2: 3.5689350380000002},
|
||||
"B": {0: 8, 1: 2, 2: 7},
|
||||
}
|
||||
expected = DataFrame(exp_data)
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=["A", "B"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("usecols", [["AAA", b"BBB"], [b"AAA", "BBB"]])
|
||||
def test_usecols_with_mixed_encoding_strings(all_parsers, usecols):
|
||||
data = """AAA,BBB,CCC,DDD
|
||||
0.056674973,8,True,a
|
||||
2.613230982,2,False,b
|
||||
3.568935038,7,False,a"""
|
||||
parser = all_parsers
|
||||
|
||||
with pytest.raises(ValueError, match=_msg_validate_usecols_arg):
|
||||
parser.read_csv(StringIO(data), usecols=usecols)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("usecols", [["あああ", "いい"], ["あああ", "いい"]])
|
||||
def test_usecols_with_multi_byte_characters(all_parsers, usecols):
|
||||
data = """あああ,いい,ううう,ええええ
|
||||
0.056674973,8,True,a
|
||||
2.613230982,2,False,b
|
||||
3.568935038,7,False,a"""
|
||||
parser = all_parsers
|
||||
|
||||
exp_data = {
|
||||
"あああ": {0: 0.056674972999999997, 1: 2.6132309819999997, 2: 3.5689350380000002},
|
||||
"いい": {0: 8, 1: 2, 2: 7},
|
||||
}
|
||||
expected = DataFrame(exp_data)
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_empty_usecols(all_parsers):
|
||||
data = "a,b,c\n1,2,3\n4,5,6"
|
||||
expected = DataFrame()
|
||||
parser = all_parsers
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=set())
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_np_array_usecols(all_parsers):
|
||||
# see gh-12546
|
||||
parser = all_parsers
|
||||
data = "a,b,c\n1,2,3"
|
||||
usecols = np.array(["a", "b"])
|
||||
|
||||
expected = DataFrame([[1, 2]], columns=usecols)
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"usecols,expected",
|
||||
[
|
||||
(
|
||||
lambda x: x.upper() in ["AAA", "BBB", "DDD"],
|
||||
DataFrame(
|
||||
{
|
||||
"AaA": {
|
||||
0: 0.056674972999999997,
|
||||
1: 2.6132309819999997,
|
||||
2: 3.5689350380000002,
|
||||
},
|
||||
"bBb": {0: 8, 1: 2, 2: 7},
|
||||
"ddd": {0: "a", 1: "b", 2: "a"},
|
||||
}
|
||||
),
|
||||
),
|
||||
(lambda x: False, DataFrame()),
|
||||
],
|
||||
)
|
||||
def test_callable_usecols(all_parsers, usecols, expected):
|
||||
# see gh-14154
|
||||
data = """AaA,bBb,CCC,ddd
|
||||
0.056674973,8,True,a
|
||||
2.613230982,2,False,b
|
||||
3.568935038,7,False,a"""
|
||||
parser = all_parsers
|
||||
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("usecols", [["a", "c"], lambda x: x in ["a", "c"]])
|
||||
def test_incomplete_first_row(all_parsers, usecols):
|
||||
# see gh-6710
|
||||
data = "1,2\n1,2,3"
|
||||
parser = all_parsers
|
||||
names = ["a", "b", "c"]
|
||||
expected = DataFrame({"a": [1, 1], "c": [np.nan, 3]})
|
||||
|
||||
result = parser.read_csv(StringIO(data), names=names, usecols=usecols)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,usecols,kwargs,expected",
|
||||
[
|
||||
# see gh-8985
|
||||
(
|
||||
"19,29,39\n" * 2 + "10,20,30,40",
|
||||
[0, 1, 2],
|
||||
dict(header=None),
|
||||
DataFrame([[19, 29, 39], [19, 29, 39], [10, 20, 30]]),
|
||||
),
|
||||
# see gh-9549
|
||||
(
|
||||
("A,B,C\n1,2,3\n3,4,5\n1,2,4,5,1,6\n1,2,3,,,1,\n1,2,3\n5,6,7"),
|
||||
["A", "B", "C"],
|
||||
dict(),
|
||||
DataFrame(
|
||||
{
|
||||
"A": [1, 3, 1, 1, 1, 5],
|
||||
"B": [2, 4, 2, 2, 2, 6],
|
||||
"C": [3, 5, 4, 3, 3, 7],
|
||||
}
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_uneven_length_cols(all_parsers, data, usecols, kwargs, expected):
|
||||
# see gh-8985
|
||||
parser = all_parsers
|
||||
result = parser.read_csv(StringIO(data), usecols=usecols, **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"usecols,kwargs,expected,msg",
|
||||
[
|
||||
(
|
||||
["a", "b", "c", "d"],
|
||||
dict(),
|
||||
DataFrame({"a": [1, 5], "b": [2, 6], "c": [3, 7], "d": [4, 8]}),
|
||||
None,
|
||||
),
|
||||
(
|
||||
["a", "b", "c", "f"],
|
||||
dict(),
|
||||
None,
|
||||
_msg_validate_usecols_names.format(r"\['f'\]"),
|
||||
),
|
||||
(["a", "b", "f"], dict(), None, _msg_validate_usecols_names.format(r"\['f'\]")),
|
||||
(
|
||||
["a", "b", "f", "g"],
|
||||
dict(),
|
||||
None,
|
||||
_msg_validate_usecols_names.format(r"\[('f', 'g'|'g', 'f')\]"),
|
||||
),
|
||||
# see gh-14671
|
||||
(
|
||||
None,
|
||||
dict(header=0, names=["A", "B", "C", "D"]),
|
||||
DataFrame({"A": [1, 5], "B": [2, 6], "C": [3, 7], "D": [4, 8]}),
|
||||
None,
|
||||
),
|
||||
(
|
||||
["A", "B", "C", "f"],
|
||||
dict(header=0, names=["A", "B", "C", "D"]),
|
||||
None,
|
||||
_msg_validate_usecols_names.format(r"\['f'\]"),
|
||||
),
|
||||
(
|
||||
["A", "B", "f"],
|
||||
dict(names=["A", "B", "C", "D"]),
|
||||
None,
|
||||
_msg_validate_usecols_names.format(r"\['f'\]"),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_raises_on_usecols_names_mismatch(all_parsers, usecols, kwargs, expected, msg):
|
||||
data = "a,b,c,d\n1,2,3,4\n5,6,7,8"
|
||||
kwargs.update(usecols=usecols)
|
||||
parser = all_parsers
|
||||
|
||||
if expected is None:
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
parser.read_csv(StringIO(data), **kwargs)
|
||||
else:
|
||||
result = parser.read_csv(StringIO(data), **kwargs)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.xfail(
|
||||
reason="see gh-16469: works on the C engine but not the Python engine", strict=False
|
||||
)
|
||||
@pytest.mark.parametrize("usecols", [["A", "C"], [0, 2]])
|
||||
def test_usecols_subset_names_mismatch_orig_columns(all_parsers, usecols):
|
||||
data = "a,b,c,d\n1,2,3,4\n5,6,7,8"
|
||||
names = ["A", "B", "C", "D"]
|
||||
parser = all_parsers
|
||||
|
||||
result = parser.read_csv(StringIO(data), header=0, names=names, usecols=usecols)
|
||||
expected = DataFrame({"A": [1, 5], "C": [3, 7]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
Reference in New Issue
Block a user