8th day of python challenges 111-117
This commit is contained in:
@@ -0,0 +1,117 @@
|
||||
import pytest
|
||||
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
import pandas as pd
|
||||
import pandas.util.testing as tm
|
||||
from pandas.util.testing import assert_frame_equal
|
||||
|
||||
|
||||
def test_compression_roundtrip(compression):
|
||||
df = pd.DataFrame(
|
||||
[[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]],
|
||||
index=["A", "B"],
|
||||
columns=["X", "Y", "Z"],
|
||||
)
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
df.to_json(path, compression=compression)
|
||||
assert_frame_equal(df, pd.read_json(path, compression=compression))
|
||||
|
||||
# explicitly ensure file was compressed.
|
||||
with tm.decompress_file(path, compression) as fh:
|
||||
result = fh.read().decode("utf8")
|
||||
assert_frame_equal(df, pd.read_json(result))
|
||||
|
||||
|
||||
def test_read_zipped_json(datapath):
|
||||
uncompressed_path = datapath("io", "json", "data", "tsframe_v012.json")
|
||||
uncompressed_df = pd.read_json(uncompressed_path)
|
||||
|
||||
compressed_path = datapath("io", "json", "data", "tsframe_v012.json.zip")
|
||||
compressed_df = pd.read_json(compressed_path, compression="zip")
|
||||
|
||||
assert_frame_equal(uncompressed_df, compressed_df)
|
||||
|
||||
|
||||
@td.skip_if_not_us_locale
|
||||
def test_with_s3_url(compression, s3_resource):
|
||||
# Bucket "pandas-test" created in tests/io/conftest.py
|
||||
|
||||
df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}')
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
df.to_json(path, compression=compression)
|
||||
with open(path, "rb") as f:
|
||||
s3_resource.Bucket("pandas-test").put_object(Key="test-1", Body=f)
|
||||
|
||||
roundtripped_df = pd.read_json("s3://pandas-test/test-1", compression=compression)
|
||||
assert_frame_equal(df, roundtripped_df)
|
||||
|
||||
|
||||
def test_lines_with_compression(compression):
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}')
|
||||
df.to_json(path, orient="records", lines=True, compression=compression)
|
||||
roundtripped_df = pd.read_json(path, lines=True, compression=compression)
|
||||
assert_frame_equal(df, roundtripped_df)
|
||||
|
||||
|
||||
def test_chunksize_with_compression(compression):
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
df = pd.read_json('{"a": ["foo", "bar", "baz"], "b": [4, 5, 6]}')
|
||||
df.to_json(path, orient="records", lines=True, compression=compression)
|
||||
|
||||
res = pd.read_json(path, lines=True, chunksize=1, compression=compression)
|
||||
roundtripped_df = pd.concat(res)
|
||||
assert_frame_equal(df, roundtripped_df)
|
||||
|
||||
|
||||
def test_write_unsupported_compression_type():
|
||||
df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}')
|
||||
with tm.ensure_clean() as path:
|
||||
msg = "Unrecognized compression type: unsupported"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.to_json(path, compression="unsupported")
|
||||
|
||||
|
||||
def test_read_unsupported_compression_type():
|
||||
with tm.ensure_clean() as path:
|
||||
msg = "Unrecognized compression type: unsupported"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
pd.read_json(path, compression="unsupported")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("to_infer", [True, False])
|
||||
@pytest.mark.parametrize("read_infer", [True, False])
|
||||
def test_to_json_compression(compression_only, read_infer, to_infer):
|
||||
# see gh-15008
|
||||
compression = compression_only
|
||||
|
||||
if compression == "zip":
|
||||
pytest.skip(
|
||||
"{compression} is not supported "
|
||||
"for to_csv".format(compression=compression)
|
||||
)
|
||||
|
||||
# We'll complete file extension subsequently.
|
||||
filename = "test."
|
||||
|
||||
if compression == "gzip":
|
||||
filename += "gz"
|
||||
else:
|
||||
# xz --> .xz
|
||||
# bz2 --> .bz2
|
||||
filename += compression
|
||||
|
||||
df = pd.DataFrame({"A": [1]})
|
||||
|
||||
to_compression = "infer" if to_infer else compression
|
||||
read_compression = "infer" if read_infer else compression
|
||||
|
||||
with tm.ensure_clean(filename) as path:
|
||||
df.to_json(path, compression=to_compression)
|
||||
result = pd.read_json(path, compression=read_compression)
|
||||
tm.assert_frame_equal(result, df)
|
@@ -0,0 +1,717 @@
|
||||
"""Tests for Table Schema integration."""
|
||||
from collections import OrderedDict
|
||||
import json
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.core.dtypes.dtypes import CategoricalDtype, DatetimeTZDtype, PeriodDtype
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame
|
||||
import pandas.util.testing as tm
|
||||
|
||||
from pandas.io.json._table_schema import (
|
||||
as_json_table_type,
|
||||
build_table_schema,
|
||||
convert_json_field_to_pandas_type,
|
||||
convert_pandas_type_to_json_field,
|
||||
set_default_names,
|
||||
)
|
||||
|
||||
|
||||
class TestBuildSchema:
|
||||
def setup_method(self, method):
|
||||
self.df = DataFrame(
|
||||
{
|
||||
"A": [1, 2, 3, 4],
|
||||
"B": ["a", "b", "c", "c"],
|
||||
"C": pd.date_range("2016-01-01", freq="d", periods=4),
|
||||
"D": pd.timedelta_range("1H", periods=4, freq="T"),
|
||||
},
|
||||
index=pd.Index(range(4), name="idx"),
|
||||
)
|
||||
|
||||
def test_build_table_schema(self):
|
||||
result = build_table_schema(self.df, version=False)
|
||||
expected = {
|
||||
"fields": [
|
||||
{"name": "idx", "type": "integer"},
|
||||
{"name": "A", "type": "integer"},
|
||||
{"name": "B", "type": "string"},
|
||||
{"name": "C", "type": "datetime"},
|
||||
{"name": "D", "type": "duration"},
|
||||
],
|
||||
"primaryKey": ["idx"],
|
||||
}
|
||||
assert result == expected
|
||||
result = build_table_schema(self.df)
|
||||
assert "pandas_version" in result
|
||||
|
||||
def test_series(self):
|
||||
s = pd.Series([1, 2, 3], name="foo")
|
||||
result = build_table_schema(s, version=False)
|
||||
expected = {
|
||||
"fields": [
|
||||
{"name": "index", "type": "integer"},
|
||||
{"name": "foo", "type": "integer"},
|
||||
],
|
||||
"primaryKey": ["index"],
|
||||
}
|
||||
assert result == expected
|
||||
result = build_table_schema(s)
|
||||
assert "pandas_version" in result
|
||||
|
||||
def test_series_unnamed(self):
|
||||
result = build_table_schema(pd.Series([1, 2, 3]), version=False)
|
||||
expected = {
|
||||
"fields": [
|
||||
{"name": "index", "type": "integer"},
|
||||
{"name": "values", "type": "integer"},
|
||||
],
|
||||
"primaryKey": ["index"],
|
||||
}
|
||||
assert result == expected
|
||||
|
||||
def test_multiindex(self):
|
||||
df = self.df.copy()
|
||||
idx = pd.MultiIndex.from_product([("a", "b"), (1, 2)])
|
||||
df.index = idx
|
||||
|
||||
result = build_table_schema(df, version=False)
|
||||
expected = {
|
||||
"fields": [
|
||||
{"name": "level_0", "type": "string"},
|
||||
{"name": "level_1", "type": "integer"},
|
||||
{"name": "A", "type": "integer"},
|
||||
{"name": "B", "type": "string"},
|
||||
{"name": "C", "type": "datetime"},
|
||||
{"name": "D", "type": "duration"},
|
||||
],
|
||||
"primaryKey": ["level_0", "level_1"],
|
||||
}
|
||||
assert result == expected
|
||||
|
||||
df.index.names = ["idx0", None]
|
||||
expected["fields"][0]["name"] = "idx0"
|
||||
expected["primaryKey"] = ["idx0", "level_1"]
|
||||
result = build_table_schema(df, version=False)
|
||||
assert result == expected
|
||||
|
||||
|
||||
class TestTableSchemaType:
|
||||
@pytest.mark.parametrize("int_type", [np.int, np.int16, np.int32, np.int64])
|
||||
def test_as_json_table_type_int_data(self, int_type):
|
||||
int_data = [1, 2, 3]
|
||||
assert as_json_table_type(np.array(int_data, dtype=int_type)) == "integer"
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"float_type", [np.float, np.float16, np.float32, np.float64]
|
||||
)
|
||||
def test_as_json_table_type_float_data(self, float_type):
|
||||
float_data = [1.0, 2.0, 3.0]
|
||||
assert as_json_table_type(np.array(float_data, dtype=float_type)) == "number"
|
||||
|
||||
@pytest.mark.parametrize("bool_type", [bool, np.bool])
|
||||
def test_as_json_table_type_bool_data(self, bool_type):
|
||||
bool_data = [True, False]
|
||||
assert as_json_table_type(np.array(bool_data, dtype=bool_type)) == "boolean"
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"date_data",
|
||||
[
|
||||
pd.to_datetime(["2016"]),
|
||||
pd.to_datetime(["2016"], utc=True),
|
||||
pd.Series(pd.to_datetime(["2016"])),
|
||||
pd.Series(pd.to_datetime(["2016"], utc=True)),
|
||||
pd.period_range("2016", freq="A", periods=3),
|
||||
],
|
||||
)
|
||||
def test_as_json_table_type_date_data(self, date_data):
|
||||
assert as_json_table_type(date_data) == "datetime"
|
||||
|
||||
@pytest.mark.parametrize("str_data", [pd.Series(["a", "b"]), pd.Index(["a", "b"])])
|
||||
def test_as_json_table_type_string_data(self, str_data):
|
||||
assert as_json_table_type(str_data) == "string"
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"cat_data",
|
||||
[
|
||||
pd.Categorical(["a"]),
|
||||
pd.Categorical([1]),
|
||||
pd.Series(pd.Categorical([1])),
|
||||
pd.CategoricalIndex([1]),
|
||||
pd.Categorical([1]),
|
||||
],
|
||||
)
|
||||
def test_as_json_table_type_categorical_data(self, cat_data):
|
||||
assert as_json_table_type(cat_data) == "any"
|
||||
|
||||
# ------
|
||||
# dtypes
|
||||
# ------
|
||||
@pytest.mark.parametrize("int_dtype", [np.int, np.int16, np.int32, np.int64])
|
||||
def test_as_json_table_type_int_dtypes(self, int_dtype):
|
||||
assert as_json_table_type(int_dtype) == "integer"
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"float_dtype", [np.float, np.float16, np.float32, np.float64]
|
||||
)
|
||||
def test_as_json_table_type_float_dtypes(self, float_dtype):
|
||||
assert as_json_table_type(float_dtype) == "number"
|
||||
|
||||
@pytest.mark.parametrize("bool_dtype", [bool, np.bool])
|
||||
def test_as_json_table_type_bool_dtypes(self, bool_dtype):
|
||||
assert as_json_table_type(bool_dtype) == "boolean"
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"date_dtype",
|
||||
[
|
||||
np.datetime64,
|
||||
np.dtype("<M8[ns]"),
|
||||
PeriodDtype("D"),
|
||||
DatetimeTZDtype("ns", "US/Central"),
|
||||
],
|
||||
)
|
||||
def test_as_json_table_type_date_dtypes(self, date_dtype):
|
||||
# TODO: datedate.date? datetime.time?
|
||||
assert as_json_table_type(date_dtype) == "datetime"
|
||||
|
||||
@pytest.mark.parametrize("td_dtype", [np.timedelta64, np.dtype("<m8[ns]")])
|
||||
def test_as_json_table_type_timedelta_dtypes(self, td_dtype):
|
||||
assert as_json_table_type(td_dtype) == "duration"
|
||||
|
||||
@pytest.mark.parametrize("str_dtype", [object]) # TODO
|
||||
def test_as_json_table_type_string_dtypes(self, str_dtype):
|
||||
assert as_json_table_type(str_dtype) == "string"
|
||||
|
||||
def test_as_json_table_type_categorical_dtypes(self):
|
||||
# TODO: I think before is_categorical_dtype(Categorical)
|
||||
# returned True, but now it's False. Figure out why or
|
||||
# if it matters
|
||||
assert as_json_table_type(pd.Categorical(["a"])) == "any"
|
||||
assert as_json_table_type(CategoricalDtype()) == "any"
|
||||
|
||||
|
||||
class TestTableOrient:
|
||||
def setup_method(self, method):
|
||||
self.df = DataFrame(
|
||||
{
|
||||
"A": [1, 2, 3, 4],
|
||||
"B": ["a", "b", "c", "c"],
|
||||
"C": pd.date_range("2016-01-01", freq="d", periods=4),
|
||||
"D": pd.timedelta_range("1H", periods=4, freq="T"),
|
||||
"E": pd.Series(pd.Categorical(["a", "b", "c", "c"])),
|
||||
"F": pd.Series(pd.Categorical(["a", "b", "c", "c"], ordered=True)),
|
||||
"G": [1.0, 2.0, 3, 4.0],
|
||||
"H": pd.date_range("2016-01-01", freq="d", periods=4, tz="US/Central"),
|
||||
},
|
||||
index=pd.Index(range(4), name="idx"),
|
||||
)
|
||||
|
||||
def test_build_series(self):
|
||||
s = pd.Series([1, 2], name="a")
|
||||
s.index.name = "id"
|
||||
result = s.to_json(orient="table", date_format="iso")
|
||||
result = json.loads(result, object_pairs_hook=OrderedDict)
|
||||
|
||||
assert "pandas_version" in result["schema"]
|
||||
result["schema"].pop("pandas_version")
|
||||
|
||||
fields = [{"name": "id", "type": "integer"}, {"name": "a", "type": "integer"}]
|
||||
|
||||
schema = {"fields": fields, "primaryKey": ["id"]}
|
||||
|
||||
expected = OrderedDict(
|
||||
[
|
||||
("schema", schema),
|
||||
(
|
||||
"data",
|
||||
[
|
||||
OrderedDict([("id", 0), ("a", 1)]),
|
||||
OrderedDict([("id", 1), ("a", 2)]),
|
||||
],
|
||||
),
|
||||
]
|
||||
)
|
||||
assert result == expected
|
||||
|
||||
def test_to_json(self):
|
||||
df = self.df.copy()
|
||||
df.index.name = "idx"
|
||||
result = df.to_json(orient="table", date_format="iso")
|
||||
result = json.loads(result, object_pairs_hook=OrderedDict)
|
||||
|
||||
assert "pandas_version" in result["schema"]
|
||||
result["schema"].pop("pandas_version")
|
||||
|
||||
fields = [
|
||||
{"name": "idx", "type": "integer"},
|
||||
{"name": "A", "type": "integer"},
|
||||
{"name": "B", "type": "string"},
|
||||
{"name": "C", "type": "datetime"},
|
||||
{"name": "D", "type": "duration"},
|
||||
{
|
||||
"constraints": {"enum": ["a", "b", "c"]},
|
||||
"name": "E",
|
||||
"ordered": False,
|
||||
"type": "any",
|
||||
},
|
||||
{
|
||||
"constraints": {"enum": ["a", "b", "c"]},
|
||||
"name": "F",
|
||||
"ordered": True,
|
||||
"type": "any",
|
||||
},
|
||||
{"name": "G", "type": "number"},
|
||||
{"name": "H", "type": "datetime", "tz": "US/Central"},
|
||||
]
|
||||
|
||||
schema = {"fields": fields, "primaryKey": ["idx"]}
|
||||
data = [
|
||||
OrderedDict(
|
||||
[
|
||||
("idx", 0),
|
||||
("A", 1),
|
||||
("B", "a"),
|
||||
("C", "2016-01-01T00:00:00.000Z"),
|
||||
("D", "P0DT1H0M0S"),
|
||||
("E", "a"),
|
||||
("F", "a"),
|
||||
("G", 1.0),
|
||||
("H", "2016-01-01T06:00:00.000Z"),
|
||||
]
|
||||
),
|
||||
OrderedDict(
|
||||
[
|
||||
("idx", 1),
|
||||
("A", 2),
|
||||
("B", "b"),
|
||||
("C", "2016-01-02T00:00:00.000Z"),
|
||||
("D", "P0DT1H1M0S"),
|
||||
("E", "b"),
|
||||
("F", "b"),
|
||||
("G", 2.0),
|
||||
("H", "2016-01-02T06:00:00.000Z"),
|
||||
]
|
||||
),
|
||||
OrderedDict(
|
||||
[
|
||||
("idx", 2),
|
||||
("A", 3),
|
||||
("B", "c"),
|
||||
("C", "2016-01-03T00:00:00.000Z"),
|
||||
("D", "P0DT1H2M0S"),
|
||||
("E", "c"),
|
||||
("F", "c"),
|
||||
("G", 3.0),
|
||||
("H", "2016-01-03T06:00:00.000Z"),
|
||||
]
|
||||
),
|
||||
OrderedDict(
|
||||
[
|
||||
("idx", 3),
|
||||
("A", 4),
|
||||
("B", "c"),
|
||||
("C", "2016-01-04T00:00:00.000Z"),
|
||||
("D", "P0DT1H3M0S"),
|
||||
("E", "c"),
|
||||
("F", "c"),
|
||||
("G", 4.0),
|
||||
("H", "2016-01-04T06:00:00.000Z"),
|
||||
]
|
||||
),
|
||||
]
|
||||
expected = OrderedDict([("schema", schema), ("data", data)])
|
||||
assert result == expected
|
||||
|
||||
def test_to_json_float_index(self):
|
||||
data = pd.Series(1, index=[1.0, 2.0])
|
||||
result = data.to_json(orient="table", date_format="iso")
|
||||
result = json.loads(result, object_pairs_hook=OrderedDict)
|
||||
result["schema"].pop("pandas_version")
|
||||
|
||||
expected = OrderedDict(
|
||||
[
|
||||
(
|
||||
"schema",
|
||||
{
|
||||
"fields": [
|
||||
{"name": "index", "type": "number"},
|
||||
{"name": "values", "type": "integer"},
|
||||
],
|
||||
"primaryKey": ["index"],
|
||||
},
|
||||
),
|
||||
(
|
||||
"data",
|
||||
[
|
||||
OrderedDict([("index", 1.0), ("values", 1)]),
|
||||
OrderedDict([("index", 2.0), ("values", 1)]),
|
||||
],
|
||||
),
|
||||
]
|
||||
)
|
||||
assert result == expected
|
||||
|
||||
def test_to_json_period_index(self):
|
||||
idx = pd.period_range("2016", freq="Q-JAN", periods=2)
|
||||
data = pd.Series(1, idx)
|
||||
result = data.to_json(orient="table", date_format="iso")
|
||||
result = json.loads(result, object_pairs_hook=OrderedDict)
|
||||
result["schema"].pop("pandas_version")
|
||||
|
||||
fields = [
|
||||
{"freq": "Q-JAN", "name": "index", "type": "datetime"},
|
||||
{"name": "values", "type": "integer"},
|
||||
]
|
||||
|
||||
schema = {"fields": fields, "primaryKey": ["index"]}
|
||||
data = [
|
||||
OrderedDict([("index", "2015-11-01T00:00:00.000Z"), ("values", 1)]),
|
||||
OrderedDict([("index", "2016-02-01T00:00:00.000Z"), ("values", 1)]),
|
||||
]
|
||||
expected = OrderedDict([("schema", schema), ("data", data)])
|
||||
assert result == expected
|
||||
|
||||
def test_to_json_categorical_index(self):
|
||||
data = pd.Series(1, pd.CategoricalIndex(["a", "b"]))
|
||||
result = data.to_json(orient="table", date_format="iso")
|
||||
result = json.loads(result, object_pairs_hook=OrderedDict)
|
||||
result["schema"].pop("pandas_version")
|
||||
|
||||
expected = OrderedDict(
|
||||
[
|
||||
(
|
||||
"schema",
|
||||
{
|
||||
"fields": [
|
||||
{
|
||||
"name": "index",
|
||||
"type": "any",
|
||||
"constraints": {"enum": ["a", "b"]},
|
||||
"ordered": False,
|
||||
},
|
||||
{"name": "values", "type": "integer"},
|
||||
],
|
||||
"primaryKey": ["index"],
|
||||
},
|
||||
),
|
||||
(
|
||||
"data",
|
||||
[
|
||||
OrderedDict([("index", "a"), ("values", 1)]),
|
||||
OrderedDict([("index", "b"), ("values", 1)]),
|
||||
],
|
||||
),
|
||||
]
|
||||
)
|
||||
assert result == expected
|
||||
|
||||
def test_date_format_raises(self):
|
||||
with pytest.raises(ValueError):
|
||||
self.df.to_json(orient="table", date_format="epoch")
|
||||
|
||||
# others work
|
||||
self.df.to_json(orient="table", date_format="iso")
|
||||
self.df.to_json(orient="table")
|
||||
|
||||
@pytest.mark.parametrize("kind", [pd.Series, pd.Index])
|
||||
def test_convert_pandas_type_to_json_field_int(self, kind):
|
||||
data = [1, 2, 3]
|
||||
result = convert_pandas_type_to_json_field(kind(data, name="name"))
|
||||
expected = {"name": "name", "type": "integer"}
|
||||
assert result == expected
|
||||
|
||||
@pytest.mark.parametrize("kind", [pd.Series, pd.Index])
|
||||
def test_convert_pandas_type_to_json_field_float(self, kind):
|
||||
data = [1.0, 2.0, 3.0]
|
||||
result = convert_pandas_type_to_json_field(kind(data, name="name"))
|
||||
expected = {"name": "name", "type": "number"}
|
||||
assert result == expected
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dt_args,extra_exp", [({}, {}), ({"utc": True}, {"tz": "UTC"})]
|
||||
)
|
||||
@pytest.mark.parametrize("wrapper", [None, pd.Series])
|
||||
def test_convert_pandas_type_to_json_field_datetime(
|
||||
self, dt_args, extra_exp, wrapper
|
||||
):
|
||||
data = [1.0, 2.0, 3.0]
|
||||
data = pd.to_datetime(data, **dt_args)
|
||||
if wrapper is pd.Series:
|
||||
data = pd.Series(data, name="values")
|
||||
result = convert_pandas_type_to_json_field(data)
|
||||
expected = {"name": "values", "type": "datetime"}
|
||||
expected.update(extra_exp)
|
||||
assert result == expected
|
||||
|
||||
def test_convert_pandas_type_to_json_period_range(self):
|
||||
arr = pd.period_range("2016", freq="A-DEC", periods=4)
|
||||
result = convert_pandas_type_to_json_field(arr)
|
||||
expected = {"name": "values", "type": "datetime", "freq": "A-DEC"}
|
||||
assert result == expected
|
||||
|
||||
@pytest.mark.parametrize("kind", [pd.Categorical, pd.CategoricalIndex])
|
||||
@pytest.mark.parametrize("ordered", [True, False])
|
||||
def test_convert_pandas_type_to_json_field_categorical(self, kind, ordered):
|
||||
data = ["a", "b", "c"]
|
||||
if kind is pd.Categorical:
|
||||
arr = pd.Series(kind(data, ordered=ordered), name="cats")
|
||||
elif kind is pd.CategoricalIndex:
|
||||
arr = kind(data, ordered=ordered, name="cats")
|
||||
|
||||
result = convert_pandas_type_to_json_field(arr)
|
||||
expected = {
|
||||
"name": "cats",
|
||||
"type": "any",
|
||||
"constraints": {"enum": data},
|
||||
"ordered": ordered,
|
||||
}
|
||||
assert result == expected
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"inp,exp",
|
||||
[
|
||||
({"type": "integer"}, "int64"),
|
||||
({"type": "number"}, "float64"),
|
||||
({"type": "boolean"}, "bool"),
|
||||
({"type": "duration"}, "timedelta64"),
|
||||
({"type": "datetime"}, "datetime64[ns]"),
|
||||
({"type": "datetime", "tz": "US/Hawaii"}, "datetime64[ns, US/Hawaii]"),
|
||||
({"type": "any"}, "object"),
|
||||
(
|
||||
{
|
||||
"type": "any",
|
||||
"constraints": {"enum": ["a", "b", "c"]},
|
||||
"ordered": False,
|
||||
},
|
||||
CategoricalDtype(categories=["a", "b", "c"], ordered=False),
|
||||
),
|
||||
(
|
||||
{
|
||||
"type": "any",
|
||||
"constraints": {"enum": ["a", "b", "c"]},
|
||||
"ordered": True,
|
||||
},
|
||||
CategoricalDtype(categories=["a", "b", "c"], ordered=True),
|
||||
),
|
||||
({"type": "string"}, "object"),
|
||||
],
|
||||
)
|
||||
def test_convert_json_field_to_pandas_type(self, inp, exp):
|
||||
field = {"name": "foo"}
|
||||
field.update(inp)
|
||||
assert convert_json_field_to_pandas_type(field) == exp
|
||||
|
||||
@pytest.mark.parametrize("inp", ["geopoint", "geojson", "fake_type"])
|
||||
def test_convert_json_field_to_pandas_type_raises(self, inp):
|
||||
field = {"type": inp}
|
||||
with pytest.raises(
|
||||
ValueError, match=("Unsupported or invalid field type: {}".format(inp))
|
||||
):
|
||||
convert_json_field_to_pandas_type(field)
|
||||
|
||||
def test_categorical(self):
|
||||
s = pd.Series(pd.Categorical(["a", "b", "a"]))
|
||||
s.index.name = "idx"
|
||||
result = s.to_json(orient="table", date_format="iso")
|
||||
result = json.loads(result, object_pairs_hook=OrderedDict)
|
||||
result["schema"].pop("pandas_version")
|
||||
|
||||
fields = [
|
||||
{"name": "idx", "type": "integer"},
|
||||
{
|
||||
"constraints": {"enum": ["a", "b"]},
|
||||
"name": "values",
|
||||
"ordered": False,
|
||||
"type": "any",
|
||||
},
|
||||
]
|
||||
|
||||
expected = OrderedDict(
|
||||
[
|
||||
("schema", {"fields": fields, "primaryKey": ["idx"]}),
|
||||
(
|
||||
"data",
|
||||
[
|
||||
OrderedDict([("idx", 0), ("values", "a")]),
|
||||
OrderedDict([("idx", 1), ("values", "b")]),
|
||||
OrderedDict([("idx", 2), ("values", "a")]),
|
||||
],
|
||||
),
|
||||
]
|
||||
)
|
||||
assert result == expected
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"idx,nm,prop",
|
||||
[
|
||||
(pd.Index([1]), "index", "name"),
|
||||
(pd.Index([1], name="myname"), "myname", "name"),
|
||||
(
|
||||
pd.MultiIndex.from_product([("a", "b"), ("c", "d")]),
|
||||
["level_0", "level_1"],
|
||||
"names",
|
||||
),
|
||||
(
|
||||
pd.MultiIndex.from_product(
|
||||
[("a", "b"), ("c", "d")], names=["n1", "n2"]
|
||||
),
|
||||
["n1", "n2"],
|
||||
"names",
|
||||
),
|
||||
(
|
||||
pd.MultiIndex.from_product(
|
||||
[("a", "b"), ("c", "d")], names=["n1", None]
|
||||
),
|
||||
["n1", "level_1"],
|
||||
"names",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_set_names_unset(self, idx, nm, prop):
|
||||
data = pd.Series(1, idx)
|
||||
result = set_default_names(data)
|
||||
assert getattr(result.index, prop) == nm
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"idx",
|
||||
[
|
||||
pd.Index([], name="index"),
|
||||
pd.MultiIndex.from_arrays([["foo"], ["bar"]], names=("level_0", "level_1")),
|
||||
pd.MultiIndex.from_arrays([["foo"], ["bar"]], names=("foo", "level_1")),
|
||||
],
|
||||
)
|
||||
def test_warns_non_roundtrippable_names(self, idx):
|
||||
# GH 19130
|
||||
df = pd.DataFrame(index=idx)
|
||||
df.index.name = "index"
|
||||
with tm.assert_produces_warning():
|
||||
set_default_names(df)
|
||||
|
||||
def test_timestamp_in_columns(self):
|
||||
df = pd.DataFrame(
|
||||
[[1, 2]], columns=[pd.Timestamp("2016"), pd.Timedelta(10, unit="s")]
|
||||
)
|
||||
result = df.to_json(orient="table")
|
||||
js = json.loads(result)
|
||||
assert js["schema"]["fields"][1]["name"] == 1451606400000
|
||||
assert js["schema"]["fields"][2]["name"] == 10000
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"case",
|
||||
[
|
||||
pd.Series([1], index=pd.Index([1], name="a"), name="a"),
|
||||
pd.DataFrame({"A": [1]}, index=pd.Index([1], name="A")),
|
||||
pd.DataFrame(
|
||||
{"A": [1]},
|
||||
index=pd.MultiIndex.from_arrays([["a"], [1]], names=["A", "a"]),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_overlapping_names(self, case):
|
||||
with pytest.raises(ValueError, match="Overlapping"):
|
||||
case.to_json(orient="table")
|
||||
|
||||
def test_mi_falsey_name(self):
|
||||
# GH 16203
|
||||
df = pd.DataFrame(
|
||||
np.random.randn(4, 4),
|
||||
index=pd.MultiIndex.from_product([("A", "B"), ("a", "b")]),
|
||||
)
|
||||
result = [x["name"] for x in build_table_schema(df)["fields"]]
|
||||
assert result == ["level_0", "level_1", 0, 1, 2, 3]
|
||||
|
||||
|
||||
class TestTableOrientReader:
|
||||
@pytest.mark.parametrize(
|
||||
"index_nm",
|
||||
[None, "idx", pytest.param("index", marks=pytest.mark.xfail), "level_0"],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"vals",
|
||||
[
|
||||
{"ints": [1, 2, 3, 4]},
|
||||
{"objects": ["a", "b", "c", "d"]},
|
||||
{"objects": ["1", "2", "3", "4"]},
|
||||
{"date_ranges": pd.date_range("2016-01-01", freq="d", periods=4)},
|
||||
{"categoricals": pd.Series(pd.Categorical(["a", "b", "c", "c"]))},
|
||||
{
|
||||
"ordered_cats": pd.Series(
|
||||
pd.Categorical(["a", "b", "c", "c"], ordered=True)
|
||||
)
|
||||
},
|
||||
{"floats": [1.0, 2.0, 3.0, 4.0]},
|
||||
{"floats": [1.1, 2.2, 3.3, 4.4]},
|
||||
{"bools": [True, False, False, True]},
|
||||
],
|
||||
)
|
||||
def test_read_json_table_orient(self, index_nm, vals, recwarn):
|
||||
df = DataFrame(vals, index=pd.Index(range(4), name=index_nm))
|
||||
out = df.to_json(orient="table")
|
||||
result = pd.read_json(out, orient="table")
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
@pytest.mark.parametrize("index_nm", [None, "idx", "index"])
|
||||
@pytest.mark.parametrize(
|
||||
"vals",
|
||||
[
|
||||
{"timedeltas": pd.timedelta_range("1H", periods=4, freq="T")},
|
||||
{
|
||||
"timezones": pd.date_range(
|
||||
"2016-01-01", freq="d", periods=4, tz="US/Central"
|
||||
)
|
||||
},
|
||||
],
|
||||
)
|
||||
def test_read_json_table_orient_raises(self, index_nm, vals, recwarn):
|
||||
df = DataFrame(vals, index=pd.Index(range(4), name=index_nm))
|
||||
out = df.to_json(orient="table")
|
||||
with pytest.raises(NotImplementedError, match="can not yet read "):
|
||||
pd.read_json(out, orient="table")
|
||||
|
||||
def test_comprehensive(self):
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [1, 2, 3, 4],
|
||||
"B": ["a", "b", "c", "c"],
|
||||
"C": pd.date_range("2016-01-01", freq="d", periods=4),
|
||||
# 'D': pd.timedelta_range('1H', periods=4, freq='T'),
|
||||
"E": pd.Series(pd.Categorical(["a", "b", "c", "c"])),
|
||||
"F": pd.Series(pd.Categorical(["a", "b", "c", "c"], ordered=True)),
|
||||
"G": [1.1, 2.2, 3.3, 4.4],
|
||||
# 'H': pd.date_range('2016-01-01', freq='d', periods=4,
|
||||
# tz='US/Central'),
|
||||
"I": [True, False, False, True],
|
||||
},
|
||||
index=pd.Index(range(4), name="idx"),
|
||||
)
|
||||
|
||||
out = df.to_json(orient="table")
|
||||
result = pd.read_json(out, orient="table")
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"index_names",
|
||||
[[None, None], ["foo", "bar"], ["foo", None], [None, "foo"], ["index", "foo"]],
|
||||
)
|
||||
def test_multiindex(self, index_names):
|
||||
# GH 18912
|
||||
df = pd.DataFrame(
|
||||
[["Arr", "alpha", [1, 2, 3, 4]], ["Bee", "Beta", [10, 20, 30, 40]]],
|
||||
index=[["A", "B"], ["Null", "Eins"]],
|
||||
columns=["Aussprache", "Griechisch", "Args"],
|
||||
)
|
||||
df.index.names = index_names
|
||||
out = df.to_json(orient="table")
|
||||
result = pd.read_json(out, orient="table")
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
def test_empty_frame_roundtrip(self):
|
||||
# GH 21287
|
||||
df = pd.DataFrame(columns=["a", "b", "c"])
|
||||
expected = df.copy()
|
||||
out = df.to_json(orient="table")
|
||||
result = pd.read_json(out, orient="table")
|
||||
tm.assert_frame_equal(expected, result)
|
@@ -0,0 +1,694 @@
|
||||
import json
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import PY36
|
||||
|
||||
from pandas import DataFrame, Index
|
||||
import pandas.util.testing as tm
|
||||
|
||||
from pandas.io.json import json_normalize
|
||||
from pandas.io.json._normalize import nested_to_record
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def deep_nested():
|
||||
# deeply nested data
|
||||
return [
|
||||
{
|
||||
"country": "USA",
|
||||
"states": [
|
||||
{
|
||||
"name": "California",
|
||||
"cities": [
|
||||
{"name": "San Francisco", "pop": 12345},
|
||||
{"name": "Los Angeles", "pop": 12346},
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "Ohio",
|
||||
"cities": [
|
||||
{"name": "Columbus", "pop": 1234},
|
||||
{"name": "Cleveland", "pop": 1236},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
"country": "Germany",
|
||||
"states": [
|
||||
{"name": "Bayern", "cities": [{"name": "Munich", "pop": 12347}]},
|
||||
{
|
||||
"name": "Nordrhein-Westfalen",
|
||||
"cities": [
|
||||
{"name": "Duesseldorf", "pop": 1238},
|
||||
{"name": "Koeln", "pop": 1239},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def state_data():
|
||||
return [
|
||||
{
|
||||
"counties": [
|
||||
{"name": "Dade", "population": 12345},
|
||||
{"name": "Broward", "population": 40000},
|
||||
{"name": "Palm Beach", "population": 60000},
|
||||
],
|
||||
"info": {"governor": "Rick Scott"},
|
||||
"shortname": "FL",
|
||||
"state": "Florida",
|
||||
},
|
||||
{
|
||||
"counties": [
|
||||
{"name": "Summit", "population": 1234},
|
||||
{"name": "Cuyahoga", "population": 1337},
|
||||
],
|
||||
"info": {"governor": "John Kasich"},
|
||||
"shortname": "OH",
|
||||
"state": "Ohio",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def author_missing_data():
|
||||
return [
|
||||
{"info": None},
|
||||
{
|
||||
"info": {"created_at": "11/08/1993", "last_updated": "26/05/2012"},
|
||||
"author_name": {"first": "Jane", "last_name": "Doe"},
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def missing_metadata():
|
||||
return [
|
||||
{
|
||||
"name": "Alice",
|
||||
"addresses": [
|
||||
{
|
||||
"number": 9562,
|
||||
"street": "Morris St.",
|
||||
"city": "Massillon",
|
||||
"state": "OH",
|
||||
"zip": 44646,
|
||||
}
|
||||
],
|
||||
},
|
||||
{
|
||||
"addresses": [
|
||||
{
|
||||
"number": 8449,
|
||||
"street": "Spring St.",
|
||||
"city": "Elizabethton",
|
||||
"state": "TN",
|
||||
"zip": 37643,
|
||||
}
|
||||
]
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def max_level_test_input_data():
|
||||
"""
|
||||
input data to test json_normalize with max_level param
|
||||
"""
|
||||
return [
|
||||
{
|
||||
"CreatedBy": {"Name": "User001"},
|
||||
"Lookup": {
|
||||
"TextField": "Some text",
|
||||
"UserField": {"Id": "ID001", "Name": "Name001"},
|
||||
},
|
||||
"Image": {"a": "b"},
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
class TestJSONNormalize:
|
||||
def test_simple_records(self):
|
||||
recs = [
|
||||
{"a": 1, "b": 2, "c": 3},
|
||||
{"a": 4, "b": 5, "c": 6},
|
||||
{"a": 7, "b": 8, "c": 9},
|
||||
{"a": 10, "b": 11, "c": 12},
|
||||
]
|
||||
|
||||
result = json_normalize(recs)
|
||||
expected = DataFrame(recs)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_simple_normalize(self, state_data):
|
||||
result = json_normalize(state_data[0], "counties")
|
||||
expected = DataFrame(state_data[0]["counties"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = json_normalize(state_data, "counties")
|
||||
|
||||
expected = []
|
||||
for rec in state_data:
|
||||
expected.extend(rec["counties"])
|
||||
expected = DataFrame(expected)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = json_normalize(state_data, "counties", meta="state")
|
||||
expected["state"] = np.array(["Florida", "Ohio"]).repeat([3, 2])
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_empty_array(self):
|
||||
result = json_normalize([])
|
||||
expected = DataFrame()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_simple_normalize_with_separator(self, deep_nested):
|
||||
# GH 14883
|
||||
result = json_normalize({"A": {"A": 1, "B": 2}})
|
||||
expected = DataFrame([[1, 2]], columns=["A.A", "A.B"])
|
||||
tm.assert_frame_equal(result.reindex_like(expected), expected)
|
||||
|
||||
result = json_normalize({"A": {"A": 1, "B": 2}}, sep="_")
|
||||
expected = DataFrame([[1, 2]], columns=["A_A", "A_B"])
|
||||
tm.assert_frame_equal(result.reindex_like(expected), expected)
|
||||
|
||||
result = json_normalize({"A": {"A": 1, "B": 2}}, sep="\u03c3")
|
||||
expected = DataFrame([[1, 2]], columns=["A\u03c3A", "A\u03c3B"])
|
||||
tm.assert_frame_equal(result.reindex_like(expected), expected)
|
||||
|
||||
result = json_normalize(
|
||||
deep_nested,
|
||||
["states", "cities"],
|
||||
meta=["country", ["states", "name"]],
|
||||
sep="_",
|
||||
)
|
||||
expected = Index(["name", "pop", "country", "states_name"]).sort_values()
|
||||
assert result.columns.sort_values().equals(expected)
|
||||
|
||||
def test_value_array_record_prefix(self):
|
||||
# GH 21536
|
||||
result = json_normalize({"A": [1, 2]}, "A", record_prefix="Prefix.")
|
||||
expected = DataFrame([[1], [2]], columns=["Prefix.0"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_nested_object_record_path(self):
|
||||
# GH 22706
|
||||
data = {
|
||||
"state": "Florida",
|
||||
"info": {
|
||||
"governor": "Rick Scott",
|
||||
"counties": [
|
||||
{"name": "Dade", "population": 12345},
|
||||
{"name": "Broward", "population": 40000},
|
||||
{"name": "Palm Beach", "population": 60000},
|
||||
],
|
||||
},
|
||||
}
|
||||
result = json_normalize(data, record_path=["info", "counties"])
|
||||
expected = DataFrame(
|
||||
[["Dade", 12345], ["Broward", 40000], ["Palm Beach", 60000]],
|
||||
columns=["name", "population"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_more_deeply_nested(self, deep_nested):
|
||||
|
||||
result = json_normalize(
|
||||
deep_nested, ["states", "cities"], meta=["country", ["states", "name"]]
|
||||
)
|
||||
ex_data = {
|
||||
"country": ["USA"] * 4 + ["Germany"] * 3,
|
||||
"states.name": [
|
||||
"California",
|
||||
"California",
|
||||
"Ohio",
|
||||
"Ohio",
|
||||
"Bayern",
|
||||
"Nordrhein-Westfalen",
|
||||
"Nordrhein-Westfalen",
|
||||
],
|
||||
"name": [
|
||||
"San Francisco",
|
||||
"Los Angeles",
|
||||
"Columbus",
|
||||
"Cleveland",
|
||||
"Munich",
|
||||
"Duesseldorf",
|
||||
"Koeln",
|
||||
],
|
||||
"pop": [12345, 12346, 1234, 1236, 12347, 1238, 1239],
|
||||
}
|
||||
|
||||
expected = DataFrame(ex_data, columns=result.columns)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_shallow_nested(self):
|
||||
data = [
|
||||
{
|
||||
"state": "Florida",
|
||||
"shortname": "FL",
|
||||
"info": {"governor": "Rick Scott"},
|
||||
"counties": [
|
||||
{"name": "Dade", "population": 12345},
|
||||
{"name": "Broward", "population": 40000},
|
||||
{"name": "Palm Beach", "population": 60000},
|
||||
],
|
||||
},
|
||||
{
|
||||
"state": "Ohio",
|
||||
"shortname": "OH",
|
||||
"info": {"governor": "John Kasich"},
|
||||
"counties": [
|
||||
{"name": "Summit", "population": 1234},
|
||||
{"name": "Cuyahoga", "population": 1337},
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
result = json_normalize(
|
||||
data, "counties", ["state", "shortname", ["info", "governor"]]
|
||||
)
|
||||
ex_data = {
|
||||
"name": ["Dade", "Broward", "Palm Beach", "Summit", "Cuyahoga"],
|
||||
"state": ["Florida"] * 3 + ["Ohio"] * 2,
|
||||
"shortname": ["FL", "FL", "FL", "OH", "OH"],
|
||||
"info.governor": ["Rick Scott"] * 3 + ["John Kasich"] * 2,
|
||||
"population": [12345, 40000, 60000, 1234, 1337],
|
||||
}
|
||||
expected = DataFrame(ex_data, columns=result.columns)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_meta_name_conflict(self):
|
||||
data = [
|
||||
{
|
||||
"foo": "hello",
|
||||
"bar": "there",
|
||||
"data": [
|
||||
{"foo": "something", "bar": "else"},
|
||||
{"foo": "something2", "bar": "else2"},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
msg = r"Conflicting metadata name (foo|bar), need distinguishing prefix"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
json_normalize(data, "data", meta=["foo", "bar"])
|
||||
|
||||
result = json_normalize(data, "data", meta=["foo", "bar"], meta_prefix="meta")
|
||||
|
||||
for val in ["metafoo", "metabar", "foo", "bar"]:
|
||||
assert val in result
|
||||
|
||||
def test_meta_parameter_not_modified(self):
|
||||
# GH 18610
|
||||
data = [
|
||||
{
|
||||
"foo": "hello",
|
||||
"bar": "there",
|
||||
"data": [
|
||||
{"foo": "something", "bar": "else"},
|
||||
{"foo": "something2", "bar": "else2"},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
COLUMNS = ["foo", "bar"]
|
||||
result = json_normalize(data, "data", meta=COLUMNS, meta_prefix="meta")
|
||||
|
||||
assert COLUMNS == ["foo", "bar"]
|
||||
for val in ["metafoo", "metabar", "foo", "bar"]:
|
||||
assert val in result
|
||||
|
||||
def test_record_prefix(self, state_data):
|
||||
result = json_normalize(state_data[0], "counties")
|
||||
expected = DataFrame(state_data[0]["counties"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = json_normalize(
|
||||
state_data, "counties", meta="state", record_prefix="county_"
|
||||
)
|
||||
|
||||
expected = []
|
||||
for rec in state_data:
|
||||
expected.extend(rec["counties"])
|
||||
expected = DataFrame(expected)
|
||||
expected = expected.rename(columns=lambda x: "county_" + x)
|
||||
expected["state"] = np.array(["Florida", "Ohio"]).repeat([3, 2])
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_non_ascii_key(self):
|
||||
testjson = (
|
||||
b'[{"\xc3\x9cnic\xc3\xb8de":0,"sub":{"A":1, "B":2}},'
|
||||
+ b'{"\xc3\x9cnic\xc3\xb8de":1,"sub":{"A":3, "B":4}}]'
|
||||
).decode("utf8")
|
||||
|
||||
testdata = {
|
||||
b"\xc3\x9cnic\xc3\xb8de".decode("utf8"): [0, 1],
|
||||
"sub.A": [1, 3],
|
||||
"sub.B": [2, 4],
|
||||
}
|
||||
expected = DataFrame(testdata)
|
||||
|
||||
result = json_normalize(json.loads(testjson))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_missing_field(self, author_missing_data):
|
||||
# GH20030:
|
||||
result = json_normalize(author_missing_data)
|
||||
ex_data = [
|
||||
{
|
||||
"info": np.nan,
|
||||
"info.created_at": np.nan,
|
||||
"info.last_updated": np.nan,
|
||||
"author_name.first": np.nan,
|
||||
"author_name.last_name": np.nan,
|
||||
},
|
||||
{
|
||||
"info": None,
|
||||
"info.created_at": "11/08/1993",
|
||||
"info.last_updated": "26/05/2012",
|
||||
"author_name.first": "Jane",
|
||||
"author_name.last_name": "Doe",
|
||||
},
|
||||
]
|
||||
expected = DataFrame(ex_data)
|
||||
tm.assert_frame_equal(result, expected, check_like=not PY36)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"max_level,expected",
|
||||
[
|
||||
(
|
||||
0,
|
||||
[
|
||||
{
|
||||
"TextField": "Some text",
|
||||
"UserField": {"Id": "ID001", "Name": "Name001"},
|
||||
"CreatedBy": {"Name": "User001"},
|
||||
"Image": {"a": "b"},
|
||||
},
|
||||
{
|
||||
"TextField": "Some text",
|
||||
"UserField": {"Id": "ID001", "Name": "Name001"},
|
||||
"CreatedBy": {"Name": "User001"},
|
||||
"Image": {"a": "b"},
|
||||
},
|
||||
],
|
||||
),
|
||||
(
|
||||
1,
|
||||
[
|
||||
{
|
||||
"TextField": "Some text",
|
||||
"UserField.Id": "ID001",
|
||||
"UserField.Name": "Name001",
|
||||
"CreatedBy": {"Name": "User001"},
|
||||
"Image": {"a": "b"},
|
||||
},
|
||||
{
|
||||
"TextField": "Some text",
|
||||
"UserField.Id": "ID001",
|
||||
"UserField.Name": "Name001",
|
||||
"CreatedBy": {"Name": "User001"},
|
||||
"Image": {"a": "b"},
|
||||
},
|
||||
],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_max_level_with_records_path(self, max_level, expected):
|
||||
# GH23843: Enhanced JSON normalize
|
||||
test_input = [
|
||||
{
|
||||
"CreatedBy": {"Name": "User001"},
|
||||
"Lookup": [
|
||||
{
|
||||
"TextField": "Some text",
|
||||
"UserField": {"Id": "ID001", "Name": "Name001"},
|
||||
},
|
||||
{
|
||||
"TextField": "Some text",
|
||||
"UserField": {"Id": "ID001", "Name": "Name001"},
|
||||
},
|
||||
],
|
||||
"Image": {"a": "b"},
|
||||
"tags": [
|
||||
{"foo": "something", "bar": "else"},
|
||||
{"foo": "something2", "bar": "else2"},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
result = json_normalize(
|
||||
test_input,
|
||||
record_path=["Lookup"],
|
||||
meta=[["CreatedBy"], ["Image"]],
|
||||
max_level=max_level,
|
||||
)
|
||||
expected_df = DataFrame(data=expected, columns=result.columns.values)
|
||||
tm.assert_equal(expected_df, result)
|
||||
|
||||
|
||||
class TestNestedToRecord:
|
||||
def test_flat_stays_flat(self):
|
||||
recs = [dict(flat1=1, flat2=2), dict(flat1=3, flat2=4)]
|
||||
result = nested_to_record(recs)
|
||||
expected = recs
|
||||
assert result == expected
|
||||
|
||||
def test_one_level_deep_flattens(self):
|
||||
data = dict(flat1=1, dict1=dict(c=1, d=2))
|
||||
|
||||
result = nested_to_record(data)
|
||||
expected = {"dict1.c": 1, "dict1.d": 2, "flat1": 1}
|
||||
|
||||
assert result == expected
|
||||
|
||||
def test_nested_flattens(self):
|
||||
data = dict(flat1=1, dict1=dict(c=1, d=2), nested=dict(e=dict(c=1, d=2), d=2))
|
||||
|
||||
result = nested_to_record(data)
|
||||
expected = {
|
||||
"dict1.c": 1,
|
||||
"dict1.d": 2,
|
||||
"flat1": 1,
|
||||
"nested.d": 2,
|
||||
"nested.e.c": 1,
|
||||
"nested.e.d": 2,
|
||||
}
|
||||
|
||||
assert result == expected
|
||||
|
||||
def test_json_normalize_errors(self, missing_metadata):
|
||||
# GH14583:
|
||||
# If meta keys are not always present a new option to set
|
||||
# errors='ignore' has been implemented
|
||||
|
||||
msg = "Try running with errors='ignore' as key 'name' is not always present"
|
||||
with pytest.raises(KeyError, match=msg):
|
||||
json_normalize(
|
||||
data=missing_metadata,
|
||||
record_path="addresses",
|
||||
meta="name",
|
||||
errors="raise",
|
||||
)
|
||||
|
||||
def test_missing_meta(self, missing_metadata):
|
||||
# GH25468
|
||||
# If metadata is nullable with errors set to ignore, the null values
|
||||
# should be numpy.nan values
|
||||
result = json_normalize(
|
||||
data=missing_metadata, record_path="addresses", meta="name", errors="ignore"
|
||||
)
|
||||
ex_data = [
|
||||
[9562, "Morris St.", "Massillon", "OH", 44646, "Alice"],
|
||||
[8449, "Spring St.", "Elizabethton", "TN", 37643, np.nan],
|
||||
]
|
||||
columns = ["city", "number", "state", "street", "zip", "name"]
|
||||
columns = ["number", "street", "city", "state", "zip", "name"]
|
||||
expected = DataFrame(ex_data, columns=columns)
|
||||
tm.assert_frame_equal(result, expected, check_like=not PY36)
|
||||
|
||||
def test_donot_drop_nonevalues(self):
|
||||
# GH21356
|
||||
data = [
|
||||
{"info": None, "author_name": {"first": "Smith", "last_name": "Appleseed"}},
|
||||
{
|
||||
"info": {"created_at": "11/08/1993", "last_updated": "26/05/2012"},
|
||||
"author_name": {"first": "Jane", "last_name": "Doe"},
|
||||
},
|
||||
]
|
||||
result = nested_to_record(data)
|
||||
expected = [
|
||||
{
|
||||
"info": None,
|
||||
"author_name.first": "Smith",
|
||||
"author_name.last_name": "Appleseed",
|
||||
},
|
||||
{
|
||||
"author_name.first": "Jane",
|
||||
"author_name.last_name": "Doe",
|
||||
"info.created_at": "11/08/1993",
|
||||
"info.last_updated": "26/05/2012",
|
||||
},
|
||||
]
|
||||
|
||||
assert result == expected
|
||||
|
||||
def test_nonetype_top_level_bottom_level(self):
|
||||
# GH21158: If inner level json has a key with a null value
|
||||
# make sure it doesnt do a new_d.pop twice and except
|
||||
data = {
|
||||
"id": None,
|
||||
"location": {
|
||||
"country": {
|
||||
"state": {
|
||||
"id": None,
|
||||
"town.info": {
|
||||
"id": None,
|
||||
"region": None,
|
||||
"x": 49.151580810546875,
|
||||
"y": -33.148521423339844,
|
||||
"z": 27.572303771972656,
|
||||
},
|
||||
}
|
||||
}
|
||||
},
|
||||
}
|
||||
result = nested_to_record(data)
|
||||
expected = {
|
||||
"id": None,
|
||||
"location.country.state.id": None,
|
||||
"location.country.state.town.info.id": None,
|
||||
"location.country.state.town.info.region": None,
|
||||
"location.country.state.town.info.x": 49.151580810546875,
|
||||
"location.country.state.town.info.y": -33.148521423339844,
|
||||
"location.country.state.town.info.z": 27.572303771972656,
|
||||
}
|
||||
assert result == expected
|
||||
|
||||
def test_nonetype_multiple_levels(self):
|
||||
# GH21158: If inner level json has a key with a null value
|
||||
# make sure it doesnt do a new_d.pop twice and except
|
||||
data = {
|
||||
"id": None,
|
||||
"location": {
|
||||
"id": None,
|
||||
"country": {
|
||||
"id": None,
|
||||
"state": {
|
||||
"id": None,
|
||||
"town.info": {
|
||||
"region": None,
|
||||
"x": 49.151580810546875,
|
||||
"y": -33.148521423339844,
|
||||
"z": 27.572303771972656,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
result = nested_to_record(data)
|
||||
expected = {
|
||||
"id": None,
|
||||
"location.id": None,
|
||||
"location.country.id": None,
|
||||
"location.country.state.id": None,
|
||||
"location.country.state.town.info.region": None,
|
||||
"location.country.state.town.info.x": 49.151580810546875,
|
||||
"location.country.state.town.info.y": -33.148521423339844,
|
||||
"location.country.state.town.info.z": 27.572303771972656,
|
||||
}
|
||||
assert result == expected
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"max_level, expected",
|
||||
[
|
||||
(
|
||||
None,
|
||||
[
|
||||
{
|
||||
"CreatedBy.Name": "User001",
|
||||
"Lookup.TextField": "Some text",
|
||||
"Lookup.UserField.Id": "ID001",
|
||||
"Lookup.UserField.Name": "Name001",
|
||||
"Image.a": "b",
|
||||
}
|
||||
],
|
||||
),
|
||||
(
|
||||
0,
|
||||
[
|
||||
{
|
||||
"CreatedBy": {"Name": "User001"},
|
||||
"Lookup": {
|
||||
"TextField": "Some text",
|
||||
"UserField": {"Id": "ID001", "Name": "Name001"},
|
||||
},
|
||||
"Image": {"a": "b"},
|
||||
}
|
||||
],
|
||||
),
|
||||
(
|
||||
1,
|
||||
[
|
||||
{
|
||||
"CreatedBy.Name": "User001",
|
||||
"Lookup.TextField": "Some text",
|
||||
"Lookup.UserField": {"Id": "ID001", "Name": "Name001"},
|
||||
"Image.a": "b",
|
||||
}
|
||||
],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_with_max_level(self, max_level, expected, max_level_test_input_data):
|
||||
# GH23843: Enhanced JSON normalize
|
||||
output = nested_to_record(max_level_test_input_data, max_level=max_level)
|
||||
assert output == expected
|
||||
|
||||
def test_with_large_max_level(self):
|
||||
# GH23843: Enhanced JSON normalize
|
||||
max_level = 100
|
||||
input_data = [
|
||||
{
|
||||
"CreatedBy": {
|
||||
"user": {
|
||||
"name": {"firstname": "Leo", "LastName": "Thomson"},
|
||||
"family_tree": {
|
||||
"father": {
|
||||
"name": "Father001",
|
||||
"father": {
|
||||
"Name": "Father002",
|
||||
"father": {
|
||||
"name": "Father003",
|
||||
"father": {"Name": "Father004"},
|
||||
},
|
||||
},
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
expected = [
|
||||
{
|
||||
"CreatedBy.user.name.firstname": "Leo",
|
||||
"CreatedBy.user.name.LastName": "Thomson",
|
||||
"CreatedBy.user.family_tree.father.name": "Father001",
|
||||
"CreatedBy.user.family_tree.father.father.Name": "Father002",
|
||||
"CreatedBy.user.family_tree.father.father.father.name": "Father003",
|
||||
"CreatedBy.user.family_tree.father.father.father.father.Name": "Father004", # noqa: E501
|
||||
}
|
||||
]
|
||||
output = nested_to_record(input_data, max_level=max_level)
|
||||
assert output == expected
|
1613
venv/lib/python3.6/site-packages/pandas/tests/io/json/test_pandas.py
Normal file
1613
venv/lib/python3.6/site-packages/pandas/tests/io/json/test_pandas.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,176 @@
|
||||
from io import StringIO
|
||||
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame, read_json
|
||||
import pandas.util.testing as tm
|
||||
from pandas.util.testing import assert_frame_equal, assert_series_equal, ensure_clean
|
||||
|
||||
from pandas.io.json._json import JsonReader
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def lines_json_df():
|
||||
df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
|
||||
return df.to_json(lines=True, orient="records")
|
||||
|
||||
|
||||
def test_read_jsonl():
|
||||
# GH9180
|
||||
result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True)
|
||||
expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_jsonl_unicode_chars():
|
||||
# GH15132: non-ascii unicode characters
|
||||
# \u201d == RIGHT DOUBLE QUOTATION MARK
|
||||
|
||||
# simulate file handle
|
||||
json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
|
||||
json = StringIO(json)
|
||||
result = read_json(json, lines=True)
|
||||
expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
# simulate string
|
||||
json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
|
||||
result = read_json(json, lines=True)
|
||||
expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"])
|
||||
assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_to_jsonl():
|
||||
# GH9180
|
||||
df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
|
||||
result = df.to_json(orient="records", lines=True)
|
||||
expected = '{"a":1,"b":2}\n{"a":1,"b":2}'
|
||||
assert result == expected
|
||||
|
||||
df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=["a", "b"])
|
||||
result = df.to_json(orient="records", lines=True)
|
||||
expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}'
|
||||
assert result == expected
|
||||
assert_frame_equal(read_json(result, lines=True), df)
|
||||
|
||||
# GH15096: escaped characters in columns and data
|
||||
df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]], columns=["a\\", "b"])
|
||||
result = df.to_json(orient="records", lines=True)
|
||||
expected = '{"a\\\\":"foo\\\\","b":"bar"}\n' '{"a\\\\":"foo\\"","b":"bar"}'
|
||||
assert result == expected
|
||||
assert_frame_equal(read_json(result, lines=True), df)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("chunksize", [1, 1.0])
|
||||
def test_readjson_chunks(lines_json_df, chunksize):
|
||||
# Basic test that read_json(chunks=True) gives the same result as
|
||||
# read_json(chunks=False)
|
||||
# GH17048: memory usage when lines=True
|
||||
|
||||
unchunked = read_json(StringIO(lines_json_df), lines=True)
|
||||
reader = read_json(StringIO(lines_json_df), lines=True, chunksize=chunksize)
|
||||
chunked = pd.concat(reader)
|
||||
|
||||
assert_frame_equal(chunked, unchunked)
|
||||
|
||||
|
||||
def test_readjson_chunksize_requires_lines(lines_json_df):
|
||||
msg = "chunksize can only be passed if lines=True"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
pd.read_json(StringIO(lines_json_df), lines=False, chunksize=2)
|
||||
|
||||
|
||||
def test_readjson_chunks_series():
|
||||
# Test reading line-format JSON to Series with chunksize param
|
||||
s = pd.Series({"A": 1, "B": 2})
|
||||
|
||||
strio = StringIO(s.to_json(lines=True, orient="records"))
|
||||
unchunked = pd.read_json(strio, lines=True, typ="Series")
|
||||
|
||||
strio = StringIO(s.to_json(lines=True, orient="records"))
|
||||
chunked = pd.concat(pd.read_json(strio, lines=True, typ="Series", chunksize=1))
|
||||
|
||||
assert_series_equal(chunked, unchunked)
|
||||
|
||||
|
||||
def test_readjson_each_chunk(lines_json_df):
|
||||
# Other tests check that the final result of read_json(chunksize=True)
|
||||
# is correct. This checks the intermediate chunks.
|
||||
chunks = list(pd.read_json(StringIO(lines_json_df), lines=True, chunksize=2))
|
||||
assert chunks[0].shape == (2, 2)
|
||||
assert chunks[1].shape == (1, 2)
|
||||
|
||||
|
||||
def test_readjson_chunks_from_file():
|
||||
with ensure_clean("test.json") as path:
|
||||
df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
|
||||
df.to_json(path, lines=True, orient="records")
|
||||
chunked = pd.concat(pd.read_json(path, lines=True, chunksize=1))
|
||||
unchunked = pd.read_json(path, lines=True)
|
||||
assert_frame_equal(unchunked, chunked)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("chunksize", [None, 1])
|
||||
def test_readjson_chunks_closes(chunksize):
|
||||
with ensure_clean("test.json") as path:
|
||||
df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
|
||||
df.to_json(path, lines=True, orient="records")
|
||||
reader = JsonReader(
|
||||
path,
|
||||
orient=None,
|
||||
typ="frame",
|
||||
dtype=True,
|
||||
convert_axes=True,
|
||||
convert_dates=True,
|
||||
keep_default_dates=True,
|
||||
numpy=False,
|
||||
precise_float=False,
|
||||
date_unit=None,
|
||||
encoding=None,
|
||||
lines=True,
|
||||
chunksize=chunksize,
|
||||
compression=None,
|
||||
)
|
||||
reader.read()
|
||||
assert (
|
||||
reader.open_stream.closed
|
||||
), "didn't close stream with \
|
||||
chunksize = {chunksize}".format(
|
||||
chunksize=chunksize
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("chunksize", [0, -1, 2.2, "foo"])
|
||||
def test_readjson_invalid_chunksize(lines_json_df, chunksize):
|
||||
msg = r"'chunksize' must be an integer >=1"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
pd.read_json(StringIO(lines_json_df), lines=True, chunksize=chunksize)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("chunksize", [None, 1, 2])
|
||||
def test_readjson_chunks_multiple_empty_lines(chunksize):
|
||||
j = """
|
||||
|
||||
{"A":1,"B":4}
|
||||
|
||||
|
||||
|
||||
{"A":2,"B":5}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
{"A":3,"B":6}
|
||||
"""
|
||||
orig = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
|
||||
test = pd.read_json(j, lines=True, chunksize=chunksize)
|
||||
if chunksize is not None:
|
||||
test = pd.concat(test)
|
||||
tm.assert_frame_equal(
|
||||
orig, test, obj="chunksize: {chunksize}".format(chunksize=chunksize)
|
||||
)
|
1084
venv/lib/python3.6/site-packages/pandas/tests/io/json/test_ujson.py
Normal file
1084
venv/lib/python3.6/site-packages/pandas/tests/io/json/test_ujson.py
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user