8th day of python challenges 111-117

2019-08-04 15:26:35 +03:00
parent b04c1b055f
commit 627802c383
3215 changed files with 760227 additions and 491 deletions
--- a/venv/lib/python3.6/site-packages/pandas/io/json/init.py
+++ b/venv/lib/python3.6/site-packages/pandas/io/json/init.py
@@ -0,0 +1,12 @@
+from pandas.io.json._json import dumps, loads, read_json, to_json
+from pandas.io.json._normalize import json_normalize
+from pandas.io.json._table_schema import build_table_schema
+
+__all__ = [
+    "dumps",
+    "loads",
+    "read_json",
+    "to_json",
+    "json_normalize",
+    "build_table_schema",
+]
--- a/venv/lib/python3.6/site-packages/pandas/io/json/_json.py
+++ b/venv/lib/python3.6/site-packages/pandas/io/json/_json.py
--- a/venv/lib/python3.6/site-packages/pandas/io/json/_normalize.py
+++ b/venv/lib/python3.6/site-packages/pandas/io/json/_normalize.py
@@ -0,0 +1,343 @@
+# ---------------------------------------------------------------------
+# JSON normalization routines
+
+from collections import defaultdict
+import copy
+from typing import DefaultDict, Dict, List, Optional, Union
+
+import numpy as np
+
+from pandas._libs.writers import convert_json_to_lines
+
+from pandas import DataFrame
+
+
+def convert_to_line_delimits(s):
+    """
+    Helper function that converts JSON lists to line delimited JSON.
+    """
+
+    # Determine we have a JSON list to turn to lines otherwise just return the
+    # json object, only lists can
+    if not s[0] == "[" and s[-1] == "]":
+        return s
+    s = s[1:-1]
+
+    return convert_json_to_lines(s)
+
+
+def nested_to_record(
+    ds,
+    prefix: str = "",
+    sep: str = ".",
+    level: int = 0,
+    max_level: Optional[int] = None,
+):
+    """
+    A simplified json_normalize
+
+    Converts a nested dict into a flat dict ("record"), unlike json_normalize,
+    it does not attempt to extract a subset of the data.
+
+    Parameters
+    ----------
+    ds : dict or list of dicts
+    prefix: the prefix, optional, default: ""
+    sep : str, default '.'
+        Nested records will generate names separated by sep,
+        e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
+
+        .. versionadded:: 0.20.0
+
+    level: int, optional, default: 0
+        The number of levels in the json string.
+
+    max_level: int, optional, default: None
+        The max depth to normalize.
+
+        .. versionadded:: 0.25.0
+
+    Returns
+    -------
+    d - dict or list of dicts, matching `ds`
+
+    Examples
+    --------
+
+    IN[52]: nested_to_record(dict(flat1=1,dict1=dict(c=1,d=2),
+                                  nested=dict(e=dict(c=1,d=2),d=2)))
+    Out[52]:
+    {'dict1.c': 1,
+     'dict1.d': 2,
+     'flat1': 1,
+     'nested.d': 2,
+     'nested.e.c': 1,
+     'nested.e.d': 2}
+    """
+    singleton = False
+    if isinstance(ds, dict):
+        ds = [ds]
+        singleton = True
+    new_ds = []
+    for d in ds:
+        new_d = copy.deepcopy(d)
+        for k, v in d.items():
+            # each key gets renamed with prefix
+            if not isinstance(k, str):
+                k = str(k)
+            if level == 0:
+                newkey = k
+            else:
+                newkey = prefix + sep + k
+
+            # flatten if type is dict and
+            # current dict level  < maximum level provided and
+            # only dicts gets recurse-flattened
+            # only at level>1 do we rename the rest of the keys
+            if not isinstance(v, dict) or (
+                max_level is not None and level >= max_level
+            ):
+                if level != 0:  # so we skip copying for top level, common case
+                    v = new_d.pop(k)
+                    new_d[newkey] = v
+                continue
+            else:
+                v = new_d.pop(k)
+                new_d.update(nested_to_record(v, newkey, sep, level + 1, max_level))
+        new_ds.append(new_d)
+
+    if singleton:
+        return new_ds[0]
+    return new_ds
+
+
+def json_normalize(
+    data: Union[Dict, List[Dict]],
+    record_path: Optional[Union[str, List]] = None,
+    meta: Optional[Union[str, List]] = None,
+    meta_prefix: Optional[str] = None,
+    record_prefix: Optional[str] = None,
+    errors: Optional[str] = "raise",
+    sep: str = ".",
+    max_level: Optional[int] = None,
+):
+    """
+    Normalize semi-structured JSON data into a flat table.
+
+    Parameters
+    ----------
+    data : dict or list of dicts
+        Unserialized JSON objects.
+    record_path : str or list of str, default None
+        Path in each object to list of records. If not passed, data will be
+        assumed to be an array of records.
+    meta : list of paths (str or list of str), default None
+        Fields to use as metadata for each record in resulting table.
+    meta_prefix : str, default None
+        If True, prefix records with dotted (?) path, e.g. foo.bar.field if
+        meta is ['foo', 'bar'].
+    record_prefix : str, default None
+        If True, prefix records with dotted (?) path, e.g. foo.bar.field if
+        path to records is ['foo', 'bar'].
+    errors : {'raise', 'ignore'}, default 'raise'
+        Configures error handling.
+
+        * 'ignore' : will ignore KeyError if keys listed in meta are not
+          always present.
+        * 'raise' : will raise KeyError if keys listed in meta are not
+          always present.
+
+        .. versionadded:: 0.20.0
+
+    sep : str, default '.'
+        Nested records will generate names separated by sep.
+        e.g., for sep='.', {'foo': {'bar': 0}} -> foo.bar.
+
+        .. versionadded:: 0.20.0
+
+    max_level : int, default None
+        Max number of levels(depth of dict) to normalize.
+        if None, normalizes all levels.
+
+        .. versionadded:: 0.25.0
+
+    Returns
+    -------
+    frame : DataFrame
+    Normalize semi-structured JSON data into a flat table.
+
+    Examples
+    --------
+
+    >>> from pandas.io.json import json_normalize
+    >>> data = [{'id': 1, 'name': {'first': 'Coleen', 'last': 'Volk'}},
+    ...         {'name': {'given': 'Mose', 'family': 'Regner'}},
+    ...         {'id': 2, 'name': 'Faye Raker'}]
+    >>> json_normalize(data)
+        id        name name.family name.first name.given name.last
+    0  1.0         NaN         NaN     Coleen        NaN      Volk
+    1  NaN         NaN      Regner        NaN       Mose       NaN
+    2  2.0  Faye Raker         NaN        NaN        NaN       NaN
+
+    >>> data = [{'id': 1,
+    ...          'name': "Cole Volk",
+    ...          'fitness': {'height': 130, 'weight': 60}},
+    ...         {'name': "Mose Reg",
+    ...          'fitness': {'height': 130, 'weight': 60}},
+    ...         {'id': 2, 'name': 'Faye Raker',
+    ...          'fitness': {'height': 130, 'weight': 60}}]
+    >>> json_normalize(data, max_level=0)
+                fitness                 id        name
+    0   {'height': 130, 'weight': 60}  1.0   Cole Volk
+    1   {'height': 130, 'weight': 60}  NaN    Mose Reg
+    2   {'height': 130, 'weight': 60}  2.0  Faye Raker
+
+    Normalizes nested data upto level 1.
+
+    >>> data = [{'id': 1,
+    ...          'name': "Cole Volk",
+    ...          'fitness': {'height': 130, 'weight': 60}},
+    ...         {'name': "Mose Reg",
+    ...          'fitness': {'height': 130, 'weight': 60}},
+    ...         {'id': 2, 'name': 'Faye Raker',
+    ...          'fitness': {'height': 130, 'weight': 60}}]
+    >>> json_normalize(data, max_level=1)
+      fitness.height  fitness.weight   id    name
+    0   130              60          1.0    Cole Volk
+    1   130              60          NaN    Mose Reg
+    2   130              60          2.0    Faye Raker
+
+    >>> data = [{'state': 'Florida',
+    ...          'shortname': 'FL',
+    ...          'info': {'governor': 'Rick Scott'},
+    ...          'counties': [{'name': 'Dade', 'population': 12345},
+    ...                       {'name': 'Broward', 'population': 40000},
+    ...                       {'name': 'Palm Beach', 'population': 60000}]},
+    ...         {'state': 'Ohio',
+    ...          'shortname': 'OH',
+    ...          'info': {'governor': 'John Kasich'},
+    ...          'counties': [{'name': 'Summit', 'population': 1234},
+    ...                       {'name': 'Cuyahoga', 'population': 1337}]}]
+    >>> result = json_normalize(data, 'counties', ['state', 'shortname',
+    ...                                            ['info', 'governor']])
+    >>> result
+             name  population    state shortname info.governor
+    0        Dade       12345   Florida    FL    Rick Scott
+    1     Broward       40000   Florida    FL    Rick Scott
+    2  Palm Beach       60000   Florida    FL    Rick Scott
+    3      Summit        1234   Ohio       OH    John Kasich
+    4    Cuyahoga        1337   Ohio       OH    John Kasich
+
+    >>> data = {'A': [1, 2]}
+    >>> json_normalize(data, 'A', record_prefix='Prefix.')
+        Prefix.0
+    0          1
+    1          2
+
+    Returns normalized data with columns prefixed with the given string.
+    """
+
+    def _pull_field(js, spec):
+        result = js
+        if isinstance(spec, list):
+            for field in spec:
+                result = result[field]
+        else:
+            result = result[spec]
+
+        return result
+
+    if isinstance(data, list) and not data:
+        return DataFrame()
+
+    # A bit of a hackjob
+    if isinstance(data, dict):
+        data = [data]
+
+    if record_path is None:
+        if any([isinstance(x, dict) for x in y.values()] for y in data):
+            # naive normalization, this is idempotent for flat records
+            # and potentially will inflate the data considerably for
+            # deeply nested structures:
+            #  {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@}
+            #
+            # TODO: handle record value which are lists, at least error
+            #       reasonably
+            data = nested_to_record(data, sep=sep, max_level=max_level)
+        return DataFrame(data)
+    elif not isinstance(record_path, list):
+        record_path = [record_path]
+
+    if meta is None:
+        meta = []
+    elif not isinstance(meta, list):
+        meta = [meta]
+
+    meta = [m if isinstance(m, list) else [m] for m in meta]
+
+    # Disastrously inefficient for now
+    records = []  # type: List
+    lengths = []
+
+    meta_vals = defaultdict(list)  # type: DefaultDict
+    meta_keys = [sep.join(val) for val in meta]
+
+    def _recursive_extract(data, path, seen_meta, level=0):
+        if isinstance(data, dict):
+            data = [data]
+        if len(path) > 1:
+            for obj in data:
+                for val, key in zip(meta, meta_keys):
+                    if level + 1 == len(val):
+                        seen_meta[key] = _pull_field(obj, val[-1])
+
+                _recursive_extract(obj[path[0]], path[1:], seen_meta, level=level + 1)
+        else:
+            for obj in data:
+                recs = _pull_field(obj, path[0])
+                recs = [
+                    nested_to_record(r, sep=sep, max_level=max_level)
+                    if isinstance(r, dict)
+                    else r
+                    for r in recs
+                ]
+
+                # For repeating the metadata later
+                lengths.append(len(recs))
+                for val, key in zip(meta, meta_keys):
+                    if level + 1 > len(val):
+                        meta_val = seen_meta[key]
+                    else:
+                        try:
+                            meta_val = _pull_field(obj, val[level:])
+                        except KeyError as e:
+                            if errors == "ignore":
+                                meta_val = np.nan
+                            else:
+                                raise KeyError(
+                                    "Try running with "
+                                    "errors='ignore' as key "
+                                    "{err} is not always present".format(err=e)
+                                )
+                    meta_vals[key].append(meta_val)
+                records.extend(recs)
+
+    _recursive_extract(data, record_path, {}, level=0)
+
+    result = DataFrame(records)
+
+    if record_prefix is not None:
+        result = result.rename(columns=lambda x: "{p}{c}".format(p=record_prefix, c=x))
+
+    # Data types, a problem
+    for k, v in meta_vals.items():
+        if meta_prefix is not None:
+            k = meta_prefix + k
+
+        if k in result:
+            raise ValueError(
+                "Conflicting metadata name {name}, "
+                "need distinguishing prefix ".format(name=k)
+            )
+        result[k] = np.array(v, dtype=object).repeat(lengths)
+    return result
--- a/venv/lib/python3.6/site-packages/pandas/io/json/_table_schema.py
+++ b/venv/lib/python3.6/site-packages/pandas/io/json/_table_schema.py
@@ -0,0 +1,338 @@
+"""
+Table Schema builders
+
+http://specs.frictionlessdata.io/json-table-schema/
+"""
+import warnings
+
+import pandas._libs.json as json
+
+from pandas.core.dtypes.common import (
+    is_bool_dtype,
+    is_categorical_dtype,
+    is_datetime64_dtype,
+    is_datetime64tz_dtype,
+    is_integer_dtype,
+    is_numeric_dtype,
+    is_period_dtype,
+    is_string_dtype,
+    is_timedelta64_dtype,
+)
+
+from pandas import DataFrame
+from pandas.api.types import CategoricalDtype
+import pandas.core.common as com
+
+loads = json.loads
+
+
+def as_json_table_type(x):
+    """
+    Convert a NumPy / pandas type to its corresponding json_table.
+
+    Parameters
+    ----------
+    x : array or dtype
+
+    Returns
+    -------
+    t : str
+        the Table Schema data types
+
+    Notes
+    -----
+    This table shows the relationship between NumPy / pandas dtypes,
+    and Table Schema dtypes.
+
+    ==============  =================
+    Pandas type     Table Schema type
+    ==============  =================
+    int64           integer
+    float64         number
+    bool            boolean
+    datetime64[ns]  datetime
+    timedelta64[ns] duration
+    object          str
+    categorical     any
+    =============== =================
+    """
+    if is_integer_dtype(x):
+        return "integer"
+    elif is_bool_dtype(x):
+        return "boolean"
+    elif is_numeric_dtype(x):
+        return "number"
+    elif is_datetime64_dtype(x) or is_datetime64tz_dtype(x) or is_period_dtype(x):
+        return "datetime"
+    elif is_timedelta64_dtype(x):
+        return "duration"
+    elif is_categorical_dtype(x):
+        return "any"
+    elif is_string_dtype(x):
+        return "string"
+    else:
+        return "any"
+
+
+def set_default_names(data):
+    """Sets index names to 'index' for regular, or 'level_x' for Multi"""
+    if com._all_not_none(*data.index.names):
+        nms = data.index.names
+        if len(nms) == 1 and data.index.name == "index":
+            warnings.warn("Index name of 'index' is not round-trippable")
+        elif len(nms) > 1 and any(x.startswith("level_") for x in nms):
+            warnings.warn(
+                "Index names beginning with 'level_' are not " "round-trippable"
+            )
+        return data
+
+    data = data.copy()
+    if data.index.nlevels > 1:
+        names = [
+            name if name is not None else "level_{}".format(i)
+            for i, name in enumerate(data.index.names)
+        ]
+        data.index.names = names
+    else:
+        data.index.name = data.index.name or "index"
+    return data
+
+
+def convert_pandas_type_to_json_field(arr, dtype=None):
+    dtype = dtype or arr.dtype
+    if arr.name is None:
+        name = "values"
+    else:
+        name = arr.name
+    field = {"name": name, "type": as_json_table_type(dtype)}
+
+    if is_categorical_dtype(arr):
+        if hasattr(arr, "categories"):
+            cats = arr.categories
+            ordered = arr.ordered
+        else:
+            cats = arr.cat.categories
+            ordered = arr.cat.ordered
+        field["constraints"] = {"enum": list(cats)}
+        field["ordered"] = ordered
+    elif is_period_dtype(arr):
+        field["freq"] = arr.freqstr
+    elif is_datetime64tz_dtype(arr):
+        if hasattr(arr, "dt"):
+            field["tz"] = arr.dt.tz.zone
+        else:
+            field["tz"] = arr.tz.zone
+    return field
+
+
+def convert_json_field_to_pandas_type(field):
+    """
+    Converts a JSON field descriptor into its corresponding NumPy / pandas type
+
+    Parameters
+    ----------
+    field
+        A JSON field descriptor
+
+    Returns
+    -------
+    dtype
+
+    Raises
+    ------
+    ValueError
+        If the type of the provided field is unknown or currently unsupported
+
+    Examples
+    --------
+    >>> convert_json_field_to_pandas_type({'name': 'an_int',
+                                           'type': 'integer'})
+    'int64'
+    >>> convert_json_field_to_pandas_type({'name': 'a_categorical',
+                                           'type': 'any',
+                                           'constraints': {'enum': [
+                                                          'a', 'b', 'c']},
+                                           'ordered': True})
+    'CategoricalDtype(categories=['a', 'b', 'c'], ordered=True)'
+    >>> convert_json_field_to_pandas_type({'name': 'a_datetime',
+                                           'type': 'datetime'})
+    'datetime64[ns]'
+    >>> convert_json_field_to_pandas_type({'name': 'a_datetime_with_tz',
+                                           'type': 'datetime',
+                                           'tz': 'US/Central'})
+    'datetime64[ns, US/Central]'
+    """
+    typ = field["type"]
+    if typ == "string":
+        return "object"
+    elif typ == "integer":
+        return "int64"
+    elif typ == "number":
+        return "float64"
+    elif typ == "boolean":
+        return "bool"
+    elif typ == "duration":
+        return "timedelta64"
+    elif typ == "datetime":
+        if field.get("tz"):
+            return "datetime64[ns, {tz}]".format(tz=field["tz"])
+        else:
+            return "datetime64[ns]"
+    elif typ == "any":
+        if "constraints" in field and "ordered" in field:
+            return CategoricalDtype(
+                categories=field["constraints"]["enum"], ordered=field["ordered"]
+            )
+        else:
+            return "object"
+
+    raise ValueError("Unsupported or invalid field type: {}".format(typ))
+
+
+def build_table_schema(data, index=True, primary_key=None, version=True):
+    """
+    Create a Table schema from ``data``.
+
+    Parameters
+    ----------
+    data : Series, DataFrame
+    index : bool, default True
+        Whether to include ``data.index`` in the schema.
+    primary_key : bool or None, default True
+        column names to designate as the primary key.
+        The default `None` will set `'primaryKey'` to the index
+        level or levels if the index is unique.
+    version : bool, default True
+        Whether to include a field `pandas_version` with the version
+        of pandas that generated the schema.
+
+    Returns
+    -------
+    schema : dict
+
+    Notes
+    -----
+    See `_as_json_table_type` for conversion types.
+    Timedeltas as converted to ISO8601 duration format with
+    9 decimal places after the seconds field for nanosecond precision.
+
+    Categoricals are converted to the `any` dtype, and use the `enum` field
+    constraint to list the allowed values. The `ordered` attribute is included
+    in an `ordered` field.
+
+    Examples
+    --------
+    >>> df = pd.DataFrame(
+    ...     {'A': [1, 2, 3],
+    ...      'B': ['a', 'b', 'c'],
+    ...      'C': pd.date_range('2016-01-01', freq='d', periods=3),
+    ...     }, index=pd.Index(range(3), name='idx'))
+    >>> build_table_schema(df)
+    {'fields': [{'name': 'idx', 'type': 'integer'},
+    {'name': 'A', 'type': 'integer'},
+    {'name': 'B', 'type': 'string'},
+    {'name': 'C', 'type': 'datetime'}],
+    'pandas_version': '0.20.0',
+    'primaryKey': ['idx']}
+    """
+    if index is True:
+        data = set_default_names(data)
+
+    schema = {}
+    fields = []
+
+    if index:
+        if data.index.nlevels > 1:
+            for level in data.index.levels:
+                fields.append(convert_pandas_type_to_json_field(level))
+        else:
+            fields.append(convert_pandas_type_to_json_field(data.index))
+
+    if data.ndim > 1:
+        for column, s in data.items():
+            fields.append(convert_pandas_type_to_json_field(s))
+    else:
+        fields.append(convert_pandas_type_to_json_field(data))
+
+    schema["fields"] = fields
+    if index and data.index.is_unique and primary_key is None:
+        if data.index.nlevels == 1:
+            schema["primaryKey"] = [data.index.name]
+        else:
+            schema["primaryKey"] = data.index.names
+    elif primary_key is not None:
+        schema["primaryKey"] = primary_key
+
+    if version:
+        schema["pandas_version"] = "0.20.0"
+    return schema
+
+
+def parse_table_schema(json, precise_float):
+    """
+    Builds a DataFrame from a given schema
+
+    Parameters
+    ----------
+    json :
+        A JSON table schema
+    precise_float : boolean
+        Flag controlling precision when decoding string to double values, as
+        dictated by ``read_json``
+
+    Returns
+    -------
+    df : DataFrame
+
+    Raises
+    ------
+    NotImplementedError
+        If the JSON table schema contains either timezone or timedelta data
+
+    Notes
+    -----
+        Because :func:`DataFrame.to_json` uses the string 'index' to denote a
+        name-less :class:`Index`, this function sets the name of the returned
+        :class:`DataFrame` to ``None`` when said string is encountered with a
+        normal :class:`Index`. For a :class:`MultiIndex`, the same limitation
+        applies to any strings beginning with 'level_'. Therefore, an
+        :class:`Index` name of 'index'  and :class:`MultiIndex` names starting
+        with 'level_' are not supported.
+
+    See Also
+    --------
+    build_table_schema : Inverse function.
+    pandas.read_json
+    """
+    table = loads(json, precise_float=precise_float)
+    col_order = [field["name"] for field in table["schema"]["fields"]]
+    df = DataFrame(table["data"], columns=col_order)[col_order]
+
+    dtypes = {
+        field["name"]: convert_json_field_to_pandas_type(field)
+        for field in table["schema"]["fields"]
+    }
+
+    # Cannot directly use as_type with timezone data on object; raise for now
+    if any(str(x).startswith("datetime64[ns, ") for x in dtypes.values()):
+        raise NotImplementedError('table="orient" can not yet read timezone ' "data")
+
+    # No ISO constructor for Timedelta as of yet, so need to raise
+    if "timedelta64" in dtypes.values():
+        raise NotImplementedError(
+            'table="orient" can not yet read ' "ISO-formatted Timedelta data"
+        )
+
+    df = df.astype(dtypes)
+
+    if "primaryKey" in table["schema"]:
+        df = df.set_index(table["schema"]["primaryKey"])
+        if len(df.index.names) == 1:
+            if df.index.name == "index":
+                df.index.name = None
+        else:
+            df.index.names = [
+                None if x.startswith("level_") else x for x in df.index.names
+            ]
+
+    return df