8th day of python challenges 111-117

This commit is contained in:
abd.shallal
2019-08-04 15:26:35 +03:00
parent b04c1b055f
commit 627802c383
3215 changed files with 760227 additions and 491 deletions

View File

@@ -0,0 +1,263 @@
# flake8: noqa
__docformat__ = "restructuredtext"
# Let users know if they're missing any of our hard dependencies
hard_dependencies = ("numpy", "pytz", "dateutil")
missing_dependencies = []
for dependency in hard_dependencies:
try:
__import__(dependency)
except ImportError as e:
missing_dependencies.append("{0}: {1}".format(dependency, str(e)))
if missing_dependencies:
raise ImportError(
"Unable to import required dependencies:\n" + "\n".join(missing_dependencies)
)
del hard_dependencies, dependency, missing_dependencies
# numpy compat
from pandas.compat.numpy import (
_np_version_under1p14,
_np_version_under1p15,
_np_version_under1p16,
_np_version_under1p17,
)
try:
from pandas._libs import hashtable as _hashtable, lib as _lib, tslib as _tslib
except ImportError as e: # pragma: no cover
# hack but overkill to use re
module = str(e).replace("cannot import name ", "")
raise ImportError(
"C extension: {0} not built. If you want to import "
"pandas from the source directory, you may need to run "
"'python setup.py build_ext --inplace --force' to build "
"the C extensions first.".format(module)
)
from datetime import datetime
from pandas._config import (
get_option,
set_option,
reset_option,
describe_option,
option_context,
options,
)
# let init-time option registration happen
import pandas.core.config_init
from pandas.core.api import (
# dtype
Int8Dtype,
Int16Dtype,
Int32Dtype,
Int64Dtype,
UInt8Dtype,
UInt16Dtype,
UInt32Dtype,
UInt64Dtype,
CategoricalDtype,
PeriodDtype,
IntervalDtype,
DatetimeTZDtype,
# missing
isna,
isnull,
notna,
notnull,
# indexes
Index,
CategoricalIndex,
Int64Index,
UInt64Index,
RangeIndex,
Float64Index,
MultiIndex,
IntervalIndex,
TimedeltaIndex,
DatetimeIndex,
PeriodIndex,
IndexSlice,
# tseries
NaT,
Period,
period_range,
Timedelta,
timedelta_range,
Timestamp,
date_range,
bdate_range,
Interval,
interval_range,
DateOffset,
# conversion
to_numeric,
to_datetime,
to_timedelta,
# misc
np,
Grouper,
factorize,
unique,
value_counts,
NamedAgg,
array,
Categorical,
set_eng_float_format,
Series,
DataFrame,
)
from pandas.core.sparse.api import (
SparseArray,
SparseDataFrame,
SparseSeries,
SparseDtype,
)
from pandas.tseries.api import infer_freq
from pandas.tseries import offsets
from pandas.core.computation.api import eval
from pandas.core.reshape.api import (
concat,
lreshape,
melt,
wide_to_long,
merge,
merge_asof,
merge_ordered,
crosstab,
pivot,
pivot_table,
get_dummies,
cut,
qcut,
)
from pandas.util._print_versions import show_versions
from pandas.io.api import (
# excel
ExcelFile,
ExcelWriter,
read_excel,
# packers
read_msgpack,
to_msgpack,
# parsers
read_csv,
read_fwf,
read_table,
# pickle
read_pickle,
to_pickle,
# pytables
HDFStore,
read_hdf,
# sql
read_sql,
read_sql_query,
read_sql_table,
# misc
read_clipboard,
read_parquet,
read_feather,
read_gbq,
read_html,
read_json,
read_stata,
read_sas,
read_spss,
)
from pandas.util._tester import test
import pandas.testing
import pandas.arrays
# use the closest tagged version if possible
from ._version import get_versions
v = get_versions()
__version__ = v.get("closest-tag", v["version"])
__git_version__ = v.get("full-revisionid")
del get_versions, v
# GH 27101
# TODO: remove Panel compat in 1.0
if pandas.compat.PY37:
def __getattr__(name):
if name == "Panel":
import warnings
warnings.warn(
"The Panel class is removed from pandas. Accessing it "
"from the top-level namespace will also be removed in "
"the next version",
FutureWarning,
stacklevel=2,
)
class Panel:
pass
return Panel
raise AttributeError("module 'pandas' has no attribute '{}'".format(name))
else:
class Panel:
pass
# module level doc-string
__doc__ = """
pandas - a powerful data analysis and manipulation library for Python
=====================================================================
**pandas** is a Python package providing fast, flexible, and expressive data
structures designed to make working with "relational" or "labeled" data both
easy and intuitive. It aims to be the fundamental high-level building block for
doing practical, **real world** data analysis in Python. Additionally, it has
the broader goal of becoming **the most powerful and flexible open source data
analysis / manipulation tool available in any language**. It is already well on
its way toward this goal.
Main Features
-------------
Here are just a few of the things that pandas does well:
- Easy handling of missing data in floating point as well as non-floating
point data.
- Size mutability: columns can be inserted and deleted from DataFrame and
higher dimensional objects
- Automatic and explicit data alignment: objects can be explicitly aligned
to a set of labels, or the user can simply ignore the labels and let
`Series`, `DataFrame`, etc. automatically align the data for you in
computations.
- Powerful, flexible group by functionality to perform split-apply-combine
operations on data sets, for both aggregating and transforming data.
- Make it easy to convert ragged, differently-indexed data in other Python
and NumPy data structures into DataFrame objects.
- Intelligent label-based slicing, fancy indexing, and subsetting of large
data sets.
- Intuitive merging and joining data sets.
- Flexible reshaping and pivoting of data sets.
- Hierarchical labeling of axes (possible to have multiple labels per tick).
- Robust IO tools for loading data from flat files (CSV and delimited),
Excel files, databases, and saving/loading data from the ultrafast HDF5
format.
- Time series-specific functionality: date range generation and frequency
conversion, moving window statistics, moving window linear regressions,
date shifting and lagging, etc.
"""

View File

@@ -0,0 +1,28 @@
"""
pandas._config is considered explicitly upstream of everything else in pandas,
should have no intra-pandas dependencies.
importing `dates` and `display` ensures that keys needed by _libs
are initialized.
"""
__all__ = [
"config",
"detect_console_encoding",
"get_option",
"set_option",
"reset_option",
"describe_option",
"option_context",
"options",
]
from pandas._config import config
from pandas._config import dates # noqa:F401
from pandas._config.config import (
describe_option,
get_option,
option_context,
options,
reset_option,
set_option,
)
from pandas._config.display import detect_console_encoding

View File

@@ -0,0 +1,847 @@
"""
The config module holds package-wide configurables and provides
a uniform API for working with them.
Overview
========
This module supports the following requirements:
- options are referenced using keys in dot.notation, e.g. "x.y.option - z".
- keys are case-insensitive.
- functions should accept partial/regex keys, when unambiguous.
- options can be registered by modules at import time.
- options can be registered at init-time (via core.config_init)
- options have a default value, and (optionally) a description and
validation function associated with them.
- options can be deprecated, in which case referencing them
should produce a warning.
- deprecated options can optionally be rerouted to a replacement
so that accessing a deprecated option reroutes to a differently
named option.
- options can be reset to their default value.
- all option can be reset to their default value at once.
- all options in a certain sub - namespace can be reset at once.
- the user can set / get / reset or ask for the description of an option.
- a developer can register and mark an option as deprecated.
- you can register a callback to be invoked when the option value
is set or reset. Changing the stored value is considered misuse, but
is not verboten.
Implementation
==============
- Data is stored using nested dictionaries, and should be accessed
through the provided API.
- "Registered options" and "Deprecated options" have metadata associated
with them, which are stored in auxiliary dictionaries keyed on the
fully-qualified key, e.g. "x.y.z.option".
- the config_init module is imported by the package's __init__.py file.
placing any register_option() calls there will ensure those options
are available as soon as pandas is loaded. If you use register_option
in a module, it will only be available after that module is imported,
which you should be aware of.
- `config_prefix` is a context_manager (for use with the `with` keyword)
which can save developers some typing, see the docstring.
"""
from collections import namedtuple
from contextlib import contextmanager
import re
from typing import Dict, List
import warnings
DeprecatedOption = namedtuple("DeprecatedOption", "key msg rkey removal_ver")
RegisteredOption = namedtuple("RegisteredOption", "key defval doc validator cb")
# holds deprecated option metdata
_deprecated_options = {} # type: Dict[str, DeprecatedOption]
# holds registered option metdata
_registered_options = {} # type: Dict[str, RegisteredOption]
# holds the current values for registered options
_global_config = {} # type: Dict[str, str]
# keys which have a special meaning
_reserved_keys = ["all"] # type: List[str]
class OptionError(AttributeError, KeyError):
"""Exception for pandas.options, backwards compatible with KeyError
checks
"""
#
# User API
def _get_single_key(pat, silent):
keys = _select_options(pat)
if len(keys) == 0:
if not silent:
_warn_if_deprecated(pat)
raise OptionError("No such keys(s): {pat!r}".format(pat=pat))
if len(keys) > 1:
raise OptionError("Pattern matched multiple keys")
key = keys[0]
if not silent:
_warn_if_deprecated(key)
key = _translate_key(key)
return key
def _get_option(pat, silent=False):
key = _get_single_key(pat, silent)
# walk the nested dict
root, k = _get_root(key)
return root[k]
def _set_option(*args, **kwargs):
# must at least 1 arg deal with constraints later
nargs = len(args)
if not nargs or nargs % 2 != 0:
raise ValueError("Must provide an even number of non-keyword " "arguments")
# default to false
silent = kwargs.pop("silent", False)
if kwargs:
msg = '_set_option() got an unexpected keyword argument "{kwarg}"'
raise TypeError(msg.format(list(kwargs.keys())[0]))
for k, v in zip(args[::2], args[1::2]):
key = _get_single_key(k, silent)
o = _get_registered_option(key)
if o and o.validator:
o.validator(v)
# walk the nested dict
root, k = _get_root(key)
root[k] = v
if o.cb:
if silent:
with warnings.catch_warnings(record=True):
o.cb(key)
else:
o.cb(key)
def _describe_option(pat="", _print_desc=True):
keys = _select_options(pat)
if len(keys) == 0:
raise OptionError("No such keys(s)")
s = ""
for k in keys: # filter by pat
s += _build_option_description(k)
if _print_desc:
print(s)
else:
return s
def _reset_option(pat, silent=False):
keys = _select_options(pat)
if len(keys) == 0:
raise OptionError("No such keys(s)")
if len(keys) > 1 and len(pat) < 4 and pat != "all":
raise ValueError(
"You must specify at least 4 characters when "
"resetting multiple keys, use the special keyword "
'"all" to reset all the options to their default '
"value"
)
for k in keys:
_set_option(k, _registered_options[k].defval, silent=silent)
def get_default_val(pat):
key = _get_single_key(pat, silent=True)
return _get_registered_option(key).defval
class DictWrapper:
""" provide attribute-style access to a nested dict"""
def __init__(self, d, prefix=""):
object.__setattr__(self, "d", d)
object.__setattr__(self, "prefix", prefix)
def __setattr__(self, key, val):
prefix = object.__getattribute__(self, "prefix")
if prefix:
prefix += "."
prefix += key
# you can't set new keys
# can you can't overwrite subtrees
if key in self.d and not isinstance(self.d[key], dict):
_set_option(prefix, val)
else:
raise OptionError("You can only set the value of existing options")
def __getattr__(self, key):
prefix = object.__getattribute__(self, "prefix")
if prefix:
prefix += "."
prefix += key
try:
v = object.__getattribute__(self, "d")[key]
except KeyError:
raise OptionError("No such option")
if isinstance(v, dict):
return DictWrapper(v, prefix)
else:
return _get_option(prefix)
def __dir__(self):
return list(self.d.keys())
# For user convenience, we'd like to have the available options described
# in the docstring. For dev convenience we'd like to generate the docstrings
# dynamically instead of maintaining them by hand. To this, we use the
# class below which wraps functions inside a callable, and converts
# __doc__ into a property function. The doctsrings below are templates
# using the py2.6+ advanced formatting syntax to plug in a concise list
# of options, and option descriptions.
class CallableDynamicDoc:
def __init__(self, func, doc_tmpl):
self.__doc_tmpl__ = doc_tmpl
self.__func__ = func
def __call__(self, *args, **kwds):
return self.__func__(*args, **kwds)
@property
def __doc__(self):
opts_desc = _describe_option("all", _print_desc=False)
opts_list = pp_options_list(list(_registered_options.keys()))
return self.__doc_tmpl__.format(opts_desc=opts_desc, opts_list=opts_list)
_get_option_tmpl = """
get_option(pat)
Retrieves the value of the specified option.
Available options:
{opts_list}
Parameters
----------
pat : str
Regexp which should match a single option.
Note: partial matches are supported for convenience, but unless you use the
full option name (e.g. x.y.z.option_name), your code may break in future
versions if new options with similar names are introduced.
Returns
-------
result : the value of the option
Raises
------
OptionError : if no such option exists
Notes
-----
The available options with its descriptions:
{opts_desc}
"""
_set_option_tmpl = """
set_option(pat, value)
Sets the value of the specified option.
Available options:
{opts_list}
Parameters
----------
pat : str
Regexp which should match a single option.
Note: partial matches are supported for convenience, but unless you use the
full option name (e.g. x.y.z.option_name), your code may break in future
versions if new options with similar names are introduced.
value : object
New value of option.
Returns
-------
None
Raises
------
OptionError if no such option exists
Notes
-----
The available options with its descriptions:
{opts_desc}
"""
_describe_option_tmpl = """
describe_option(pat, _print_desc=False)
Prints the description for one or more registered options.
Call with not arguments to get a listing for all registered options.
Available options:
{opts_list}
Parameters
----------
pat : str
Regexp pattern. All matching keys will have their description displayed.
_print_desc : bool, default True
If True (default) the description(s) will be printed to stdout.
Otherwise, the description(s) will be returned as a unicode string
(for testing).
Returns
-------
None by default, the description(s) as a unicode string if _print_desc
is False
Notes
-----
The available options with its descriptions:
{opts_desc}
"""
_reset_option_tmpl = """
reset_option(pat)
Reset one or more options to their default value.
Pass "all" as argument to reset all options.
Available options:
{opts_list}
Parameters
----------
pat : str/regex
If specified only options matching `prefix*` will be reset.
Note: partial matches are supported for convenience, but unless you
use the full option name (e.g. x.y.z.option_name), your code may break
in future versions if new options with similar names are introduced.
Returns
-------
None
Notes
-----
The available options with its descriptions:
{opts_desc}
"""
# bind the functions with their docstrings into a Callable
# and use that as the functions exposed in pd.api
get_option = CallableDynamicDoc(_get_option, _get_option_tmpl)
set_option = CallableDynamicDoc(_set_option, _set_option_tmpl)
reset_option = CallableDynamicDoc(_reset_option, _reset_option_tmpl)
describe_option = CallableDynamicDoc(_describe_option, _describe_option_tmpl)
options = DictWrapper(_global_config)
#
# Functions for use by pandas developers, in addition to User - api
class option_context:
"""
Context manager to temporarily set options in the `with` statement context.
You need to invoke as ``option_context(pat, val, [(pat, val), ...])``.
Examples
--------
>>> with option_context('display.max_rows', 10, 'display.max_columns', 5):
... ...
"""
def __init__(self, *args):
if not (len(args) % 2 == 0 and len(args) >= 2):
raise ValueError(
"Need to invoke as" " option_context(pat, val, [(pat, val), ...])."
)
self.ops = list(zip(args[::2], args[1::2]))
def __enter__(self):
self.undo = [(pat, _get_option(pat, silent=True)) for pat, val in self.ops]
for pat, val in self.ops:
_set_option(pat, val, silent=True)
def __exit__(self, *args):
if self.undo:
for pat, val in self.undo:
_set_option(pat, val, silent=True)
def register_option(key, defval, doc="", validator=None, cb=None):
"""Register an option in the package-wide pandas config object
Parameters
----------
key - a fully-qualified key, e.g. "x.y.option - z".
defval - the default value of the option
doc - a string description of the option
validator - a function of a single argument, should raise `ValueError` if
called with a value which is not a legal value for the option.
cb - a function of a single argument "key", which is called
immediately after an option value is set/reset. key is
the full name of the option.
Returns
-------
Nothing.
Raises
------
ValueError if `validator` is specified and `defval` is not a valid value.
"""
import tokenize
import keyword
key = key.lower()
if key in _registered_options:
msg = "Option '{key}' has already been registered"
raise OptionError(msg.format(key=key))
if key in _reserved_keys:
msg = "Option '{key}' is a reserved key"
raise OptionError(msg.format(key=key))
# the default value should be legal
if validator:
validator(defval)
# walk the nested dict, creating dicts as needed along the path
path = key.split(".")
for k in path:
if not bool(re.match("^" + tokenize.Name + "$", k)):
raise ValueError("{k} is not a valid identifier".format(k=k))
if keyword.iskeyword(k):
raise ValueError("{k} is a python keyword".format(k=k))
cursor = _global_config
msg = "Path prefix to option '{option}' is already an option"
for i, p in enumerate(path[:-1]):
if not isinstance(cursor, dict):
raise OptionError(msg.format(option=".".join(path[:i])))
if p not in cursor:
cursor[p] = {}
cursor = cursor[p]
if not isinstance(cursor, dict):
raise OptionError(msg.format(option=".".join(path[:-1])))
cursor[path[-1]] = defval # initialize
# save the option metadata
_registered_options[key] = RegisteredOption(
key=key, defval=defval, doc=doc, validator=validator, cb=cb
)
def deprecate_option(key, msg=None, rkey=None, removal_ver=None):
"""
Mark option `key` as deprecated, if code attempts to access this option,
a warning will be produced, using `msg` if given, or a default message
if not.
if `rkey` is given, any access to the key will be re-routed to `rkey`.
Neither the existence of `key` nor that if `rkey` is checked. If they
do not exist, any subsequence access will fail as usual, after the
deprecation warning is given.
Parameters
----------
key - the name of the option to be deprecated. must be a fully-qualified
option name (e.g "x.y.z.rkey").
msg - (Optional) a warning message to output when the key is referenced.
if no message is given a default message will be emitted.
rkey - (Optional) the name of an option to reroute access to.
If specified, any referenced `key` will be re-routed to `rkey`
including set/get/reset.
rkey must be a fully-qualified option name (e.g "x.y.z.rkey").
used by the default message if no `msg` is specified.
removal_ver - (Optional) specifies the version in which this option will
be removed. used by the default message if no `msg`
is specified.
Returns
-------
Nothing
Raises
------
OptionError - if key has already been deprecated.
"""
key = key.lower()
if key in _deprecated_options:
msg = "Option '{key}' has already been defined as deprecated."
raise OptionError(msg.format(key=key))
_deprecated_options[key] = DeprecatedOption(key, msg, rkey, removal_ver)
#
# functions internal to the module
def _select_options(pat):
"""returns a list of keys matching `pat`
if pat=="all", returns all registered options
"""
# short-circuit for exact key
if pat in _registered_options:
return [pat]
# else look through all of them
keys = sorted(_registered_options.keys())
if pat == "all": # reserved key
return keys
return [k for k in keys if re.search(pat, k, re.I)]
def _get_root(key):
path = key.split(".")
cursor = _global_config
for p in path[:-1]:
cursor = cursor[p]
return cursor, path[-1]
def _is_deprecated(key):
""" Returns True if the given option has been deprecated """
key = key.lower()
return key in _deprecated_options
def _get_deprecated_option(key):
"""
Retrieves the metadata for a deprecated option, if `key` is deprecated.
Returns
-------
DeprecatedOption (namedtuple) if key is deprecated, None otherwise
"""
try:
d = _deprecated_options[key]
except KeyError:
return None
else:
return d
def _get_registered_option(key):
"""
Retrieves the option metadata if `key` is a registered option.
Returns
-------
RegisteredOption (namedtuple) if key is deprecated, None otherwise
"""
return _registered_options.get(key)
def _translate_key(key):
"""
if key id deprecated and a replacement key defined, will return the
replacement key, otherwise returns `key` as - is
"""
d = _get_deprecated_option(key)
if d:
return d.rkey or key
else:
return key
def _warn_if_deprecated(key):
"""
Checks if `key` is a deprecated option and if so, prints a warning.
Returns
-------
bool - True if `key` is deprecated, False otherwise.
"""
d = _get_deprecated_option(key)
if d:
if d.msg:
print(d.msg)
warnings.warn(d.msg, FutureWarning)
else:
msg = "'{key}' is deprecated".format(key=key)
if d.removal_ver:
msg += " and will be removed in {version}".format(version=d.removal_ver)
if d.rkey:
msg += ", please use '{rkey}' instead.".format(rkey=d.rkey)
else:
msg += ", please refrain from using it."
warnings.warn(msg, FutureWarning)
return True
return False
def _build_option_description(k):
""" Builds a formatted description of a registered option and prints it """
o = _get_registered_option(k)
d = _get_deprecated_option(k)
s = "{k} ".format(k=k)
if o.doc:
s += "\n".join(o.doc.strip().split("\n"))
else:
s += "No description available."
if o:
s += "\n [default: {default}] [currently: {current}]".format(
default=o.defval, current=_get_option(k, True)
)
if d:
s += "\n (Deprecated"
s += ", use `{rkey}` instead.".format(rkey=d.rkey if d.rkey else "")
s += ")"
return s
def pp_options_list(keys, width=80, _print=False):
""" Builds a concise listing of available options, grouped by prefix """
from textwrap import wrap
from itertools import groupby
def pp(name, ks):
pfx = "- " + name + ".[" if name else ""
ls = wrap(
", ".join(ks),
width,
initial_indent=pfx,
subsequent_indent=" ",
break_long_words=False,
)
if ls and ls[-1] and name:
ls[-1] = ls[-1] + "]"
return ls
ls = []
singles = [x for x in sorted(keys) if x.find(".") < 0]
if singles:
ls += pp("", singles)
keys = [x for x in keys if x.find(".") >= 0]
for k, g in groupby(sorted(keys), lambda x: x[: x.rfind(".")]):
ks = [x[len(k) + 1 :] for x in list(g)]
ls += pp(k, ks)
s = "\n".join(ls)
if _print:
print(s)
else:
return s
#
# helpers
@contextmanager
def config_prefix(prefix):
"""contextmanager for multiple invocations of API with a common prefix
supported API functions: (register / get / set )__option
Warning: This is not thread - safe, and won't work properly if you import
the API functions into your module using the "from x import y" construct.
Example:
import pandas._config.config as cf
with cf.config_prefix("display.font"):
cf.register_option("color", "red")
cf.register_option("size", " 5 pt")
cf.set_option(size, " 6 pt")
cf.get_option(size)
...
etc'
will register options "display.font.color", "display.font.size", set the
value of "display.font.size"... and so on.
"""
# Note: reset_option relies on set_option, and on key directly
# it does not fit in to this monkey-patching scheme
global register_option, get_option, set_option, reset_option
def wrap(func):
def inner(key, *args, **kwds):
pkey = "{prefix}.{key}".format(prefix=prefix, key=key)
return func(pkey, *args, **kwds)
return inner
_register_option = register_option
_get_option = get_option
_set_option = set_option
set_option = wrap(set_option)
get_option = wrap(get_option)
register_option = wrap(register_option)
yield None
set_option = _set_option
get_option = _get_option
register_option = _register_option
# These factories and methods are handy for use as the validator
# arg in register_option
def is_type_factory(_type):
"""
Parameters
----------
`_type` - a type to be compared against (e.g. type(x) == `_type`)
Returns
-------
validator - a function of a single argument x , which raises
ValueError if type(x) is not equal to `_type`
"""
def inner(x):
if type(x) != _type:
msg = "Value must have type '{typ!s}'"
raise ValueError(msg.format(typ=_type))
return inner
def is_instance_factory(_type):
"""
Parameters
----------
`_type` - the type to be checked against
Returns
-------
validator - a function of a single argument x , which raises
ValueError if x is not an instance of `_type`
"""
if isinstance(_type, (tuple, list)):
_type = tuple(_type)
type_repr = "|".join(map(str, _type))
else:
type_repr = "'{typ}'".format(typ=_type)
def inner(x):
if not isinstance(x, _type):
msg = "Value must be an instance of {type_repr}"
raise ValueError(msg.format(type_repr=type_repr))
return inner
def is_one_of_factory(legal_values):
callables = [c for c in legal_values if callable(c)]
legal_values = [c for c in legal_values if not callable(c)]
def inner(x):
if x not in legal_values:
if not any(c(x) for c in callables):
uvals = [str(lval) for lval in legal_values]
pp_values = "|".join(uvals)
msg = "Value must be one of {pp_values}"
if len(callables):
msg += " or a callable"
raise ValueError(msg.format(pp_values=pp_values))
return inner
# common type validators, for convenience
# usage: register_option(... , validator = is_int)
is_int = is_type_factory(int)
is_bool = is_type_factory(bool)
is_float = is_type_factory(float)
is_str = is_type_factory(str)
is_text = is_instance_factory((str, bytes))
def is_callable(obj):
"""
Parameters
----------
`obj` - the object to be checked
Returns
-------
validator - returns True if object is callable
raises ValueError otherwise.
"""
if not callable(obj):
raise ValueError("Value must be a callable")
return True

View File

@@ -0,0 +1,23 @@
"""
config for datetime formatting
"""
from pandas._config import config as cf
pc_date_dayfirst_doc = """
: boolean
When True, prints and parses dates with the day first, eg 20/01/2005
"""
pc_date_yearfirst_doc = """
: boolean
When True, prints and parses dates with the year first, eg 2005/01/20
"""
with cf.config_prefix("display"):
# Needed upstream of `_libs` because these are used in tslibs.parsing
cf.register_option(
"date_dayfirst", False, pc_date_dayfirst_doc, validator=cf.is_bool
)
cf.register_option(
"date_yearfirst", False, pc_date_yearfirst_doc, validator=cf.is_bool
)

View File

@@ -0,0 +1,56 @@
"""
Unopinionated display configuration.
"""
import locale
import sys
from pandas._config import config as cf
# -----------------------------------------------------------------------------
# Global formatting options
_initial_defencoding = None
def detect_console_encoding():
"""
Try to find the most capable encoding supported by the console.
slightly modified from the way IPython handles the same issue.
"""
global _initial_defencoding
encoding = None
try:
encoding = sys.stdout.encoding or sys.stdin.encoding
except (AttributeError, IOError):
pass
# try again for something better
if not encoding or "ascii" in encoding.lower():
try:
encoding = locale.getpreferredencoding()
except Exception:
pass
# when all else fails. this will usually be "ascii"
if not encoding or "ascii" in encoding.lower():
encoding = sys.getdefaultencoding()
# GH#3360, save the reported defencoding at import time
# MPL backends may change it. Make available for debugging.
if not _initial_defencoding:
_initial_defencoding = sys.getdefaultencoding()
return encoding
pc_encoding_doc = """
: str/unicode
Defaults to the detected encoding of the console.
Specifies the encoding to be used for strings returned by to_string,
these are generally strings meant to be displayed on the console.
"""
with cf.config_prefix("display"):
cf.register_option(
"encoding", detect_console_encoding(), pc_encoding_doc, validator=cf.is_text
)

View File

@@ -0,0 +1,162 @@
"""
Helpers for configuring locale settings.
Name `localization` is chosen to avoid overlap with builtin `locale` module.
"""
from contextlib import contextmanager
import locale
import re
import subprocess
from pandas._config.config import options
@contextmanager
def set_locale(new_locale, lc_var=locale.LC_ALL):
"""
Context manager for temporarily setting a locale.
Parameters
----------
new_locale : str or tuple
A string of the form <language_country>.<encoding>. For example to set
the current locale to US English with a UTF8 encoding, you would pass
"en_US.UTF-8".
lc_var : int, default `locale.LC_ALL`
The category of the locale being set.
Notes
-----
This is useful when you want to run a particular block of code under a
particular locale, without globally setting the locale. This probably isn't
thread-safe.
"""
current_locale = locale.getlocale()
try:
locale.setlocale(lc_var, new_locale)
normalized_locale = locale.getlocale()
if all(x is not None for x in normalized_locale):
yield ".".join(normalized_locale)
else:
yield new_locale
finally:
locale.setlocale(lc_var, current_locale)
def can_set_locale(lc, lc_var=locale.LC_ALL):
"""
Check to see if we can set a locale, and subsequently get the locale,
without raising an Exception.
Parameters
----------
lc : str
The locale to attempt to set.
lc_var : int, default `locale.LC_ALL`
The category of the locale being set.
Returns
-------
is_valid : bool
Whether the passed locale can be set
"""
try:
with set_locale(lc, lc_var=lc_var):
pass
except (ValueError, locale.Error):
# horrible name for a Exception subclass
return False
else:
return True
def _valid_locales(locales, normalize):
"""
Return a list of normalized locales that do not throw an ``Exception``
when set.
Parameters
----------
locales : str
A string where each locale is separated by a newline.
normalize : bool
Whether to call ``locale.normalize`` on each locale.
Returns
-------
valid_locales : list
A list of valid locales.
"""
if normalize:
normalizer = lambda x: locale.normalize(x.strip())
else:
normalizer = lambda x: x.strip()
return list(filter(can_set_locale, map(normalizer, locales)))
def _default_locale_getter():
try:
raw_locales = subprocess.check_output(["locale -a"], shell=True)
except subprocess.CalledProcessError as e:
raise type(e)(
"{exception}, the 'locale -a' command cannot be found "
"on your system".format(exception=e)
)
return raw_locales
def get_locales(prefix=None, normalize=True, locale_getter=_default_locale_getter):
"""
Get all the locales that are available on the system.
Parameters
----------
prefix : str
If not ``None`` then return only those locales with the prefix
provided. For example to get all English language locales (those that
start with ``"en"``), pass ``prefix="en"``.
normalize : bool
Call ``locale.normalize`` on the resulting list of available locales.
If ``True``, only locales that can be set without throwing an
``Exception`` are returned.
locale_getter : callable
The function to use to retrieve the current locales. This should return
a string with each locale separated by a newline character.
Returns
-------
locales : list of strings
A list of locale strings that can be set with ``locale.setlocale()``.
For example::
locale.setlocale(locale.LC_ALL, locale_string)
On error will return None (no locale available, e.g. Windows)
"""
try:
raw_locales = locale_getter()
except Exception:
return None
try:
# raw_locales is "\n" separated list of locales
# it may contain non-decodable parts, so split
# extract what we can and then rejoin.
raw_locales = raw_locales.split(b"\n")
out_locales = []
for x in raw_locales:
out_locales.append(str(x, encoding=options.display.encoding))
except TypeError:
pass
if prefix is None:
return _valid_locales(out_locales, normalize)
pattern = re.compile("{prefix}.*".format(prefix=prefix))
found = pattern.findall("\n".join(out_locales))
return _valid_locales(found, normalize)

View File

@@ -0,0 +1,11 @@
# flake8: noqa
from .tslibs import (
NaT,
NaTType,
OutOfBoundsDatetime,
Period,
Timedelta,
Timestamp,
iNaT,
)

View File

@@ -0,0 +1,9 @@
# flake8: noqa
from .conversion import localize_pydatetime, normalize_date
from .nattype import NaT, NaTType, iNaT, is_null_datetimelike
from .np_datetime import OutOfBoundsDatetime
from .period import IncompatibleFrequency, Period
from .timedeltas import Timedelta, delta_to_nanoseconds, ints_to_pytimedelta
from .timestamps import Timestamp
from .tzconversion import tz_convert_single

View File

@@ -0,0 +1,34 @@
from pathlib import Path
from typing import IO, AnyStr, TypeVar, Union
import numpy as np
from pandas._libs import Timestamp
from pandas._libs.tslibs.period import Period
from pandas._libs.tslibs.timedeltas import Timedelta
from pandas.core.dtypes.dtypes import ExtensionDtype
from pandas.core.dtypes.generic import (
ABCDataFrame,
ABCExtensionArray,
ABCIndexClass,
ABCSeries,
ABCSparseSeries,
)
AnyArrayLike = TypeVar(
"AnyArrayLike",
ABCExtensionArray,
ABCIndexClass,
ABCSeries,
ABCSparseSeries,
np.ndarray,
)
ArrayLike = TypeVar("ArrayLike", ABCExtensionArray, np.ndarray)
DatetimeLikeScalar = TypeVar("DatetimeLikeScalar", Period, Timestamp, Timedelta)
Dtype = Union[str, np.dtype, ExtensionDtype]
FilePathOrBuffer = Union[str, Path, IO[AnyStr]]
FrameOrSeries = TypeVar("FrameOrSeries", ABCSeries, ABCDataFrame)
Scalar = Union[str, int, float]
Axis = Union[str, int]

View File

@@ -0,0 +1,23 @@
# This file was generated by 'versioneer.py' (0.15) from
# revision-control system data, or from the parent directory name of an
# unpacked source archive. Distribution tarballs contain a pre-generated copy
# of this file.
from warnings import catch_warnings
with catch_warnings(record=True):
import json
import sys
version_json = '''
{
"dirty": false,
"error": null,
"full-revisionid": "d1accd032b648c9affd6dce1f81feb9c99422483",
"version": "0.25.0"
}
''' # END VERSION_JSON
def get_versions():
return json.loads(version_json)

View File

@@ -0,0 +1,2 @@
""" public toolkit API """
from . import extensions, types # noqa

View File

@@ -0,0 +1,13 @@
"""Public API for extending pandas objects."""
from pandas.core.dtypes.dtypes import ( # noqa: F401
ExtensionDtype,
register_extension_dtype,
)
from pandas.core.accessor import ( # noqa: F401
register_dataframe_accessor,
register_index_accessor,
register_series_accessor,
)
from pandas.core.algorithms import take # noqa: F401
from pandas.core.arrays import ExtensionArray, ExtensionScalarOpsMixin # noqa: F401

View File

@@ -0,0 +1,12 @@
""" public toolkit API """
from pandas._libs.lib import infer_dtype # noqa: F401
from pandas.core.dtypes.api import * # noqa: F403, F401
from pandas.core.dtypes.concat import union_categoricals # noqa: F401
from pandas.core.dtypes.dtypes import ( # noqa: F401
CategoricalDtype,
DatetimeTZDtype,
IntervalDtype,
PeriodDtype,
)

View File

@@ -0,0 +1,26 @@
"""
All of pandas' ExtensionArrays.
See :ref:`extending.extension-types` for more.
"""
from pandas.core.arrays import (
Categorical,
DatetimeArray,
IntegerArray,
IntervalArray,
PandasArray,
PeriodArray,
SparseArray,
TimedeltaArray,
)
__all__ = [
"Categorical",
"DatetimeArray",
"IntegerArray",
"IntervalArray",
"PandasArray",
"PeriodArray",
"SparseArray",
"TimedeltaArray",
]

View File

@@ -0,0 +1,66 @@
"""
compat
======
Cross-compatible functions for different versions of Python.
Other items:
* platform checker
"""
import platform
import struct
import sys
PY36 = sys.version_info >= (3, 6)
PY37 = sys.version_info >= (3, 7)
PYPY = platform.python_implementation() == "PyPy"
# ----------------------------------------------------------------------------
# functions largely based / taken from the six module
# Much of the code in this module comes from Benjamin Peterson's six library.
# The license for this library can be found in LICENSES/SIX and the code can be
# found at https://bitbucket.org/gutworth/six
def set_function_name(f, name, cls):
"""
Bind the name/qualname attributes of the function
"""
f.__name__ = name
f.__qualname__ = "{klass}.{name}".format(klass=cls.__name__, name=name)
f.__module__ = cls.__module__
return f
def raise_with_traceback(exc, traceback=Ellipsis):
"""
Raise exception with existing traceback.
If traceback is not passed, uses sys.exc_info() to get traceback.
"""
if traceback == Ellipsis:
_, _, traceback = sys.exc_info()
raise exc.with_traceback(traceback)
# https://github.com/pandas-dev/pandas/pull/9123
def is_platform_little_endian():
""" am I little endian """
return sys.byteorder == "little"
def is_platform_windows():
return sys.platform == "win32" or sys.platform == "cygwin"
def is_platform_linux():
return sys.platform == "linux2"
def is_platform_mac():
return sys.platform == "darwin"
def is_platform_32bit():
return struct.calcsize("P") * 8 < 64

View File

@@ -0,0 +1,111 @@
import distutils.version
import importlib
import types
import warnings
# Update install.rst when updating versions!
VERSIONS = {
"bs4": "4.6.0",
"bottleneck": "1.2.1",
"fastparquet": "0.2.1",
"gcsfs": "0.2.2",
"lxml.etree": "3.8.0",
"matplotlib": "2.2.2",
"numexpr": "2.6.2",
"odfpy": "1.3.0",
"openpyxl": "2.4.8",
"pandas_gbq": "0.8.0",
"pyarrow": "0.9.0",
"pytables": "3.4.2",
"s3fs": "0.0.8",
"scipy": "0.19.0",
"sqlalchemy": "1.1.4",
"tables": "3.4.2",
"xarray": "0.8.2",
"xlrd": "1.1.0",
"xlwt": "1.2.0",
"xlsxwriter": "0.9.8",
}
message = (
"Missing optional dependency '{name}'. {extra} "
"Use pip or conda to install {name}."
)
version_message = (
"Pandas requires version '{minimum_version}' or newer of '{name}' "
"(version '{actual_version}' currently installed)."
)
def _get_version(module: types.ModuleType) -> str:
version = getattr(module, "__version__", None)
if version is None:
# xlrd uses a capitalized attribute name
version = getattr(module, "__VERSION__", None)
if version is None:
raise ImportError("Can't determine version for {}".format(module.__name__))
return version
def import_optional_dependency(
name: str, extra: str = "", raise_on_missing: bool = True, on_version: str = "raise"
):
"""
Import an optional dependency.
By default, if a dependency is missing an ImportError with a nice
message will be raised. If a dependency is present, but too old,
we raise.
Parameters
----------
name : str
The module name. This should be top-level only, so that the
version may be checked.
extra : str
Additional text to include in the ImportError message.
raise_on_missing : bool, default True
Whether to raise if the optional dependency is not found.
When False and the module is not present, None is returned.
on_version : str {'raise', 'warn'}
What to do when a dependency's version is too old.
* raise : Raise an ImportError
* warn : Warn that the version is too old. Returns None
* ignore: Return the module, even if the version is too old.
It's expected that users validate the version locally when
using ``on_version="ignore"`` (see. ``io/html.py``)
Returns
-------
maybe_module : Optional[ModuleType]
The imported module, when found and the version is correct.
None is returned when the package is not found and `raise_on_missing`
is False, or when the package's version is too old and `on_version`
is ``'warn'``.
"""
try:
module = importlib.import_module(name)
except ImportError:
if raise_on_missing:
raise ImportError(message.format(name=name, extra=extra)) from None
else:
return None
minimum_version = VERSIONS.get(name)
if minimum_version:
version = _get_version(module)
if distutils.version.LooseVersion(version) < minimum_version:
assert on_version in {"warn", "raise", "ignore"}
msg = version_message.format(
minimum_version=minimum_version, name=name, actual_version=version
)
if on_version == "warn":
warnings.warn(msg, UserWarning)
return None
elif on_version == "raise":
raise ImportError(msg)
return module

View File

@@ -0,0 +1,23 @@
from collections import ChainMap
class DeepChainMap(ChainMap):
def __setitem__(self, key, value):
for mapping in self.maps:
if key in mapping:
mapping[key] = value
return
self.maps[0][key] = value
def __delitem__(self, key):
for mapping in self.maps:
if key in mapping:
del mapping[key]
return
raise KeyError(key)
# override because the m parameter is introduced in Python 3.4
def new_child(self, m=None):
if m is None:
m = {}
return self.__class__(m, *self.maps)

View File

@@ -0,0 +1,74 @@
""" support numpy compatibility across versions """
from distutils.version import LooseVersion
import re
import numpy as np
# numpy versioning
_np_version = np.__version__
_nlv = LooseVersion(_np_version)
_np_version_under1p14 = _nlv < LooseVersion("1.14")
_np_version_under1p15 = _nlv < LooseVersion("1.15")
_np_version_under1p16 = _nlv < LooseVersion("1.16")
_np_version_under1p17 = _nlv < LooseVersion("1.17")
_is_numpy_dev = ".dev" in str(_nlv)
if _nlv < "1.13.3":
raise ImportError(
"this version of pandas is incompatible with "
"numpy < 1.13.3\n"
"your numpy version is {0}.\n"
"Please upgrade numpy to >= 1.13.3 to use "
"this pandas version".format(_np_version)
)
_tz_regex = re.compile("[+-]0000$")
def tz_replacer(s):
if isinstance(s, str):
if s.endswith("Z"):
s = s[:-1]
elif _tz_regex.search(s):
s = s[:-5]
return s
def np_datetime64_compat(s, *args, **kwargs):
"""
provide compat for construction of strings to numpy datetime64's with
tz-changes in 1.11 that make '2015-01-01 09:00:00Z' show a deprecation
warning, when need to pass '2015-01-01 09:00:00'
"""
s = tz_replacer(s)
return np.datetime64(s, *args, **kwargs)
def np_array_datetime64_compat(arr, *args, **kwargs):
"""
provide compat for construction of an array of strings to a
np.array(..., dtype=np.datetime64(..))
tz-changes in 1.11 that make '2015-01-01 09:00:00Z' show a deprecation
warning, when need to pass '2015-01-01 09:00:00'
"""
# is_list_like
if hasattr(arr, "__iter__") and not isinstance(arr, (str, bytes)):
arr = [tz_replacer(s) for s in arr]
else:
arr = tz_replacer(arr)
return np.array(arr, *args, **kwargs)
__all__ = [
"np",
"_np_version",
"_np_version_under1p14",
"_np_version_under1p15",
"_np_version_under1p16",
"_np_version_under1p17",
"_is_numpy_dev",
]

View File

@@ -0,0 +1,424 @@
"""
For compatibility with numpy libraries, pandas functions or
methods have to accept '*args' and '**kwargs' parameters to
accommodate numpy arguments that are not actually used or
respected in the pandas implementation.
To ensure that users do not abuse these parameters, validation
is performed in 'validators.py' to make sure that any extra
parameters passed correspond ONLY to those in the numpy signature.
Part of that validation includes whether or not the user attempted
to pass in non-default values for these extraneous parameters. As we
want to discourage users from relying on these parameters when calling
the pandas implementation, we want them only to pass in the default values
for these parameters.
This module provides a set of commonly used default arguments for functions
and methods that are spread throughout the codebase. This module will make it
easier to adjust to future upstream changes in the analogous numpy signatures.
"""
from collections import OrderedDict
from distutils.version import LooseVersion
from typing import Any, Dict, Optional, Union
from numpy import __version__ as _np_version, ndarray
from pandas._libs.lib import is_bool, is_integer
from pandas.errors import UnsupportedFunctionCall
from pandas.util._validators import (
validate_args,
validate_args_and_kwargs,
validate_kwargs,
)
class CompatValidator:
def __init__(self, defaults, fname=None, method=None, max_fname_arg_count=None):
self.fname = fname
self.method = method
self.defaults = defaults
self.max_fname_arg_count = max_fname_arg_count
def __call__(self, args, kwargs, fname=None, max_fname_arg_count=None, method=None):
if args or kwargs:
fname = self.fname if fname is None else fname
max_fname_arg_count = (
self.max_fname_arg_count
if max_fname_arg_count is None
else max_fname_arg_count
)
method = self.method if method is None else method
if method == "args":
validate_args(fname, args, max_fname_arg_count, self.defaults)
elif method == "kwargs":
validate_kwargs(fname, kwargs, self.defaults)
elif method == "both":
validate_args_and_kwargs(
fname, args, kwargs, max_fname_arg_count, self.defaults
)
else:
raise ValueError(
"invalid validation method " "'{method}'".format(method=method)
)
ARGMINMAX_DEFAULTS = dict(out=None)
validate_argmin = CompatValidator(
ARGMINMAX_DEFAULTS, fname="argmin", method="both", max_fname_arg_count=1
)
validate_argmax = CompatValidator(
ARGMINMAX_DEFAULTS, fname="argmax", method="both", max_fname_arg_count=1
)
def process_skipna(skipna, args):
if isinstance(skipna, ndarray) or skipna is None:
args = (skipna,) + args
skipna = True
return skipna, args
def validate_argmin_with_skipna(skipna, args, kwargs):
"""
If 'Series.argmin' is called via the 'numpy' library,
the third parameter in its signature is 'out', which
takes either an ndarray or 'None', so check if the
'skipna' parameter is either an instance of ndarray or
is None, since 'skipna' itself should be a boolean
"""
skipna, args = process_skipna(skipna, args)
validate_argmin(args, kwargs)
return skipna
def validate_argmax_with_skipna(skipna, args, kwargs):
"""
If 'Series.argmax' is called via the 'numpy' library,
the third parameter in its signature is 'out', which
takes either an ndarray or 'None', so check if the
'skipna' parameter is either an instance of ndarray or
is None, since 'skipna' itself should be a boolean
"""
skipna, args = process_skipna(skipna, args)
validate_argmax(args, kwargs)
return skipna
ARGSORT_DEFAULTS = OrderedDict() # type: OrderedDict[str, Optional[Union[int, str]]]
ARGSORT_DEFAULTS["axis"] = -1
ARGSORT_DEFAULTS["kind"] = "quicksort"
ARGSORT_DEFAULTS["order"] = None
if LooseVersion(_np_version) >= LooseVersion("1.17.0"):
# GH-26361. NumPy added radix sort and changed default to None.
ARGSORT_DEFAULTS["kind"] = None
validate_argsort = CompatValidator(
ARGSORT_DEFAULTS, fname="argsort", max_fname_arg_count=0, method="both"
)
# two different signatures of argsort, this second validation
# for when the `kind` param is supported
ARGSORT_DEFAULTS_KIND = OrderedDict() # type: OrderedDict[str, Optional[int]]
ARGSORT_DEFAULTS_KIND["axis"] = -1
ARGSORT_DEFAULTS_KIND["order"] = None
validate_argsort_kind = CompatValidator(
ARGSORT_DEFAULTS_KIND, fname="argsort", max_fname_arg_count=0, method="both"
)
def validate_argsort_with_ascending(ascending, args, kwargs):
"""
If 'Categorical.argsort' is called via the 'numpy' library, the
first parameter in its signature is 'axis', which takes either
an integer or 'None', so check if the 'ascending' parameter has
either integer type or is None, since 'ascending' itself should
be a boolean
"""
if is_integer(ascending) or ascending is None:
args = (ascending,) + args
ascending = True
validate_argsort_kind(args, kwargs, max_fname_arg_count=3)
return ascending
CLIP_DEFAULTS = dict(out=None) # type Dict[str, Any]
validate_clip = CompatValidator(
CLIP_DEFAULTS, fname="clip", method="both", max_fname_arg_count=3
)
def validate_clip_with_axis(axis, args, kwargs):
"""
If 'NDFrame.clip' is called via the numpy library, the third
parameter in its signature is 'out', which can takes an ndarray,
so check if the 'axis' parameter is an instance of ndarray, since
'axis' itself should either be an integer or None
"""
if isinstance(axis, ndarray):
args = (axis,) + args
axis = None
validate_clip(args, kwargs)
return axis
COMPRESS_DEFAULTS = OrderedDict() # type: OrderedDict[str, Any]
COMPRESS_DEFAULTS["axis"] = None
COMPRESS_DEFAULTS["out"] = None
validate_compress = CompatValidator(
COMPRESS_DEFAULTS, fname="compress", method="both", max_fname_arg_count=1
)
CUM_FUNC_DEFAULTS = OrderedDict() # type: OrderedDict[str, Any]
CUM_FUNC_DEFAULTS["dtype"] = None
CUM_FUNC_DEFAULTS["out"] = None
validate_cum_func = CompatValidator(
CUM_FUNC_DEFAULTS, method="both", max_fname_arg_count=1
)
validate_cumsum = CompatValidator(
CUM_FUNC_DEFAULTS, fname="cumsum", method="both", max_fname_arg_count=1
)
def validate_cum_func_with_skipna(skipna, args, kwargs, name):
"""
If this function is called via the 'numpy' library, the third
parameter in its signature is 'dtype', which takes either a
'numpy' dtype or 'None', so check if the 'skipna' parameter is
a boolean or not
"""
if not is_bool(skipna):
args = (skipna,) + args
skipna = True
validate_cum_func(args, kwargs, fname=name)
return skipna
ALLANY_DEFAULTS = OrderedDict() # type: OrderedDict[str, Optional[bool]]
ALLANY_DEFAULTS["dtype"] = None
ALLANY_DEFAULTS["out"] = None
ALLANY_DEFAULTS["keepdims"] = False
validate_all = CompatValidator(
ALLANY_DEFAULTS, fname="all", method="both", max_fname_arg_count=1
)
validate_any = CompatValidator(
ALLANY_DEFAULTS, fname="any", method="both", max_fname_arg_count=1
)
LOGICAL_FUNC_DEFAULTS = dict(out=None, keepdims=False)
validate_logical_func = CompatValidator(LOGICAL_FUNC_DEFAULTS, method="kwargs")
MINMAX_DEFAULTS = dict(out=None, keepdims=False)
validate_min = CompatValidator(
MINMAX_DEFAULTS, fname="min", method="both", max_fname_arg_count=1
)
validate_max = CompatValidator(
MINMAX_DEFAULTS, fname="max", method="both", max_fname_arg_count=1
)
RESHAPE_DEFAULTS = dict(order="C") # type: Dict[str, str]
validate_reshape = CompatValidator(
RESHAPE_DEFAULTS, fname="reshape", method="both", max_fname_arg_count=1
)
REPEAT_DEFAULTS = dict(axis=None) # type: Dict[str, Any]
validate_repeat = CompatValidator(
REPEAT_DEFAULTS, fname="repeat", method="both", max_fname_arg_count=1
)
ROUND_DEFAULTS = dict(out=None) # type: Dict[str, Any]
validate_round = CompatValidator(
ROUND_DEFAULTS, fname="round", method="both", max_fname_arg_count=1
)
SORT_DEFAULTS = OrderedDict() # type: OrderedDict[str, Optional[Union[int, str]]]
SORT_DEFAULTS["axis"] = -1
SORT_DEFAULTS["kind"] = "quicksort"
SORT_DEFAULTS["order"] = None
validate_sort = CompatValidator(SORT_DEFAULTS, fname="sort", method="kwargs")
STAT_FUNC_DEFAULTS = OrderedDict() # type: OrderedDict[str, Optional[Any]]
STAT_FUNC_DEFAULTS["dtype"] = None
STAT_FUNC_DEFAULTS["out"] = None
PROD_DEFAULTS = SUM_DEFAULTS = STAT_FUNC_DEFAULTS.copy()
SUM_DEFAULTS["keepdims"] = False
SUM_DEFAULTS["initial"] = None
MEDIAN_DEFAULTS = STAT_FUNC_DEFAULTS.copy()
MEDIAN_DEFAULTS["overwrite_input"] = False
MEDIAN_DEFAULTS["keepdims"] = False
STAT_FUNC_DEFAULTS["keepdims"] = False
validate_stat_func = CompatValidator(STAT_FUNC_DEFAULTS, method="kwargs")
validate_sum = CompatValidator(
SUM_DEFAULTS, fname="sum", method="both", max_fname_arg_count=1
)
validate_prod = CompatValidator(
PROD_DEFAULTS, fname="prod", method="both", max_fname_arg_count=1
)
validate_mean = CompatValidator(
STAT_FUNC_DEFAULTS, fname="mean", method="both", max_fname_arg_count=1
)
validate_median = CompatValidator(
MEDIAN_DEFAULTS, fname="median", method="both", max_fname_arg_count=1
)
STAT_DDOF_FUNC_DEFAULTS = OrderedDict() # type: OrderedDict[str, Optional[bool]]
STAT_DDOF_FUNC_DEFAULTS["dtype"] = None
STAT_DDOF_FUNC_DEFAULTS["out"] = None
STAT_DDOF_FUNC_DEFAULTS["keepdims"] = False
validate_stat_ddof_func = CompatValidator(STAT_DDOF_FUNC_DEFAULTS, method="kwargs")
TAKE_DEFAULTS = OrderedDict() # type: OrderedDict[str, Optional[str]]
TAKE_DEFAULTS["out"] = None
TAKE_DEFAULTS["mode"] = "raise"
validate_take = CompatValidator(TAKE_DEFAULTS, fname="take", method="kwargs")
def validate_take_with_convert(convert, args, kwargs):
"""
If this function is called via the 'numpy' library, the third
parameter in its signature is 'axis', which takes either an
ndarray or 'None', so check if the 'convert' parameter is either
an instance of ndarray or is None
"""
if isinstance(convert, ndarray) or convert is None:
args = (convert,) + args
convert = True
validate_take(args, kwargs, max_fname_arg_count=3, method="both")
return convert
TRANSPOSE_DEFAULTS = dict(axes=None)
validate_transpose = CompatValidator(
TRANSPOSE_DEFAULTS, fname="transpose", method="both", max_fname_arg_count=0
)
def validate_window_func(name, args, kwargs):
numpy_args = ("axis", "dtype", "out")
msg = (
"numpy operations are not "
"valid with window objects. "
"Use .{func}() directly instead ".format(func=name)
)
if len(args) > 0:
raise UnsupportedFunctionCall(msg)
for arg in numpy_args:
if arg in kwargs:
raise UnsupportedFunctionCall(msg)
def validate_rolling_func(name, args, kwargs):
numpy_args = ("axis", "dtype", "out")
msg = (
"numpy operations are not "
"valid with window objects. "
"Use .rolling(...).{func}() instead ".format(func=name)
)
if len(args) > 0:
raise UnsupportedFunctionCall(msg)
for arg in numpy_args:
if arg in kwargs:
raise UnsupportedFunctionCall(msg)
def validate_expanding_func(name, args, kwargs):
numpy_args = ("axis", "dtype", "out")
msg = (
"numpy operations are not "
"valid with window objects. "
"Use .expanding(...).{func}() instead ".format(func=name)
)
if len(args) > 0:
raise UnsupportedFunctionCall(msg)
for arg in numpy_args:
if arg in kwargs:
raise UnsupportedFunctionCall(msg)
def validate_groupby_func(name, args, kwargs, allowed=None):
"""
'args' and 'kwargs' should be empty, except for allowed
kwargs because all of
their necessary parameters are explicitly listed in
the function signature
"""
if allowed is None:
allowed = []
kwargs = set(kwargs) - set(allowed)
if len(args) + len(kwargs) > 0:
raise UnsupportedFunctionCall(
(
"numpy operations are not valid "
"with groupby. Use .groupby(...)."
"{func}() instead".format(func=name)
)
)
RESAMPLER_NUMPY_OPS = ("min", "max", "sum", "prod", "mean", "std", "var")
def validate_resampler_func(method, args, kwargs):
"""
'args' and 'kwargs' should be empty because all of
their necessary parameters are explicitly listed in
the function signature
"""
if len(args) + len(kwargs) > 0:
if method in RESAMPLER_NUMPY_OPS:
raise UnsupportedFunctionCall(
(
"numpy operations are not valid "
"with resample. Use .resample(...)."
"{func}() instead".format(func=method)
)
)
else:
raise TypeError("too many arguments passed in")
def validate_minmax_axis(axis):
"""
Ensure that the axis argument passed to min, max, argmin, or argmax is
zero or None, as otherwise it will be incorrectly ignored.
Parameters
----------
axis : int or None
Raises
------
ValueError
"""
ndim = 1 # hard-coded for Index
if axis is None:
return
if axis >= ndim or (axis < 0 and ndim + axis < 0):
raise ValueError(
"`axis` must be fewer than the number of "
"dimensions ({ndim})".format(ndim=ndim)
)

View File

@@ -0,0 +1,221 @@
"""
Support pre-0.12 series pickle compatibility.
"""
import copy
import pickle as pkl
import sys
import pandas # noqa
from pandas import Index
def load_reduce(self):
stack = self.stack
args = stack.pop()
func = stack[-1]
if len(args) and type(args[0]) is type:
n = args[0].__name__ # noqa
try:
stack[-1] = func(*args)
return
except Exception as e:
# If we have a deprecated function,
# try to replace and try again.
msg = "_reconstruct: First argument must be a sub-type of ndarray"
if msg in str(e):
try:
cls = args[0]
stack[-1] = object.__new__(cls)
return
except TypeError:
pass
# try to re-encode the arguments
if getattr(self, "encoding", None) is not None:
args = tuple(
arg.encode(self.encoding) if isinstance(arg, str) else arg
for arg in args
)
try:
stack[-1] = func(*args)
return
except TypeError:
pass
# unknown exception, re-raise
if getattr(self, "is_verbose", None):
print(sys.exc_info())
print(func, args)
raise
# If classes are moved, provide compat here.
_class_locations_map = {
("pandas.core.sparse.array", "SparseArray"): ("pandas.core.arrays", "SparseArray"),
# 15477
#
# TODO: When FrozenNDArray is removed, add
# the following lines for compat:
#
# ('pandas.core.base', 'FrozenNDArray'):
# ('numpy', 'ndarray'),
# ('pandas.core.indexes.frozen', 'FrozenNDArray'):
# ('numpy', 'ndarray'),
#
# Afterwards, remove the current entry
# for `pandas.core.base.FrozenNDArray`.
("pandas.core.base", "FrozenNDArray"): (
"pandas.core.indexes.frozen",
"FrozenNDArray",
),
("pandas.core.base", "FrozenList"): ("pandas.core.indexes.frozen", "FrozenList"),
# 10890
("pandas.core.series", "TimeSeries"): ("pandas.core.series", "Series"),
("pandas.sparse.series", "SparseTimeSeries"): (
"pandas.core.sparse.series",
"SparseSeries",
),
# 12588, extensions moving
("pandas._sparse", "BlockIndex"): ("pandas._libs.sparse", "BlockIndex"),
("pandas.tslib", "Timestamp"): ("pandas._libs.tslib", "Timestamp"),
# 18543 moving period
("pandas._period", "Period"): ("pandas._libs.tslibs.period", "Period"),
("pandas._libs.period", "Period"): ("pandas._libs.tslibs.period", "Period"),
# 18014 moved __nat_unpickle from _libs.tslib-->_libs.tslibs.nattype
("pandas.tslib", "__nat_unpickle"): (
"pandas._libs.tslibs.nattype",
"__nat_unpickle",
),
("pandas._libs.tslib", "__nat_unpickle"): (
"pandas._libs.tslibs.nattype",
"__nat_unpickle",
),
# 15998 top-level dirs moving
("pandas.sparse.array", "SparseArray"): (
"pandas.core.arrays.sparse",
"SparseArray",
),
("pandas.sparse.series", "SparseSeries"): (
"pandas.core.sparse.series",
"SparseSeries",
),
("pandas.sparse.frame", "SparseDataFrame"): (
"pandas.core.sparse.frame",
"SparseDataFrame",
),
("pandas.indexes.base", "_new_Index"): ("pandas.core.indexes.base", "_new_Index"),
("pandas.indexes.base", "Index"): ("pandas.core.indexes.base", "Index"),
("pandas.indexes.numeric", "Int64Index"): (
"pandas.core.indexes.numeric",
"Int64Index",
),
("pandas.indexes.range", "RangeIndex"): ("pandas.core.indexes.range", "RangeIndex"),
("pandas.indexes.multi", "MultiIndex"): ("pandas.core.indexes.multi", "MultiIndex"),
("pandas.tseries.index", "_new_DatetimeIndex"): (
"pandas.core.indexes.datetimes",
"_new_DatetimeIndex",
),
("pandas.tseries.index", "DatetimeIndex"): (
"pandas.core.indexes.datetimes",
"DatetimeIndex",
),
("pandas.tseries.period", "PeriodIndex"): (
"pandas.core.indexes.period",
"PeriodIndex",
),
# 19269, arrays moving
("pandas.core.categorical", "Categorical"): ("pandas.core.arrays", "Categorical"),
# 19939, add timedeltaindex, float64index compat from 15998 move
("pandas.tseries.tdi", "TimedeltaIndex"): (
"pandas.core.indexes.timedeltas",
"TimedeltaIndex",
),
("pandas.indexes.numeric", "Float64Index"): (
"pandas.core.indexes.numeric",
"Float64Index",
),
}
# our Unpickler sub-class to override methods and some dispatcher
# functions for compat
class Unpickler(pkl._Unpickler): # type: ignore
def find_class(self, module, name):
# override superclass
key = (module, name)
module, name = _class_locations_map.get(key, key)
return super().find_class(module, name)
Unpickler.dispatch = copy.copy(Unpickler.dispatch)
Unpickler.dispatch[pkl.REDUCE[0]] = load_reduce
def load_newobj(self):
args = self.stack.pop()
cls = self.stack[-1]
# compat
if issubclass(cls, Index):
obj = object.__new__(cls)
else:
obj = cls.__new__(cls, *args)
self.stack[-1] = obj
Unpickler.dispatch[pkl.NEWOBJ[0]] = load_newobj
def load_newobj_ex(self):
kwargs = self.stack.pop()
args = self.stack.pop()
cls = self.stack.pop()
# compat
if issubclass(cls, Index):
obj = object.__new__(cls)
else:
obj = cls.__new__(cls, *args, **kwargs)
self.append(obj)
try:
Unpickler.dispatch[pkl.NEWOBJ_EX[0]] = load_newobj_ex
except (AttributeError, KeyError):
pass
def load(fh, encoding=None, is_verbose=False):
"""load a pickle, with a provided encoding
if compat is True:
fake the old class hierarchy
if it works, then return the new type objects
Parameters
----------
fh : a filelike object
encoding : an optional encoding
is_verbose : show exception output
"""
try:
fh.seek(0)
if encoding is not None:
up = Unpickler(fh, encoding=encoding)
else:
up = Unpickler(fh)
up.is_verbose = is_verbose
return up.load()
except (ValueError, TypeError):
raise

View File

@@ -0,0 +1,843 @@
from datetime import date, time, timedelta, timezone
from decimal import Decimal
import operator
import os
from dateutil.tz import tzlocal, tzutc
import hypothesis
from hypothesis import strategies as st
import numpy as np
import pytest
from pytz import FixedOffset, utc
import pandas.util._test_decorators as td
import pandas as pd
from pandas import DataFrame
from pandas.core import ops
import pandas.util.testing as tm
hypothesis.settings.register_profile(
"ci",
# Hypothesis timing checks are tuned for scalars by default, so we bump
# them from 200ms to 500ms per test case as the global default. If this
# is too short for a specific test, (a) try to make it faster, and (b)
# if it really is slow add `@settings(deadline=...)` with a working value,
# or `deadline=None` to entirely disable timeouts for that test.
deadline=500,
suppress_health_check=(hypothesis.HealthCheck.too_slow,),
)
hypothesis.settings.load_profile("ci")
def pytest_addoption(parser):
parser.addoption("--skip-slow", action="store_true", help="skip slow tests")
parser.addoption("--skip-network", action="store_true", help="skip network tests")
parser.addoption("--skip-db", action="store_true", help="skip db tests")
parser.addoption(
"--run-high-memory", action="store_true", help="run high memory tests"
)
parser.addoption("--only-slow", action="store_true", help="run only slow tests")
parser.addoption(
"--strict-data-files",
action="store_true",
help="Fail if a test is skipped for missing data file.",
)
def pytest_runtest_setup(item):
if "slow" in item.keywords and item.config.getoption("--skip-slow"):
pytest.skip("skipping due to --skip-slow")
if "slow" not in item.keywords and item.config.getoption("--only-slow"):
pytest.skip("skipping due to --only-slow")
if "network" in item.keywords and item.config.getoption("--skip-network"):
pytest.skip("skipping due to --skip-network")
if "db" in item.keywords and item.config.getoption("--skip-db"):
pytest.skip("skipping due to --skip-db")
if "high_memory" in item.keywords and not item.config.getoption(
"--run-high-memory"
):
pytest.skip("skipping high memory test since --run-high-memory was not set")
# Configurations for all tests and all test modules
@pytest.fixture(autouse=True)
def configure_tests():
pd.set_option("chained_assignment", "raise")
# For running doctests: make np and pd names available
@pytest.fixture(autouse=True)
def add_imports(doctest_namespace):
doctest_namespace["np"] = np
doctest_namespace["pd"] = pd
@pytest.fixture(params=["bsr", "coo", "csc", "csr", "dia", "dok", "lil"])
def spmatrix(request):
from scipy import sparse
return getattr(sparse, request.param + "_matrix")
@pytest.fixture(params=[0, 1, "index", "columns"], ids=lambda x: "axis {!r}".format(x))
def axis(request):
"""
Fixture for returning the axis numbers of a DataFrame.
"""
return request.param
axis_frame = axis
@pytest.fixture(params=[0, "index"], ids=lambda x: "axis {!r}".format(x))
def axis_series(request):
"""
Fixture for returning the axis numbers of a Series.
"""
return request.param
@pytest.fixture
def ip():
"""
Get an instance of IPython.InteractiveShell.
Will raise a skip if IPython is not installed.
"""
pytest.importorskip("IPython", minversion="6.0.0")
from IPython.core.interactiveshell import InteractiveShell
return InteractiveShell()
@pytest.fixture(params=[True, False, None])
def observed(request):
""" pass in the observed keyword to groupby for [True, False]
This indicates whether categoricals should return values for
values which are not in the grouper [False / None], or only values which
appear in the grouper [True]. [None] is supported for future compatibility
if we decide to change the default (and would need to warn if this
parameter is not passed)"""
return request.param
@pytest.fixture(params=[True, False, None])
def ordered_fixture(request):
"""Boolean 'ordered' parameter for Categorical."""
return request.param
_all_arithmetic_operators = [
"__add__",
"__radd__",
"__sub__",
"__rsub__",
"__mul__",
"__rmul__",
"__floordiv__",
"__rfloordiv__",
"__truediv__",
"__rtruediv__",
"__pow__",
"__rpow__",
"__mod__",
"__rmod__",
]
@pytest.fixture(params=_all_arithmetic_operators)
def all_arithmetic_operators(request):
"""
Fixture for dunder names for common arithmetic operations
"""
return request.param
@pytest.fixture(
params=[
operator.add,
ops.radd,
operator.sub,
ops.rsub,
operator.mul,
ops.rmul,
operator.truediv,
ops.rtruediv,
operator.floordiv,
ops.rfloordiv,
operator.mod,
ops.rmod,
operator.pow,
ops.rpow,
]
)
def all_arithmetic_functions(request):
"""
Fixture for operator and roperator arithmetic functions.
Note: This includes divmod and rdivmod, whereas all_arithmetic_operators
does not.
"""
return request.param
_all_numeric_reductions = [
"sum",
"max",
"min",
"mean",
"prod",
"std",
"var",
"median",
"kurt",
"skew",
]
@pytest.fixture(params=_all_numeric_reductions)
def all_numeric_reductions(request):
"""
Fixture for numeric reduction names
"""
return request.param
_all_boolean_reductions = ["all", "any"]
@pytest.fixture(params=_all_boolean_reductions)
def all_boolean_reductions(request):
"""
Fixture for boolean reduction names
"""
return request.param
_cython_table = pd.core.base.SelectionMixin._cython_table.items()
@pytest.fixture(params=list(_cython_table))
def cython_table_items(request):
return request.param
def _get_cython_table_params(ndframe, func_names_and_expected):
"""combine frame, functions from SelectionMixin._cython_table
keys and expected result.
Parameters
----------
ndframe : DataFrame or Series
func_names_and_expected : Sequence of two items
The first item is a name of a NDFrame method ('sum', 'prod') etc.
The second item is the expected return value
Returns
-------
results : list
List of three items (DataFrame, function, expected result)
"""
results = []
for func_name, expected in func_names_and_expected:
results.append((ndframe, func_name, expected))
results += [
(ndframe, func, expected)
for func, name in _cython_table
if name == func_name
]
return results
@pytest.fixture(params=["__eq__", "__ne__", "__le__", "__lt__", "__ge__", "__gt__"])
def all_compare_operators(request):
"""
Fixture for dunder names for common compare operations
* >=
* >
* ==
* !=
* <
* <=
"""
return request.param
@pytest.fixture(params=["__le__", "__lt__", "__ge__", "__gt__"])
def compare_operators_no_eq_ne(request):
"""
Fixture for dunder names for compare operations except == and !=
* >=
* >
* <
* <=
"""
return request.param
@pytest.fixture(params=[None, "gzip", "bz2", "zip", "xz"])
def compression(request):
"""
Fixture for trying common compression types in compression tests
"""
return request.param
@pytest.fixture(params=["gzip", "bz2", "zip", "xz"])
def compression_only(request):
"""
Fixture for trying common compression types in compression tests excluding
uncompressed case
"""
return request.param
@pytest.fixture(params=[True, False])
def writable(request):
"""
Fixture that an array is writable
"""
return request.param
@pytest.fixture(scope="module")
def datetime_tz_utc():
return timezone.utc
@pytest.fixture(params=["utc", "dateutil/UTC", utc, tzutc(), timezone.utc])
def utc_fixture(request):
"""
Fixture to provide variants of UTC timezone strings and tzinfo objects
"""
return request.param
@pytest.fixture(params=["inner", "outer", "left", "right"])
def join_type(request):
"""
Fixture for trying all types of join operations
"""
return request.param
@pytest.fixture
def strict_data_files(pytestconfig):
return pytestconfig.getoption("--strict-data-files")
@pytest.fixture
def datapath(strict_data_files):
"""Get the path to a data file.
Parameters
----------
path : str
Path to the file, relative to ``pandas/tests/``
Returns
-------
path : path including ``pandas/tests``.
Raises
------
ValueError
If the path doesn't exist and the --strict-data-files option is set.
"""
BASE_PATH = os.path.join(os.path.dirname(__file__), "tests")
def deco(*args):
path = os.path.join(BASE_PATH, *args)
if not os.path.exists(path):
if strict_data_files:
msg = "Could not find file {} and --strict-data-files is set."
raise ValueError(msg.format(path))
else:
msg = "Could not find {}."
pytest.skip(msg.format(path))
return path
return deco
@pytest.fixture
def iris(datapath):
"""The iris dataset as a DataFrame."""
return pd.read_csv(datapath("data", "iris.csv"))
@pytest.fixture(params=["nlargest", "nsmallest"])
def nselect_method(request):
"""
Fixture for trying all nselect methods
"""
return request.param
@pytest.fixture(params=["left", "right", "both", "neither"])
def closed(request):
"""
Fixture for trying all interval closed parameters
"""
return request.param
@pytest.fixture(params=["left", "right", "both", "neither"])
def other_closed(request):
"""
Secondary closed fixture to allow parametrizing over all pairs of closed
"""
return request.param
@pytest.fixture(params=[None, np.nan, pd.NaT, float("nan"), np.float("NaN")])
def nulls_fixture(request):
"""
Fixture for each null type in pandas
"""
return request.param
nulls_fixture2 = nulls_fixture # Generate cartesian product of nulls_fixture
@pytest.fixture(params=[None, np.nan, pd.NaT])
def unique_nulls_fixture(request):
"""
Fixture for each null type in pandas, each null type exactly once
"""
return request.param
# Generate cartesian product of unique_nulls_fixture:
unique_nulls_fixture2 = unique_nulls_fixture
TIMEZONES = [
None,
"UTC",
"US/Eastern",
"Asia/Tokyo",
"dateutil/US/Pacific",
"dateutil/Asia/Singapore",
tzutc(),
tzlocal(),
FixedOffset(300),
FixedOffset(0),
FixedOffset(-300),
timezone.utc,
timezone(timedelta(hours=1)),
timezone(timedelta(hours=-1), name="foo"),
]
TIMEZONE_IDS = [repr(i) for i in TIMEZONES]
@td.parametrize_fixture_doc(str(TIMEZONE_IDS))
@pytest.fixture(params=TIMEZONES, ids=TIMEZONE_IDS)
def tz_naive_fixture(request):
"""
Fixture for trying timezones including default (None): {0}
"""
return request.param
@td.parametrize_fixture_doc(str(TIMEZONE_IDS[1:]))
@pytest.fixture(params=TIMEZONES[1:], ids=TIMEZONE_IDS[1:])
def tz_aware_fixture(request):
"""
Fixture for trying explicit timezones: {0}
"""
return request.param
# Generate cartesian product of tz_aware_fixture:
tz_aware_fixture2 = tz_aware_fixture
# ----------------------------------------------------------------
# Dtypes
# ----------------------------------------------------------------
UNSIGNED_INT_DTYPES = ["uint8", "uint16", "uint32", "uint64"]
UNSIGNED_EA_INT_DTYPES = ["UInt8", "UInt16", "UInt32", "UInt64"]
SIGNED_INT_DTYPES = [int, "int8", "int16", "int32", "int64"]
SIGNED_EA_INT_DTYPES = ["Int8", "Int16", "Int32", "Int64"]
ALL_INT_DTYPES = UNSIGNED_INT_DTYPES + SIGNED_INT_DTYPES
ALL_EA_INT_DTYPES = UNSIGNED_EA_INT_DTYPES + SIGNED_EA_INT_DTYPES
FLOAT_DTYPES = [float, "float32", "float64"]
COMPLEX_DTYPES = [complex, "complex64", "complex128"]
STRING_DTYPES = [str, "str", "U"]
DATETIME64_DTYPES = ["datetime64[ns]", "M8[ns]"]
TIMEDELTA64_DTYPES = ["timedelta64[ns]", "m8[ns]"]
BOOL_DTYPES = [bool, "bool"]
BYTES_DTYPES = [bytes, "bytes"]
OBJECT_DTYPES = [object, "object"]
ALL_REAL_DTYPES = FLOAT_DTYPES + ALL_INT_DTYPES
ALL_NUMPY_DTYPES = (
ALL_REAL_DTYPES
+ COMPLEX_DTYPES
+ STRING_DTYPES
+ DATETIME64_DTYPES
+ TIMEDELTA64_DTYPES
+ BOOL_DTYPES
+ OBJECT_DTYPES
+ BYTES_DTYPES
)
@pytest.fixture(params=STRING_DTYPES)
def string_dtype(request):
"""Parametrized fixture for string dtypes.
* str
* 'str'
* 'U'
"""
return request.param
@pytest.fixture(params=BYTES_DTYPES)
def bytes_dtype(request):
"""Parametrized fixture for bytes dtypes.
* bytes
* 'bytes'
"""
return request.param
@pytest.fixture(params=OBJECT_DTYPES)
def object_dtype(request):
"""Parametrized fixture for object dtypes.
* object
* 'object'
"""
return request.param
@pytest.fixture(params=DATETIME64_DTYPES)
def datetime64_dtype(request):
"""Parametrized fixture for datetime64 dtypes.
* 'datetime64[ns]'
* 'M8[ns]'
"""
return request.param
@pytest.fixture(params=TIMEDELTA64_DTYPES)
def timedelta64_dtype(request):
"""Parametrized fixture for timedelta64 dtypes.
* 'timedelta64[ns]'
* 'm8[ns]'
"""
return request.param
@pytest.fixture(params=FLOAT_DTYPES)
def float_dtype(request):
"""
Parameterized fixture for float dtypes.
* float
* 'float32'
* 'float64'
"""
return request.param
@pytest.fixture(params=COMPLEX_DTYPES)
def complex_dtype(request):
"""
Parameterized fixture for complex dtypes.
* complex
* 'complex64'
* 'complex128'
"""
return request.param
@pytest.fixture(params=SIGNED_INT_DTYPES)
def sint_dtype(request):
"""
Parameterized fixture for signed integer dtypes.
* int
* 'int8'
* 'int16'
* 'int32'
* 'int64'
"""
return request.param
@pytest.fixture(params=UNSIGNED_INT_DTYPES)
def uint_dtype(request):
"""
Parameterized fixture for unsigned integer dtypes.
* 'uint8'
* 'uint16'
* 'uint32'
* 'uint64'
"""
return request.param
@pytest.fixture(params=ALL_INT_DTYPES)
def any_int_dtype(request):
"""
Parameterized fixture for any integer dtype.
* int
* 'int8'
* 'uint8'
* 'int16'
* 'uint16'
* 'int32'
* 'uint32'
* 'int64'
* 'uint64'
"""
return request.param
@pytest.fixture(params=ALL_REAL_DTYPES)
def any_real_dtype(request):
"""
Parameterized fixture for any (purely) real numeric dtype.
* int
* 'int8'
* 'uint8'
* 'int16'
* 'uint16'
* 'int32'
* 'uint32'
* 'int64'
* 'uint64'
* float
* 'float32'
* 'float64'
"""
return request.param
@pytest.fixture(params=ALL_NUMPY_DTYPES)
def any_numpy_dtype(request):
"""
Parameterized fixture for all numpy dtypes.
* bool
* 'bool'
* int
* 'int8'
* 'uint8'
* 'int16'
* 'uint16'
* 'int32'
* 'uint32'
* 'int64'
* 'uint64'
* float
* 'float32'
* 'float64'
* complex
* 'complex64'
* 'complex128'
* str
* 'str'
* 'U'
* bytes
* 'bytes'
* 'datetime64[ns]'
* 'M8[ns]'
* 'timedelta64[ns]'
* 'm8[ns]'
* object
* 'object'
"""
return request.param
# categoricals are handled separately
_any_skipna_inferred_dtype = [
("string", ["a", np.nan, "c"]),
("bytes", [b"a", np.nan, b"c"]),
("empty", [np.nan, np.nan, np.nan]),
("empty", []),
("mixed-integer", ["a", np.nan, 2]),
("mixed", ["a", np.nan, 2.0]),
("floating", [1.0, np.nan, 2.0]),
("integer", [1, np.nan, 2]),
("mixed-integer-float", [1, np.nan, 2.0]),
("decimal", [Decimal(1), np.nan, Decimal(2)]),
("boolean", [True, np.nan, False]),
("datetime64", [np.datetime64("2013-01-01"), np.nan, np.datetime64("2018-01-01")]),
("datetime", [pd.Timestamp("20130101"), np.nan, pd.Timestamp("20180101")]),
("date", [date(2013, 1, 1), np.nan, date(2018, 1, 1)]),
# The following two dtypes are commented out due to GH 23554
# ('complex', [1 + 1j, np.nan, 2 + 2j]),
# ('timedelta64', [np.timedelta64(1, 'D'),
# np.nan, np.timedelta64(2, 'D')]),
("timedelta", [timedelta(1), np.nan, timedelta(2)]),
("time", [time(1), np.nan, time(2)]),
("period", [pd.Period(2013), pd.NaT, pd.Period(2018)]),
("interval", [pd.Interval(0, 1), np.nan, pd.Interval(0, 2)]),
]
ids, _ = zip(*_any_skipna_inferred_dtype) # use inferred type as fixture-id
@pytest.fixture(params=_any_skipna_inferred_dtype, ids=ids)
def any_skipna_inferred_dtype(request):
"""
Fixture for all inferred dtypes from _libs.lib.infer_dtype
The covered (inferred) types are:
* 'string'
* 'empty'
* 'bytes'
* 'mixed'
* 'mixed-integer'
* 'mixed-integer-float'
* 'floating'
* 'integer'
* 'decimal'
* 'boolean'
* 'datetime64'
* 'datetime'
* 'date'
* 'timedelta'
* 'time'
* 'period'
* 'interval'
Returns
-------
inferred_dtype : str
The string for the inferred dtype from _libs.lib.infer_dtype
values : np.ndarray
An array of object dtype that will be inferred to have
`inferred_dtype`
Examples
--------
>>> import pandas._libs.lib as lib
>>>
>>> def test_something(any_skipna_inferred_dtype):
... inferred_dtype, values = any_skipna_inferred_dtype
... # will pass
... assert lib.infer_dtype(values, skipna=True) == inferred_dtype
"""
inferred_dtype, values = request.param
values = np.array(values, dtype=object) # object dtype to avoid casting
# correctness of inference tested in tests/dtypes/test_inference.py
return inferred_dtype, values
@pytest.fixture(
params=[
getattr(pd.offsets, o)
for o in pd.offsets.__all__
if issubclass(getattr(pd.offsets, o), pd.offsets.Tick)
]
)
def tick_classes(request):
"""
Fixture for Tick based datetime offsets available for a time series.
"""
return request.param
# ----------------------------------------------------------------
# Global setup for tests using Hypothesis
# Registering these strategies makes them globally available via st.from_type,
# which is use for offsets in tests/tseries/offsets/test_offsets_properties.py
for name in "MonthBegin MonthEnd BMonthBegin BMonthEnd".split():
cls = getattr(pd.tseries.offsets, name)
st.register_type_strategy(
cls, st.builds(cls, n=st.integers(-99, 99), normalize=st.booleans())
)
for name in "YearBegin YearEnd BYearBegin BYearEnd".split():
cls = getattr(pd.tseries.offsets, name)
st.register_type_strategy(
cls,
st.builds(
cls,
n=st.integers(-5, 5),
normalize=st.booleans(),
month=st.integers(min_value=1, max_value=12),
),
)
for name in "QuarterBegin QuarterEnd BQuarterBegin BQuarterEnd".split():
cls = getattr(pd.tseries.offsets, name)
st.register_type_strategy(
cls,
st.builds(
cls,
n=st.integers(-24, 24),
normalize=st.booleans(),
startingMonth=st.integers(min_value=1, max_value=12),
),
)
@pytest.fixture
def float_frame():
"""
Fixture for DataFrame of floats with index of unique strings
Columns are ['A', 'B', 'C', 'D'].
A B C D
P7GACiRnxd -0.465578 -0.361863 0.886172 -0.053465
qZKh6afn8n -0.466693 -0.373773 0.266873 1.673901
tkp0r6Qble 0.148691 -0.059051 0.174817 1.598433
wP70WOCtv8 0.133045 -0.581994 -0.992240 0.261651
M2AeYQMnCz -1.207959 -0.185775 0.588206 0.563938
QEPzyGDYDo -0.381843 -0.758281 0.502575 -0.565053
r78Jwns6dn -0.653707 0.883127 0.682199 0.206159
... ... ... ... ...
IHEGx9NO0T -0.277360 0.113021 -1.018314 0.196316
lPMj8K27FA -1.313667 -0.604776 -1.305618 -0.863999
qa66YMWQa5 1.110525 0.475310 -0.747865 0.032121
yOa0ATsmcE -0.431457 0.067094 0.096567 -0.264962
65znX3uRNG 1.528446 0.160416 -0.109635 -0.032987
eCOBvKqf3e 0.235281 1.622222 0.781255 0.392871
xSucinXxuV -1.263557 0.252799 -0.552247 0.400426
[30 rows x 4 columns]
"""
return DataFrame(tm.getSeriesData())

View File

@@ -0,0 +1,307 @@
"""
accessor.py contains base classes for implementing accessor properties
that can be mixed into or pinned onto other pandas classes.
"""
from typing import Set
import warnings
from pandas.util._decorators import Appender
class DirNamesMixin:
_accessors = set() # type: Set[str]
_deprecations = frozenset(
["asobject", "base", "data", "flags", "itemsize", "strides"]
)
def _dir_deletions(self):
"""
Delete unwanted __dir__ for this object.
"""
return self._accessors | self._deprecations
def _dir_additions(self):
"""
Add additional __dir__ for this object.
"""
rv = set()
for accessor in self._accessors:
try:
getattr(self, accessor)
rv.add(accessor)
except AttributeError:
pass
return rv
def __dir__(self):
"""
Provide method name lookup and completion
Only provide 'public' methods.
"""
rv = set(dir(type(self)))
rv = (rv - self._dir_deletions()) | self._dir_additions()
return sorted(rv)
class PandasDelegate:
"""
An abstract base class for delegating methods/properties.
"""
def _delegate_property_get(self, name, *args, **kwargs):
raise TypeError("You cannot access the " "property {name}".format(name=name))
def _delegate_property_set(self, name, value, *args, **kwargs):
raise TypeError("The property {name} cannot be set".format(name=name))
def _delegate_method(self, name, *args, **kwargs):
raise TypeError("You cannot call method {name}".format(name=name))
@classmethod
def _add_delegate_accessors(cls, delegate, accessors, typ, overwrite=False):
"""
Add accessors to cls from the delegate class.
Parameters
----------
cls : the class to add the methods/properties to
delegate : the class to get methods/properties & doc-strings
accessors : string list of accessors to add
typ : 'property' or 'method'
overwrite : boolean, default False
overwrite the method/property in the target class if it exists.
"""
def _create_delegator_property(name):
def _getter(self):
return self._delegate_property_get(name)
def _setter(self, new_values):
return self._delegate_property_set(name, new_values)
_getter.__name__ = name
_setter.__name__ = name
return property(
fget=_getter, fset=_setter, doc=getattr(delegate, name).__doc__
)
def _create_delegator_method(name):
def f(self, *args, **kwargs):
return self._delegate_method(name, *args, **kwargs)
f.__name__ = name
f.__doc__ = getattr(delegate, name).__doc__
return f
for name in accessors:
if typ == "property":
f = _create_delegator_property(name)
else:
f = _create_delegator_method(name)
# don't overwrite existing methods/properties
if overwrite or not hasattr(cls, name):
setattr(cls, name, f)
def delegate_names(delegate, accessors, typ, overwrite=False):
"""
Add delegated names to a class using a class decorator. This provides
an alternative usage to directly calling `_add_delegate_accessors`
below a class definition.
Parameters
----------
delegate : object
the class to get methods/properties & doc-strings
accessors : Sequence[str]
List of accessor to add
typ : {'property', 'method'}
overwrite : boolean, default False
overwrite the method/property in the target class if it exists
Returns
-------
callable
A class decorator.
Examples
--------
@delegate_names(Categorical, ["categories", "ordered"], "property")
class CategoricalAccessor(PandasDelegate):
[...]
"""
def add_delegate_accessors(cls):
cls._add_delegate_accessors(delegate, accessors, typ, overwrite=overwrite)
return cls
return add_delegate_accessors
# Ported with modifications from xarray
# https://github.com/pydata/xarray/blob/master/xarray/core/extensions.py
# 1. We don't need to catch and re-raise AttributeErrors as RuntimeErrors
# 2. We use a UserWarning instead of a custom Warning
class CachedAccessor:
"""
Custom property-like object (descriptor) for caching accessors.
Parameters
----------
name : str
The namespace this will be accessed under, e.g. ``df.foo``
accessor : cls
The class with the extension methods. The class' __init__ method
should expect one of a ``Series``, ``DataFrame`` or ``Index`` as
the single argument ``data``
"""
def __init__(self, name, accessor):
self._name = name
self._accessor = accessor
def __get__(self, obj, cls):
if obj is None:
# we're accessing the attribute of the class, i.e., Dataset.geo
return self._accessor
accessor_obj = self._accessor(obj)
# Replace the property with the accessor object. Inspired by:
# http://www.pydanny.com/cached-property.html
# We need to use object.__setattr__ because we overwrite __setattr__ on
# NDFrame
object.__setattr__(obj, self._name, accessor_obj)
return accessor_obj
def _register_accessor(name, cls):
def decorator(accessor):
if hasattr(cls, name):
warnings.warn(
"registration of accessor {!r} under name {!r} for type "
"{!r} is overriding a preexisting attribute with the same "
"name.".format(accessor, name, cls),
UserWarning,
stacklevel=2,
)
setattr(cls, name, CachedAccessor(name, accessor))
cls._accessors.add(name)
return accessor
return decorator
_doc = """
Register a custom accessor on %(klass)s objects.
Parameters
----------
name : str
Name under which the accessor should be registered. A warning is issued
if this name conflicts with a preexisting attribute.
Returns
-------
callable
A class decorator.
See Also
--------
%(others)s
Notes
-----
When accessed, your accessor will be initialized with the pandas object
the user is interacting with. So the signature must be
.. code-block:: python
def __init__(self, pandas_object): # noqa: E999
...
For consistency with pandas methods, you should raise an ``AttributeError``
if the data passed to your accessor has an incorrect dtype.
>>> pd.Series(['a', 'b']).dt
Traceback (most recent call last):
...
AttributeError: Can only use .dt accessor with datetimelike values
Examples
--------
In your library code::
import pandas as pd
@pd.api.extensions.register_dataframe_accessor("geo")
class GeoAccessor:
def __init__(self, pandas_obj):
self._obj = pandas_obj
@property
def center(self):
# return the geographic center point of this DataFrame
lat = self._obj.latitude
lon = self._obj.longitude
return (float(lon.mean()), float(lat.mean()))
def plot(self):
# plot this array's data on a map, e.g., using Cartopy
pass
Back in an interactive IPython session:
>>> ds = pd.DataFrame({'longitude': np.linspace(0, 10),
... 'latitude': np.linspace(0, 20)})
>>> ds.geo.center
(5.0, 10.0)
>>> ds.geo.plot()
# plots data on a map
"""
@Appender(
_doc
% dict(
klass="DataFrame",
others=("register_series_accessor, " "register_index_accessor"),
)
)
def register_dataframe_accessor(name):
from pandas import DataFrame
return _register_accessor(name, DataFrame)
@Appender(
_doc
% dict(
klass="Series",
others=("register_dataframe_accessor, " "register_index_accessor"),
)
)
def register_series_accessor(name):
from pandas import Series
return _register_accessor(name, Series)
@Appender(
_doc
% dict(
klass="Index",
others=("register_dataframe_accessor, " "register_series_accessor"),
)
)
def register_index_accessor(name):
from pandas import Index
return _register_accessor(name, Index)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,55 @@
# flake8: noqa
import numpy as np
from pandas.core.arrays.integer import (
Int8Dtype,
Int16Dtype,
Int32Dtype,
Int64Dtype,
UInt8Dtype,
UInt16Dtype,
UInt32Dtype,
UInt64Dtype,
)
from pandas.core.algorithms import factorize, unique, value_counts
from pandas.core.dtypes.missing import isna, isnull, notna, notnull
from pandas.core.dtypes.dtypes import (
CategoricalDtype,
PeriodDtype,
IntervalDtype,
DatetimeTZDtype,
)
from pandas.core.arrays import Categorical, array
from pandas.core.groupby import Grouper, NamedAgg
from pandas.io.formats.format import set_eng_float_format
from pandas.core.index import (
Index,
CategoricalIndex,
Int64Index,
UInt64Index,
RangeIndex,
Float64Index,
MultiIndex,
IntervalIndex,
TimedeltaIndex,
DatetimeIndex,
PeriodIndex,
NaT,
)
from pandas.core.indexes.period import Period, period_range
from pandas.core.indexes.timedeltas import Timedelta, timedelta_range
from pandas.core.indexes.datetimes import Timestamp, date_range, bdate_range
from pandas.core.indexes.interval import Interval, interval_range
from pandas.core.series import Series
from pandas.core.frame import DataFrame
# TODO: Remove import when statsmodels updates #18264
from pandas.core.reshape.reshape import get_dummies
from pandas.core.indexing import IndexSlice
from pandas.core.tools.numeric import to_numeric
from pandas.tseries.offsets import DateOffset
from pandas.core.tools.datetimes import to_datetime
from pandas.core.tools.timedeltas import to_timedelta

View File

@@ -0,0 +1,446 @@
import inspect
import warnings
import numpy as np
from pandas._libs import reduction
from pandas.util._decorators import cache_readonly
from pandas.core.dtypes.common import (
is_dict_like,
is_extension_type,
is_list_like,
is_sequence,
)
from pandas.core.dtypes.generic import ABCSeries
from pandas.io.formats.printing import pprint_thing
def frame_apply(
obj,
func,
axis=0,
broadcast=None,
raw=False,
reduce=None,
result_type=None,
ignore_failures=False,
args=None,
kwds=None,
):
""" construct and return a row or column based frame apply object """
axis = obj._get_axis_number(axis)
if axis == 0:
klass = FrameRowApply
elif axis == 1:
klass = FrameColumnApply
return klass(
obj,
func,
broadcast=broadcast,
raw=raw,
reduce=reduce,
result_type=result_type,
ignore_failures=ignore_failures,
args=args,
kwds=kwds,
)
class FrameApply:
def __init__(
self,
obj,
func,
broadcast,
raw,
reduce,
result_type,
ignore_failures,
args,
kwds,
):
self.obj = obj
self.raw = raw
self.ignore_failures = ignore_failures
self.args = args or ()
self.kwds = kwds or {}
if result_type not in [None, "reduce", "broadcast", "expand"]:
raise ValueError(
"invalid value for result_type, must be one "
"of {None, 'reduce', 'broadcast', 'expand'}"
)
if broadcast is not None:
warnings.warn(
"The broadcast argument is deprecated and will "
"be removed in a future version. You can specify "
"result_type='broadcast' to broadcast the result "
"to the original dimensions",
FutureWarning,
stacklevel=4,
)
if broadcast:
result_type = "broadcast"
if reduce is not None:
warnings.warn(
"The reduce argument is deprecated and will "
"be removed in a future version. You can specify "
"result_type='reduce' to try to reduce the result "
"to the original dimensions",
FutureWarning,
stacklevel=4,
)
if reduce:
if result_type is not None:
raise ValueError("cannot pass both reduce=True and result_type")
result_type = "reduce"
self.result_type = result_type
# curry if needed
if (kwds or args) and not isinstance(func, (np.ufunc, str)):
def f(x):
return func(x, *args, **kwds)
else:
f = func
self.f = f
# results
self.result = None
self.res_index = None
self.res_columns = None
@property
def columns(self):
return self.obj.columns
@property
def index(self):
return self.obj.index
@cache_readonly
def values(self):
return self.obj.values
@cache_readonly
def dtypes(self):
return self.obj.dtypes
@property
def agg_axis(self):
return self.obj._get_agg_axis(self.axis)
def get_result(self):
""" compute the results """
# dispatch to agg
if is_list_like(self.f) or is_dict_like(self.f):
return self.obj.aggregate(self.f, axis=self.axis, *self.args, **self.kwds)
# all empty
if len(self.columns) == 0 and len(self.index) == 0:
return self.apply_empty_result()
# string dispatch
if isinstance(self.f, str):
# Support for `frame.transform('method')`
# Some methods (shift, etc.) require the axis argument, others
# don't, so inspect and insert if necessary.
func = getattr(self.obj, self.f)
sig = inspect.getfullargspec(func)
if "axis" in sig.args:
self.kwds["axis"] = self.axis
return func(*self.args, **self.kwds)
# ufunc
elif isinstance(self.f, np.ufunc):
with np.errstate(all="ignore"):
results = self.obj._data.apply("apply", func=self.f)
return self.obj._constructor(
data=results, index=self.index, columns=self.columns, copy=False
)
# broadcasting
if self.result_type == "broadcast":
return self.apply_broadcast()
# one axis empty
elif not all(self.obj.shape):
return self.apply_empty_result()
# raw
elif self.raw and not self.obj._is_mixed_type:
return self.apply_raw()
return self.apply_standard()
def apply_empty_result(self):
"""
we have an empty result; at least 1 axis is 0
we will try to apply the function to an empty
series in order to see if this is a reduction function
"""
# we are not asked to reduce or infer reduction
# so just return a copy of the existing object
if self.result_type not in ["reduce", None]:
return self.obj.copy()
# we may need to infer
reduce = self.result_type == "reduce"
from pandas import Series
if not reduce:
EMPTY_SERIES = Series([])
try:
r = self.f(EMPTY_SERIES, *self.args, **self.kwds)
reduce = not isinstance(r, Series)
except Exception:
pass
if reduce:
return self.obj._constructor_sliced(np.nan, index=self.agg_axis)
else:
return self.obj.copy()
def apply_raw(self):
""" apply to the values as a numpy array """
try:
result = reduction.reduce(self.values, self.f, axis=self.axis)
except Exception:
result = np.apply_along_axis(self.f, self.axis, self.values)
# TODO: mixed type case
if result.ndim == 2:
return self.obj._constructor(result, index=self.index, columns=self.columns)
else:
return self.obj._constructor_sliced(result, index=self.agg_axis)
def apply_broadcast(self, target):
result_values = np.empty_like(target.values)
# axis which we want to compare compliance
result_compare = target.shape[0]
for i, col in enumerate(target.columns):
res = self.f(target[col])
ares = np.asarray(res).ndim
# must be a scalar or 1d
if ares > 1:
raise ValueError("too many dims to broadcast")
elif ares == 1:
# must match return dim
if result_compare != len(res):
raise ValueError("cannot broadcast result")
result_values[:, i] = res
# we *always* preserve the original index / columns
result = self.obj._constructor(
result_values, index=target.index, columns=target.columns
)
return result
def apply_standard(self):
# try to reduce first (by default)
# this only matters if the reduction in values is of different dtype
# e.g. if we want to apply to a SparseFrame, then can't directly reduce
# we cannot reduce using non-numpy dtypes,
# as demonstrated in gh-12244
if (
self.result_type in ["reduce", None]
and not self.dtypes.apply(is_extension_type).any()
):
# Create a dummy Series from an empty array
from pandas import Series
values = self.values
index = self.obj._get_axis(self.axis)
labels = self.agg_axis
empty_arr = np.empty(len(index), dtype=values.dtype)
dummy = Series(empty_arr, index=index, dtype=values.dtype)
try:
result = reduction.reduce(
values, self.f, axis=self.axis, dummy=dummy, labels=labels
)
return self.obj._constructor_sliced(result, index=labels)
except Exception:
pass
# compute the result using the series generator
self.apply_series_generator()
# wrap results
return self.wrap_results()
def apply_series_generator(self):
series_gen = self.series_generator
res_index = self.result_index
i = None
keys = []
results = {}
if self.ignore_failures:
successes = []
for i, v in enumerate(series_gen):
try:
results[i] = self.f(v)
keys.append(v.name)
successes.append(i)
except Exception:
pass
# so will work with MultiIndex
if len(successes) < len(res_index):
res_index = res_index.take(successes)
else:
try:
for i, v in enumerate(series_gen):
results[i] = self.f(v)
keys.append(v.name)
except Exception as e:
if hasattr(e, "args"):
# make sure i is defined
if i is not None:
k = res_index[i]
e.args = e.args + ("occurred at index %s" % pprint_thing(k),)
raise
self.results = results
self.res_index = res_index
self.res_columns = self.result_columns
def wrap_results(self):
results = self.results
# see if we can infer the results
if len(results) > 0 and is_sequence(results[0]):
return self.wrap_results_for_axis()
# dict of scalars
result = self.obj._constructor_sliced(results)
result.index = self.res_index
return result
class FrameRowApply(FrameApply):
axis = 0
def apply_broadcast(self):
return super().apply_broadcast(self.obj)
@property
def series_generator(self):
return (self.obj._ixs(i, axis=1) for i in range(len(self.columns)))
@property
def result_index(self):
return self.columns
@property
def result_columns(self):
return self.index
def wrap_results_for_axis(self):
""" return the results for the rows """
results = self.results
result = self.obj._constructor(data=results)
if not isinstance(results[0], ABCSeries):
try:
result.index = self.res_columns
except ValueError:
pass
try:
result.columns = self.res_index
except ValueError:
pass
return result
class FrameColumnApply(FrameApply):
axis = 1
def apply_broadcast(self):
result = super().apply_broadcast(self.obj.T)
return result.T
@property
def series_generator(self):
constructor = self.obj._constructor_sliced
return (
constructor(arr, index=self.columns, name=name)
for i, (arr, name) in enumerate(zip(self.values, self.index))
)
@property
def result_index(self):
return self.index
@property
def result_columns(self):
return self.columns
def wrap_results_for_axis(self):
""" return the results for the columns """
results = self.results
# we have requested to expand
if self.result_type == "expand":
result = self.infer_to_same_shape()
# we have a non-series and don't want inference
elif not isinstance(results[0], ABCSeries):
from pandas import Series
result = Series(results)
result.index = self.res_index
# we may want to infer results
else:
result = self.infer_to_same_shape()
return result
def infer_to_same_shape(self):
""" infer the results to the same shape as the input object """
results = self.results
result = self.obj._constructor(data=results)
result = result.T
# set the index
result.index = self.res_index
# infer dtypes
result = result.infer_objects()
return result

View File

@@ -0,0 +1,14 @@
from .array_ import array # noqa: F401
from .base import ( # noqa: F401
ExtensionArray,
ExtensionOpsMixin,
ExtensionScalarOpsMixin,
)
from .categorical import Categorical # noqa: F401
from .datetimes import DatetimeArray # noqa: F401
from .integer import IntegerArray, integer_array # noqa: F401
from .interval import IntervalArray # noqa: F401
from .numpy_ import PandasArray, PandasDtype # noqa: F401
from .period import PeriodArray, period_array # noqa: F401
from .sparse import SparseArray # noqa: F401
from .timedeltas import TimedeltaArray # noqa: F401

View File

@@ -0,0 +1,196 @@
"""
Helper functions to generate range-like data for DatetimeArray
(and possibly TimedeltaArray/PeriodArray)
"""
from typing import Tuple
import numpy as np
from pandas._libs.tslibs import OutOfBoundsDatetime, Timestamp
from pandas.tseries.offsets import DateOffset, Tick, generate_range
def generate_regular_range(
start: Timestamp, end: Timestamp, periods: int, freq: DateOffset
) -> Tuple[np.ndarray, str]:
"""
Generate a range of dates with the spans between dates described by
the given `freq` DateOffset.
Parameters
----------
start : Timestamp or None
first point of produced date range
end : Timestamp or None
last point of produced date range
periods : int
number of periods in produced date range
freq : DateOffset
describes space between dates in produced date range
Returns
-------
ndarray[np.int64] representing nanosecond unix timestamps
"""
if isinstance(freq, Tick):
stride = freq.nanos
if periods is None:
b = Timestamp(start).value
# cannot just use e = Timestamp(end) + 1 because arange breaks when
# stride is too large, see GH10887
e = b + (Timestamp(end).value - b) // stride * stride + stride // 2 + 1
# end.tz == start.tz by this point due to _generate implementation
tz = start.tz
elif start is not None:
b = Timestamp(start).value
e = _generate_range_overflow_safe(b, periods, stride, side="start")
tz = start.tz
elif end is not None:
e = Timestamp(end).value + stride
b = _generate_range_overflow_safe(e, periods, stride, side="end")
tz = end.tz
else:
raise ValueError(
"at least 'start' or 'end' should be specified "
"if a 'period' is given."
)
with np.errstate(over="raise"):
# If the range is sufficiently large, np.arange may overflow
# and incorrectly return an empty array if not caught.
try:
values = np.arange(b, e, stride, dtype=np.int64)
except FloatingPointError:
xdr = [b]
while xdr[-1] != e:
xdr.append(xdr[-1] + stride)
values = np.array(xdr[:-1], dtype=np.int64)
else:
tz = None
# start and end should have the same timezone by this point
if start is not None:
tz = start.tz
elif end is not None:
tz = end.tz
xdr = generate_range(start=start, end=end, periods=periods, offset=freq)
values = np.array([x.value for x in xdr], dtype=np.int64)
return values, tz
def _generate_range_overflow_safe(
endpoint: int, periods: int, stride: int, side: str = "start"
) -> int:
"""
Calculate the second endpoint for passing to np.arange, checking
to avoid an integer overflow. Catch OverflowError and re-raise
as OutOfBoundsDatetime.
Parameters
----------
endpoint : int
nanosecond timestamp of the known endpoint of the desired range
periods : int
number of periods in the desired range
stride : int
nanoseconds between periods in the desired range
side : {'start', 'end'}
which end of the range `endpoint` refers to
Returns
-------
other_end : int
Raises
------
OutOfBoundsDatetime
"""
# GH#14187 raise instead of incorrectly wrapping around
assert side in ["start", "end"]
i64max = np.uint64(np.iinfo(np.int64).max)
msg = (
"Cannot generate range with {side}={endpoint} and "
"periods={periods}".format(side=side, endpoint=endpoint, periods=periods)
)
with np.errstate(over="raise"):
# if periods * strides cannot be multiplied within the *uint64* bounds,
# we cannot salvage the operation by recursing, so raise
try:
addend = np.uint64(periods) * np.uint64(np.abs(stride))
except FloatingPointError:
raise OutOfBoundsDatetime(msg)
if np.abs(addend) <= i64max:
# relatively easy case without casting concerns
return _generate_range_overflow_safe_signed(endpoint, periods, stride, side)
elif (endpoint > 0 and side == "start" and stride > 0) or (
endpoint < 0 and side == "end" and stride > 0
):
# no chance of not-overflowing
raise OutOfBoundsDatetime(msg)
elif side == "end" and endpoint > i64max and endpoint - stride <= i64max:
# in _generate_regular_range we added `stride` thereby overflowing
# the bounds. Adjust to fix this.
return _generate_range_overflow_safe(
endpoint - stride, periods - 1, stride, side
)
# split into smaller pieces
mid_periods = periods // 2
remaining = periods - mid_periods
assert 0 < remaining < periods, (remaining, periods, endpoint, stride)
midpoint = _generate_range_overflow_safe(endpoint, mid_periods, stride, side)
return _generate_range_overflow_safe(midpoint, remaining, stride, side)
def _generate_range_overflow_safe_signed(
endpoint: int, periods: int, stride: int, side: str
) -> int:
"""
A special case for _generate_range_overflow_safe where `periods * stride`
can be calculated without overflowing int64 bounds.
"""
assert side in ["start", "end"]
if side == "end":
stride *= -1
with np.errstate(over="raise"):
addend = np.int64(periods) * np.int64(stride)
try:
# easy case with no overflows
return np.int64(endpoint) + addend
except (FloatingPointError, OverflowError):
# with endpoint negative and addend positive we risk
# FloatingPointError; with reversed signed we risk OverflowError
pass
# if stride and endpoint had opposite signs, then endpoint + addend
# should never overflow. so they must have the same signs
assert (stride > 0 and endpoint >= 0) or (stride < 0 and endpoint <= 0)
if stride > 0:
# watch out for very special case in which we just slightly
# exceed implementation bounds, but when passing the result to
# np.arange will get a result slightly within the bounds
assert endpoint >= 0
result = np.uint64(endpoint) + np.uint64(addend)
i64max = np.uint64(np.iinfo(np.int64).max)
assert result > i64max
if result <= i64max + np.uint64(stride):
return result
raise OutOfBoundsDatetime(
"Cannot generate range with "
"{side}={endpoint} and "
"periods={periods}".format(side=side, endpoint=endpoint, periods=periods)
)

View File

@@ -0,0 +1,281 @@
from typing import Optional, Sequence, Union, cast
import numpy as np
from pandas._libs import lib, tslibs
from pandas.core.dtypes.common import (
is_datetime64_ns_dtype,
is_extension_array_dtype,
is_timedelta64_ns_dtype,
)
from pandas.core.dtypes.dtypes import ExtensionDtype, registry
from pandas.core.dtypes.generic import ABCExtensionArray
def array(
data: Sequence[object],
dtype: Optional[Union[str, np.dtype, ExtensionDtype]] = None,
copy: bool = True,
) -> ABCExtensionArray:
"""
Create an array.
.. versionadded:: 0.24.0
Parameters
----------
data : Sequence of objects
The scalars inside `data` should be instances of the
scalar type for `dtype`. It's expected that `data`
represents a 1-dimensional array of data.
When `data` is an Index or Series, the underlying array
will be extracted from `data`.
dtype : str, np.dtype, or ExtensionDtype, optional
The dtype to use for the array. This may be a NumPy
dtype or an extension type registered with pandas using
:meth:`pandas.api.extensions.register_extension_dtype`.
If not specified, there are two possibilities:
1. When `data` is a :class:`Series`, :class:`Index`, or
:class:`ExtensionArray`, the `dtype` will be taken
from the data.
2. Otherwise, pandas will attempt to infer the `dtype`
from the data.
Note that when `data` is a NumPy array, ``data.dtype`` is
*not* used for inferring the array type. This is because
NumPy cannot represent all the types of data that can be
held in extension arrays.
Currently, pandas will infer an extension dtype for sequences of
============================== =====================================
Scalar Type Array Type
============================== =====================================
:class:`pandas.Interval` :class:`pandas.arrays.IntervalArray`
:class:`pandas.Period` :class:`pandas.arrays.PeriodArray`
:class:`datetime.datetime` :class:`pandas.arrays.DatetimeArray`
:class:`datetime.timedelta` :class:`pandas.arrays.TimedeltaArray`
============================== =====================================
For all other cases, NumPy's usual inference rules will be used.
copy : bool, default True
Whether to copy the data, even if not necessary. Depending
on the type of `data`, creating the new array may require
copying data, even if ``copy=False``.
Returns
-------
ExtensionArray
The newly created array.
Raises
------
ValueError
When `data` is not 1-dimensional.
See Also
--------
numpy.array : Construct a NumPy array.
Series : Construct a pandas Series.
Index : Construct a pandas Index.
arrays.PandasArray : ExtensionArray wrapping a NumPy array.
Series.array : Extract the array stored within a Series.
Notes
-----
Omitting the `dtype` argument means pandas will attempt to infer the
best array type from the values in the data. As new array types are
added by pandas and 3rd party libraries, the "best" array type may
change. We recommend specifying `dtype` to ensure that
1. the correct array type for the data is returned
2. the returned array type doesn't change as new extension types
are added by pandas and third-party libraries
Additionally, if the underlying memory representation of the returned
array matters, we recommend specifying the `dtype` as a concrete object
rather than a string alias or allowing it to be inferred. For example,
a future version of pandas or a 3rd-party library may include a
dedicated ExtensionArray for string data. In this event, the following
would no longer return a :class:`arrays.PandasArray` backed by a NumPy
array.
>>> pd.array(['a', 'b'], dtype=str)
<PandasArray>
['a', 'b']
Length: 2, dtype: str32
This would instead return the new ExtensionArray dedicated for string
data. If you really need the new array to be backed by a NumPy array,
specify that in the dtype.
>>> pd.array(['a', 'b'], dtype=np.dtype("<U1"))
<PandasArray>
['a', 'b']
Length: 2, dtype: str32
Or use the dedicated constructor for the array you're expecting, and
wrap that in a PandasArray
>>> pd.array(np.array(['a', 'b'], dtype='<U1'))
<PandasArray>
['a', 'b']
Length: 2, dtype: str32
Finally, Pandas has arrays that mostly overlap with NumPy
* :class:`arrays.DatetimeArray`
* :class:`arrays.TimedeltaArray`
When data with a ``datetime64[ns]`` or ``timedelta64[ns]`` dtype is
passed, pandas will always return a ``DatetimeArray`` or ``TimedeltaArray``
rather than a ``PandasArray``. This is for symmetry with the case of
timezone-aware data, which NumPy does not natively support.
>>> pd.array(['2015', '2016'], dtype='datetime64[ns]')
<DatetimeArray>
['2015-01-01 00:00:00', '2016-01-01 00:00:00']
Length: 2, dtype: datetime64[ns]
>>> pd.array(["1H", "2H"], dtype='timedelta64[ns]')
<TimedeltaArray>
['01:00:00', '02:00:00']
Length: 2, dtype: timedelta64[ns]
Examples
--------
If a dtype is not specified, `data` is passed through to
:meth:`numpy.array`, and a :class:`arrays.PandasArray` is returned.
>>> pd.array([1, 2])
<PandasArray>
[1, 2]
Length: 2, dtype: int64
Or the NumPy dtype can be specified
>>> pd.array([1, 2], dtype=np.dtype("int32"))
<PandasArray>
[1, 2]
Length: 2, dtype: int32
You can use the string alias for `dtype`
>>> pd.array(['a', 'b', 'a'], dtype='category')
[a, b, a]
Categories (2, object): [a, b]
Or specify the actual dtype
>>> pd.array(['a', 'b', 'a'],
... dtype=pd.CategoricalDtype(['a', 'b', 'c'], ordered=True))
[a, b, a]
Categories (3, object): [a < b < c]
Because omitting the `dtype` passes the data through to NumPy,
a mixture of valid integers and NA will return a floating-point
NumPy array.
>>> pd.array([1, 2, np.nan])
<PandasArray>
[1.0, 2.0, nan]
Length: 3, dtype: float64
To use pandas' nullable :class:`pandas.arrays.IntegerArray`, specify
the dtype:
>>> pd.array([1, 2, np.nan], dtype='Int64')
<IntegerArray>
[1, 2, NaN]
Length: 3, dtype: Int64
Pandas will infer an ExtensionArray for some types of data:
>>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")])
<PeriodArray>
['2000-01-01', '2000-01-01']
Length: 2, dtype: period[D]
`data` must be 1-dimensional. A ValueError is raised when the input
has the wrong dimensionality.
>>> pd.array(1)
Traceback (most recent call last):
...
ValueError: Cannot pass scalar '1' to 'pandas.array'.
"""
from pandas.core.arrays import (
period_array,
ExtensionArray,
IntervalArray,
PandasArray,
DatetimeArray,
TimedeltaArray,
)
from pandas.core.internals.arrays import extract_array
if lib.is_scalar(data):
msg = "Cannot pass scalar '{}' to 'pandas.array'."
raise ValueError(msg.format(data))
data = extract_array(data, extract_numpy=True)
if dtype is None and isinstance(data, ExtensionArray):
dtype = data.dtype
# this returns None for not-found dtypes.
if isinstance(dtype, str):
dtype = registry.find(dtype) or dtype
if is_extension_array_dtype(dtype):
cls = cast(ExtensionDtype, dtype).construct_array_type()
return cls._from_sequence(data, dtype=dtype, copy=copy)
if dtype is None:
inferred_dtype = lib.infer_dtype(data, skipna=False)
if inferred_dtype == "period":
try:
return period_array(data, copy=copy)
except tslibs.IncompatibleFrequency:
# We may have a mixture of frequencies.
# We choose to return an ndarray, rather than raising.
pass
elif inferred_dtype == "interval":
try:
return IntervalArray(data, copy=copy)
except ValueError:
# We may have a mixture of `closed` here.
# We choose to return an ndarray, rather than raising.
pass
elif inferred_dtype.startswith("datetime"):
# datetime, datetime64
try:
return DatetimeArray._from_sequence(data, copy=copy)
except ValueError:
# Mixture of timezones, fall back to PandasArray
pass
elif inferred_dtype.startswith("timedelta"):
# timedelta, timedelta64
return TimedeltaArray._from_sequence(data, copy=copy)
# TODO(BooleanArray): handle this type
# Pandas overrides NumPy for
# 1. datetime64[ns]
# 2. timedelta64[ns]
# so that a DatetimeArray is returned.
if is_datetime64_ns_dtype(dtype):
return DatetimeArray._from_sequence(data, dtype=dtype, copy=copy)
elif is_timedelta64_ns_dtype(dtype):
return TimedeltaArray._from_sequence(data, dtype=dtype, copy=copy)
result = PandasArray._from_sequence(data, dtype=dtype, copy=copy)
return result

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,867 @@
import numbers
from typing import Type
import warnings
import numpy as np
from pandas._libs import lib
from pandas.compat import set_function_name
from pandas.util._decorators import cache_readonly
from pandas.core.dtypes.base import ExtensionDtype
from pandas.core.dtypes.cast import astype_nansafe
from pandas.core.dtypes.common import (
is_bool_dtype,
is_float,
is_float_dtype,
is_integer,
is_integer_dtype,
is_list_like,
is_object_dtype,
is_scalar,
)
from pandas.core.dtypes.dtypes import register_extension_dtype
from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries
from pandas.core.dtypes.missing import isna, notna
from pandas.core import nanops, ops
from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
from pandas.core.tools.numeric import to_numeric
class _IntegerDtype(ExtensionDtype):
"""
An ExtensionDtype to hold a single size & kind of integer dtype.
These specific implementations are subclasses of the non-public
_IntegerDtype. For example we have Int8Dtype to represent signed int 8s.
The attributes name & type are set when these subclasses are created.
"""
name = None # type: str
base = None
type = None # type: Type
na_value = np.nan
def __repr__(self):
sign = "U" if self.is_unsigned_integer else ""
return "{sign}Int{size}Dtype()".format(sign=sign, size=8 * self.itemsize)
@cache_readonly
def is_signed_integer(self):
return self.kind == "i"
@cache_readonly
def is_unsigned_integer(self):
return self.kind == "u"
@property
def _is_numeric(self):
return True
@cache_readonly
def numpy_dtype(self):
""" Return an instance of our numpy dtype """
return np.dtype(self.type)
@cache_readonly
def kind(self):
return self.numpy_dtype.kind
@cache_readonly
def itemsize(self):
""" Return the number of bytes in this dtype """
return self.numpy_dtype.itemsize
@classmethod
def construct_array_type(cls):
"""Return the array type associated with this dtype
Returns
-------
type
"""
return IntegerArray
def integer_array(values, dtype=None, copy=False):
"""
Infer and return an integer array of the values.
Parameters
----------
values : 1D list-like
dtype : dtype, optional
dtype to coerce
copy : boolean, default False
Returns
-------
IntegerArray
Raises
------
TypeError if incompatible types
"""
values, mask = coerce_to_array(values, dtype=dtype, copy=copy)
return IntegerArray(values, mask)
def safe_cast(values, dtype, copy):
"""
Safely cast the values to the dtype if they
are equivalent, meaning floats must be equivalent to the
ints.
"""
try:
return values.astype(dtype, casting="safe", copy=copy)
except TypeError:
casted = values.astype(dtype, copy=copy)
if (casted == values).all():
return casted
raise TypeError(
"cannot safely cast non-equivalent {} to {}".format(
values.dtype, np.dtype(dtype)
)
)
def coerce_to_array(values, dtype, mask=None, copy=False):
"""
Coerce the input values array to numpy arrays with a mask
Parameters
----------
values : 1D list-like
dtype : integer dtype
mask : boolean 1D array, optional
copy : boolean, default False
if True, copy the input
Returns
-------
tuple of (values, mask)
"""
# if values is integer numpy array, preserve it's dtype
if dtype is None and hasattr(values, "dtype"):
if is_integer_dtype(values.dtype):
dtype = values.dtype
if dtype is not None:
if isinstance(dtype, str) and (
dtype.startswith("Int") or dtype.startswith("UInt")
):
# Avoid DeprecationWarning from NumPy about np.dtype("Int64")
# https://github.com/numpy/numpy/pull/7476
dtype = dtype.lower()
if not issubclass(type(dtype), _IntegerDtype):
try:
dtype = _dtypes[str(np.dtype(dtype))]
except KeyError:
raise ValueError("invalid dtype specified {}".format(dtype))
if isinstance(values, IntegerArray):
values, mask = values._data, values._mask
if dtype is not None:
values = values.astype(dtype.numpy_dtype, copy=False)
if copy:
values = values.copy()
mask = mask.copy()
return values, mask
values = np.array(values, copy=copy)
if is_object_dtype(values):
inferred_type = lib.infer_dtype(values, skipna=True)
if inferred_type == "empty":
values = np.empty(len(values))
values.fill(np.nan)
elif inferred_type not in [
"floating",
"integer",
"mixed-integer",
"mixed-integer-float",
]:
raise TypeError(
"{} cannot be converted to an IntegerDtype".format(values.dtype)
)
elif is_bool_dtype(values) and is_integer_dtype(dtype):
values = np.array(values, dtype=int, copy=copy)
elif not (is_integer_dtype(values) or is_float_dtype(values)):
raise TypeError(
"{} cannot be converted to an IntegerDtype".format(values.dtype)
)
if mask is None:
mask = isna(values)
else:
assert len(mask) == len(values)
if not values.ndim == 1:
raise TypeError("values must be a 1D list-like")
if not mask.ndim == 1:
raise TypeError("mask must be a 1D list-like")
# infer dtype if needed
if dtype is None:
dtype = np.dtype("int64")
else:
dtype = dtype.type
# if we are float, let's make sure that we can
# safely cast
# we copy as need to coerce here
if mask.any():
values = values.copy()
values[mask] = 1
values = safe_cast(values, dtype, copy=False)
else:
values = safe_cast(values, dtype, copy=False)
return values, mask
class IntegerArray(ExtensionArray, ExtensionOpsMixin):
"""
Array of integer (optional missing) values.
.. versionadded:: 0.24.0
.. warning::
IntegerArray is currently experimental, and its API or internal
implementation may change without warning.
We represent an IntegerArray with 2 numpy arrays:
- data: contains a numpy integer array of the appropriate dtype
- mask: a boolean array holding a mask on the data, True is missing
To construct an IntegerArray from generic array-like input, use
:func:`pandas.array` with one of the integer dtypes (see examples).
See :ref:`integer_na` for more.
Parameters
----------
values : numpy.ndarray
A 1-d integer-dtype array.
mask : numpy.ndarray
A 1-d boolean-dtype array indicating missing values.
copy : bool, default False
Whether to copy the `values` and `mask`.
Attributes
----------
None
Methods
-------
None
Returns
-------
IntegerArray
Examples
--------
Create an IntegerArray with :func:`pandas.array`.
>>> int_array = pd.array([1, None, 3], dtype=pd.Int32Dtype())
>>> int_array
<IntegerArray>
[1, NaN, 3]
Length: 3, dtype: Int32
String aliases for the dtypes are also available. They are capitalized.
>>> pd.array([1, None, 3], dtype='Int32')
<IntegerArray>
[1, NaN, 3]
Length: 3, dtype: Int32
>>> pd.array([1, None, 3], dtype='UInt16')
<IntegerArray>
[1, NaN, 3]
Length: 3, dtype: UInt16
"""
@cache_readonly
def dtype(self):
return _dtypes[str(self._data.dtype)]
def __init__(self, values, mask, copy=False):
if not (isinstance(values, np.ndarray) and is_integer_dtype(values.dtype)):
raise TypeError(
"values should be integer numpy array. Use "
"the 'integer_array' function instead"
)
if not (isinstance(mask, np.ndarray) and is_bool_dtype(mask.dtype)):
raise TypeError(
"mask should be boolean numpy array. Use "
"the 'integer_array' function instead"
)
if copy:
values = values.copy()
mask = mask.copy()
self._data = values
self._mask = mask
@classmethod
def _from_sequence(cls, scalars, dtype=None, copy=False):
return integer_array(scalars, dtype=dtype, copy=copy)
@classmethod
def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
scalars = to_numeric(strings, errors="raise")
return cls._from_sequence(scalars, dtype, copy)
@classmethod
def _from_factorized(cls, values, original):
return integer_array(values, dtype=original.dtype)
def _formatter(self, boxed=False):
def fmt(x):
if isna(x):
return "NaN"
return str(x)
return fmt
def __getitem__(self, item):
if is_integer(item):
if self._mask[item]:
return self.dtype.na_value
return self._data[item]
return type(self)(self._data[item], self._mask[item])
def _coerce_to_ndarray(self):
"""
coerce to an ndarary of object dtype
"""
# TODO(jreback) make this better
data = self._data.astype(object)
data[self._mask] = self._na_value
return data
__array_priority__ = 1000 # higher than ndarray so ops dispatch to us
def __array__(self, dtype=None):
"""
the array interface, return my values
We return an object array here to preserve our scalar values
"""
return self._coerce_to_ndarray()
_HANDLED_TYPES = (np.ndarray, numbers.Number)
def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
# For IntegerArray inputs, we apply the ufunc to ._data
# and mask the result.
if method == "reduce":
# Not clear how to handle missing values in reductions. Raise.
raise NotImplementedError("The 'reduce' method is not supported.")
out = kwargs.get("out", ())
for x in inputs + out:
if not isinstance(x, self._HANDLED_TYPES + (IntegerArray,)):
return NotImplemented
# for binary ops, use our custom dunder methods
result = ops.maybe_dispatch_ufunc_to_dunder_op(
self, ufunc, method, *inputs, **kwargs
)
if result is not NotImplemented:
return result
mask = np.zeros(len(self), dtype=bool)
inputs2 = []
for x in inputs:
if isinstance(x, IntegerArray):
mask |= x._mask
inputs2.append(x._data)
else:
inputs2.append(x)
def reconstruct(x):
# we don't worry about scalar `x` here, since we
# raise for reduce up above.
if is_integer_dtype(x.dtype):
m = mask.copy()
return IntegerArray(x, m)
else:
x[mask] = np.nan
return x
result = getattr(ufunc, method)(*inputs2, **kwargs)
if isinstance(result, tuple):
tuple(reconstruct(x) for x in result)
else:
return reconstruct(result)
def __iter__(self):
for i in range(len(self)):
if self._mask[i]:
yield self.dtype.na_value
else:
yield self._data[i]
def take(self, indexer, allow_fill=False, fill_value=None):
from pandas.api.extensions import take
# we always fill with 1 internally
# to avoid upcasting
data_fill_value = 1 if isna(fill_value) else fill_value
result = take(
self._data, indexer, fill_value=data_fill_value, allow_fill=allow_fill
)
mask = take(self._mask, indexer, fill_value=True, allow_fill=allow_fill)
# if we are filling
# we only fill where the indexer is null
# not existing missing values
# TODO(jreback) what if we have a non-na float as a fill value?
if allow_fill and notna(fill_value):
fill_mask = np.asarray(indexer) == -1
result[fill_mask] = fill_value
mask = mask ^ fill_mask
return type(self)(result, mask, copy=False)
def copy(self):
data, mask = self._data, self._mask
data = data.copy()
mask = mask.copy()
return type(self)(data, mask, copy=False)
def __setitem__(self, key, value):
_is_scalar = is_scalar(value)
if _is_scalar:
value = [value]
value, mask = coerce_to_array(value, dtype=self.dtype)
if _is_scalar:
value = value[0]
mask = mask[0]
self._data[key] = value
self._mask[key] = mask
def __len__(self):
return len(self._data)
@property
def nbytes(self):
return self._data.nbytes + self._mask.nbytes
def isna(self):
return self._mask
@property
def _na_value(self):
return np.nan
@classmethod
def _concat_same_type(cls, to_concat):
data = np.concatenate([x._data for x in to_concat])
mask = np.concatenate([x._mask for x in to_concat])
return cls(data, mask)
def astype(self, dtype, copy=True):
"""
Cast to a NumPy array or IntegerArray with 'dtype'.
Parameters
----------
dtype : str or dtype
Typecode or data-type to which the array is cast.
copy : bool, default True
Whether to copy the data, even if not necessary. If False,
a copy is made only if the old dtype does not match the
new dtype.
Returns
-------
array : ndarray or IntegerArray
NumPy ndarray or IntergerArray with 'dtype' for its dtype.
Raises
------
TypeError
if incompatible type with an IntegerDtype, equivalent of same_kind
casting
"""
# if we are astyping to an existing IntegerDtype we can fastpath
if isinstance(dtype, _IntegerDtype):
result = self._data.astype(dtype.numpy_dtype, copy=False)
return type(self)(result, mask=self._mask, copy=False)
# coerce
data = self._coerce_to_ndarray()
return astype_nansafe(data, dtype, copy=None)
@property
def _ndarray_values(self) -> np.ndarray:
"""Internal pandas method for lossy conversion to a NumPy ndarray.
This method is not part of the pandas interface.
The expectation is that this is cheap to compute, and is primarily
used for interacting with our indexers.
"""
return self._data
def value_counts(self, dropna=True):
"""
Returns a Series containing counts of each category.
Every category will have an entry, even those with a count of 0.
Parameters
----------
dropna : boolean, default True
Don't include counts of NaN.
Returns
-------
counts : Series
See Also
--------
Series.value_counts
"""
from pandas import Index, Series
# compute counts on the data with no nans
data = self._data[~self._mask]
value_counts = Index(data).value_counts()
array = value_counts.values
# TODO(extension)
# if we have allow Index to hold an ExtensionArray
# this is easier
index = value_counts.index.astype(object)
# if we want nans, count the mask
if not dropna:
# TODO(extension)
# appending to an Index *always* infers
# w/o passing the dtype
array = np.append(array, [self._mask.sum()])
index = Index(
np.concatenate([index.values, np.array([np.nan], dtype=object)]),
dtype=object,
)
return Series(array, index=index)
def _values_for_argsort(self) -> np.ndarray:
"""Return values for sorting.
Returns
-------
ndarray
The transformed values should maintain the ordering between values
within the array.
See Also
--------
ExtensionArray.argsort
"""
data = self._data.copy()
data[self._mask] = data.min() - 1
return data
@classmethod
def _create_comparison_method(cls, op):
def cmp_method(self, other):
op_name = op.__name__
mask = None
if isinstance(other, (ABCSeries, ABCIndexClass)):
# Rely on pandas to unbox and dispatch to us.
return NotImplemented
if isinstance(other, IntegerArray):
other, mask = other._data, other._mask
elif is_list_like(other):
other = np.asarray(other)
if other.ndim > 0 and len(self) != len(other):
raise ValueError("Lengths must match to compare")
other = lib.item_from_zerodim(other)
# numpy will show a DeprecationWarning on invalid elementwise
# comparisons, this will raise in the future
with warnings.catch_warnings():
warnings.filterwarnings("ignore", "elementwise", FutureWarning)
with np.errstate(all="ignore"):
result = op(self._data, other)
# nans propagate
if mask is None:
mask = self._mask
else:
mask = self._mask | mask
result[mask] = op_name == "ne"
return result
name = "__{name}__".format(name=op.__name__)
return set_function_name(cmp_method, name, cls)
def _reduce(self, name, skipna=True, **kwargs):
data = self._data
mask = self._mask
# coerce to a nan-aware float if needed
if mask.any():
data = self._data.astype("float64")
data[mask] = self._na_value
op = getattr(nanops, "nan" + name)
result = op(data, axis=0, skipna=skipna, mask=mask)
# if we have a boolean op, don't coerce
if name in ["any", "all"]:
pass
# if we have a preservable numeric op,
# provide coercion back to an integer type if possible
elif name in ["sum", "min", "max", "prod"] and notna(result):
int_result = int(result)
if int_result == result:
result = int_result
return result
def _maybe_mask_result(self, result, mask, other, op_name):
"""
Parameters
----------
result : array-like
mask : array-like bool
other : scalar or array-like
op_name : str
"""
# may need to fill infs
# and mask wraparound
if is_float_dtype(result):
mask |= (result == np.inf) | (result == -np.inf)
# if we have a float operand we are by-definition
# a float result
# or our op is a divide
if (is_float_dtype(other) or is_float(other)) or (
op_name in ["rtruediv", "truediv"]
):
result[mask] = np.nan
return result
return type(self)(result, mask, copy=False)
@classmethod
def _create_arithmetic_method(cls, op):
def integer_arithmetic_method(self, other):
op_name = op.__name__
mask = None
if isinstance(other, (ABCSeries, ABCIndexClass)):
# Rely on pandas to unbox and dispatch to us.
return NotImplemented
if getattr(other, "ndim", 0) > 1:
raise NotImplementedError("can only perform ops with 1-d structures")
if isinstance(other, IntegerArray):
other, mask = other._data, other._mask
elif getattr(other, "ndim", None) == 0:
other = other.item()
elif is_list_like(other):
other = np.asarray(other)
if not other.ndim:
other = other.item()
elif other.ndim == 1:
if not (is_float_dtype(other) or is_integer_dtype(other)):
raise TypeError("can only perform ops with numeric values")
else:
if not (is_float(other) or is_integer(other)):
raise TypeError("can only perform ops with numeric values")
# nans propagate
if mask is None:
mask = self._mask
else:
mask = self._mask | mask
# 1 ** np.nan is 1. So we have to unmask those.
if op_name == "pow":
mask = np.where(self == 1, False, mask)
elif op_name == "rpow":
mask = np.where(other == 1, False, mask)
with np.errstate(all="ignore"):
result = op(self._data, other)
# divmod returns a tuple
if op_name == "divmod":
div, mod = result
return (
self._maybe_mask_result(div, mask, other, "floordiv"),
self._maybe_mask_result(mod, mask, other, "mod"),
)
return self._maybe_mask_result(result, mask, other, op_name)
name = "__{name}__".format(name=op.__name__)
return set_function_name(integer_arithmetic_method, name, cls)
IntegerArray._add_arithmetic_ops()
IntegerArray._add_comparison_ops()
_dtype_docstring = """
An ExtensionDtype for {dtype} integer data.
Attributes
----------
None
Methods
-------
None
"""
# create the Dtype
Int8Dtype = register_extension_dtype(
type(
"Int8Dtype",
(_IntegerDtype,),
{
"type": np.int8,
"name": "Int8",
"__doc__": _dtype_docstring.format(dtype="int8"),
},
)
)
Int16Dtype = register_extension_dtype(
type(
"Int16Dtype",
(_IntegerDtype,),
{
"type": np.int16,
"name": "Int16",
"__doc__": _dtype_docstring.format(dtype="int16"),
},
)
)
Int32Dtype = register_extension_dtype(
type(
"Int32Dtype",
(_IntegerDtype,),
{
"type": np.int32,
"name": "Int32",
"__doc__": _dtype_docstring.format(dtype="int32"),
},
)
)
Int64Dtype = register_extension_dtype(
type(
"Int64Dtype",
(_IntegerDtype,),
{
"type": np.int64,
"name": "Int64",
"__doc__": _dtype_docstring.format(dtype="int64"),
},
)
)
UInt8Dtype = register_extension_dtype(
type(
"UInt8Dtype",
(_IntegerDtype,),
{
"type": np.uint8,
"name": "UInt8",
"__doc__": _dtype_docstring.format(dtype="uint8"),
},
)
)
UInt16Dtype = register_extension_dtype(
type(
"UInt16Dtype",
(_IntegerDtype,),
{
"type": np.uint16,
"name": "UInt16",
"__doc__": _dtype_docstring.format(dtype="uint16"),
},
)
)
UInt32Dtype = register_extension_dtype(
type(
"UInt32Dtype",
(_IntegerDtype,),
{
"type": np.uint32,
"name": "UInt32",
"__doc__": _dtype_docstring.format(dtype="uint32"),
},
)
)
UInt64Dtype = register_extension_dtype(
type(
"UInt64Dtype",
(_IntegerDtype,),
{
"type": np.uint64,
"name": "UInt64",
"__doc__": _dtype_docstring.format(dtype="uint64"),
},
)
)
_dtypes = {
"int8": Int8Dtype(),
"int16": Int16Dtype(),
"int32": Int32Dtype(),
"int64": Int64Dtype(),
"uint8": UInt8Dtype(),
"uint16": UInt16Dtype(),
"uint32": UInt32Dtype(),
"uint64": UInt64Dtype(),
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,469 @@
import numbers
import numpy as np
from numpy.lib.mixins import NDArrayOperatorsMixin
from pandas._libs import lib
from pandas.compat.numpy import function as nv
from pandas.util._decorators import Appender
from pandas.util._validators import validate_fillna_kwargs
from pandas.core.dtypes.dtypes import ExtensionDtype
from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries
from pandas.core.dtypes.inference import is_array_like, is_list_like
from pandas import compat
from pandas.core import nanops
from pandas.core.algorithms import searchsorted
from pandas.core.missing import backfill_1d, pad_1d
from .base import ExtensionArray, ExtensionOpsMixin
class PandasDtype(ExtensionDtype):
"""
A Pandas ExtensionDtype for NumPy dtypes.
.. versionadded:: 0.24.0
This is mostly for internal compatibility, and is not especially
useful on its own.
Parameters
----------
dtype : numpy.dtype
"""
_metadata = ("_dtype",)
def __init__(self, dtype):
dtype = np.dtype(dtype)
self._dtype = dtype
self._name = dtype.name
self._type = dtype.type
def __repr__(self):
return "PandasDtype({!r})".format(self.name)
@property
def numpy_dtype(self):
"""The NumPy dtype this PandasDtype wraps."""
return self._dtype
@property
def name(self):
return self._name
@property
def type(self):
return self._type
@property
def _is_numeric(self):
# exclude object, str, unicode, void.
return self.kind in set("biufc")
@property
def _is_boolean(self):
return self.kind == "b"
@classmethod
def construct_from_string(cls, string):
return cls(np.dtype(string))
def construct_array_type(cls):
return PandasArray
@property
def kind(self):
return self._dtype.kind
@property
def itemsize(self):
"""The element size of this data-type object."""
return self._dtype.itemsize
class PandasArray(ExtensionArray, ExtensionOpsMixin, NDArrayOperatorsMixin):
"""
A pandas ExtensionArray for NumPy data.
.. versionadded :: 0.24.0
This is mostly for internal compatibility, and is not especially
useful on its own.
Parameters
----------
values : ndarray
The NumPy ndarray to wrap. Must be 1-dimensional.
copy : bool, default False
Whether to copy `values`.
Attributes
----------
None
Methods
-------
None
"""
# If you're wondering why pd.Series(cls) doesn't put the array in an
# ExtensionBlock, search for `ABCPandasArray`. We check for
# that _typ to ensure that that users don't unnecessarily use EAs inside
# pandas internals, which turns off things like block consolidation.
_typ = "npy_extension"
__array_priority__ = 1000
# ------------------------------------------------------------------------
# Constructors
def __init__(self, values, copy=False):
if isinstance(values, type(self)):
values = values._ndarray
if not isinstance(values, np.ndarray):
raise ValueError("'values' must be a NumPy array.")
if values.ndim != 1:
raise ValueError("PandasArray must be 1-dimensional.")
if copy:
values = values.copy()
self._ndarray = values
self._dtype = PandasDtype(values.dtype)
@classmethod
def _from_sequence(cls, scalars, dtype=None, copy=False):
if isinstance(dtype, PandasDtype):
dtype = dtype._dtype
result = np.asarray(scalars, dtype=dtype)
if copy and result is scalars:
result = result.copy()
return cls(result)
@classmethod
def _from_factorized(cls, values, original):
return cls(values)
@classmethod
def _concat_same_type(cls, to_concat):
return cls(np.concatenate(to_concat))
# ------------------------------------------------------------------------
# Data
@property
def dtype(self):
return self._dtype
# ------------------------------------------------------------------------
# NumPy Array Interface
def __array__(self, dtype=None):
return np.asarray(self._ndarray, dtype=dtype)
_HANDLED_TYPES = (np.ndarray, numbers.Number)
def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
# Lightly modified version of
# https://docs.scipy.org/doc/numpy-1.15.1/reference/generated/\
# numpy.lib.mixins.NDArrayOperatorsMixin.html
# The primary modification is not boxing scalar return values
# in PandasArray, since pandas' ExtensionArrays are 1-d.
out = kwargs.get("out", ())
for x in inputs + out:
# Only support operations with instances of _HANDLED_TYPES.
# Use PandasArray instead of type(self) for isinstance to
# allow subclasses that don't override __array_ufunc__ to
# handle PandasArray objects.
if not isinstance(x, self._HANDLED_TYPES + (PandasArray,)):
return NotImplemented
# Defer to the implementation of the ufunc on unwrapped values.
inputs = tuple(x._ndarray if isinstance(x, PandasArray) else x for x in inputs)
if out:
kwargs["out"] = tuple(
x._ndarray if isinstance(x, PandasArray) else x for x in out
)
result = getattr(ufunc, method)(*inputs, **kwargs)
if type(result) is tuple and len(result):
# multiple return values
if not lib.is_scalar(result[0]):
# re-box array-like results
return tuple(type(self)(x) for x in result)
else:
# but not scalar reductions
return result
elif method == "at":
# no return value
return None
else:
# one return value
if not lib.is_scalar(result):
# re-box array-like results, but not scalar reductions
result = type(self)(result)
return result
# ------------------------------------------------------------------------
# Pandas ExtensionArray Interface
def __getitem__(self, item):
if isinstance(item, type(self)):
item = item._ndarray
result = self._ndarray[item]
if not lib.is_scalar(item):
result = type(self)(result)
return result
def __setitem__(self, key, value):
from pandas.core.internals.arrays import extract_array
value = extract_array(value, extract_numpy=True)
if not lib.is_scalar(key) and is_list_like(key):
key = np.asarray(key)
if not lib.is_scalar(value):
value = np.asarray(value)
values = self._ndarray
t = np.result_type(value, values)
if t != self._ndarray.dtype:
values = values.astype(t, casting="safe")
values[key] = value
self._dtype = PandasDtype(t)
self._ndarray = values
else:
self._ndarray[key] = value
def __len__(self):
return len(self._ndarray)
@property
def nbytes(self):
return self._ndarray.nbytes
def isna(self):
from pandas import isna
return isna(self._ndarray)
def fillna(self, value=None, method=None, limit=None):
# TODO(_values_for_fillna): remove this
value, method = validate_fillna_kwargs(value, method)
mask = self.isna()
if is_array_like(value):
if len(value) != len(self):
raise ValueError(
"Length of 'value' does not match. Got ({}) "
" expected {}".format(len(value), len(self))
)
value = value[mask]
if mask.any():
if method is not None:
func = pad_1d if method == "pad" else backfill_1d
new_values = func(self._ndarray, limit=limit, mask=mask)
new_values = self._from_sequence(new_values, dtype=self.dtype)
else:
# fill with value
new_values = self.copy()
new_values[mask] = value
else:
new_values = self.copy()
return new_values
def take(self, indices, allow_fill=False, fill_value=None):
from pandas.core.algorithms import take
result = take(
self._ndarray, indices, allow_fill=allow_fill, fill_value=fill_value
)
return type(self)(result)
def copy(self):
return type(self)(self._ndarray.copy())
def _values_for_argsort(self):
return self._ndarray
def _values_for_factorize(self):
return self._ndarray, -1
def unique(self):
from pandas import unique
return type(self)(unique(self._ndarray))
# ------------------------------------------------------------------------
# Reductions
def _reduce(self, name, skipna=True, **kwargs):
meth = getattr(self, name, None)
if meth:
return meth(skipna=skipna, **kwargs)
else:
msg = "'{}' does not implement reduction '{}'"
raise TypeError(msg.format(type(self).__name__, name))
def any(self, axis=None, out=None, keepdims=False, skipna=True):
nv.validate_any((), dict(out=out, keepdims=keepdims))
return nanops.nanany(self._ndarray, axis=axis, skipna=skipna)
def all(self, axis=None, out=None, keepdims=False, skipna=True):
nv.validate_all((), dict(out=out, keepdims=keepdims))
return nanops.nanall(self._ndarray, axis=axis, skipna=skipna)
def min(self, axis=None, out=None, keepdims=False, skipna=True):
nv.validate_min((), dict(out=out, keepdims=keepdims))
return nanops.nanmin(self._ndarray, axis=axis, skipna=skipna)
def max(self, axis=None, out=None, keepdims=False, skipna=True):
nv.validate_max((), dict(out=out, keepdims=keepdims))
return nanops.nanmax(self._ndarray, axis=axis, skipna=skipna)
def sum(
self,
axis=None,
dtype=None,
out=None,
keepdims=False,
initial=None,
skipna=True,
min_count=0,
):
nv.validate_sum(
(), dict(dtype=dtype, out=out, keepdims=keepdims, initial=initial)
)
return nanops.nansum(
self._ndarray, axis=axis, skipna=skipna, min_count=min_count
)
def prod(
self,
axis=None,
dtype=None,
out=None,
keepdims=False,
initial=None,
skipna=True,
min_count=0,
):
nv.validate_prod(
(), dict(dtype=dtype, out=out, keepdims=keepdims, initial=initial)
)
return nanops.nanprod(
self._ndarray, axis=axis, skipna=skipna, min_count=min_count
)
def mean(self, axis=None, dtype=None, out=None, keepdims=False, skipna=True):
nv.validate_mean((), dict(dtype=dtype, out=out, keepdims=keepdims))
return nanops.nanmean(self._ndarray, axis=axis, skipna=skipna)
def median(
self, axis=None, out=None, overwrite_input=False, keepdims=False, skipna=True
):
nv.validate_median(
(), dict(out=out, overwrite_input=overwrite_input, keepdims=keepdims)
)
return nanops.nanmedian(self._ndarray, axis=axis, skipna=skipna)
def std(self, axis=None, dtype=None, out=None, ddof=1, keepdims=False, skipna=True):
nv.validate_stat_ddof_func(
(), dict(dtype=dtype, out=out, keepdims=keepdims), fname="std"
)
return nanops.nanstd(self._ndarray, axis=axis, skipna=skipna, ddof=ddof)
def var(self, axis=None, dtype=None, out=None, ddof=1, keepdims=False, skipna=True):
nv.validate_stat_ddof_func(
(), dict(dtype=dtype, out=out, keepdims=keepdims), fname="var"
)
return nanops.nanvar(self._ndarray, axis=axis, skipna=skipna, ddof=ddof)
def sem(self, axis=None, dtype=None, out=None, ddof=1, keepdims=False, skipna=True):
nv.validate_stat_ddof_func(
(), dict(dtype=dtype, out=out, keepdims=keepdims), fname="sem"
)
return nanops.nansem(self._ndarray, axis=axis, skipna=skipna, ddof=ddof)
def kurt(self, axis=None, dtype=None, out=None, keepdims=False, skipna=True):
nv.validate_stat_ddof_func(
(), dict(dtype=dtype, out=out, keepdims=keepdims), fname="kurt"
)
return nanops.nankurt(self._ndarray, axis=axis, skipna=skipna)
def skew(self, axis=None, dtype=None, out=None, keepdims=False, skipna=True):
nv.validate_stat_ddof_func(
(), dict(dtype=dtype, out=out, keepdims=keepdims), fname="skew"
)
return nanops.nanskew(self._ndarray, axis=axis, skipna=skipna)
# ------------------------------------------------------------------------
# Additional Methods
def to_numpy(self, dtype=None, copy=False):
"""
Convert the PandasArray to a :class:`numpy.ndarray`.
By default, this requires no coercion or copying of data.
Parameters
----------
dtype : numpy.dtype
The NumPy dtype to pass to :func:`numpy.asarray`.
copy : bool, default False
Whether to copy the underlying data.
Returns
-------
ndarray
"""
result = np.asarray(self._ndarray, dtype=dtype)
if copy and result is self._ndarray:
result = result.copy()
return result
@Appender(ExtensionArray.searchsorted.__doc__)
def searchsorted(self, value, side="left", sorter=None):
return searchsorted(self.to_numpy(), value, side=side, sorter=sorter)
# ------------------------------------------------------------------------
# Ops
def __invert__(self):
return type(self)(~self._ndarray)
@classmethod
def _create_arithmetic_method(cls, op):
def arithmetic_method(self, other):
if isinstance(other, (ABCIndexClass, ABCSeries)):
return NotImplemented
elif isinstance(other, cls):
other = other._ndarray
with np.errstate(all="ignore"):
result = op(self._ndarray, other)
if op is divmod:
a, b = result
return cls(a), cls(b)
return cls(result)
return compat.set_function_name(
arithmetic_method, "__{}__".format(op.__name__), cls
)
_create_comparison_method = _create_arithmetic_method
PandasArray._add_arithmetic_ops()
PandasArray._add_comparison_ops()

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,502 @@
"""
Misc tools for implementing data structures
Note: pandas.core.common is *not* part of the public API.
"""
import collections
from collections import OrderedDict, abc
from datetime import datetime, timedelta
from functools import partial
import inspect
from typing import Any, Iterable, Union
import numpy as np
from pandas._libs import lib, tslibs
from pandas.compat import PY36
from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
from pandas.core.dtypes.common import (
is_array_like,
is_bool_dtype,
is_extension_array_dtype,
is_integer,
)
from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries
from pandas.core.dtypes.inference import _iterable_not_string
from pandas.core.dtypes.missing import isna, isnull, notnull # noqa
class SettingWithCopyError(ValueError):
pass
class SettingWithCopyWarning(Warning):
pass
def flatten(l):
"""
Flatten an arbitrarily nested sequence.
Parameters
----------
l : sequence
The non string sequence to flatten
Notes
-----
This doesn't consider strings sequences.
Returns
-------
flattened : generator
"""
for el in l:
if _iterable_not_string(el):
for s in flatten(el):
yield s
else:
yield el
def consensus_name_attr(objs):
name = objs[0].name
for obj in objs[1:]:
try:
if obj.name != name:
name = None
except ValueError:
name = None
return name
def maybe_box(indexer, values, obj, key):
# if we have multiples coming back, box em
if isinstance(values, np.ndarray):
return obj[indexer.get_loc(key)]
# return the value
return values
def maybe_box_datetimelike(value):
# turn a datetime like into a Timestamp/timedelta as needed
if isinstance(value, (np.datetime64, datetime)):
value = tslibs.Timestamp(value)
elif isinstance(value, (np.timedelta64, timedelta)):
value = tslibs.Timedelta(value)
return value
values_from_object = lib.values_from_object
def is_bool_indexer(key: Any) -> bool:
"""
Check whether `key` is a valid boolean indexer.
Parameters
----------
key : Any
Only list-likes may be considered boolean indexers.
All other types are not considered a boolean indexer.
For array-like input, boolean ndarrays or ExtensionArrays
with ``_is_boolean`` set are considered boolean indexers.
Returns
-------
bool
Raises
------
ValueError
When the array is an object-dtype ndarray or ExtensionArray
and contains missing values.
"""
na_msg = "cannot index with vector containing NA / NaN values"
if isinstance(key, (ABCSeries, np.ndarray, ABCIndex)) or (
is_array_like(key) and is_extension_array_dtype(key.dtype)
):
if key.dtype == np.object_:
key = np.asarray(values_from_object(key))
if not lib.is_bool_array(key):
if isna(key).any():
raise ValueError(na_msg)
return False
return True
elif is_bool_dtype(key.dtype):
# an ndarray with bool-dtype by definition has no missing values.
# So we only need to check for NAs in ExtensionArrays
if is_extension_array_dtype(key.dtype):
if np.any(key.isna()):
raise ValueError(na_msg)
return True
elif isinstance(key, list):
try:
arr = np.asarray(key)
return arr.dtype == np.bool_ and len(arr) == len(key)
except TypeError: # pragma: no cover
return False
return False
def cast_scalar_indexer(val):
"""
To avoid numpy DeprecationWarnings, cast float to integer where valid.
Parameters
----------
val : scalar
Returns
-------
outval : scalar
"""
# assumes lib.is_scalar(val)
if lib.is_float(val) and val == int(val):
return int(val)
return val
def _not_none(*args):
"""
Returns a generator consisting of the arguments that are not None.
"""
return (arg for arg in args if arg is not None)
def _any_none(*args):
"""
Returns a boolean indicating if any argument is None.
"""
for arg in args:
if arg is None:
return True
return False
def _all_none(*args):
"""
Returns a boolean indicating if all arguments are None.
"""
for arg in args:
if arg is not None:
return False
return True
def _any_not_none(*args):
"""
Returns a boolean indicating if any argument is not None.
"""
for arg in args:
if arg is not None:
return True
return False
def _all_not_none(*args):
"""
Returns a boolean indicating if all arguments are not None.
"""
for arg in args:
if arg is None:
return False
return True
def count_not_none(*args):
"""
Returns the count of arguments that are not None.
"""
return sum(x is not None for x in args)
def try_sort(iterable):
listed = list(iterable)
try:
return sorted(listed)
except Exception:
return listed
def dict_keys_to_ordered_list(mapping):
# when pandas drops support for Python < 3.6, this function
# can be replaced by a simple list(mapping.keys())
if PY36 or isinstance(mapping, OrderedDict):
keys = list(mapping.keys())
else:
keys = try_sort(mapping)
return keys
def asarray_tuplesafe(values, dtype=None):
if not (isinstance(values, (list, tuple)) or hasattr(values, "__array__")):
values = list(values)
elif isinstance(values, ABCIndexClass):
return values.values
if isinstance(values, list) and dtype in [np.object_, object]:
return construct_1d_object_array_from_listlike(values)
result = np.asarray(values, dtype=dtype)
if issubclass(result.dtype.type, str):
result = np.asarray(values, dtype=object)
if result.ndim == 2:
# Avoid building an array of arrays:
# TODO: verify whether any path hits this except #18819 (invalid)
values = [tuple(x) for x in values]
result = construct_1d_object_array_from_listlike(values)
return result
def index_labels_to_array(labels, dtype=None):
"""
Transform label or iterable of labels to array, for use in Index.
Parameters
----------
dtype : dtype
If specified, use as dtype of the resulting array, otherwise infer.
Returns
-------
array
"""
if isinstance(labels, (str, tuple)):
labels = [labels]
if not isinstance(labels, (list, np.ndarray)):
try:
labels = list(labels)
except TypeError: # non-iterable
labels = [labels]
labels = asarray_tuplesafe(labels, dtype=dtype)
return labels
def maybe_make_list(obj):
if obj is not None and not isinstance(obj, (tuple, list)):
return [obj]
return obj
def maybe_iterable_to_list(obj: Union[Iterable, Any]) -> Union[list, Any]:
"""
If obj is Iterable but not list-like, consume into list.
"""
if isinstance(obj, abc.Iterable) and not isinstance(obj, abc.Sized):
return list(obj)
return obj
def is_null_slice(obj):
"""
We have a null slice.
"""
return (
isinstance(obj, slice)
and obj.start is None
and obj.stop is None
and obj.step is None
)
def is_true_slices(l):
"""
Find non-trivial slices in "l": return a list of booleans with same length.
"""
return [isinstance(k, slice) and not is_null_slice(k) for k in l]
# TODO: used only once in indexing; belongs elsewhere?
def is_full_slice(obj, l):
"""
We have a full length slice.
"""
return (
isinstance(obj, slice) and obj.start == 0 and obj.stop == l and obj.step is None
)
def get_callable_name(obj):
# typical case has name
if hasattr(obj, "__name__"):
return getattr(obj, "__name__")
# some objects don't; could recurse
if isinstance(obj, partial):
return get_callable_name(obj.func)
# fall back to class name
if hasattr(obj, "__call__"):
return obj.__class__.__name__
# everything failed (probably because the argument
# wasn't actually callable); we return None
# instead of the empty string in this case to allow
# distinguishing between no name and a name of ''
return None
def apply_if_callable(maybe_callable, obj, **kwargs):
"""
Evaluate possibly callable input using obj and kwargs if it is callable,
otherwise return as it is.
Parameters
----------
maybe_callable : possibly a callable
obj : NDFrame
**kwargs
"""
if callable(maybe_callable):
return maybe_callable(obj, **kwargs)
return maybe_callable
def dict_compat(d):
"""
Helper function to convert datetimelike-keyed dicts
to Timestamp-keyed dict.
Parameters
----------
d: dict like object
Returns
-------
dict
"""
return {maybe_box_datetimelike(key): value for key, value in d.items()}
def standardize_mapping(into):
"""
Helper function to standardize a supplied mapping.
.. versionadded:: 0.21.0
Parameters
----------
into : instance or subclass of collections.abc.Mapping
Must be a class, an initialized collections.defaultdict,
or an instance of a collections.abc.Mapping subclass.
Returns
-------
mapping : a collections.abc.Mapping subclass or other constructor
a callable object that can accept an iterator to create
the desired Mapping.
See Also
--------
DataFrame.to_dict
Series.to_dict
"""
if not inspect.isclass(into):
if isinstance(into, collections.defaultdict):
return partial(collections.defaultdict, into.default_factory)
into = type(into)
if not issubclass(into, abc.Mapping):
raise TypeError("unsupported type: {into}".format(into=into))
elif into == collections.defaultdict:
raise TypeError("to_dict() only accepts initialized defaultdicts")
return into
def random_state(state=None):
"""
Helper function for processing random_state arguments.
Parameters
----------
state : int, np.random.RandomState, None.
If receives an int, passes to np.random.RandomState() as seed.
If receives an np.random.RandomState object, just returns object.
If receives `None`, returns np.random.
If receives anything else, raises an informative ValueError.
Default None.
Returns
-------
np.random.RandomState
"""
if is_integer(state):
return np.random.RandomState(state)
elif isinstance(state, np.random.RandomState):
return state
elif state is None:
return np.random
else:
raise ValueError(
"random_state must be an integer, a numpy " "RandomState, or None"
)
def _pipe(obj, func, *args, **kwargs):
"""
Apply a function ``func`` to object ``obj`` either by passing obj as the
first argument to the function or, in the case that the func is a tuple,
interpret the first element of the tuple as a function and pass the obj to
that function as a keyword argument whose key is the value of the second
element of the tuple.
Parameters
----------
func : callable or tuple of (callable, string)
Function to apply to this object or, alternatively, a
``(callable, data_keyword)`` tuple where ``data_keyword`` is a
string indicating the keyword of `callable`` that expects the
object.
args : iterable, optional
positional arguments passed into ``func``.
kwargs : dict, optional
a dictionary of keyword arguments passed into ``func``.
Returns
-------
object : the return type of ``func``.
"""
if isinstance(func, tuple):
func, target = func
if target in kwargs:
msg = "%s is both the pipe target and a keyword argument" % target
raise ValueError(msg)
kwargs[target] = obj
return func(*args, **kwargs)
else:
return func(obj, *args, **kwargs)
def _get_rename_function(mapper):
"""
Returns a function that will map names/labels, dependent if mapper
is a dict, Series or just a function.
"""
if isinstance(mapper, (abc.Mapping, ABCSeries)):
def f(x):
if x in mapper:
return mapper[x]
else:
return x
else:
f = mapper
return f

View File

@@ -0,0 +1,175 @@
"""Core eval alignment algorithms
"""
from functools import partial, wraps
import warnings
import numpy as np
from pandas.errors import PerformanceWarning
import pandas as pd
import pandas.core.common as com
from pandas.core.computation.common import _result_type_many
def _align_core_single_unary_op(term):
if isinstance(term.value, np.ndarray):
typ = partial(np.asanyarray, dtype=term.value.dtype)
else:
typ = type(term.value)
ret = (typ,)
if not hasattr(term.value, "axes"):
ret += (None,)
else:
ret += (_zip_axes_from_type(typ, term.value.axes),)
return ret
def _zip_axes_from_type(typ, new_axes):
axes = {ax_name: new_axes[ax_ind] for ax_ind, ax_name in typ._AXIS_NAMES.items()}
return axes
def _any_pandas_objects(terms):
"""Check a sequence of terms for instances of PandasObject."""
return any(isinstance(term.value, pd.core.generic.PandasObject) for term in terms)
def _filter_special_cases(f):
@wraps(f)
def wrapper(terms):
# single unary operand
if len(terms) == 1:
return _align_core_single_unary_op(terms[0])
term_values = (term.value for term in terms)
# we don't have any pandas objects
if not _any_pandas_objects(terms):
return _result_type_many(*term_values), None
return f(terms)
return wrapper
@_filter_special_cases
def _align_core(terms):
term_index = [i for i, term in enumerate(terms) if hasattr(term.value, "axes")]
term_dims = [terms[i].value.ndim for i in term_index]
ndims = pd.Series(dict(zip(term_index, term_dims)))
# initial axes are the axes of the largest-axis'd term
biggest = terms[ndims.idxmax()].value
typ = biggest._constructor
axes = biggest.axes
naxes = len(axes)
gt_than_one_axis = naxes > 1
for value in (terms[i].value for i in term_index):
is_series = isinstance(value, pd.Series)
is_series_and_gt_one_axis = is_series and gt_than_one_axis
for axis, items in enumerate(value.axes):
if is_series_and_gt_one_axis:
ax, itm = naxes - 1, value.index
else:
ax, itm = axis, items
if not axes[ax].is_(itm):
axes[ax] = axes[ax].join(itm, how="outer")
for i, ndim in ndims.items():
for axis, items in zip(range(ndim), axes):
ti = terms[i].value
if hasattr(ti, "reindex"):
transpose = isinstance(ti, pd.Series) and naxes > 1
reindexer = axes[naxes - 1] if transpose else items
term_axis_size = len(ti.axes[axis])
reindexer_size = len(reindexer)
ordm = np.log10(max(1, abs(reindexer_size - term_axis_size)))
if ordm >= 1 and reindexer_size >= 10000:
w = (
"Alignment difference on axis {axis} is larger "
"than an order of magnitude on term {term!r}, by "
"more than {ordm:.4g}; performance may suffer"
).format(axis=axis, term=terms[i].name, ordm=ordm)
warnings.warn(w, category=PerformanceWarning, stacklevel=6)
f = partial(ti.reindex, reindexer, axis=axis, copy=False)
terms[i].update(f())
terms[i].update(terms[i].value.values)
return typ, _zip_axes_from_type(typ, axes)
def _align(terms):
"""Align a set of terms"""
try:
# flatten the parse tree (a nested list, really)
terms = list(com.flatten(terms))
except TypeError:
# can't iterate so it must just be a constant or single variable
if isinstance(terms.value, pd.core.generic.NDFrame):
typ = type(terms.value)
return typ, _zip_axes_from_type(typ, terms.value.axes)
return np.result_type(terms.type), None
# if all resolved variables are numeric scalars
if all(term.is_scalar for term in terms):
return _result_type_many(*(term.value for term in terms)).type, None
# perform the main alignment
typ, axes = _align_core(terms)
return typ, axes
def _reconstruct_object(typ, obj, axes, dtype):
"""Reconstruct an object given its type, raw value, and possibly empty
(None) axes.
Parameters
----------
typ : object
A type
obj : object
The value to use in the type constructor
axes : dict
The axes to use to construct the resulting pandas object
Returns
-------
ret : typ
An object of type ``typ`` with the value `obj` and possible axes
`axes`.
"""
try:
typ = typ.type
except AttributeError:
pass
res_t = np.result_type(obj.dtype, dtype)
if not isinstance(typ, partial) and issubclass(typ, pd.core.generic.PandasObject):
return typ(obj, dtype=res_t, **axes)
# special case for pathological things like ~True/~False
if hasattr(res_t, "type") and typ == np.bool_ and res_t != np.bool_:
ret_value = res_t.type(obj)
else:
ret_value = typ(obj).astype(res_t)
# The condition is to distinguish 0-dim array (returned in case of
# scalar) and 1 element array
# e.g. np.array(0) and np.array([0])
if len(obj.shape) == 1 and len(obj) == 1:
if not isinstance(ret_value, np.ndarray):
ret_value = np.array([ret_value]).astype(res_t)
return ret_value

View File

@@ -0,0 +1,3 @@
# flake8: noqa
from pandas.core.computation.eval import eval

View File

@@ -0,0 +1,10 @@
from pandas.compat._optional import import_optional_dependency
ne = import_optional_dependency("numexpr", raise_on_missing=False, on_version="warn")
_NUMEXPR_INSTALLED = ne is not None
if _NUMEXPR_INSTALLED:
_NUMEXPR_VERSION = ne.__version__
else:
_NUMEXPR_VERSION = None
__all__ = ["_NUMEXPR_INSTALLED", "_NUMEXPR_VERSION"]

View File

@@ -0,0 +1,38 @@
from functools import reduce
import numpy as np
import pandas as pd
# A token value Python's tokenizer probably will never use.
_BACKTICK_QUOTED_STRING = 100
def _ensure_decoded(s):
""" if we have bytes, decode them to unicode """
if isinstance(s, (np.bytes_, bytes)):
s = s.decode(pd.get_option("display.encoding"))
return s
def _result_type_many(*arrays_and_dtypes):
""" wrapper around numpy.result_type which overcomes the NPY_MAXARGS (32)
argument limit """
try:
return np.result_type(*arrays_and_dtypes)
except ValueError:
# we have > NPY_MAXARGS terms in our expression
return reduce(np.result_type, arrays_and_dtypes)
def _remove_spaces_column_name(name):
"""Check if name contains any spaces, if it contains any spaces
the spaces will be removed and an underscore suffix is added."""
if not isinstance(name, str) or " " not in name:
return name
return name.replace(" ", "_") + "_BACKTICK_QUOTED_STRING"
class NameResolutionError(NameError):
pass

View File

@@ -0,0 +1,148 @@
"""
Engine classes for :func:`~pandas.eval`
"""
import abc
from pandas.core.computation.align import _align, _reconstruct_object
from pandas.core.computation.ops import UndefinedVariableError, _mathops, _reductions
import pandas.io.formats.printing as printing
_ne_builtins = frozenset(_mathops + _reductions)
class NumExprClobberingError(NameError):
pass
def _check_ne_builtin_clash(expr):
"""Attempt to prevent foot-shooting in a helpful way.
Parameters
----------
terms : Term
Terms can contain
"""
names = expr.names
overlap = names & _ne_builtins
if overlap:
s = ", ".join(map(repr, overlap))
raise NumExprClobberingError(
'Variables in expression "{expr}" '
"overlap with builtins: ({s})".format(expr=expr, s=s)
)
class AbstractEngine(metaclass=abc.ABCMeta):
"""Object serving as a base class for all engines."""
has_neg_frac = False
def __init__(self, expr):
self.expr = expr
self.aligned_axes = None
self.result_type = None
def convert(self):
"""Convert an expression for evaluation.
Defaults to return the expression as a string.
"""
return printing.pprint_thing(self.expr)
def evaluate(self):
"""Run the engine on the expression
This method performs alignment which is necessary no matter what engine
is being used, thus its implementation is in the base class.
Returns
-------
obj : object
The result of the passed expression.
"""
if not self._is_aligned:
self.result_type, self.aligned_axes = _align(self.expr.terms)
# make sure no names in resolvers and locals/globals clash
res = self._evaluate()
return _reconstruct_object(
self.result_type, res, self.aligned_axes, self.expr.terms.return_type
)
@property
def _is_aligned(self):
return self.aligned_axes is not None and self.result_type is not None
@abc.abstractmethod
def _evaluate(self):
"""Return an evaluated expression.
Parameters
----------
env : Scope
The local and global environment in which to evaluate an
expression.
Notes
-----
Must be implemented by subclasses.
"""
pass
class NumExprEngine(AbstractEngine):
"""NumExpr engine class"""
has_neg_frac = True
def __init__(self, expr):
super().__init__(expr)
def convert(self):
return str(super().convert())
def _evaluate(self):
import numexpr as ne
# convert the expression to a valid numexpr expression
s = self.convert()
try:
env = self.expr.env
scope = env.full_scope
truediv = scope["truediv"]
_check_ne_builtin_clash(self.expr)
return ne.evaluate(s, local_dict=scope, truediv=truediv)
except KeyError as e:
# python 3 compat kludge
try:
msg = e.message
except AttributeError:
msg = str(e)
raise UndefinedVariableError(msg)
class PythonEngine(AbstractEngine):
"""Evaluate an expression in Python space.
Mostly for testing purposes.
"""
has_neg_frac = False
def __init__(self, expr):
super().__init__(expr)
def evaluate(self):
return self.expr()
def _evaluate(self):
pass
_engines = {"numexpr": NumExprEngine, "python": PythonEngine}

View File

@@ -0,0 +1,380 @@
#!/usr/bin/env python
"""
Top level ``eval`` module.
"""
import tokenize
import warnings
from pandas.util._validators import validate_bool_kwarg
from pandas.core.computation.engines import _engines
from pandas.core.computation.scope import _ensure_scope
from pandas.io.formats.printing import pprint_thing
def _check_engine(engine):
"""
Make sure a valid engine is passed.
Parameters
----------
engine : str
Raises
------
KeyError
* If an invalid engine is passed
ImportError
* If numexpr was requested but doesn't exist
Returns
-------
string engine
"""
from pandas.core.computation.check import _NUMEXPR_INSTALLED
if engine is None:
if _NUMEXPR_INSTALLED:
engine = "numexpr"
else:
engine = "python"
if engine not in _engines:
valid = list(_engines.keys())
raise KeyError(
"Invalid engine {engine!r} passed, valid engines are"
" {valid}".format(engine=engine, valid=valid)
)
# TODO: validate this in a more general way (thinking of future engines
# that won't necessarily be import-able)
# Could potentially be done on engine instantiation
if engine == "numexpr":
if not _NUMEXPR_INSTALLED:
raise ImportError(
"'numexpr' is not installed or an "
"unsupported version. Cannot use "
"engine='numexpr' for query/eval "
"if 'numexpr' is not installed"
)
return engine
def _check_parser(parser):
"""
Make sure a valid parser is passed.
Parameters
----------
parser : str
Raises
------
KeyError
* If an invalid parser is passed
"""
from pandas.core.computation.expr import _parsers
if parser not in _parsers:
raise KeyError(
"Invalid parser {parser!r} passed, valid parsers are"
" {valid}".format(parser=parser, valid=_parsers.keys())
)
def _check_resolvers(resolvers):
if resolvers is not None:
for resolver in resolvers:
if not hasattr(resolver, "__getitem__"):
name = type(resolver).__name__
raise TypeError(
"Resolver of type {name!r} does not implement "
"the __getitem__ method".format(name=name)
)
def _check_expression(expr):
"""
Make sure an expression is not an empty string
Parameters
----------
expr : object
An object that can be converted to a string
Raises
------
ValueError
* If expr is an empty string
"""
if not expr:
raise ValueError("expr cannot be an empty string")
def _convert_expression(expr):
"""
Convert an object to an expression.
Thus function converts an object to an expression (a unicode string) and
checks to make sure it isn't empty after conversion. This is used to
convert operators to their string representation for recursive calls to
:func:`~pandas.eval`.
Parameters
----------
expr : object
The object to be converted to a string.
Returns
-------
s : unicode
The string representation of an object.
Raises
------
ValueError
* If the expression is empty.
"""
s = pprint_thing(expr)
_check_expression(s)
return s
def _check_for_locals(expr, stack_level, parser):
from pandas.core.computation.expr import tokenize_string
at_top_of_stack = stack_level == 0
not_pandas_parser = parser != "pandas"
if not_pandas_parser:
msg = "The '@' prefix is only supported by the pandas parser"
elif at_top_of_stack:
msg = (
"The '@' prefix is not allowed in "
"top-level eval calls, \nplease refer to "
"your variables by name without the '@' "
"prefix"
)
if at_top_of_stack or not_pandas_parser:
for toknum, tokval in tokenize_string(expr):
if toknum == tokenize.OP and tokval == "@":
raise SyntaxError(msg)
def eval(
expr,
parser="pandas",
engine=None,
truediv=True,
local_dict=None,
global_dict=None,
resolvers=(),
level=0,
target=None,
inplace=False,
):
"""
Evaluate a Python expression as a string using various backends.
The following arithmetic operations are supported: ``+``, ``-``, ``*``,
``/``, ``**``, ``%``, ``//`` (python engine only) along with the following
boolean operations: ``|`` (or), ``&`` (and), and ``~`` (not).
Additionally, the ``'pandas'`` parser allows the use of :keyword:`and`,
:keyword:`or`, and :keyword:`not` with the same semantics as the
corresponding bitwise operators. :class:`~pandas.Series` and
:class:`~pandas.DataFrame` objects are supported and behave as they would
with plain ol' Python evaluation.
Parameters
----------
expr : str or unicode
The expression to evaluate. This string cannot contain any Python
`statements
<https://docs.python.org/3/reference/simple_stmts.html#simple-statements>`__,
only Python `expressions
<https://docs.python.org/3/reference/simple_stmts.html#expression-statements>`__.
parser : string, default 'pandas', {'pandas', 'python'}
The parser to use to construct the syntax tree from the expression. The
default of ``'pandas'`` parses code slightly different than standard
Python. Alternatively, you can parse an expression using the
``'python'`` parser to retain strict Python semantics. See the
:ref:`enhancing performance <enhancingperf.eval>` documentation for
more details.
engine : string or None, default 'numexpr', {'python', 'numexpr'}
The engine used to evaluate the expression. Supported engines are
- None : tries to use ``numexpr``, falls back to ``python``
- ``'numexpr'``: This default engine evaluates pandas objects using
numexpr for large speed ups in complex expressions
with large frames.
- ``'python'``: Performs operations as if you had ``eval``'d in top
level python. This engine is generally not that useful.
More backends may be available in the future.
truediv : bool, optional
Whether to use true division, like in Python >= 3
local_dict : dict or None, optional
A dictionary of local variables, taken from locals() by default.
global_dict : dict or None, optional
A dictionary of global variables, taken from globals() by default.
resolvers : list of dict-like or None, optional
A list of objects implementing the ``__getitem__`` special method that
you can use to inject an additional collection of namespaces to use for
variable lookup. For example, this is used in the
:meth:`~DataFrame.query` method to inject the
``DataFrame.index`` and ``DataFrame.columns``
variables that refer to their respective :class:`~pandas.DataFrame`
instance attributes.
level : int, optional
The number of prior stack frames to traverse and add to the current
scope. Most users will **not** need to change this parameter.
target : object, optional, default None
This is the target object for assignment. It is used when there is
variable assignment in the expression. If so, then `target` must
support item assignment with string keys, and if a copy is being
returned, it must also support `.copy()`.
inplace : bool, default False
If `target` is provided, and the expression mutates `target`, whether
to modify `target` inplace. Otherwise, return a copy of `target` with
the mutation.
Returns
-------
ndarray, numeric scalar, DataFrame, Series
Raises
------
ValueError
There are many instances where such an error can be raised:
- `target=None`, but the expression is multiline.
- The expression is multiline, but not all them have item assignment.
An example of such an arrangement is this:
a = b + 1
a + 2
Here, there are expressions on different lines, making it multiline,
but the last line has no variable assigned to the output of `a + 2`.
- `inplace=True`, but the expression is missing item assignment.
- Item assignment is provided, but the `target` does not support
string item assignment.
- Item assignment is provided and `inplace=False`, but the `target`
does not support the `.copy()` method
See Also
--------
DataFrame.query
DataFrame.eval
Notes
-----
The ``dtype`` of any objects involved in an arithmetic ``%`` operation are
recursively cast to ``float64``.
See the :ref:`enhancing performance <enhancingperf.eval>` documentation for
more details.
"""
from pandas.core.computation.expr import Expr
inplace = validate_bool_kwarg(inplace, "inplace")
if isinstance(expr, str):
_check_expression(expr)
exprs = [e.strip() for e in expr.splitlines() if e.strip() != ""]
else:
exprs = [expr]
multi_line = len(exprs) > 1
if multi_line and target is None:
raise ValueError(
"multi-line expressions are only valid in the "
"context of data, use DataFrame.eval"
)
ret = None
first_expr = True
target_modified = False
for expr in exprs:
expr = _convert_expression(expr)
engine = _check_engine(engine)
_check_parser(parser)
_check_resolvers(resolvers)
_check_for_locals(expr, level, parser)
# get our (possibly passed-in) scope
env = _ensure_scope(
level + 1,
global_dict=global_dict,
local_dict=local_dict,
resolvers=resolvers,
target=target,
)
parsed_expr = Expr(expr, engine=engine, parser=parser, env=env, truediv=truediv)
# construct the engine and evaluate the parsed expression
eng = _engines[engine]
eng_inst = eng(parsed_expr)
ret = eng_inst.evaluate()
if parsed_expr.assigner is None:
if multi_line:
raise ValueError(
"Multi-line expressions are only valid"
" if all expressions contain an assignment"
)
elif inplace:
raise ValueError("Cannot operate inplace " "if there is no assignment")
# assign if needed
assigner = parsed_expr.assigner
if env.target is not None and assigner is not None:
target_modified = True
# if returning a copy, copy only on the first assignment
if not inplace and first_expr:
try:
target = env.target.copy()
except AttributeError:
raise ValueError("Cannot return a copy of the target")
else:
target = env.target
# TypeError is most commonly raised (e.g. int, list), but you
# get IndexError if you try to do this assignment on np.ndarray.
# we will ignore numpy warnings here; e.g. if trying
# to use a non-numeric indexer
try:
with warnings.catch_warnings(record=True):
# TODO: Filter the warnings we actually care about here.
target[assigner] = ret
except (TypeError, IndexError):
raise ValueError("Cannot assign expression output to target")
if not resolvers:
resolvers = ({assigner: ret},)
else:
# existing resolver needs updated to handle
# case of mutating existing column in copy
for resolver in resolvers:
if assigner in resolver:
resolver[assigner] = ret
break
else:
resolvers += ({assigner: ret},)
ret = None
first_expr = False
# We want to exclude `inplace=None` as being False.
if inplace is False:
return target if target_modified else ret

View File

@@ -0,0 +1,854 @@
""":func:`~pandas.eval` parsers
"""
import ast
from functools import partial, reduce
from io import StringIO
import itertools as it
import operator
import tokenize
from typing import Type
import numpy as np
import pandas as pd
from pandas.core import common as com
from pandas.core.base import StringMixin
from pandas.core.computation.common import (
_BACKTICK_QUOTED_STRING,
_remove_spaces_column_name,
)
from pandas.core.computation.ops import (
_LOCAL_TAG,
BinOp,
Constant,
Div,
FuncNode,
Op,
Term,
UnaryOp,
UndefinedVariableError,
_arith_ops_syms,
_bool_ops_syms,
_cmp_ops_syms,
_mathops,
_reductions,
_unary_ops_syms,
is_term,
)
from pandas.core.computation.scope import Scope
import pandas.io.formats.printing as printing
def tokenize_string(source):
"""Tokenize a Python source code string.
Parameters
----------
source : str
A Python source code string
"""
line_reader = StringIO(source).readline
token_generator = tokenize.generate_tokens(line_reader)
# Loop over all tokens till a backtick (`) is found.
# Then, take all tokens till the next backtick to form a backtick quoted
# string.
for toknum, tokval, _, _, _ in token_generator:
if tokval == "`":
tokval = " ".join(
it.takewhile(
lambda tokval: tokval != "`",
map(operator.itemgetter(1), token_generator),
)
)
toknum = _BACKTICK_QUOTED_STRING
yield toknum, tokval
def _rewrite_assign(tok):
"""Rewrite the assignment operator for PyTables expressions that use ``=``
as a substitute for ``==``.
Parameters
----------
tok : tuple of int, str
ints correspond to the all caps constants in the tokenize module
Returns
-------
t : tuple of int, str
Either the input or token or the replacement values
"""
toknum, tokval = tok
return toknum, "==" if tokval == "=" else tokval
def _replace_booleans(tok):
"""Replace ``&`` with ``and`` and ``|`` with ``or`` so that bitwise
precedence is changed to boolean precedence.
Parameters
----------
tok : tuple of int, str
ints correspond to the all caps constants in the tokenize module
Returns
-------
t : tuple of int, str
Either the input or token or the replacement values
"""
toknum, tokval = tok
if toknum == tokenize.OP:
if tokval == "&":
return tokenize.NAME, "and"
elif tokval == "|":
return tokenize.NAME, "or"
return toknum, tokval
return toknum, tokval
def _replace_locals(tok):
"""Replace local variables with a syntactically valid name.
Parameters
----------
tok : tuple of int, str
ints correspond to the all caps constants in the tokenize module
Returns
-------
t : tuple of int, str
Either the input or token or the replacement values
Notes
-----
This is somewhat of a hack in that we rewrite a string such as ``'@a'`` as
``'__pd_eval_local_a'`` by telling the tokenizer that ``__pd_eval_local_``
is a ``tokenize.OP`` and to replace the ``'@'`` symbol with it.
"""
toknum, tokval = tok
if toknum == tokenize.OP and tokval == "@":
return tokenize.OP, _LOCAL_TAG
return toknum, tokval
def _clean_spaces_backtick_quoted_names(tok):
"""Clean up a column name if surrounded by backticks.
Backtick quoted string are indicated by a certain tokval value. If a string
is a backtick quoted token it will processed by
:func:`_remove_spaces_column_name` so that the parser can find this
string when the query is executed.
See also :meth:`NDFrame._get_space_character_free_column_resolver`.
Parameters
----------
tok : tuple of int, str
ints correspond to the all caps constants in the tokenize module
Returns
-------
t : tuple of int, str
Either the input or token or the replacement values
"""
toknum, tokval = tok
if toknum == _BACKTICK_QUOTED_STRING:
return tokenize.NAME, _remove_spaces_column_name(tokval)
return toknum, tokval
def _compose2(f, g):
"""Compose 2 callables"""
return lambda *args, **kwargs: f(g(*args, **kwargs))
def _compose(*funcs):
"""Compose 2 or more callables"""
assert len(funcs) > 1, "At least 2 callables must be passed to compose"
return reduce(_compose2, funcs)
def _preparse(
source,
f=_compose(
_replace_locals,
_replace_booleans,
_rewrite_assign,
_clean_spaces_backtick_quoted_names,
),
):
"""Compose a collection of tokenization functions
Parameters
----------
source : str
A Python source code string
f : callable
This takes a tuple of (toknum, tokval) as its argument and returns a
tuple with the same structure but possibly different elements. Defaults
to the composition of ``_rewrite_assign``, ``_replace_booleans``, and
``_replace_locals``.
Returns
-------
s : str
Valid Python source code
Notes
-----
The `f` parameter can be any callable that takes *and* returns input of the
form ``(toknum, tokval)``, where ``toknum`` is one of the constants from
the ``tokenize`` module and ``tokval`` is a string.
"""
assert callable(f), "f must be callable"
return tokenize.untokenize((f(x) for x in tokenize_string(source)))
def _is_type(t):
"""Factory for a type checking function of type ``t`` or tuple of types."""
return lambda x: isinstance(x.value, t)
_is_list = _is_type(list)
_is_str = _is_type(str)
# partition all AST nodes
_all_nodes = frozenset(
filter(
lambda x: isinstance(x, type) and issubclass(x, ast.AST),
(getattr(ast, node) for node in dir(ast)),
)
)
def _filter_nodes(superclass, all_nodes=_all_nodes):
"""Filter out AST nodes that are subclasses of ``superclass``."""
node_names = (node.__name__ for node in all_nodes if issubclass(node, superclass))
return frozenset(node_names)
_all_node_names = frozenset(map(lambda x: x.__name__, _all_nodes))
_mod_nodes = _filter_nodes(ast.mod)
_stmt_nodes = _filter_nodes(ast.stmt)
_expr_nodes = _filter_nodes(ast.expr)
_expr_context_nodes = _filter_nodes(ast.expr_context)
_slice_nodes = _filter_nodes(ast.slice)
_boolop_nodes = _filter_nodes(ast.boolop)
_operator_nodes = _filter_nodes(ast.operator)
_unary_op_nodes = _filter_nodes(ast.unaryop)
_cmp_op_nodes = _filter_nodes(ast.cmpop)
_comprehension_nodes = _filter_nodes(ast.comprehension)
_handler_nodes = _filter_nodes(ast.excepthandler)
_arguments_nodes = _filter_nodes(ast.arguments)
_keyword_nodes = _filter_nodes(ast.keyword)
_alias_nodes = _filter_nodes(ast.alias)
# nodes that we don't support directly but are needed for parsing
_hacked_nodes = frozenset(["Assign", "Module", "Expr"])
_unsupported_expr_nodes = frozenset(
[
"Yield",
"GeneratorExp",
"IfExp",
"DictComp",
"SetComp",
"Repr",
"Lambda",
"Set",
"AST",
"Is",
"IsNot",
]
)
# these nodes are low priority or won't ever be supported (e.g., AST)
_unsupported_nodes = (
_stmt_nodes
| _mod_nodes
| _handler_nodes
| _arguments_nodes
| _keyword_nodes
| _alias_nodes
| _expr_context_nodes
| _unsupported_expr_nodes
) - _hacked_nodes
# we're adding a different assignment in some cases to be equality comparison
# and we don't want `stmt` and friends in their so get only the class whose
# names are capitalized
_base_supported_nodes = (_all_node_names - _unsupported_nodes) | _hacked_nodes
_msg = "cannot both support and not support {intersection}".format(
intersection=_unsupported_nodes & _base_supported_nodes
)
assert not _unsupported_nodes & _base_supported_nodes, _msg
def _node_not_implemented(node_name, cls):
"""Return a function that raises a NotImplementedError with a passed node
name.
"""
def f(self, *args, **kwargs):
raise NotImplementedError(
"{name!r} nodes are not " "implemented".format(name=node_name)
)
return f
def disallow(nodes):
"""Decorator to disallow certain nodes from parsing. Raises a
NotImplementedError instead.
Returns
-------
disallowed : callable
"""
def disallowed(cls):
cls.unsupported_nodes = ()
for node in nodes:
new_method = _node_not_implemented(node, cls)
name = "visit_{node}".format(node=node)
cls.unsupported_nodes += (name,)
setattr(cls, name, new_method)
return cls
return disallowed
def _op_maker(op_class, op_symbol):
"""Return a function to create an op class with its symbol already passed.
Returns
-------
f : callable
"""
def f(self, node, *args, **kwargs):
"""Return a partial function with an Op subclass with an operator
already passed.
Returns
-------
f : callable
"""
return partial(op_class, op_symbol, *args, **kwargs)
return f
_op_classes = {"binary": BinOp, "unary": UnaryOp}
def add_ops(op_classes):
"""Decorator to add default implementation of ops."""
def f(cls):
for op_attr_name, op_class in op_classes.items():
ops = getattr(cls, "{name}_ops".format(name=op_attr_name))
ops_map = getattr(cls, "{name}_op_nodes_map".format(name=op_attr_name))
for op in ops:
op_node = ops_map[op]
if op_node is not None:
made_op = _op_maker(op_class, op)
setattr(cls, "visit_{node}".format(node=op_node), made_op)
return cls
return f
@disallow(_unsupported_nodes)
@add_ops(_op_classes)
class BaseExprVisitor(ast.NodeVisitor):
"""Custom ast walker. Parsers of other engines should subclass this class
if necessary.
Parameters
----------
env : Scope
engine : str
parser : str
preparser : callable
"""
const_type = Constant # type: Type[Term]
term_type = Term
binary_ops = _cmp_ops_syms + _bool_ops_syms + _arith_ops_syms
binary_op_nodes = (
"Gt",
"Lt",
"GtE",
"LtE",
"Eq",
"NotEq",
"In",
"NotIn",
"BitAnd",
"BitOr",
"And",
"Or",
"Add",
"Sub",
"Mult",
None,
"Pow",
"FloorDiv",
"Mod",
)
binary_op_nodes_map = dict(zip(binary_ops, binary_op_nodes))
unary_ops = _unary_ops_syms
unary_op_nodes = "UAdd", "USub", "Invert", "Not"
unary_op_nodes_map = dict(zip(unary_ops, unary_op_nodes))
rewrite_map = {
ast.Eq: ast.In,
ast.NotEq: ast.NotIn,
ast.In: ast.In,
ast.NotIn: ast.NotIn,
}
def __init__(self, env, engine, parser, preparser=_preparse):
self.env = env
self.engine = engine
self.parser = parser
self.preparser = preparser
self.assigner = None
def visit(self, node, **kwargs):
if isinstance(node, str):
clean = self.preparser(node)
try:
node = ast.fix_missing_locations(ast.parse(clean))
except SyntaxError as e:
from keyword import iskeyword
if any(iskeyword(x) for x in clean.split()):
e.msg = "Python keyword not valid identifier" " in numexpr query"
raise e
method = "visit_" + node.__class__.__name__
visitor = getattr(self, method)
return visitor(node, **kwargs)
def visit_Module(self, node, **kwargs):
if len(node.body) != 1:
raise SyntaxError("only a single expression is allowed")
expr = node.body[0]
return self.visit(expr, **kwargs)
def visit_Expr(self, node, **kwargs):
return self.visit(node.value, **kwargs)
def _rewrite_membership_op(self, node, left, right):
# the kind of the operator (is actually an instance)
op_instance = node.op
op_type = type(op_instance)
# must be two terms and the comparison operator must be ==/!=/in/not in
if is_term(left) and is_term(right) and op_type in self.rewrite_map:
left_list, right_list = map(_is_list, (left, right))
left_str, right_str = map(_is_str, (left, right))
# if there are any strings or lists in the expression
if left_list or right_list or left_str or right_str:
op_instance = self.rewrite_map[op_type]()
# pop the string variable out of locals and replace it with a list
# of one string, kind of a hack
if right_str:
name = self.env.add_tmp([right.value])
right = self.term_type(name, self.env)
if left_str:
name = self.env.add_tmp([left.value])
left = self.term_type(name, self.env)
op = self.visit(op_instance)
return op, op_instance, left, right
def _maybe_transform_eq_ne(self, node, left=None, right=None):
if left is None:
left = self.visit(node.left, side="left")
if right is None:
right = self.visit(node.right, side="right")
op, op_class, left, right = self._rewrite_membership_op(node, left, right)
return op, op_class, left, right
def _maybe_downcast_constants(self, left, right):
f32 = np.dtype(np.float32)
if (
left.is_scalar
and hasattr(left, "value")
and not right.is_scalar
and right.return_type == f32
):
# right is a float32 array, left is a scalar
name = self.env.add_tmp(np.float32(left.value))
left = self.term_type(name, self.env)
if (
right.is_scalar
and hasattr(right, "value")
and not left.is_scalar
and left.return_type == f32
):
# left is a float32 array, right is a scalar
name = self.env.add_tmp(np.float32(right.value))
right = self.term_type(name, self.env)
return left, right
def _maybe_eval(self, binop, eval_in_python):
# eval `in` and `not in` (for now) in "partial" python space
# things that can be evaluated in "eval" space will be turned into
# temporary variables. for example,
# [1,2] in a + 2 * b
# in that case a + 2 * b will be evaluated using numexpr, and the "in"
# call will be evaluated using isin (in python space)
return binop.evaluate(
self.env, self.engine, self.parser, self.term_type, eval_in_python
)
def _maybe_evaluate_binop(
self,
op,
op_class,
lhs,
rhs,
eval_in_python=("in", "not in"),
maybe_eval_in_python=("==", "!=", "<", ">", "<=", ">="),
):
res = op(lhs, rhs)
if res.has_invalid_return_type:
raise TypeError(
"unsupported operand type(s) for {op}:"
" '{lhs}' and '{rhs}'".format(op=res.op, lhs=lhs.type, rhs=rhs.type)
)
if self.engine != "pytables":
if (
res.op in _cmp_ops_syms
and getattr(lhs, "is_datetime", False)
or getattr(rhs, "is_datetime", False)
):
# all date ops must be done in python bc numexpr doesn't work
# well with NaT
return self._maybe_eval(res, self.binary_ops)
if res.op in eval_in_python:
# "in"/"not in" ops are always evaluated in python
return self._maybe_eval(res, eval_in_python)
elif self.engine != "pytables":
if (
getattr(lhs, "return_type", None) == object
or getattr(rhs, "return_type", None) == object
):
# evaluate "==" and "!=" in python if either of our operands
# has an object return type
return self._maybe_eval(res, eval_in_python + maybe_eval_in_python)
return res
def visit_BinOp(self, node, **kwargs):
op, op_class, left, right = self._maybe_transform_eq_ne(node)
left, right = self._maybe_downcast_constants(left, right)
return self._maybe_evaluate_binop(op, op_class, left, right)
def visit_Div(self, node, **kwargs):
truediv = self.env.scope["truediv"]
return lambda lhs, rhs: Div(lhs, rhs, truediv)
def visit_UnaryOp(self, node, **kwargs):
op = self.visit(node.op)
operand = self.visit(node.operand)
return op(operand)
def visit_Name(self, node, **kwargs):
return self.term_type(node.id, self.env, **kwargs)
def visit_NameConstant(self, node, **kwargs):
return self.const_type(node.value, self.env)
def visit_Num(self, node, **kwargs):
return self.const_type(node.n, self.env)
def visit_Str(self, node, **kwargs):
name = self.env.add_tmp(node.s)
return self.term_type(name, self.env)
def visit_List(self, node, **kwargs):
name = self.env.add_tmp([self.visit(e)(self.env) for e in node.elts])
return self.term_type(name, self.env)
visit_Tuple = visit_List
def visit_Index(self, node, **kwargs):
""" df.index[4] """
return self.visit(node.value)
def visit_Subscript(self, node, **kwargs):
value = self.visit(node.value)
slobj = self.visit(node.slice)
result = pd.eval(
slobj, local_dict=self.env, engine=self.engine, parser=self.parser
)
try:
# a Term instance
v = value.value[result]
except AttributeError:
# an Op instance
lhs = pd.eval(
value, local_dict=self.env, engine=self.engine, parser=self.parser
)
v = lhs[result]
name = self.env.add_tmp(v)
return self.term_type(name, env=self.env)
def visit_Slice(self, node, **kwargs):
""" df.index[slice(4,6)] """
lower = node.lower
if lower is not None:
lower = self.visit(lower).value
upper = node.upper
if upper is not None:
upper = self.visit(upper).value
step = node.step
if step is not None:
step = self.visit(step).value
return slice(lower, upper, step)
def visit_Assign(self, node, **kwargs):
"""
support a single assignment node, like
c = a + b
set the assigner at the top level, must be a Name node which
might or might not exist in the resolvers
"""
if len(node.targets) != 1:
raise SyntaxError("can only assign a single expression")
if not isinstance(node.targets[0], ast.Name):
raise SyntaxError(
"left hand side of an assignment must be a " "single name"
)
if self.env.target is None:
raise ValueError("cannot assign without a target object")
try:
assigner = self.visit(node.targets[0], **kwargs)
except UndefinedVariableError:
assigner = node.targets[0].id
self.assigner = getattr(assigner, "name", assigner)
if self.assigner is None:
raise SyntaxError(
"left hand side of an assignment must be a " "single resolvable name"
)
return self.visit(node.value, **kwargs)
def visit_Attribute(self, node, **kwargs):
attr = node.attr
value = node.value
ctx = node.ctx
if isinstance(ctx, ast.Load):
# resolve the value
resolved = self.visit(value).value
try:
v = getattr(resolved, attr)
name = self.env.add_tmp(v)
return self.term_type(name, self.env)
except AttributeError:
# something like datetime.datetime where scope is overridden
if isinstance(value, ast.Name) and value.id == attr:
return resolved
raise ValueError("Invalid Attribute context {name}".format(name=ctx.__name__))
def visit_Call(self, node, side=None, **kwargs):
if isinstance(node.func, ast.Attribute):
res = self.visit_Attribute(node.func)
elif not isinstance(node.func, ast.Name):
raise TypeError("Only named functions are supported")
else:
try:
res = self.visit(node.func)
except UndefinedVariableError:
# Check if this is a supported function name
try:
res = FuncNode(node.func.id)
except ValueError:
# Raise original error
raise
if res is None:
raise ValueError("Invalid function call {func}".format(func=node.func.id))
if hasattr(res, "value"):
res = res.value
if isinstance(res, FuncNode):
new_args = [self.visit(arg) for arg in node.args]
if node.keywords:
raise TypeError(
'Function "{name}" does not support keyword '
"arguments".format(name=res.name)
)
return res(*new_args, **kwargs)
else:
new_args = [self.visit(arg).value for arg in node.args]
for key in node.keywords:
if not isinstance(key, ast.keyword):
raise ValueError(
"keyword error in function call "
"'{func}'".format(func=node.func.id)
)
if key.arg:
kwargs[key.arg] = self.visit(key.value).value
return self.const_type(res(*new_args, **kwargs), self.env)
def translate_In(self, op):
return op
def visit_Compare(self, node, **kwargs):
ops = node.ops
comps = node.comparators
# base case: we have something like a CMP b
if len(comps) == 1:
op = self.translate_In(ops[0])
binop = ast.BinOp(op=op, left=node.left, right=comps[0])
return self.visit(binop)
# recursive case: we have a chained comparison, a CMP b CMP c, etc.
left = node.left
values = []
for op, comp in zip(ops, comps):
new_node = self.visit(
ast.Compare(comparators=[comp], left=left, ops=[self.translate_In(op)])
)
left = comp
values.append(new_node)
return self.visit(ast.BoolOp(op=ast.And(), values=values))
def _try_visit_binop(self, bop):
if isinstance(bop, (Op, Term)):
return bop
return self.visit(bop)
def visit_BoolOp(self, node, **kwargs):
def visitor(x, y):
lhs = self._try_visit_binop(x)
rhs = self._try_visit_binop(y)
op, op_class, lhs, rhs = self._maybe_transform_eq_ne(node, lhs, rhs)
return self._maybe_evaluate_binop(op, node.op, lhs, rhs)
operands = node.values
return reduce(visitor, operands)
_python_not_supported = frozenset(["Dict", "BoolOp", "In", "NotIn"])
_numexpr_supported_calls = frozenset(_reductions + _mathops)
@disallow(
(_unsupported_nodes | _python_not_supported)
- (_boolop_nodes | frozenset(["BoolOp", "Attribute", "In", "NotIn", "Tuple"]))
)
class PandasExprVisitor(BaseExprVisitor):
def __init__(
self,
env,
engine,
parser,
preparser=partial(
_preparse,
f=_compose(
_replace_locals, _replace_booleans, _clean_spaces_backtick_quoted_names
),
),
):
super().__init__(env, engine, parser, preparser)
@disallow(_unsupported_nodes | _python_not_supported | frozenset(["Not"]))
class PythonExprVisitor(BaseExprVisitor):
def __init__(self, env, engine, parser, preparser=lambda x: x):
super().__init__(env, engine, parser, preparser=preparser)
class Expr(StringMixin):
"""Object encapsulating an expression.
Parameters
----------
expr : str
engine : str, optional, default 'numexpr'
parser : str, optional, default 'pandas'
env : Scope, optional, default None
truediv : bool, optional, default True
level : int, optional, default 2
"""
def __init__(
self, expr, engine="numexpr", parser="pandas", env=None, truediv=True, level=0
):
self.expr = expr
self.env = env or Scope(level=level + 1)
self.engine = engine
self.parser = parser
self.env.scope["truediv"] = truediv
self._visitor = _parsers[parser](self.env, self.engine, self.parser)
self.terms = self.parse()
@property
def assigner(self):
return getattr(self._visitor, "assigner", None)
def __call__(self):
return self.terms(self.env)
def __str__(self):
return printing.pprint_thing(self.terms)
def __len__(self):
return len(self.expr)
def parse(self):
"""Parse an expression"""
return self._visitor.visit(self.expr)
@property
def names(self):
"""Get the names in an expression"""
if is_term(self.terms):
return frozenset([self.terms.name])
return frozenset(term.name for term in com.flatten(self.terms))
_parsers = {"python": PythonExprVisitor, "pandas": PandasExprVisitor}

View File

@@ -0,0 +1,263 @@
"""
Expressions
-----------
Offer fast expression evaluation through numexpr
"""
import warnings
import numpy as np
from pandas._config import get_option
from pandas._libs.lib import values_from_object
from pandas.core.dtypes.generic import ABCDataFrame
from pandas.core.computation.check import _NUMEXPR_INSTALLED
if _NUMEXPR_INSTALLED:
import numexpr as ne
_TEST_MODE = None
_TEST_RESULT = None
_USE_NUMEXPR = _NUMEXPR_INSTALLED
_evaluate = None
_where = None
# the set of dtypes that we will allow pass to numexpr
_ALLOWED_DTYPES = {
"evaluate": {"int64", "int32", "float64", "float32", "bool"},
"where": {"int64", "float64", "bool"},
}
# the minimum prod shape that we will use numexpr
_MIN_ELEMENTS = 10000
def set_use_numexpr(v=True):
# set/unset to use numexpr
global _USE_NUMEXPR
if _NUMEXPR_INSTALLED:
_USE_NUMEXPR = v
# choose what we are going to do
global _evaluate, _where
if not _USE_NUMEXPR:
_evaluate = _evaluate_standard
_where = _where_standard
else:
_evaluate = _evaluate_numexpr
_where = _where_numexpr
def set_numexpr_threads(n=None):
# if we are using numexpr, set the threads to n
# otherwise reset
if _NUMEXPR_INSTALLED and _USE_NUMEXPR:
if n is None:
n = ne.detect_number_of_cores()
ne.set_num_threads(n)
def _evaluate_standard(op, op_str, a, b, **eval_kwargs):
""" standard evaluation """
if _TEST_MODE:
_store_test_result(False)
with np.errstate(all="ignore"):
return op(a, b)
def _can_use_numexpr(op, op_str, a, b, dtype_check):
""" return a boolean if we WILL be using numexpr """
if op_str is not None:
# required min elements (otherwise we are adding overhead)
if np.prod(a.shape) > _MIN_ELEMENTS:
# check for dtype compatibility
dtypes = set()
for o in [a, b]:
if hasattr(o, "dtypes"):
s = o.dtypes.value_counts()
if len(s) > 1:
return False
dtypes |= set(s.index.astype(str))
elif isinstance(o, np.ndarray):
dtypes |= {o.dtype.name}
# allowed are a superset
if not len(dtypes) or _ALLOWED_DTYPES[dtype_check] >= dtypes:
return True
return False
def _evaluate_numexpr(op, op_str, a, b, truediv=True, reversed=False, **eval_kwargs):
result = None
if _can_use_numexpr(op, op_str, a, b, "evaluate"):
try:
# we were originally called by a reversed op
# method
if reversed:
a, b = b, a
a_value = getattr(a, "values", a)
b_value = getattr(b, "values", b)
result = ne.evaluate(
"a_value {op} b_value".format(op=op_str),
local_dict={"a_value": a_value, "b_value": b_value},
casting="safe",
truediv=truediv,
**eval_kwargs
)
except ValueError as detail:
if "unknown type object" in str(detail):
pass
if _TEST_MODE:
_store_test_result(result is not None)
if result is None:
result = _evaluate_standard(op, op_str, a, b)
return result
def _where_standard(cond, a, b):
return np.where(
values_from_object(cond), values_from_object(a), values_from_object(b)
)
def _where_numexpr(cond, a, b):
result = None
if _can_use_numexpr(None, "where", a, b, "where"):
try:
cond_value = getattr(cond, "values", cond)
a_value = getattr(a, "values", a)
b_value = getattr(b, "values", b)
result = ne.evaluate(
"where(cond_value, a_value, b_value)",
local_dict={
"cond_value": cond_value,
"a_value": a_value,
"b_value": b_value,
},
casting="safe",
)
except ValueError as detail:
if "unknown type object" in str(detail):
pass
except Exception as detail:
raise TypeError(str(detail))
if result is None:
result = _where_standard(cond, a, b)
return result
# turn myself on
set_use_numexpr(get_option("compute.use_numexpr"))
def _has_bool_dtype(x):
try:
if isinstance(x, ABCDataFrame):
return "bool" in x.dtypes
else:
return x.dtype == bool
except AttributeError:
return isinstance(x, (bool, np.bool_))
def _bool_arith_check(
op_str, a, b, not_allowed=frozenset(("/", "//", "**")), unsupported=None
):
if unsupported is None:
unsupported = {"+": "|", "*": "&", "-": "^"}
if _has_bool_dtype(a) and _has_bool_dtype(b):
if op_str in unsupported:
warnings.warn(
"evaluating in Python space because the {op!r} "
"operator is not supported by numexpr for "
"the bool dtype, use {alt_op!r} instead".format(
op=op_str, alt_op=unsupported[op_str]
)
)
return False
if op_str in not_allowed:
raise NotImplementedError(
"operator {op!r} not implemented for " "bool dtypes".format(op=op_str)
)
return True
def evaluate(op, op_str, a, b, use_numexpr=True, **eval_kwargs):
""" evaluate and return the expression of the op on a and b
Parameters
----------
op : the actual operand
op_str: the string version of the op
a : left operand
b : right operand
use_numexpr : whether to try to use numexpr (default True)
"""
use_numexpr = use_numexpr and _bool_arith_check(op_str, a, b)
if use_numexpr:
return _evaluate(op, op_str, a, b, **eval_kwargs)
return _evaluate_standard(op, op_str, a, b)
def where(cond, a, b, use_numexpr=True):
""" evaluate the where condition cond on a and b
Parameters
----------
cond : a boolean array
a : return if cond is True
b : return if cond is False
use_numexpr : whether to try to use numexpr (default True)
"""
if use_numexpr:
return _where(cond, a, b)
return _where_standard(cond, a, b)
def set_test_mode(v=True):
"""
Keeps track of whether numexpr was used. Stores an additional ``True``
for every successful use of evaluate with numexpr since the last
``get_test_result``
"""
global _TEST_MODE, _TEST_RESULT
_TEST_MODE = v
_TEST_RESULT = []
def _store_test_result(used_numexpr):
global _TEST_RESULT
if used_numexpr:
_TEST_RESULT.append(used_numexpr)
def get_test_result():
"""get test result and reset test_results"""
global _TEST_RESULT
res = _TEST_RESULT
_TEST_RESULT = []
return res

View File

@@ -0,0 +1,587 @@
"""Operator classes for eval.
"""
from datetime import datetime
from distutils.version import LooseVersion
from functools import partial
import operator as op
import numpy as np
from pandas._libs.tslibs import Timestamp
from pandas.core.dtypes.common import is_list_like, is_scalar
from pandas.core.base import StringMixin
import pandas.core.common as com
from pandas.core.computation.common import _ensure_decoded, _result_type_many
from pandas.core.computation.scope import _DEFAULT_GLOBALS
from pandas.io.formats.printing import pprint_thing, pprint_thing_encoded
_reductions = "sum", "prod"
_unary_math_ops = (
"sin",
"cos",
"exp",
"log",
"expm1",
"log1p",
"sqrt",
"sinh",
"cosh",
"tanh",
"arcsin",
"arccos",
"arctan",
"arccosh",
"arcsinh",
"arctanh",
"abs",
"log10",
"floor",
"ceil",
)
_binary_math_ops = ("arctan2",)
_mathops = _unary_math_ops + _binary_math_ops
_LOCAL_TAG = "__pd_eval_local_"
class UndefinedVariableError(NameError):
"""NameError subclass for local variables."""
def __init__(self, name, is_local):
if is_local:
msg = "local variable {0!r} is not defined"
else:
msg = "name {0!r} is not defined"
super().__init__(msg.format(name))
class Term(StringMixin):
def __new__(cls, name, env, side=None, encoding=None):
klass = Constant if not isinstance(name, str) else cls
supr_new = super(Term, klass).__new__
return supr_new(klass)
def __init__(self, name, env, side=None, encoding=None):
self._name = name
self.env = env
self.side = side
tname = str(name)
self.is_local = tname.startswith(_LOCAL_TAG) or tname in _DEFAULT_GLOBALS
self._value = self._resolve_name()
self.encoding = encoding
@property
def local_name(self):
return self.name.replace(_LOCAL_TAG, "")
def __str__(self):
return pprint_thing(self.name)
def __call__(self, *args, **kwargs):
return self.value
def evaluate(self, *args, **kwargs):
return self
def _resolve_name(self):
res = self.env.resolve(self.local_name, is_local=self.is_local)
self.update(res)
if hasattr(res, "ndim") and res.ndim > 2:
raise NotImplementedError(
"N-dimensional objects, where N > 2," " are not supported with eval"
)
return res
def update(self, value):
"""
search order for local (i.e., @variable) variables:
scope, key_variable
[('locals', 'local_name'),
('globals', 'local_name'),
('locals', 'key'),
('globals', 'key')]
"""
key = self.name
# if it's a variable name (otherwise a constant)
if isinstance(key, str):
self.env.swapkey(self.local_name, key, new_value=value)
self.value = value
@property
def is_scalar(self):
return is_scalar(self._value)
@property
def type(self):
try:
# potentially very slow for large, mixed dtype frames
return self._value.values.dtype
except AttributeError:
try:
# ndarray
return self._value.dtype
except AttributeError:
# scalar
return type(self._value)
return_type = type
@property
def raw(self):
return pprint_thing(
"{0}(name={1!r}, type={2})"
"".format(self.__class__.__name__, self.name, self.type)
)
@property
def is_datetime(self):
try:
t = self.type.type
except AttributeError:
t = self.type
return issubclass(t, (datetime, np.datetime64))
@property
def value(self):
return self._value
@value.setter
def value(self, new_value):
self._value = new_value
@property
def name(self):
return self._name
@property
def ndim(self):
return self._value.ndim
class Constant(Term):
def __init__(self, value, env, side=None, encoding=None):
super().__init__(value, env, side=side, encoding=encoding)
def _resolve_name(self):
return self._name
@property
def name(self):
return self.value
def __str__(self):
# in python 2 str() of float
# can truncate shorter than repr()
return repr(self.name)
_bool_op_map = {"not": "~", "and": "&", "or": "|"}
class Op(StringMixin):
"""Hold an operator of arbitrary arity
"""
def __init__(self, op, operands, *args, **kwargs):
self.op = _bool_op_map.get(op, op)
self.operands = operands
self.encoding = kwargs.get("encoding", None)
def __iter__(self):
return iter(self.operands)
def __str__(self):
"""Print a generic n-ary operator and its operands using infix
notation"""
# recurse over the operands
parened = ("({0})".format(pprint_thing(opr)) for opr in self.operands)
return pprint_thing(" {0} ".format(self.op).join(parened))
@property
def return_type(self):
# clobber types to bool if the op is a boolean operator
if self.op in (_cmp_ops_syms + _bool_ops_syms):
return np.bool_
return _result_type_many(*(term.type for term in com.flatten(self)))
@property
def has_invalid_return_type(self):
types = self.operand_types
obj_dtype_set = frozenset([np.dtype("object")])
return self.return_type == object and types - obj_dtype_set
@property
def operand_types(self):
return frozenset(term.type for term in com.flatten(self))
@property
def is_scalar(self):
return all(operand.is_scalar for operand in self.operands)
@property
def is_datetime(self):
try:
t = self.return_type.type
except AttributeError:
t = self.return_type
return issubclass(t, (datetime, np.datetime64))
def _in(x, y):
"""Compute the vectorized membership of ``x in y`` if possible, otherwise
use Python.
"""
try:
return x.isin(y)
except AttributeError:
if is_list_like(x):
try:
return y.isin(x)
except AttributeError:
pass
return x in y
def _not_in(x, y):
"""Compute the vectorized membership of ``x not in y`` if possible,
otherwise use Python.
"""
try:
return ~x.isin(y)
except AttributeError:
if is_list_like(x):
try:
return ~y.isin(x)
except AttributeError:
pass
return x not in y
_cmp_ops_syms = ">", "<", ">=", "<=", "==", "!=", "in", "not in"
_cmp_ops_funcs = op.gt, op.lt, op.ge, op.le, op.eq, op.ne, _in, _not_in
_cmp_ops_dict = dict(zip(_cmp_ops_syms, _cmp_ops_funcs))
_bool_ops_syms = "&", "|", "and", "or"
_bool_ops_funcs = op.and_, op.or_, op.and_, op.or_
_bool_ops_dict = dict(zip(_bool_ops_syms, _bool_ops_funcs))
_arith_ops_syms = "+", "-", "*", "/", "**", "//", "%"
_arith_ops_funcs = (op.add, op.sub, op.mul, op.truediv, op.pow, op.floordiv, op.mod)
_arith_ops_dict = dict(zip(_arith_ops_syms, _arith_ops_funcs))
_special_case_arith_ops_syms = "**", "//", "%"
_special_case_arith_ops_funcs = op.pow, op.floordiv, op.mod
_special_case_arith_ops_dict = dict(
zip(_special_case_arith_ops_syms, _special_case_arith_ops_funcs)
)
_binary_ops_dict = {}
for d in (_cmp_ops_dict, _bool_ops_dict, _arith_ops_dict):
_binary_ops_dict.update(d)
def _cast_inplace(terms, acceptable_dtypes, dtype):
"""Cast an expression inplace.
Parameters
----------
terms : Op
The expression that should cast.
acceptable_dtypes : list of acceptable numpy.dtype
Will not cast if term's dtype in this list.
.. versionadded:: 0.19.0
dtype : str or numpy.dtype
The dtype to cast to.
"""
dt = np.dtype(dtype)
for term in terms:
if term.type in acceptable_dtypes:
continue
try:
new_value = term.value.astype(dt)
except AttributeError:
new_value = dt.type(term.value)
term.update(new_value)
def is_term(obj):
return isinstance(obj, Term)
class BinOp(Op):
"""Hold a binary operator and its operands
Parameters
----------
op : str
left : Term or Op
right : Term or Op
"""
def __init__(self, op, lhs, rhs, **kwargs):
super().__init__(op, (lhs, rhs))
self.lhs = lhs
self.rhs = rhs
self._disallow_scalar_only_bool_ops()
self.convert_values()
try:
self.func = _binary_ops_dict[op]
except KeyError:
# has to be made a list for python3
keys = list(_binary_ops_dict.keys())
raise ValueError(
"Invalid binary operator {0!r}, valid"
" operators are {1}".format(op, keys)
)
def __call__(self, env):
"""Recursively evaluate an expression in Python space.
Parameters
----------
env : Scope
Returns
-------
object
The result of an evaluated expression.
"""
# handle truediv
if self.op == "/" and env.scope["truediv"]:
self.func = op.truediv
# recurse over the left/right nodes
left = self.lhs(env)
right = self.rhs(env)
return self.func(left, right)
def evaluate(self, env, engine, parser, term_type, eval_in_python):
"""Evaluate a binary operation *before* being passed to the engine.
Parameters
----------
env : Scope
engine : str
parser : str
term_type : type
eval_in_python : list
Returns
-------
term_type
The "pre-evaluated" expression as an instance of ``term_type``
"""
if engine == "python":
res = self(env)
else:
# recurse over the left/right nodes
left = self.lhs.evaluate(
env,
engine=engine,
parser=parser,
term_type=term_type,
eval_in_python=eval_in_python,
)
right = self.rhs.evaluate(
env,
engine=engine,
parser=parser,
term_type=term_type,
eval_in_python=eval_in_python,
)
# base cases
if self.op in eval_in_python:
res = self.func(left.value, right.value)
else:
from pandas.core.computation.eval import eval
res = eval(self, local_dict=env, engine=engine, parser=parser)
name = env.add_tmp(res)
return term_type(name, env=env)
def convert_values(self):
"""Convert datetimes to a comparable value in an expression.
"""
def stringify(value):
if self.encoding is not None:
encoder = partial(pprint_thing_encoded, encoding=self.encoding)
else:
encoder = pprint_thing
return encoder(value)
lhs, rhs = self.lhs, self.rhs
if is_term(lhs) and lhs.is_datetime and is_term(rhs) and rhs.is_scalar:
v = rhs.value
if isinstance(v, (int, float)):
v = stringify(v)
v = Timestamp(_ensure_decoded(v))
if v.tz is not None:
v = v.tz_convert("UTC")
self.rhs.update(v)
if is_term(rhs) and rhs.is_datetime and is_term(lhs) and lhs.is_scalar:
v = lhs.value
if isinstance(v, (int, float)):
v = stringify(v)
v = Timestamp(_ensure_decoded(v))
if v.tz is not None:
v = v.tz_convert("UTC")
self.lhs.update(v)
def _disallow_scalar_only_bool_ops(self):
if (
(self.lhs.is_scalar or self.rhs.is_scalar)
and self.op in _bool_ops_dict
and (
not (
issubclass(self.rhs.return_type, (bool, np.bool_))
and issubclass(self.lhs.return_type, (bool, np.bool_))
)
)
):
raise NotImplementedError("cannot evaluate scalar only bool ops")
def isnumeric(dtype):
return issubclass(np.dtype(dtype).type, np.number)
class Div(BinOp):
"""Div operator to special case casting.
Parameters
----------
lhs, rhs : Term or Op
The Terms or Ops in the ``/`` expression.
truediv : bool
Whether or not to use true division. With Python 3 this happens
regardless of the value of ``truediv``.
"""
def __init__(self, lhs, rhs, truediv, *args, **kwargs):
super().__init__("/", lhs, rhs, *args, **kwargs)
if not isnumeric(lhs.return_type) or not isnumeric(rhs.return_type):
raise TypeError(
"unsupported operand type(s) for {0}:"
" '{1}' and '{2}'".format(self.op, lhs.return_type, rhs.return_type)
)
# do not upcast float32s to float64 un-necessarily
acceptable_dtypes = [np.float32, np.float_]
_cast_inplace(com.flatten(self), acceptable_dtypes, np.float_)
_unary_ops_syms = "+", "-", "~", "not"
_unary_ops_funcs = op.pos, op.neg, op.invert, op.invert
_unary_ops_dict = dict(zip(_unary_ops_syms, _unary_ops_funcs))
class UnaryOp(Op):
"""Hold a unary operator and its operands
Parameters
----------
op : str
The token used to represent the operator.
operand : Term or Op
The Term or Op operand to the operator.
Raises
------
ValueError
* If no function associated with the passed operator token is found.
"""
def __init__(self, op, operand):
super().__init__(op, (operand,))
self.operand = operand
try:
self.func = _unary_ops_dict[op]
except KeyError:
raise ValueError(
"Invalid unary operator {0!r}, valid operators "
"are {1}".format(op, _unary_ops_syms)
)
def __call__(self, env):
operand = self.operand(env)
return self.func(operand)
def __str__(self):
return pprint_thing("{0}({1})".format(self.op, self.operand))
@property
def return_type(self):
operand = self.operand
if operand.return_type == np.dtype("bool"):
return np.dtype("bool")
if isinstance(operand, Op) and (
operand.op in _cmp_ops_dict or operand.op in _bool_ops_dict
):
return np.dtype("bool")
return np.dtype("int")
class MathCall(Op):
def __init__(self, func, args):
super().__init__(func.name, args)
self.func = func
def __call__(self, env):
operands = [op(env) for op in self.operands]
with np.errstate(all="ignore"):
return self.func.func(*operands)
def __str__(self):
operands = map(str, self.operands)
return pprint_thing("{0}({1})".format(self.op, ",".join(operands)))
class FuncNode:
def __init__(self, name):
from pandas.core.computation.check import _NUMEXPR_INSTALLED, _NUMEXPR_VERSION
if name not in _mathops or (
_NUMEXPR_INSTALLED
and _NUMEXPR_VERSION < LooseVersion("2.6.9")
and name in ("floor", "ceil")
):
raise ValueError('"{0}" is not a supported function'.format(name))
self.name = name
self.func = getattr(np, name)
def __call__(self, *args):
return MathCall(self, args)

View File

@@ -0,0 +1,609 @@
""" manage PyTables query interface via Expressions """
import ast
from functools import partial
import numpy as np
from pandas._libs.tslibs import Timedelta, Timestamp
from pandas.compat.chainmap import DeepChainMap
from pandas.core.dtypes.common import is_list_like
import pandas as pd
from pandas.core.base import StringMixin
import pandas.core.common as com
from pandas.core.computation import expr, ops
from pandas.core.computation.common import _ensure_decoded
from pandas.core.computation.expr import BaseExprVisitor
from pandas.core.computation.ops import UndefinedVariableError, is_term
from pandas.io.formats.printing import pprint_thing, pprint_thing_encoded
class Scope(expr.Scope):
__slots__ = ("queryables",)
def __init__(self, level, global_dict=None, local_dict=None, queryables=None):
super().__init__(level + 1, global_dict=global_dict, local_dict=local_dict)
self.queryables = queryables or dict()
class Term(ops.Term):
def __new__(cls, name, env, side=None, encoding=None):
klass = Constant if not isinstance(name, str) else cls
supr_new = StringMixin.__new__
return supr_new(klass)
def __init__(self, name, env, side=None, encoding=None):
super().__init__(name, env, side=side, encoding=encoding)
def _resolve_name(self):
# must be a queryables
if self.side == "left":
if self.name not in self.env.queryables:
raise NameError("name {name!r} is not defined".format(name=self.name))
return self.name
# resolve the rhs (and allow it to be None)
try:
return self.env.resolve(self.name, is_local=False)
except UndefinedVariableError:
return self.name
# read-only property overwriting read/write property
@property # type: ignore
def value(self):
return self._value
class Constant(Term):
def __init__(self, value, env, side=None, encoding=None):
super().__init__(value, env, side=side, encoding=encoding)
def _resolve_name(self):
return self._name
class BinOp(ops.BinOp):
_max_selectors = 31
def __init__(self, op, lhs, rhs, queryables, encoding):
super().__init__(op, lhs, rhs)
self.queryables = queryables
self.encoding = encoding
self.filter = None
self.condition = None
def _disallow_scalar_only_bool_ops(self):
pass
def prune(self, klass):
def pr(left, right):
""" create and return a new specialized BinOp from myself """
if left is None:
return right
elif right is None:
return left
k = klass
if isinstance(left, ConditionBinOp):
if isinstance(right, ConditionBinOp):
k = JointConditionBinOp
elif isinstance(left, k):
return left
elif isinstance(right, k):
return right
elif isinstance(left, FilterBinOp):
if isinstance(right, FilterBinOp):
k = JointFilterBinOp
elif isinstance(left, k):
return left
elif isinstance(right, k):
return right
return k(
self.op, left, right, queryables=self.queryables, encoding=self.encoding
).evaluate()
left, right = self.lhs, self.rhs
if is_term(left) and is_term(right):
res = pr(left.value, right.value)
elif not is_term(left) and is_term(right):
res = pr(left.prune(klass), right.value)
elif is_term(left) and not is_term(right):
res = pr(left.value, right.prune(klass))
elif not (is_term(left) or is_term(right)):
res = pr(left.prune(klass), right.prune(klass))
return res
def conform(self, rhs):
""" inplace conform rhs """
if not is_list_like(rhs):
rhs = [rhs]
if isinstance(rhs, np.ndarray):
rhs = rhs.ravel()
return rhs
@property
def is_valid(self):
""" return True if this is a valid field """
return self.lhs in self.queryables
@property
def is_in_table(self):
""" return True if this is a valid column name for generation (e.g. an
actual column in the table) """
return self.queryables.get(self.lhs) is not None
@property
def kind(self):
""" the kind of my field """
return getattr(self.queryables.get(self.lhs), "kind", None)
@property
def meta(self):
""" the meta of my field """
return getattr(self.queryables.get(self.lhs), "meta", None)
@property
def metadata(self):
""" the metadata of my field """
return getattr(self.queryables.get(self.lhs), "metadata", None)
def generate(self, v):
""" create and return the op string for this TermValue """
val = v.tostring(self.encoding)
return "({lhs} {op} {val})".format(lhs=self.lhs, op=self.op, val=val)
def convert_value(self, v):
""" convert the expression that is in the term to something that is
accepted by pytables """
def stringify(value):
if self.encoding is not None:
encoder = partial(pprint_thing_encoded, encoding=self.encoding)
else:
encoder = pprint_thing
return encoder(value)
kind = _ensure_decoded(self.kind)
meta = _ensure_decoded(self.meta)
if kind == "datetime64" or kind == "datetime":
if isinstance(v, (int, float)):
v = stringify(v)
v = _ensure_decoded(v)
v = Timestamp(v)
if v.tz is not None:
v = v.tz_convert("UTC")
return TermValue(v, v.value, kind)
elif kind == "timedelta64" or kind == "timedelta":
v = Timedelta(v, unit="s").value
return TermValue(int(v), v, kind)
elif meta == "category":
metadata = com.values_from_object(self.metadata)
result = metadata.searchsorted(v, side="left")
# result returns 0 if v is first element or if v is not in metadata
# check that metadata contains v
if not result and v not in metadata:
result = -1
return TermValue(result, result, "integer")
elif kind == "integer":
v = int(float(v))
return TermValue(v, v, kind)
elif kind == "float":
v = float(v)
return TermValue(v, v, kind)
elif kind == "bool":
if isinstance(v, str):
v = not v.strip().lower() in [
"false",
"f",
"no",
"n",
"none",
"0",
"[]",
"{}",
"",
]
else:
v = bool(v)
return TermValue(v, v, kind)
elif isinstance(v, str):
# string quoting
return TermValue(v, stringify(v), "string")
else:
raise TypeError(
"Cannot compare {v} of type {typ} to {kind} column".format(
v=v, typ=type(v), kind=kind
)
)
def convert_values(self):
pass
class FilterBinOp(BinOp):
def __str__(self):
return pprint_thing(
"[Filter : [{lhs}] -> [{op}]".format(lhs=self.filter[0], op=self.filter[1])
)
def invert(self):
""" invert the filter """
if self.filter is not None:
f = list(self.filter)
f[1] = self.generate_filter_op(invert=True)
self.filter = tuple(f)
return self
def format(self):
""" return the actual filter format """
return [self.filter]
def evaluate(self):
if not self.is_valid:
raise ValueError("query term is not valid [{slf}]".format(slf=self))
rhs = self.conform(self.rhs)
values = [TermValue(v, v, self.kind).value for v in rhs]
if self.is_in_table:
# if too many values to create the expression, use a filter instead
if self.op in ["==", "!="] and len(values) > self._max_selectors:
filter_op = self.generate_filter_op()
self.filter = (self.lhs, filter_op, pd.Index(values))
return self
return None
# equality conditions
if self.op in ["==", "!="]:
filter_op = self.generate_filter_op()
self.filter = (self.lhs, filter_op, pd.Index(values))
else:
raise TypeError(
"passing a filterable condition to a non-table "
"indexer [{slf}]".format(slf=self)
)
return self
def generate_filter_op(self, invert=False):
if (self.op == "!=" and not invert) or (self.op == "==" and invert):
return lambda axis, vals: ~axis.isin(vals)
else:
return lambda axis, vals: axis.isin(vals)
class JointFilterBinOp(FilterBinOp):
def format(self):
raise NotImplementedError("unable to collapse Joint Filters")
def evaluate(self):
return self
class ConditionBinOp(BinOp):
def __str__(self):
return pprint_thing("[Condition : [{cond}]]".format(cond=self.condition))
def invert(self):
""" invert the condition """
# if self.condition is not None:
# self.condition = "~(%s)" % self.condition
# return self
raise NotImplementedError(
"cannot use an invert condition when " "passing to numexpr"
)
def format(self):
""" return the actual ne format """
return self.condition
def evaluate(self):
if not self.is_valid:
raise ValueError("query term is not valid [{slf}]".format(slf=self))
# convert values if we are in the table
if not self.is_in_table:
return None
rhs = self.conform(self.rhs)
values = [self.convert_value(v) for v in rhs]
# equality conditions
if self.op in ["==", "!="]:
# too many values to create the expression?
if len(values) <= self._max_selectors:
vs = [self.generate(v) for v in values]
self.condition = "({cond})".format(cond=" | ".join(vs))
# use a filter after reading
else:
return None
else:
self.condition = self.generate(values[0])
return self
class JointConditionBinOp(ConditionBinOp):
def evaluate(self):
self.condition = "({lhs} {op} {rhs})".format(
lhs=self.lhs.condition, op=self.op, rhs=self.rhs.condition
)
return self
class UnaryOp(ops.UnaryOp):
def prune(self, klass):
if self.op != "~":
raise NotImplementedError("UnaryOp only support invert type ops")
operand = self.operand
operand = operand.prune(klass)
if operand is not None:
if issubclass(klass, ConditionBinOp):
if operand.condition is not None:
return operand.invert()
elif issubclass(klass, FilterBinOp):
if operand.filter is not None:
return operand.invert()
return None
_op_classes = {"unary": UnaryOp}
class ExprVisitor(BaseExprVisitor):
const_type = Constant
term_type = Term
def __init__(self, env, engine, parser, **kwargs):
super().__init__(env, engine, parser)
for bin_op in self.binary_ops:
bin_node = self.binary_op_nodes_map[bin_op]
setattr(
self,
"visit_{node}".format(node=bin_node),
lambda node, bin_op=bin_op: partial(BinOp, bin_op, **kwargs),
)
def visit_UnaryOp(self, node, **kwargs):
if isinstance(node.op, (ast.Not, ast.Invert)):
return UnaryOp("~", self.visit(node.operand))
elif isinstance(node.op, ast.USub):
return self.const_type(-self.visit(node.operand).value, self.env)
elif isinstance(node.op, ast.UAdd):
raise NotImplementedError("Unary addition not supported")
def visit_Index(self, node, **kwargs):
return self.visit(node.value).value
def visit_Assign(self, node, **kwargs):
cmpr = ast.Compare(
ops=[ast.Eq()], left=node.targets[0], comparators=[node.value]
)
return self.visit(cmpr)
def visit_Subscript(self, node, **kwargs):
# only allow simple subscripts
value = self.visit(node.value)
slobj = self.visit(node.slice)
try:
value = value.value
except AttributeError:
pass
try:
return self.const_type(value[slobj], self.env)
except TypeError:
raise ValueError(
"cannot subscript {value!r} with "
"{slobj!r}".format(value=value, slobj=slobj)
)
def visit_Attribute(self, node, **kwargs):
attr = node.attr
value = node.value
ctx = node.ctx.__class__
if ctx == ast.Load:
# resolve the value
resolved = self.visit(value)
# try to get the value to see if we are another expression
try:
resolved = resolved.value
except (AttributeError):
pass
try:
return self.term_type(getattr(resolved, attr), self.env)
except AttributeError:
# something like datetime.datetime where scope is overridden
if isinstance(value, ast.Name) and value.id == attr:
return resolved
raise ValueError("Invalid Attribute context {name}".format(name=ctx.__name__))
def translate_In(self, op):
return ast.Eq() if isinstance(op, ast.In) else op
def _rewrite_membership_op(self, node, left, right):
return self.visit(node.op), node.op, left, right
def _validate_where(w):
"""
Validate that the where statement is of the right type.
The type may either be String, Expr, or list-like of Exprs.
Parameters
----------
w : String term expression, Expr, or list-like of Exprs.
Returns
-------
where : The original where clause if the check was successful.
Raises
------
TypeError : An invalid data type was passed in for w (e.g. dict).
"""
if not (isinstance(w, (Expr, str)) or is_list_like(w)):
raise TypeError(
"where must be passed as a string, Expr, " "or list-like of Exprs"
)
return w
class Expr(expr.Expr):
""" hold a pytables like expression, comprised of possibly multiple 'terms'
Parameters
----------
where : string term expression, Expr, or list-like of Exprs
queryables : a "kinds" map (dict of column name -> kind), or None if column
is non-indexable
encoding : an encoding that will encode the query terms
Returns
-------
an Expr object
Examples
--------
'index>=date'
"columns=['A', 'D']"
'columns=A'
'columns==A'
"~(columns=['A','B'])"
'index>df.index[3] & string="bar"'
'(index>df.index[3] & index<=df.index[6]) | string="bar"'
"ts>=Timestamp('2012-02-01')"
"major_axis>=20130101"
"""
def __init__(self, where, queryables=None, encoding=None, scope_level=0):
where = _validate_where(where)
self.encoding = encoding
self.condition = None
self.filter = None
self.terms = None
self._visitor = None
# capture the environment if needed
local_dict = DeepChainMap()
if isinstance(where, Expr):
local_dict = where.env.scope
where = where.expr
elif isinstance(where, (list, tuple)):
for idx, w in enumerate(where):
if isinstance(w, Expr):
local_dict = w.env.scope
else:
w = _validate_where(w)
where[idx] = w
where = " & ".join(map("({})".format, com.flatten(where))) # noqa
self.expr = where
self.env = Scope(scope_level + 1, local_dict=local_dict)
if queryables is not None and isinstance(self.expr, str):
self.env.queryables.update(queryables)
self._visitor = ExprVisitor(
self.env,
queryables=queryables,
parser="pytables",
engine="pytables",
encoding=encoding,
)
self.terms = self.parse()
def __str__(self):
if self.terms is not None:
return pprint_thing(self.terms)
return pprint_thing(self.expr)
def evaluate(self):
""" create and return the numexpr condition and filter """
try:
self.condition = self.terms.prune(ConditionBinOp)
except AttributeError:
raise ValueError(
"cannot process expression [{expr}], [{slf}] "
"is not a valid condition".format(expr=self.expr, slf=self)
)
try:
self.filter = self.terms.prune(FilterBinOp)
except AttributeError:
raise ValueError(
"cannot process expression [{expr}], [{slf}] "
"is not a valid filter".format(expr=self.expr, slf=self)
)
return self.condition, self.filter
class TermValue:
""" hold a term value the we use to construct a condition/filter """
def __init__(self, value, converted, kind):
self.value = value
self.converted = converted
self.kind = kind
def tostring(self, encoding):
""" quote the string if not encoded
else encode and return """
if self.kind == "string":
if encoding is not None:
return self.converted
return '"{converted}"'.format(converted=self.converted)
elif self.kind == "float":
# python 2 str(float) is not always
# round-trippable so use repr()
return repr(self.converted)
return self.converted
def maybe_expression(s):
""" loose checking if s is a pytables-acceptable expression """
if not isinstance(s, str):
return False
ops = ExprVisitor.binary_ops + ExprVisitor.unary_ops + ("=",)
# make sure we have an op at least
return any(op in s for op in ops)

View File

@@ -0,0 +1,309 @@
"""
Module for scope operations
"""
import datetime
import inspect
from io import StringIO
import itertools
import pprint
import struct
import sys
import numpy as np
from pandas._libs.tslibs import Timestamp
from pandas.compat.chainmap import DeepChainMap
from pandas.core.base import StringMixin
import pandas.core.computation as compu
def _ensure_scope(
level, global_dict=None, local_dict=None, resolvers=(), target=None, **kwargs
):
"""Ensure that we are grabbing the correct scope."""
return Scope(
level + 1,
global_dict=global_dict,
local_dict=local_dict,
resolvers=resolvers,
target=target,
)
def _replacer(x):
"""Replace a number with its hexadecimal representation. Used to tag
temporary variables with their calling scope's id.
"""
# get the hex repr of the binary char and remove 0x and pad by pad_size
# zeros
try:
hexin = ord(x)
except TypeError:
# bytes literals masquerade as ints when iterating in py3
hexin = x
return hex(hexin)
def _raw_hex_id(obj):
"""Return the padded hexadecimal id of ``obj``."""
# interpret as a pointer since that's what really what id returns
packed = struct.pack("@P", id(obj))
return "".join(map(_replacer, packed))
_DEFAULT_GLOBALS = {
"Timestamp": Timestamp,
"datetime": datetime.datetime,
"True": True,
"False": False,
"list": list,
"tuple": tuple,
"inf": np.inf,
"Inf": np.inf,
}
def _get_pretty_string(obj):
"""Return a prettier version of obj
Parameters
----------
obj : object
Object to pretty print
Returns
-------
s : str
Pretty print object repr
"""
sio = StringIO()
pprint.pprint(obj, stream=sio)
return sio.getvalue()
class Scope(StringMixin):
"""Object to hold scope, with a few bells to deal with some custom syntax
and contexts added by pandas.
Parameters
----------
level : int
global_dict : dict or None, optional, default None
local_dict : dict or Scope or None, optional, default None
resolvers : list-like or None, optional, default None
target : object
Attributes
----------
level : int
scope : DeepChainMap
target : object
temps : dict
"""
__slots__ = "level", "scope", "target", "temps"
def __init__(
self, level, global_dict=None, local_dict=None, resolvers=(), target=None
):
self.level = level + 1
# shallow copy because we don't want to keep filling this up with what
# was there before if there are multiple calls to Scope/_ensure_scope
self.scope = DeepChainMap(_DEFAULT_GLOBALS.copy())
self.target = target
if isinstance(local_dict, Scope):
self.scope.update(local_dict.scope)
if local_dict.target is not None:
self.target = local_dict.target
self.update(local_dict.level)
frame = sys._getframe(self.level)
try:
# shallow copy here because we don't want to replace what's in
# scope when we align terms (alignment accesses the underlying
# numpy array of pandas objects)
self.scope = self.scope.new_child((global_dict or frame.f_globals).copy())
if not isinstance(local_dict, Scope):
self.scope = self.scope.new_child((local_dict or frame.f_locals).copy())
finally:
del frame
# assumes that resolvers are going from outermost scope to inner
if isinstance(local_dict, Scope):
resolvers += tuple(local_dict.resolvers.maps)
self.resolvers = DeepChainMap(*resolvers)
self.temps = {}
def __str__(self):
scope_keys = _get_pretty_string(list(self.scope.keys()))
res_keys = _get_pretty_string(list(self.resolvers.keys()))
unicode_str = "{name}(scope={scope_keys}, resolvers={res_keys})"
return unicode_str.format(
name=type(self).__name__, scope_keys=scope_keys, res_keys=res_keys
)
@property
def has_resolvers(self):
"""Return whether we have any extra scope.
For example, DataFrames pass Their columns as resolvers during calls to
``DataFrame.eval()`` and ``DataFrame.query()``.
Returns
-------
hr : bool
"""
return bool(len(self.resolvers))
def resolve(self, key, is_local):
"""Resolve a variable name in a possibly local context
Parameters
----------
key : str
A variable name
is_local : bool
Flag indicating whether the variable is local or not (prefixed with
the '@' symbol)
Returns
-------
value : object
The value of a particular variable
"""
try:
# only look for locals in outer scope
if is_local:
return self.scope[key]
# not a local variable so check in resolvers if we have them
if self.has_resolvers:
return self.resolvers[key]
# if we're here that means that we have no locals and we also have
# no resolvers
assert not is_local and not self.has_resolvers
return self.scope[key]
except KeyError:
try:
# last ditch effort we look in temporaries
# these are created when parsing indexing expressions
# e.g., df[df > 0]
return self.temps[key]
except KeyError:
raise compu.ops.UndefinedVariableError(key, is_local)
def swapkey(self, old_key, new_key, new_value=None):
"""Replace a variable name, with a potentially new value.
Parameters
----------
old_key : str
Current variable name to replace
new_key : str
New variable name to replace `old_key` with
new_value : object
Value to be replaced along with the possible renaming
"""
if self.has_resolvers:
maps = self.resolvers.maps + self.scope.maps
else:
maps = self.scope.maps
maps.append(self.temps)
for mapping in maps:
if old_key in mapping:
mapping[new_key] = new_value
return
def _get_vars(self, stack, scopes):
"""Get specifically scoped variables from a list of stack frames.
Parameters
----------
stack : list
A list of stack frames as returned by ``inspect.stack()``
scopes : sequence of strings
A sequence containing valid stack frame attribute names that
evaluate to a dictionary. For example, ('locals', 'globals')
"""
variables = itertools.product(scopes, stack)
for scope, (frame, _, _, _, _, _) in variables:
try:
d = getattr(frame, "f_" + scope)
self.scope = self.scope.new_child(d)
finally:
# won't remove it, but DECREF it
# in Py3 this probably isn't necessary since frame won't be
# scope after the loop
del frame
def update(self, level):
"""Update the current scope by going back `level` levels.
Parameters
----------
level : int or None, optional, default None
"""
sl = level + 1
# add sl frames to the scope starting with the
# most distant and overwriting with more current
# makes sure that we can capture variable scope
stack = inspect.stack()
try:
self._get_vars(stack[:sl], scopes=["locals"])
finally:
del stack[:], stack
def add_tmp(self, value):
"""Add a temporary variable to the scope.
Parameters
----------
value : object
An arbitrary object to be assigned to a temporary variable.
Returns
-------
name : basestring
The name of the temporary variable created.
"""
name = "{name}_{num}_{hex_id}".format(
name=type(value).__name__, num=self.ntemps, hex_id=_raw_hex_id(self)
)
# add to inner most scope
assert name not in self.temps
self.temps[name] = value
assert name in self.temps
# only increment if the variable gets put in the scope
return name
@property
def ntemps(self):
"""The number of temporary variables in this scope"""
return len(self.temps)
@property
def full_scope(self):
"""Return the full scope for use with passing to engines transparently
as a mapping.
Returns
-------
vars : DeepChainMap
All variables in this scope.
"""
maps = [self.temps] + self.resolvers.maps + self.scope.maps
return DeepChainMap(*maps)

View File

@@ -0,0 +1,648 @@
"""
This module is imported from the pandas package __init__.py file
in order to ensure that the core.config options registered here will
be available as soon as the user loads the package. if register_option
is invoked inside specific modules, they will not be registered until that
module is imported, which may or may not be a problem.
If you need to make sure options are available even before a certain
module is imported, register them here rather then in the module.
"""
import importlib
import pandas._config.config as cf
from pandas._config.config import (
is_bool,
is_callable,
is_instance_factory,
is_int,
is_one_of_factory,
is_text,
)
# compute
use_bottleneck_doc = """
: bool
Use the bottleneck library to accelerate if it is installed,
the default is True
Valid values: False,True
"""
def use_bottleneck_cb(key):
from pandas.core import nanops
nanops.set_use_bottleneck(cf.get_option(key))
use_numexpr_doc = """
: bool
Use the numexpr library to accelerate computation if it is installed,
the default is True
Valid values: False,True
"""
def use_numexpr_cb(key):
from pandas.core.computation import expressions
expressions.set_use_numexpr(cf.get_option(key))
with cf.config_prefix("compute"):
cf.register_option(
"use_bottleneck",
True,
use_bottleneck_doc,
validator=is_bool,
cb=use_bottleneck_cb,
)
cf.register_option(
"use_numexpr", True, use_numexpr_doc, validator=is_bool, cb=use_numexpr_cb
)
#
# options from the "display" namespace
pc_precision_doc = """
: int
Floating point output precision (number of significant digits). This is
only a suggestion
"""
pc_colspace_doc = """
: int
Default space for DataFrame columns.
"""
pc_max_rows_doc = """
: int
If max_rows is exceeded, switch to truncate view. Depending on
`large_repr`, objects are either centrally truncated or printed as
a summary view. 'None' value means unlimited.
In case python/IPython is running in a terminal and `large_repr`
equals 'truncate' this can be set to 0 and pandas will auto-detect
the height of the terminal and print a truncated object which fits
the screen height. The IPython notebook, IPython qtconsole, or
IDLE do not run in a terminal and hence it is not possible to do
correct auto-detection.
"""
pc_min_rows_doc = """
: int
The numbers of rows to show in a truncated view (when `max_rows` is
exceeded). Ignored when `max_rows` is set to None or 0. When set to
None, follows the value of `max_rows`.
"""
pc_max_cols_doc = """
: int
If max_cols is exceeded, switch to truncate view. Depending on
`large_repr`, objects are either centrally truncated or printed as
a summary view. 'None' value means unlimited.
In case python/IPython is running in a terminal and `large_repr`
equals 'truncate' this can be set to 0 and pandas will auto-detect
the width of the terminal and print a truncated object which fits
the screen width. The IPython notebook, IPython qtconsole, or IDLE
do not run in a terminal and hence it is not possible to do
correct auto-detection.
"""
pc_max_categories_doc = """
: int
This sets the maximum number of categories pandas should output when
printing out a `Categorical` or a Series of dtype "category".
"""
pc_max_info_cols_doc = """
: int
max_info_columns is used in DataFrame.info method to decide if
per column information will be printed.
"""
pc_nb_repr_h_doc = """
: boolean
When True, IPython notebook will use html representation for
pandas objects (if it is available).
"""
pc_pprint_nest_depth = """
: int
Controls the number of nested levels to process when pretty-printing
"""
pc_multi_sparse_doc = """
: boolean
"sparsify" MultiIndex display (don't display repeated
elements in outer levels within groups)
"""
float_format_doc = """
: callable
The callable should accept a floating point number and return
a string with the desired format of the number. This is used
in some places like SeriesFormatter.
See formats.format.EngFormatter for an example.
"""
max_colwidth_doc = """
: int
The maximum width in characters of a column in the repr of
a pandas data structure. When the column overflows, a "..."
placeholder is embedded in the output.
"""
colheader_justify_doc = """
: 'left'/'right'
Controls the justification of column headers. used by DataFrameFormatter.
"""
pc_expand_repr_doc = """
: boolean
Whether to print out the full DataFrame repr for wide DataFrames across
multiple lines, `max_columns` is still respected, but the output will
wrap-around across multiple "pages" if its width exceeds `display.width`.
"""
pc_show_dimensions_doc = """
: boolean or 'truncate'
Whether to print out dimensions at the end of DataFrame repr.
If 'truncate' is specified, only print out the dimensions if the
frame is truncated (e.g. not display all rows and/or columns)
"""
pc_east_asian_width_doc = """
: boolean
Whether to use the Unicode East Asian Width to calculate the display text
width.
Enabling this may affect to the performance (default: False)
"""
pc_ambiguous_as_wide_doc = """
: boolean
Whether to handle Unicode characters belong to Ambiguous as Wide (width=2)
(default: False)
"""
pc_latex_repr_doc = """
: boolean
Whether to produce a latex DataFrame representation for jupyter
environments that support it.
(default: False)
"""
pc_table_schema_doc = """
: boolean
Whether to publish a Table Schema representation for frontends
that support it.
(default: False)
"""
pc_html_border_doc = """
: int
A ``border=value`` attribute is inserted in the ``<table>`` tag
for the DataFrame HTML repr.
"""
pc_html_use_mathjax_doc = """\
: boolean
When True, Jupyter notebook will process table contents using MathJax,
rendering mathematical expressions enclosed by the dollar symbol.
(default: True)
"""
pc_width_doc = """
: int
Width of the display in characters. In case python/IPython is running in
a terminal this can be set to None and pandas will correctly auto-detect
the width.
Note that the IPython notebook, IPython qtconsole, or IDLE do not run in a
terminal and hence it is not possible to correctly detect the width.
"""
pc_chop_threshold_doc = """
: float or None
if set to a float value, all float values smaller then the given threshold
will be displayed as exactly 0 by repr and friends.
"""
pc_max_seq_items = """
: int or None
when pretty-printing a long sequence, no more then `max_seq_items`
will be printed. If items are omitted, they will be denoted by the
addition of "..." to the resulting string.
If set to None, the number of items to be printed is unlimited.
"""
pc_max_info_rows_doc = """
: int or None
df.info() will usually show null-counts for each column.
For large frames this can be quite slow. max_info_rows and max_info_cols
limit this null check only to frames with smaller dimensions than
specified.
"""
pc_large_repr_doc = """
: 'truncate'/'info'
For DataFrames exceeding max_rows/max_cols, the repr (and HTML repr) can
show a truncated table (the default from 0.13), or switch to the view from
df.info() (the behaviour in earlier versions of pandas).
"""
pc_memory_usage_doc = """
: bool, string or None
This specifies if the memory usage of a DataFrame should be displayed when
df.info() is called. Valid values True,False,'deep'
"""
pc_latex_escape = """
: bool
This specifies if the to_latex method of a Dataframe uses escapes special
characters.
Valid values: False,True
"""
pc_latex_longtable = """
:bool
This specifies if the to_latex method of a Dataframe uses the longtable
format.
Valid values: False,True
"""
pc_latex_multicolumn = """
: bool
This specifies if the to_latex method of a Dataframe uses multicolumns
to pretty-print MultiIndex columns.
Valid values: False,True
"""
pc_latex_multicolumn_format = """
: string
This specifies the format for multicolumn headers.
Can be surrounded with '|'.
Valid values: 'l', 'c', 'r', 'p{<width>}'
"""
pc_latex_multirow = """
: bool
This specifies if the to_latex method of a Dataframe uses multirows
to pretty-print MultiIndex rows.
Valid values: False,True
"""
def table_schema_cb(key):
from pandas.io.formats.printing import _enable_data_resource_formatter
_enable_data_resource_formatter(cf.get_option(key))
def is_terminal():
"""
Detect if Python is running in a terminal.
Returns True if Python is running in a terminal or False if not.
"""
try:
ip = get_ipython()
except NameError: # assume standard Python interpreter in a terminal
return True
else:
if hasattr(ip, "kernel"): # IPython as a Jupyter kernel
return False
else: # IPython in a terminal
return True
with cf.config_prefix("display"):
cf.register_option("precision", 6, pc_precision_doc, validator=is_int)
cf.register_option(
"float_format",
None,
float_format_doc,
validator=is_one_of_factory([None, is_callable]),
)
cf.register_option("column_space", 12, validator=is_int)
cf.register_option(
"max_info_rows",
1690785,
pc_max_info_rows_doc,
validator=is_instance_factory((int, type(None))),
)
cf.register_option(
"max_rows",
60,
pc_max_rows_doc,
validator=is_instance_factory([type(None), int]),
)
cf.register_option(
"min_rows",
10,
pc_min_rows_doc,
validator=is_instance_factory([type(None), int]),
)
cf.register_option("max_categories", 8, pc_max_categories_doc, validator=is_int)
cf.register_option("max_colwidth", 50, max_colwidth_doc, validator=is_int)
if is_terminal():
max_cols = 0 # automatically determine optimal number of columns
else:
max_cols = 20 # cannot determine optimal number of columns
cf.register_option(
"max_columns",
max_cols,
pc_max_cols_doc,
validator=is_instance_factory([type(None), int]),
)
cf.register_option(
"large_repr",
"truncate",
pc_large_repr_doc,
validator=is_one_of_factory(["truncate", "info"]),
)
cf.register_option("max_info_columns", 100, pc_max_info_cols_doc, validator=is_int)
cf.register_option(
"colheader_justify", "right", colheader_justify_doc, validator=is_text
)
cf.register_option("notebook_repr_html", True, pc_nb_repr_h_doc, validator=is_bool)
cf.register_option("pprint_nest_depth", 3, pc_pprint_nest_depth, validator=is_int)
cf.register_option("multi_sparse", True, pc_multi_sparse_doc, validator=is_bool)
cf.register_option("expand_frame_repr", True, pc_expand_repr_doc)
cf.register_option(
"show_dimensions",
"truncate",
pc_show_dimensions_doc,
validator=is_one_of_factory([True, False, "truncate"]),
)
cf.register_option("chop_threshold", None, pc_chop_threshold_doc)
cf.register_option("max_seq_items", 100, pc_max_seq_items)
cf.register_option(
"width", 80, pc_width_doc, validator=is_instance_factory([type(None), int])
)
cf.register_option(
"memory_usage",
True,
pc_memory_usage_doc,
validator=is_one_of_factory([None, True, False, "deep"]),
)
cf.register_option(
"unicode.east_asian_width", False, pc_east_asian_width_doc, validator=is_bool
)
cf.register_option(
"unicode.ambiguous_as_wide", False, pc_east_asian_width_doc, validator=is_bool
)
cf.register_option("latex.repr", False, pc_latex_repr_doc, validator=is_bool)
cf.register_option("latex.escape", True, pc_latex_escape, validator=is_bool)
cf.register_option("latex.longtable", False, pc_latex_longtable, validator=is_bool)
cf.register_option(
"latex.multicolumn", True, pc_latex_multicolumn, validator=is_bool
)
cf.register_option(
"latex.multicolumn_format", "l", pc_latex_multicolumn, validator=is_text
)
cf.register_option("latex.multirow", False, pc_latex_multirow, validator=is_bool)
cf.register_option(
"html.table_schema",
False,
pc_table_schema_doc,
validator=is_bool,
cb=table_schema_cb,
)
cf.register_option("html.border", 1, pc_html_border_doc, validator=is_int)
cf.register_option(
"html.use_mathjax", True, pc_html_use_mathjax_doc, validator=is_bool
)
tc_sim_interactive_doc = """
: boolean
Whether to simulate interactive mode for purposes of testing
"""
with cf.config_prefix("mode"):
cf.register_option("sim_interactive", False, tc_sim_interactive_doc)
use_inf_as_null_doc = """
: boolean
use_inf_as_null had been deprecated and will be removed in a future
version. Use `use_inf_as_na` instead.
"""
use_inf_as_na_doc = """
: boolean
True means treat None, NaN, INF, -INF as NA (old way),
False means None and NaN are null, but INF, -INF are not NA
(new way).
"""
# We don't want to start importing everything at the global context level
# or we'll hit circular deps.
def use_inf_as_na_cb(key):
from pandas.core.dtypes.missing import _use_inf_as_na
_use_inf_as_na(key)
with cf.config_prefix("mode"):
cf.register_option("use_inf_as_na", False, use_inf_as_na_doc, cb=use_inf_as_na_cb)
cf.register_option(
"use_inf_as_null", False, use_inf_as_null_doc, cb=use_inf_as_na_cb
)
cf.deprecate_option(
"mode.use_inf_as_null", msg=use_inf_as_null_doc, rkey="mode.use_inf_as_na"
)
# user warnings
chained_assignment = """
: string
Raise an exception, warn, or no action if trying to use chained assignment,
The default is warn
"""
with cf.config_prefix("mode"):
cf.register_option(
"chained_assignment",
"warn",
chained_assignment,
validator=is_one_of_factory([None, "warn", "raise"]),
)
# Set up the io.excel specific reader configuration.
reader_engine_doc = """
: string
The default Excel reader engine for '{ext}' files. Available options:
auto, {others}.
"""
_xls_options = ["xlrd"]
_xlsm_options = ["xlrd", "openpyxl"]
_xlsx_options = ["xlrd", "openpyxl"]
_ods_options = ["odf"]
with cf.config_prefix("io.excel.xls"):
cf.register_option(
"reader",
"auto",
reader_engine_doc.format(ext="xls", others=", ".join(_xls_options)),
validator=str,
)
with cf.config_prefix("io.excel.xlsm"):
cf.register_option(
"reader",
"auto",
reader_engine_doc.format(ext="xlsm", others=", ".join(_xlsm_options)),
validator=str,
)
with cf.config_prefix("io.excel.xlsx"):
cf.register_option(
"reader",
"auto",
reader_engine_doc.format(ext="xlsx", others=", ".join(_xlsx_options)),
validator=str,
)
with cf.config_prefix("io.excel.ods"):
cf.register_option(
"reader",
"auto",
reader_engine_doc.format(ext="ods", others=", ".join(_ods_options)),
validator=str,
)
# Set up the io.excel specific writer configuration.
writer_engine_doc = """
: string
The default Excel writer engine for '{ext}' files. Available options:
auto, {others}.
"""
_xls_options = ["xlwt"]
_xlsm_options = ["openpyxl"]
_xlsx_options = ["openpyxl", "xlsxwriter"]
with cf.config_prefix("io.excel.xls"):
cf.register_option(
"writer",
"auto",
writer_engine_doc.format(ext="xls", others=", ".join(_xls_options)),
validator=str,
)
with cf.config_prefix("io.excel.xlsm"):
cf.register_option(
"writer",
"auto",
writer_engine_doc.format(ext="xlsm", others=", ".join(_xlsm_options)),
validator=str,
)
with cf.config_prefix("io.excel.xlsx"):
cf.register_option(
"writer",
"auto",
writer_engine_doc.format(ext="xlsx", others=", ".join(_xlsx_options)),
validator=str,
)
# Set up the io.parquet specific configuration.
parquet_engine_doc = """
: string
The default parquet reader/writer engine. Available options:
'auto', 'pyarrow', 'fastparquet', the default is 'auto'
"""
with cf.config_prefix("io.parquet"):
cf.register_option(
"engine",
"auto",
parquet_engine_doc,
validator=is_one_of_factory(["auto", "pyarrow", "fastparquet"]),
)
# --------
# Plotting
# ---------
plotting_backend_doc = """
: str
The plotting backend to use. The default value is "matplotlib", the
backend provided with pandas. Other backends can be specified by
prodiving the name of the module that implements the backend.
"""
def register_plotting_backend_cb(key):
backend_str = cf.get_option(key)
if backend_str == "matplotlib":
try:
import pandas.plotting._matplotlib # noqa
except ImportError:
raise ImportError(
"matplotlib is required for plotting when the "
'default backend "matplotlib" is selected.'
)
else:
return
try:
importlib.import_module(backend_str)
except ImportError:
raise ValueError(
'"{}" does not seem to be an installed module. '
"A pandas plotting backend must be a module that "
"can be imported".format(backend_str)
)
with cf.config_prefix("plotting"):
cf.register_option(
"backend",
defval="matplotlib",
doc=plotting_backend_doc,
validator=str,
cb=register_plotting_backend_cb,
)
register_converter_doc = """
: bool
Whether to register converters with matplotlib's units registry for
dates, times, datetimes, and Periods. Toggling to False will remove
the converters, restoring any converters that pandas overwrote.
"""
def register_converter_cb(key):
from pandas.plotting import register_matplotlib_converters
from pandas.plotting import deregister_matplotlib_converters
if cf.get_option(key):
register_matplotlib_converters()
else:
deregister_matplotlib_converters()
with cf.config_prefix("plotting.matplotlib"):
cf.register_option(
"register_converters",
True,
register_converter_doc,
validator=bool,
cb=register_converter_cb,
)

View File

@@ -0,0 +1,47 @@
# flake8: noqa
from .common import (
is_array_like,
is_bool,
is_bool_dtype,
is_categorical,
is_categorical_dtype,
is_complex,
is_complex_dtype,
is_datetime64_any_dtype,
is_datetime64_dtype,
is_datetime64_ns_dtype,
is_datetime64tz_dtype,
is_datetimetz,
is_dict_like,
is_dtype_equal,
is_extension_array_dtype,
is_extension_type,
is_file_like,
is_float,
is_float_dtype,
is_hashable,
is_int64_dtype,
is_integer,
is_integer_dtype,
is_interval,
is_interval_dtype,
is_iterator,
is_list_like,
is_named_tuple,
is_number,
is_numeric_dtype,
is_object_dtype,
is_period,
is_period_dtype,
is_re,
is_re_compilable,
is_scalar,
is_signed_integer_dtype,
is_sparse,
is_string_dtype,
is_timedelta64_dtype,
is_timedelta64_ns_dtype,
is_unsigned_integer_dtype,
pandas_dtype,
)

View File

@@ -0,0 +1,298 @@
"""Extend pandas with custom array types"""
from typing import List, Optional, Tuple, Type
import numpy as np
from pandas.errors import AbstractMethodError
from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
class ExtensionDtype:
"""
A custom data type, to be paired with an ExtensionArray.
.. versionadded:: 0.23.0
See Also
--------
extensions.register_extension_dtype
extensions.ExtensionArray
Notes
-----
The interface includes the following abstract methods that must
be implemented by subclasses:
* type
* name
* construct_from_string
The following attributes influence the behavior of the dtype in
pandas operations
* _is_numeric
* _is_boolean
Optionally one can override construct_array_type for construction
with the name of this dtype via the Registry. See
:meth:`extensions.register_extension_dtype`.
* construct_array_type
The `na_value` class attribute can be used to set the default NA value
for this type. :attr:`numpy.nan` is used by default.
ExtensionDtypes are required to be hashable. The base class provides
a default implementation, which relies on the ``_metadata`` class
attribute. ``_metadata`` should be a tuple containing the strings
that define your data type. For example, with ``PeriodDtype`` that's
the ``freq`` attribute.
**If you have a parametrized dtype you should set the ``_metadata``
class property**.
Ideally, the attributes in ``_metadata`` will match the
parameters to your ``ExtensionDtype.__init__`` (if any). If any of
the attributes in ``_metadata`` don't implement the standard
``__eq__`` or ``__hash__``, the default implementations here will not
work.
.. versionchanged:: 0.24.0
Added ``_metadata``, ``__hash__``, and changed the default definition
of ``__eq__``.
This class does not inherit from 'abc.ABCMeta' for performance reasons.
Methods and properties required by the interface raise
``pandas.errors.AbstractMethodError`` and no ``register`` method is
provided for registering virtual subclasses.
"""
_metadata = () # type: Tuple[str, ...]
def __str__(self):
return self.name
def __eq__(self, other):
"""Check whether 'other' is equal to self.
By default, 'other' is considered equal if either
* it's a string matching 'self.name'.
* it's an instance of this type and all of the
the attributes in ``self._metadata`` are equal between
`self` and `other`.
Parameters
----------
other : Any
Returns
-------
bool
"""
if isinstance(other, str):
try:
other = self.construct_from_string(other)
except TypeError:
return False
if isinstance(other, type(self)):
return all(
getattr(self, attr) == getattr(other, attr) for attr in self._metadata
)
return False
def __hash__(self):
return hash(tuple(getattr(self, attr) for attr in self._metadata))
def __ne__(self, other):
return not self.__eq__(other)
@property
def na_value(self):
"""
Default NA value to use for this type.
This is used in e.g. ExtensionArray.take. This should be the
user-facing "boxed" version of the NA value, not the physical NA value
for storage. e.g. for JSONArray, this is an empty dictionary.
"""
return np.nan
@property
def type(self) -> Type:
"""
The scalar type for the array, e.g. ``int``
It's expected ``ExtensionArray[item]`` returns an instance
of ``ExtensionDtype.type`` for scalar ``item``, assuming
that value is valid (not NA). NA values do not need to be
instances of `type`.
"""
raise AbstractMethodError(self)
@property
def kind(self) -> str:
"""
A character code (one of 'biufcmMOSUV'), default 'O'
This should match the NumPy dtype used when the array is
converted to an ndarray, which is probably 'O' for object if
the extension type cannot be represented as a built-in NumPy
type.
See Also
--------
numpy.dtype.kind
"""
return "O"
@property
def name(self) -> str:
"""
A string identifying the data type.
Will be used for display in, e.g. ``Series.dtype``
"""
raise AbstractMethodError(self)
@property
def names(self) -> Optional[List[str]]:
"""Ordered list of field names, or None if there are no fields.
This is for compatibility with NumPy arrays, and may be removed in the
future.
"""
return None
@classmethod
def construct_array_type(cls):
"""
Return the array type associated with this dtype
Returns
-------
type
"""
raise NotImplementedError
@classmethod
def construct_from_string(cls, string: str):
r"""
Construct this type from a string.
This is useful mainly for data types that accept parameters.
For example, a period dtype accepts a frequency parameter that
can be set as ``period[H]`` (where H means hourly frequency).
By default, in the abstract class, just the name of the type is
expected. But subclasses can overwrite this method to accept
parameters.
Parameters
----------
string : str
The name of the type, for example ``category``.
Returns
-------
ExtensionDtype
Instance of the dtype.
Raises
------
TypeError
If a class cannot be constructed from this 'string'.
Examples
--------
For extension dtypes with arguments the following may be an
adequate implementation.
>>> @classmethod
... def construct_from_string(cls, string):
... pattern = re.compile(r"^my_type\[(?P<arg_name>.+)\]$")
... match = pattern.match(string)
... if match:
... return cls(**match.groupdict())
... else:
... raise TypeError("Cannot construct a '{}' from "
... "'{}'".format(cls.__name__, string))
"""
if not isinstance(string, str):
raise TypeError("Expects a string, got {}".format(type(string)))
if string != cls.name:
raise TypeError(
"Cannot construct a '{}' from '{}'".format(cls.__name__, string)
)
return cls()
@classmethod
def is_dtype(cls, dtype) -> bool:
"""Check if we match 'dtype'.
Parameters
----------
dtype : object
The object to check.
Returns
-------
is_dtype : bool
Notes
-----
The default implementation is True if
1. ``cls.construct_from_string(dtype)`` is an instance
of ``cls``.
2. ``dtype`` is an object and is an instance of ``cls``
3. ``dtype`` has a ``dtype`` attribute, and any of the above
conditions is true for ``dtype.dtype``.
"""
dtype = getattr(dtype, "dtype", dtype)
if isinstance(dtype, (ABCSeries, ABCIndexClass, ABCDataFrame, np.dtype)):
# https://github.com/pandas-dev/pandas/issues/22960
# avoid passing data to `construct_from_string`. This could
# cause a FutureWarning from numpy about failing elementwise
# comparison from, e.g., comparing DataFrame == 'category'.
return False
elif dtype is None:
return False
elif isinstance(dtype, cls):
return True
try:
return cls.construct_from_string(dtype) is not None
except TypeError:
return False
@property
def _is_numeric(self) -> bool:
"""
Whether columns with this dtype should be considered numeric.
By default ExtensionDtypes are assumed to be non-numeric.
They'll be excluded from operations that exclude non-numeric
columns, like (groupby) reductions, plotting, etc.
"""
return False
@property
def _is_boolean(self) -> bool:
"""
Whether this dtype should be considered boolean.
By default, ExtensionDtypes are assumed to be non-numeric.
Setting this to True will affect the behavior of several places,
e.g.
* is_bool
* boolean indexing
Returns
-------
bool
"""
return False

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,595 @@
"""
Utility functions related to concat
"""
import numpy as np
from pandas._libs import tslib, tslibs
from pandas.core.dtypes.common import (
_NS_DTYPE,
_TD_DTYPE,
is_bool_dtype,
is_categorical_dtype,
is_datetime64_dtype,
is_datetime64tz_dtype,
is_dtype_equal,
is_extension_array_dtype,
is_object_dtype,
is_sparse,
is_timedelta64_dtype,
)
from pandas.core.dtypes.generic import (
ABCDatetimeArray,
ABCDatetimeIndex,
ABCIndexClass,
ABCPeriodIndex,
ABCRangeIndex,
ABCSparseDataFrame,
ABCTimedeltaIndex,
)
def get_dtype_kinds(l):
"""
Parameters
----------
l : list of arrays
Returns
-------
a set of kinds that exist in this list of arrays
"""
typs = set()
for arr in l:
dtype = arr.dtype
if is_categorical_dtype(dtype):
typ = "category"
elif is_sparse(arr):
typ = "sparse"
elif isinstance(arr, ABCRangeIndex):
typ = "range"
elif is_datetime64tz_dtype(arr):
# if to_concat contains different tz,
# the result must be object dtype
typ = str(arr.dtype)
elif is_datetime64_dtype(dtype):
typ = "datetime"
elif is_timedelta64_dtype(dtype):
typ = "timedelta"
elif is_object_dtype(dtype):
typ = "object"
elif is_bool_dtype(dtype):
typ = "bool"
elif is_extension_array_dtype(dtype):
typ = str(arr.dtype)
else:
typ = dtype.kind
typs.add(typ)
return typs
def _get_series_result_type(result, objs=None):
"""
return appropriate class of Series concat
input is either dict or array-like
"""
from pandas import SparseSeries, SparseDataFrame, DataFrame
# concat Series with axis 1
if isinstance(result, dict):
# concat Series with axis 1
if all(isinstance(c, (SparseSeries, SparseDataFrame)) for c in result.values()):
return SparseDataFrame
else:
return DataFrame
# otherwise it is a SingleBlockManager (axis = 0)
return objs[0]._constructor
def _get_frame_result_type(result, objs):
"""
return appropriate class of DataFrame-like concat
if all blocks are sparse, return SparseDataFrame
otherwise, return 1st obj
"""
if result.blocks and (any(isinstance(obj, ABCSparseDataFrame) for obj in objs)):
from pandas.core.sparse.api import SparseDataFrame
return SparseDataFrame
else:
return next(obj for obj in objs if not isinstance(obj, ABCSparseDataFrame))
def _concat_compat(to_concat, axis=0):
"""
provide concatenation of an array of arrays each of which is a single
'normalized' dtypes (in that for example, if it's object, then it is a
non-datetimelike and provide a combined dtype for the resulting array that
preserves the overall dtype if possible)
Parameters
----------
to_concat : array of arrays
axis : axis to provide concatenation
Returns
-------
a single array, preserving the combined dtypes
"""
# filter empty arrays
# 1-d dtypes always are included here
def is_nonempty(x):
try:
return x.shape[axis] > 0
except Exception:
return True
# If all arrays are empty, there's nothing to convert, just short-cut to
# the concatenation, #3121.
#
# Creating an empty array directly is tempting, but the winnings would be
# marginal given that it would still require shape & dtype calculation and
# np.concatenate which has them both implemented is compiled.
typs = get_dtype_kinds(to_concat)
_contains_datetime = any(typ.startswith("datetime") for typ in typs)
_contains_period = any(typ.startswith("period") for typ in typs)
if "category" in typs:
# this must be prior to _concat_datetime,
# to support Categorical + datetime-like
return _concat_categorical(to_concat, axis=axis)
elif _contains_datetime or "timedelta" in typs or _contains_period:
return _concat_datetime(to_concat, axis=axis, typs=typs)
# these are mandated to handle empties as well
elif "sparse" in typs:
return _concat_sparse(to_concat, axis=axis, typs=typs)
all_empty = all(not is_nonempty(x) for x in to_concat)
if any(is_extension_array_dtype(x) for x in to_concat) and axis == 1:
to_concat = [np.atleast_2d(x.astype("object")) for x in to_concat]
if all_empty:
# we have all empties, but may need to coerce the result dtype to
# object if we have non-numeric type operands (numpy would otherwise
# cast this to float)
typs = get_dtype_kinds(to_concat)
if len(typs) != 1:
if not len(typs - {"i", "u", "f"}) or not len(typs - {"bool", "i", "u"}):
# let numpy coerce
pass
else:
# coerce to object
to_concat = [x.astype("object") for x in to_concat]
return np.concatenate(to_concat, axis=axis)
def _concat_categorical(to_concat, axis=0):
"""Concatenate an object/categorical array of arrays, each of which is a
single dtype
Parameters
----------
to_concat : array of arrays
axis : int
Axis to provide concatenation in the current implementation this is
always 0, e.g. we only have 1D categoricals
Returns
-------
Categorical
A single array, preserving the combined dtypes
"""
# we could have object blocks and categoricals here
# if we only have a single categoricals then combine everything
# else its a non-compat categorical
categoricals = [x for x in to_concat if is_categorical_dtype(x.dtype)]
# validate the categories
if len(categoricals) != len(to_concat):
pass
else:
# when all categories are identical
first = to_concat[0]
if all(first.is_dtype_equal(other) for other in to_concat[1:]):
return union_categoricals(categoricals)
# extract the categoricals & coerce to object if needed
to_concat = [
x._internal_get_values()
if is_categorical_dtype(x.dtype)
else np.asarray(x).ravel()
if not is_datetime64tz_dtype(x)
else np.asarray(x.astype(object))
for x in to_concat
]
result = _concat_compat(to_concat)
if axis == 1:
result = result.reshape(1, len(result))
return result
def union_categoricals(to_union, sort_categories=False, ignore_order=False):
"""
Combine list-like of Categorical-like, unioning categories. All
categories must have the same dtype.
.. versionadded:: 0.19.0
Parameters
----------
to_union : list-like of Categorical, CategoricalIndex,
or Series with dtype='category'
sort_categories : boolean, default False
If true, resulting categories will be lexsorted, otherwise
they will be ordered as they appear in the data.
ignore_order : boolean, default False
If true, the ordered attribute of the Categoricals will be ignored.
Results in an unordered categorical.
.. versionadded:: 0.20.0
Returns
-------
result : Categorical
Raises
------
TypeError
- all inputs do not have the same dtype
- all inputs do not have the same ordered property
- all inputs are ordered and their categories are not identical
- sort_categories=True and Categoricals are ordered
ValueError
Empty list of categoricals passed
Notes
-----
To learn more about categories, see `link
<http://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html#unioning>`__
Examples
--------
>>> from pandas.api.types import union_categoricals
If you want to combine categoricals that do not necessarily have
the same categories, `union_categoricals` will combine a list-like
of categoricals. The new categories will be the union of the
categories being combined.
>>> a = pd.Categorical(["b", "c"])
>>> b = pd.Categorical(["a", "b"])
>>> union_categoricals([a, b])
[b, c, a, b]
Categories (3, object): [b, c, a]
By default, the resulting categories will be ordered as they appear
in the `categories` of the data. If you want the categories to be
lexsorted, use `sort_categories=True` argument.
>>> union_categoricals([a, b], sort_categories=True)
[b, c, a, b]
Categories (3, object): [a, b, c]
`union_categoricals` also works with the case of combining two
categoricals of the same categories and order information (e.g. what
you could also `append` for).
>>> a = pd.Categorical(["a", "b"], ordered=True)
>>> b = pd.Categorical(["a", "b", "a"], ordered=True)
>>> union_categoricals([a, b])
[a, b, a, b, a]
Categories (2, object): [a < b]
Raises `TypeError` because the categories are ordered and not identical.
>>> a = pd.Categorical(["a", "b"], ordered=True)
>>> b = pd.Categorical(["a", "b", "c"], ordered=True)
>>> union_categoricals([a, b])
TypeError: to union ordered Categoricals, all categories must be the same
New in version 0.20.0
Ordered categoricals with different categories or orderings can be
combined by using the `ignore_ordered=True` argument.
>>> a = pd.Categorical(["a", "b", "c"], ordered=True)
>>> b = pd.Categorical(["c", "b", "a"], ordered=True)
>>> union_categoricals([a, b], ignore_order=True)
[a, b, c, c, b, a]
Categories (3, object): [a, b, c]
`union_categoricals` also works with a `CategoricalIndex`, or `Series`
containing categorical data, but note that the resulting array will
always be a plain `Categorical`
>>> a = pd.Series(["b", "c"], dtype='category')
>>> b = pd.Series(["a", "b"], dtype='category')
>>> union_categoricals([a, b])
[b, c, a, b]
Categories (3, object): [b, c, a]
"""
from pandas import Index, Categorical, CategoricalIndex, Series
from pandas.core.arrays.categorical import _recode_for_categories
if len(to_union) == 0:
raise ValueError("No Categoricals to union")
def _maybe_unwrap(x):
if isinstance(x, (CategoricalIndex, Series)):
return x.values
elif isinstance(x, Categorical):
return x
else:
raise TypeError("all components to combine must be Categorical")
to_union = [_maybe_unwrap(x) for x in to_union]
first = to_union[0]
if not all(
is_dtype_equal(other.categories.dtype, first.categories.dtype)
for other in to_union[1:]
):
raise TypeError("dtype of categories must be the same")
ordered = False
if all(first.is_dtype_equal(other) for other in to_union[1:]):
# identical categories - fastpath
categories = first.categories
ordered = first.ordered
if all(first.categories.equals(other.categories) for other in to_union[1:]):
new_codes = np.concatenate([c.codes for c in to_union])
else:
codes = [first.codes] + [
_recode_for_categories(other.codes, other.categories, first.categories)
for other in to_union[1:]
]
new_codes = np.concatenate(codes)
if sort_categories and not ignore_order and ordered:
raise TypeError(
"Cannot use sort_categories=True with " "ordered Categoricals"
)
if sort_categories and not categories.is_monotonic_increasing:
categories = categories.sort_values()
indexer = categories.get_indexer(first.categories)
from pandas.core.algorithms import take_1d
new_codes = take_1d(indexer, new_codes, fill_value=-1)
elif ignore_order or all(not c.ordered for c in to_union):
# different categories - union and recode
cats = first.categories.append([c.categories for c in to_union[1:]])
categories = Index(cats.unique())
if sort_categories:
categories = categories.sort_values()
new_codes = [
_recode_for_categories(c.codes, c.categories, categories) for c in to_union
]
new_codes = np.concatenate(new_codes)
else:
# ordered - to show a proper error message
if all(c.ordered for c in to_union):
msg = "to union ordered Categoricals, " "all categories must be the same"
raise TypeError(msg)
else:
raise TypeError("Categorical.ordered must be the same")
if ignore_order:
ordered = False
return Categorical(new_codes, categories=categories, ordered=ordered, fastpath=True)
def _concatenate_2d(to_concat, axis):
# coerce to 2d if needed & concatenate
if axis == 1:
to_concat = [np.atleast_2d(x) for x in to_concat]
return np.concatenate(to_concat, axis=axis)
def _concat_datetime(to_concat, axis=0, typs=None):
"""
provide concatenation of an datetimelike array of arrays each of which is a
single M8[ns], datetimet64[ns, tz] or m8[ns] dtype
Parameters
----------
to_concat : array of arrays
axis : axis to provide concatenation
typs : set of to_concat dtypes
Returns
-------
a single array, preserving the combined dtypes
"""
if typs is None:
typs = get_dtype_kinds(to_concat)
# multiple types, need to coerce to object
if len(typs) != 1:
return _concatenate_2d(
[_convert_datetimelike_to_object(x) for x in to_concat], axis=axis
)
# must be single dtype
if any(typ.startswith("datetime") for typ in typs):
if "datetime" in typs:
to_concat = [x.astype(np.int64, copy=False) for x in to_concat]
return _concatenate_2d(to_concat, axis=axis).view(_NS_DTYPE)
else:
# when to_concat has different tz, len(typs) > 1.
# thus no need to care
return _concat_datetimetz(to_concat)
elif "timedelta" in typs:
return _concatenate_2d([x.view(np.int64) for x in to_concat], axis=axis).view(
_TD_DTYPE
)
elif any(typ.startswith("period") for typ in typs):
assert len(typs) == 1
cls = to_concat[0]
new_values = cls._concat_same_type(to_concat)
return new_values
def _convert_datetimelike_to_object(x):
# coerce datetimelike array to object dtype
# if dtype is of datetimetz or timezone
if x.dtype.kind == _NS_DTYPE.kind:
if getattr(x, "tz", None) is not None:
x = np.asarray(x.astype(object))
else:
shape = x.shape
x = tslib.ints_to_pydatetime(x.view(np.int64).ravel(), box="timestamp")
x = x.reshape(shape)
elif x.dtype == _TD_DTYPE:
shape = x.shape
x = tslibs.ints_to_pytimedelta(x.view(np.int64).ravel(), box=True)
x = x.reshape(shape)
return x
def _concat_datetimetz(to_concat, name=None):
"""
concat DatetimeIndex with the same tz
all inputs must be DatetimeIndex
it is used in DatetimeIndex.append also
"""
# Right now, internals will pass a List[DatetimeArray] here
# for reductions like quantile. I would like to disentangle
# all this before we get here.
sample = to_concat[0]
if isinstance(sample, ABCIndexClass):
return sample._concat_same_dtype(to_concat, name=name)
elif isinstance(sample, ABCDatetimeArray):
return sample._concat_same_type(to_concat)
def _concat_index_same_dtype(indexes, klass=None):
klass = klass if klass is not None else indexes[0].__class__
return klass(np.concatenate([x._values for x in indexes]))
def _concat_index_asobject(to_concat, name=None):
"""
concat all inputs as object. DatetimeIndex, TimedeltaIndex and
PeriodIndex are converted to object dtype before concatenation
"""
from pandas import Index
from pandas.core.arrays import ExtensionArray
klasses = (ABCDatetimeIndex, ABCTimedeltaIndex, ABCPeriodIndex, ExtensionArray)
to_concat = [x.astype(object) if isinstance(x, klasses) else x for x in to_concat]
self = to_concat[0]
attribs = self._get_attributes_dict()
attribs["name"] = name
to_concat = [x._values if isinstance(x, Index) else x for x in to_concat]
return self._shallow_copy_with_infer(np.concatenate(to_concat), **attribs)
def _concat_sparse(to_concat, axis=0, typs=None):
"""
provide concatenation of an sparse/dense array of arrays each of which is a
single dtype
Parameters
----------
to_concat : array of arrays
axis : axis to provide concatenation
typs : set of to_concat dtypes
Returns
-------
a single array, preserving the combined dtypes
"""
from pandas.core.arrays import SparseArray
fill_values = [x.fill_value for x in to_concat if isinstance(x, SparseArray)]
fill_value = fill_values[0]
# TODO: Fix join unit generation so we aren't passed this.
to_concat = [
x
if isinstance(x, SparseArray)
else SparseArray(x.squeeze(), fill_value=fill_value)
for x in to_concat
]
return SparseArray._concat_same_type(to_concat)
def _concat_rangeindex_same_dtype(indexes):
"""
Concatenates multiple RangeIndex instances. All members of "indexes" must
be of type RangeIndex; result will be RangeIndex if possible, Int64Index
otherwise. E.g.:
indexes = [RangeIndex(3), RangeIndex(3, 6)] -> RangeIndex(6)
indexes = [RangeIndex(3), RangeIndex(4, 6)] -> Int64Index([0,1,2,4,5])
"""
from pandas import Int64Index, RangeIndex
start = step = next_ = None
# Filter the empty indexes
non_empty_indexes = [obj for obj in indexes if len(obj)]
for obj in non_empty_indexes:
rng = obj._range # type: range
if start is None:
# This is set by the first non-empty index
start = rng.start
if step is None and len(rng) > 1:
step = rng.step
elif step is None:
# First non-empty index had only one element
if rng.start == start:
return _concat_index_same_dtype(indexes, klass=Int64Index)
step = rng.start - start
non_consecutive = (step != rng.step and len(rng) > 1) or (
next_ is not None and rng.start != next_
)
if non_consecutive:
return _concat_index_same_dtype(indexes, klass=Int64Index)
if step is not None:
next_ = rng[-1] + step
if non_empty_indexes:
# Get the stop value from "next" or alternatively
# from the last non-empty index
stop = non_empty_indexes[-1].stop if next_ is None else next_
return RangeIndex(start, stop, step)
# Here all "indexes" had 0 length, i.e. were empty.
# In this case return an empty range index.
return RangeIndex(0, 0)

File diff suppressed because it is too large Load Diff

Some files were not shown because too many files have changed in this diff Show More