8th day of python challenges 111-117

This commit is contained in:
abd.shallal
2019-08-04 15:26:35 +03:00
parent b04c1b055f
commit 627802c383
3215 changed files with 760227 additions and 491 deletions

View File

@@ -0,0 +1,21 @@
"""
Data IO api
"""
# flake8: noqa
from pandas.io.clipboards import read_clipboard
from pandas.io.excel import ExcelFile, ExcelWriter, read_excel
from pandas.io.feather_format import read_feather
from pandas.io.gbq import read_gbq
from pandas.io.html import read_html
from pandas.io.json import read_json
from pandas.io.packers import read_msgpack, to_msgpack
from pandas.io.parquet import read_parquet
from pandas.io.parsers import read_csv, read_fwf, read_table
from pandas.io.pickle import read_pickle, to_pickle
from pandas.io.pytables import HDFStore, read_hdf
from pandas.io.sas import read_sas
from pandas.io.spss import read_spss
from pandas.io.sql import read_sql, read_sql_query, read_sql_table
from pandas.io.stata import read_stata

View File

@@ -0,0 +1,126 @@
"""
Pyperclip
A cross-platform clipboard module for Python. (only handles plain text for now)
By Al Sweigart al@inventwithpython.com
BSD License
Usage:
import pyperclip
pyperclip.copy('The text to be copied to the clipboard.')
spam = pyperclip.paste()
if not pyperclip.copy:
print("Copy functionality unavailable!")
On Windows, no additional modules are needed.
On Mac, the module uses pbcopy and pbpaste, which should come with the os.
On Linux, install xclip or xsel via package manager. For example, in Debian:
sudo apt-get install xclip
Otherwise on Linux, you will need the qtpy or PyQt modules installed.
qtpy also requires a python-qt-bindings module: PyQt4, PyQt5, PySide, PySide2
This module does not work with PyGObject yet.
"""
__version__ = "1.5.27"
import os
import platform
import subprocess
from .clipboards import (
init_klipper_clipboard,
init_no_clipboard,
init_osx_clipboard,
init_qt_clipboard,
init_xclip_clipboard,
init_xsel_clipboard,
)
from .windows import init_windows_clipboard
# `import qtpy` sys.exit()s if DISPLAY is not in the environment.
# Thus, we need to detect the presence of $DISPLAY manually
# and not load qtpy if it is absent.
HAS_DISPLAY = os.getenv("DISPLAY", False)
CHECK_CMD = "where" if platform.system() == "Windows" else "which"
def _executable_exists(name):
return (
subprocess.call(
[CHECK_CMD, name], stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
== 0
)
def determine_clipboard():
# Determine the OS/platform and set
# the copy() and paste() functions accordingly.
if "cygwin" in platform.system().lower():
# FIXME: pyperclip currently does not support Cygwin,
# see https://github.com/asweigart/pyperclip/issues/55
pass
elif os.name == "nt" or platform.system() == "Windows":
return init_windows_clipboard()
if os.name == "mac" or platform.system() == "Darwin":
return init_osx_clipboard()
if HAS_DISPLAY:
# Determine which command/module is installed, if any.
try:
# qtpy is a small abstraction layer that lets you write
# applications using a single api call to either PyQt or PySide
# https://pypi.org/project/QtPy
import qtpy # noqa
except ImportError:
# If qtpy isn't installed, fall back on importing PyQt5, or PyQt5
try:
import PyQt5 # noqa
except ImportError:
try:
import PyQt4 # noqa
except ImportError:
pass # fail fast for all non-ImportError exceptions.
else:
return init_qt_clipboard()
else:
return init_qt_clipboard()
pass
else:
return init_qt_clipboard()
if _executable_exists("xclip"):
return init_xclip_clipboard()
if _executable_exists("xsel"):
return init_xsel_clipboard()
if _executable_exists("klipper") and _executable_exists("qdbus"):
return init_klipper_clipboard()
return init_no_clipboard()
def set_clipboard(clipboard):
global copy, paste
clipboard_types = {
"osx": init_osx_clipboard,
"qt": init_qt_clipboard,
"xclip": init_xclip_clipboard,
"xsel": init_xsel_clipboard,
"klipper": init_klipper_clipboard,
"windows": init_windows_clipboard,
"no": init_no_clipboard,
}
copy, paste = clipboard_types[clipboard]()
copy, paste = determine_clipboard()
__all__ = ["copy", "paste"]
# pandas aliases
clipboard_get = paste
clipboard_set = copy

View File

@@ -0,0 +1,129 @@
import subprocess
from .exceptions import PyperclipException
EXCEPT_MSG = """
Pyperclip could not find a copy/paste mechanism for your system.
For more information, please visit https://pyperclip.readthedocs.org """
def init_osx_clipboard():
def copy_osx(text):
p = subprocess.Popen(["pbcopy", "w"], stdin=subprocess.PIPE, close_fds=True)
p.communicate(input=text.encode("utf-8"))
def paste_osx():
p = subprocess.Popen(["pbpaste", "r"], stdout=subprocess.PIPE, close_fds=True)
stdout, stderr = p.communicate()
return stdout.decode("utf-8")
return copy_osx, paste_osx
def init_qt_clipboard():
# $DISPLAY should exist
# Try to import from qtpy, but if that fails try PyQt5 then PyQt4
try:
from qtpy.QtWidgets import QApplication
except ImportError:
try:
from PyQt5.QtWidgets import QApplication
except ImportError:
from PyQt4.QtGui import QApplication
app = QApplication.instance()
if app is None:
app = QApplication([])
def copy_qt(text):
cb = app.clipboard()
cb.setText(text)
def paste_qt():
cb = app.clipboard()
return str(cb.text())
return copy_qt, paste_qt
def init_xclip_clipboard():
def copy_xclip(text):
p = subprocess.Popen(
["xclip", "-selection", "c"], stdin=subprocess.PIPE, close_fds=True
)
p.communicate(input=text.encode("utf-8"))
def paste_xclip():
p = subprocess.Popen(
["xclip", "-selection", "c", "-o"], stdout=subprocess.PIPE, close_fds=True
)
stdout, stderr = p.communicate()
return stdout.decode("utf-8")
return copy_xclip, paste_xclip
def init_xsel_clipboard():
def copy_xsel(text):
p = subprocess.Popen(
["xsel", "-b", "-i"], stdin=subprocess.PIPE, close_fds=True
)
p.communicate(input=text.encode("utf-8"))
def paste_xsel():
p = subprocess.Popen(
["xsel", "-b", "-o"], stdout=subprocess.PIPE, close_fds=True
)
stdout, stderr = p.communicate()
return stdout.decode("utf-8")
return copy_xsel, paste_xsel
def init_klipper_clipboard():
def copy_klipper(text):
p = subprocess.Popen(
[
"qdbus",
"org.kde.klipper",
"/klipper",
"setClipboardContents",
text.encode("utf-8"),
],
stdin=subprocess.PIPE,
close_fds=True,
)
p.communicate(input=None)
def paste_klipper():
p = subprocess.Popen(
["qdbus", "org.kde.klipper", "/klipper", "getClipboardContents"],
stdout=subprocess.PIPE,
close_fds=True,
)
stdout, stderr = p.communicate()
# Workaround for https://bugs.kde.org/show_bug.cgi?id=342874
# TODO: https://github.com/asweigart/pyperclip/issues/43
clipboardContents = stdout.decode("utf-8")
# even if blank, Klipper will append a newline at the end
assert len(clipboardContents) > 0
# make sure that newline is there
assert clipboardContents.endswith("\n")
if clipboardContents.endswith("\n"):
clipboardContents = clipboardContents[:-1]
return clipboardContents
return copy_klipper, paste_klipper
def init_no_clipboard():
class ClipboardUnavailable:
def __call__(self, *args, **kwargs):
raise PyperclipException(EXCEPT_MSG)
def __bool__(self):
return False
return ClipboardUnavailable(), ClipboardUnavailable()

View File

@@ -0,0 +1,11 @@
import ctypes
class PyperclipException(RuntimeError):
pass
class PyperclipWindowsException(PyperclipException):
def __init__(self, message):
message += " ({err})".format(err=ctypes.WinError())
super().__init__(message)

View File

@@ -0,0 +1,184 @@
"""
This module implements clipboard handling on Windows using ctypes.
"""
import contextlib
import ctypes
from ctypes import c_size_t, c_wchar, c_wchar_p, get_errno, sizeof
import time
from .exceptions import PyperclipWindowsException
class CheckedCall:
def __init__(self, f):
super().__setattr__("f", f)
def __call__(self, *args):
ret = self.f(*args)
if not ret and get_errno():
raise PyperclipWindowsException("Error calling " + self.f.__name__)
return ret
def __setattr__(self, key, value):
setattr(self.f, key, value)
def init_windows_clipboard():
from ctypes.wintypes import (
HGLOBAL,
LPVOID,
DWORD,
LPCSTR,
INT,
HWND,
HINSTANCE,
HMENU,
BOOL,
UINT,
HANDLE,
)
windll = ctypes.windll
msvcrt = ctypes.CDLL("msvcrt")
safeCreateWindowExA = CheckedCall(windll.user32.CreateWindowExA)
safeCreateWindowExA.argtypes = [
DWORD,
LPCSTR,
LPCSTR,
DWORD,
INT,
INT,
INT,
INT,
HWND,
HMENU,
HINSTANCE,
LPVOID,
]
safeCreateWindowExA.restype = HWND
safeDestroyWindow = CheckedCall(windll.user32.DestroyWindow)
safeDestroyWindow.argtypes = [HWND]
safeDestroyWindow.restype = BOOL
OpenClipboard = windll.user32.OpenClipboard
OpenClipboard.argtypes = [HWND]
OpenClipboard.restype = BOOL
safeCloseClipboard = CheckedCall(windll.user32.CloseClipboard)
safeCloseClipboard.argtypes = []
safeCloseClipboard.restype = BOOL
safeEmptyClipboard = CheckedCall(windll.user32.EmptyClipboard)
safeEmptyClipboard.argtypes = []
safeEmptyClipboard.restype = BOOL
safeGetClipboardData = CheckedCall(windll.user32.GetClipboardData)
safeGetClipboardData.argtypes = [UINT]
safeGetClipboardData.restype = HANDLE
safeSetClipboardData = CheckedCall(windll.user32.SetClipboardData)
safeSetClipboardData.argtypes = [UINT, HANDLE]
safeSetClipboardData.restype = HANDLE
safeGlobalAlloc = CheckedCall(windll.kernel32.GlobalAlloc)
safeGlobalAlloc.argtypes = [UINT, c_size_t]
safeGlobalAlloc.restype = HGLOBAL
safeGlobalLock = CheckedCall(windll.kernel32.GlobalLock)
safeGlobalLock.argtypes = [HGLOBAL]
safeGlobalLock.restype = LPVOID
safeGlobalUnlock = CheckedCall(windll.kernel32.GlobalUnlock)
safeGlobalUnlock.argtypes = [HGLOBAL]
safeGlobalUnlock.restype = BOOL
wcslen = CheckedCall(msvcrt.wcslen)
wcslen.argtypes = [c_wchar_p]
wcslen.restype = UINT
GMEM_MOVEABLE = 0x0002
CF_UNICODETEXT = 13
@contextlib.contextmanager
def window():
"""
Context that provides a valid Windows hwnd.
"""
# we really just need the hwnd, so setting "STATIC"
# as predefined lpClass is just fine.
hwnd = safeCreateWindowExA(
0, b"STATIC", None, 0, 0, 0, 0, 0, None, None, None, None
)
try:
yield hwnd
finally:
safeDestroyWindow(hwnd)
@contextlib.contextmanager
def clipboard(hwnd):
"""
Context manager that opens the clipboard and prevents
other applications from modifying the clipboard content.
"""
# We may not get the clipboard handle immediately because
# some other application is accessing it (?)
# We try for at least 500ms to get the clipboard.
t = time.time() + 0.5
success = False
while time.time() < t:
success = OpenClipboard(hwnd)
if success:
break
time.sleep(0.01)
if not success:
raise PyperclipWindowsException("Error calling OpenClipboard")
try:
yield
finally:
safeCloseClipboard()
def copy_windows(text):
# This function is heavily based on
# http://msdn.com/ms649016#_win32_Copying_Information_to_the_Clipboard
with window() as hwnd:
# http://msdn.com/ms649048
# If an application calls OpenClipboard with hwnd set to NULL,
# EmptyClipboard sets the clipboard owner to NULL;
# this causes SetClipboardData to fail.
# => We need a valid hwnd to copy something.
with clipboard(hwnd):
safeEmptyClipboard()
if text:
# http://msdn.com/ms649051
# If the hMem parameter identifies a memory object,
# the object must have been allocated using the
# function with the GMEM_MOVEABLE flag.
count = wcslen(text) + 1
handle = safeGlobalAlloc(GMEM_MOVEABLE, count * sizeof(c_wchar))
locked_handle = safeGlobalLock(handle)
ctypes.memmove(
c_wchar_p(locked_handle),
c_wchar_p(text),
count * sizeof(c_wchar),
)
safeGlobalUnlock(handle)
safeSetClipboardData(CF_UNICODETEXT, handle)
def paste_windows():
with clipboard(None):
handle = safeGetClipboardData(CF_UNICODETEXT)
if not handle:
# GetClipboardData may return NULL with errno == NO_ERROR
# if the clipboard is empty.
# (Also, it may return a handle to an empty buffer,
# but technically that's not empty)
return ""
return c_wchar_p(handle).value
return copy_windows, paste_windows

View File

@@ -0,0 +1,135 @@
""" io on the clipboard """
from io import StringIO
import warnings
from pandas.core.dtypes.generic import ABCDataFrame
from pandas import get_option, option_context
def read_clipboard(sep=r"\s+", **kwargs): # pragma: no cover
r"""
Read text from clipboard and pass to read_csv. See read_csv for the
full argument list
Parameters
----------
sep : str, default '\s+'
A string or regex delimiter. The default of '\s+' denotes
one or more whitespace characters.
Returns
-------
parsed : DataFrame
"""
encoding = kwargs.pop("encoding", "utf-8")
# only utf-8 is valid for passed value because that's what clipboard
# supports
if encoding is not None and encoding.lower().replace("-", "") != "utf8":
raise NotImplementedError("reading from clipboard only supports utf-8 encoding")
from pandas.io.clipboard import clipboard_get
from pandas.io.parsers import read_csv
text = clipboard_get()
# Try to decode (if needed, as "text" might already be a string here).
try:
text = text.decode(kwargs.get("encoding") or get_option("display.encoding"))
except AttributeError:
pass
# Excel copies into clipboard with \t separation
# inspect no more then the 10 first lines, if they
# all contain an equal number (>0) of tabs, infer
# that this came from excel and set 'sep' accordingly
lines = text[:10000].split("\n")[:-1][:10]
# Need to remove leading white space, since read_csv
# accepts:
# a b
# 0 1 2
# 1 3 4
counts = {x.lstrip().count("\t") for x in lines}
if len(lines) > 1 and len(counts) == 1 and counts.pop() != 0:
sep = "\t"
# Edge case where sep is specified to be None, return to default
if sep is None and kwargs.get("delim_whitespace") is None:
sep = r"\s+"
# Regex separator currently only works with python engine.
# Default to python if separator is multi-character (regex)
if len(sep) > 1 and kwargs.get("engine") is None:
kwargs["engine"] = "python"
elif len(sep) > 1 and kwargs.get("engine") == "c":
warnings.warn(
"read_clipboard with regex separator does not work"
" properly with c engine"
)
return read_csv(StringIO(text), sep=sep, **kwargs)
def to_clipboard(obj, excel=True, sep=None, **kwargs): # pragma: no cover
"""
Attempt to write text representation of object to the system clipboard
The clipboard can be then pasted into Excel for example.
Parameters
----------
obj : the object to write to the clipboard
excel : boolean, defaults to True
if True, use the provided separator, writing in a csv
format for allowing easy pasting into excel.
if False, write a string representation of the object
to the clipboard
sep : optional, defaults to tab
other keywords are passed to to_csv
Notes
-----
Requirements for your platform
- Linux: xclip, or xsel (with PyQt4 modules)
- Windows:
- OS X:
"""
encoding = kwargs.pop("encoding", "utf-8")
# testing if an invalid encoding is passed to clipboard
if encoding is not None and encoding.lower().replace("-", "") != "utf8":
raise ValueError("clipboard only supports utf-8 encoding")
from pandas.io.clipboard import clipboard_set
if excel is None:
excel = True
if excel:
try:
if sep is None:
sep = "\t"
buf = StringIO()
# clipboard_set (pyperclip) expects unicode
obj.to_csv(buf, sep=sep, encoding="utf-8", **kwargs)
text = buf.getvalue()
clipboard_set(text)
return
except TypeError:
warnings.warn(
"to_clipboard in excel mode requires a single " "character separator."
)
elif sep is not None:
warnings.warn("to_clipboard with excel=False ignores the sep argument")
if isinstance(obj, ABCDataFrame):
# str(df) has various unhelpful defaults, like truncation
with option_context("display.max_colwidth", 999999):
objstr = obj.to_string(**kwargs)
else:
objstr = str(obj)
clipboard_set(objstr)

View File

@@ -0,0 +1,515 @@
"""Common IO api utilities"""
import bz2
import codecs
import csv
import gzip
from http.client import HTTPException # noqa
from io import BytesIO
import lzma
import mmap
import os
import pathlib
from urllib.error import URLError # noqa
from urllib.parse import ( # noqa
urlencode,
urljoin,
urlparse as parse_url,
uses_netloc,
uses_params,
uses_relative,
)
from urllib.request import pathname2url, urlopen
import zipfile
from pandas.errors import ( # noqa
AbstractMethodError,
DtypeWarning,
EmptyDataError,
ParserError,
ParserWarning,
)
from pandas.core.dtypes.common import is_file_like
# gh-12665: Alias for now and remove later.
CParserError = ParserError
# common NA values
# no longer excluding inf representations
# '1.#INF','-1.#INF', '1.#INF000000',
_NA_VALUES = {
"-1.#IND",
"1.#QNAN",
"1.#IND",
"-1.#QNAN",
"#N/A N/A",
"#N/A",
"N/A",
"n/a",
"NA",
"#NA",
"NULL",
"null",
"NaN",
"-NaN",
"nan",
"-nan",
"",
}
_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
_VALID_URLS.discard("")
class BaseIterator:
"""Subclass this and provide a "__next__()" method to obtain an iterator.
Useful only when the object being iterated is non-reusable (e.g. OK for a
parser, not for an in-memory table, yes for its iterator)."""
def __iter__(self):
return self
def __next__(self):
raise AbstractMethodError(self)
def _is_url(url):
"""Check to see if a URL has a valid protocol.
Parameters
----------
url : str or unicode
Returns
-------
isurl : bool
If `url` has a valid protocol return True otherwise False.
"""
try:
return parse_url(url).scheme in _VALID_URLS
except Exception:
return False
def _expand_user(filepath_or_buffer):
"""Return the argument with an initial component of ~ or ~user
replaced by that user's home directory.
Parameters
----------
filepath_or_buffer : object to be converted if possible
Returns
-------
expanded_filepath_or_buffer : an expanded filepath or the
input if not expandable
"""
if isinstance(filepath_or_buffer, str):
return os.path.expanduser(filepath_or_buffer)
return filepath_or_buffer
def _validate_header_arg(header):
if isinstance(header, bool):
raise TypeError(
"Passing a bool to header is invalid. "
"Use header=None for no header or "
"header=int or list-like of ints to specify "
"the row(s) making up the column names"
)
def _stringify_path(filepath_or_buffer):
"""Attempt to convert a path-like object to a string.
Parameters
----------
filepath_or_buffer : object to be converted
Returns
-------
str_filepath_or_buffer : maybe a string version of the object
Notes
-----
Objects supporting the fspath protocol (python 3.6+) are coerced
according to its __fspath__ method.
For backwards compatibility with older pythons, pathlib.Path and
py.path objects are specially coerced.
Any other object is passed through unchanged, which includes bytes,
strings, buffers, or anything else that's not even path-like.
"""
if hasattr(filepath_or_buffer, "__fspath__"):
return filepath_or_buffer.__fspath__()
elif isinstance(filepath_or_buffer, pathlib.Path):
return str(filepath_or_buffer)
return _expand_user(filepath_or_buffer)
def is_s3_url(url):
"""Check for an s3, s3n, or s3a url"""
try:
return parse_url(url).scheme in ["s3", "s3n", "s3a"]
except Exception:
return False
def is_gcs_url(url):
"""Check for a gcs url"""
try:
return parse_url(url).scheme in ["gcs", "gs"]
except Exception:
return False
def get_filepath_or_buffer(
filepath_or_buffer, encoding=None, compression=None, mode=None
):
"""
If the filepath_or_buffer is a url, translate and return the buffer.
Otherwise passthrough.
Parameters
----------
filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path),
or buffer
compression : {{'gzip', 'bz2', 'zip', 'xz', None}}, optional
encoding : the encoding to use to decode bytes, default is 'utf-8'
mode : str, optional
Returns
-------
tuple of ({a filepath_ or buffer or S3File instance},
encoding, str,
compression, str,
should_close, bool)
"""
filepath_or_buffer = _stringify_path(filepath_or_buffer)
if _is_url(filepath_or_buffer):
req = urlopen(filepath_or_buffer)
content_encoding = req.headers.get("Content-Encoding", None)
if content_encoding == "gzip":
# Override compression based on Content-Encoding header
compression = "gzip"
reader = BytesIO(req.read())
req.close()
return reader, encoding, compression, True
if is_s3_url(filepath_or_buffer):
from pandas.io import s3
return s3.get_filepath_or_buffer(
filepath_or_buffer, encoding=encoding, compression=compression, mode=mode
)
if is_gcs_url(filepath_or_buffer):
from pandas.io import gcs
return gcs.get_filepath_or_buffer(
filepath_or_buffer, encoding=encoding, compression=compression, mode=mode
)
if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)):
return _expand_user(filepath_or_buffer), None, compression, False
if not is_file_like(filepath_or_buffer):
msg = "Invalid file path or buffer object type: {_type}"
raise ValueError(msg.format(_type=type(filepath_or_buffer)))
return filepath_or_buffer, None, compression, False
def file_path_to_url(path):
"""
converts an absolute native path to a FILE URL.
Parameters
----------
path : a path in native format
Returns
-------
a valid FILE URL
"""
return urljoin("file:", pathname2url(path))
_compression_to_extension = {"gzip": ".gz", "bz2": ".bz2", "zip": ".zip", "xz": ".xz"}
def _infer_compression(filepath_or_buffer, compression):
"""
Get the compression method for filepath_or_buffer. If compression='infer',
the inferred compression method is returned. Otherwise, the input
compression method is returned unchanged, unless it's invalid, in which
case an error is raised.
Parameters
----------
filepath_or_buffer :
a path (str) or buffer
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}
If 'infer' and `filepath_or_buffer` is path-like, then detect
compression from the following extensions: '.gz', '.bz2', '.zip',
or '.xz' (otherwise no compression).
Returns
-------
string or None :
compression method
Raises
------
ValueError on invalid compression specified
"""
# No compression has been explicitly specified
if compression is None:
return None
# Infer compression
if compression == "infer":
# Convert all path types (e.g. pathlib.Path) to strings
filepath_or_buffer = _stringify_path(filepath_or_buffer)
if not isinstance(filepath_or_buffer, str):
# Cannot infer compression of a buffer, assume no compression
return None
# Infer compression from the filename/URL extension
for compression, extension in _compression_to_extension.items():
if filepath_or_buffer.endswith(extension):
return compression
return None
# Compression has been specified. Check that it's valid
if compression in _compression_to_extension:
return compression
msg = "Unrecognized compression type: {}".format(compression)
valid = ["infer", None] + sorted(_compression_to_extension)
msg += "\nValid compression types are {}".format(valid)
raise ValueError(msg)
def _get_handle(
path_or_buf, mode, encoding=None, compression=None, memory_map=False, is_text=True
):
"""
Get file handle for given path/buffer and mode.
Parameters
----------
path_or_buf :
a path (str) or buffer
mode : str
mode to open path_or_buf with
encoding : str or None
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default None
If 'infer' and `filepath_or_buffer` is path-like, then detect
compression from the following extensions: '.gz', '.bz2', '.zip',
or '.xz' (otherwise no compression).
memory_map : boolean, default False
See parsers._parser_params for more information.
is_text : boolean, default True
whether file/buffer is in text format (csv, json, etc.), or in binary
mode (pickle, etc.)
Returns
-------
f : file-like
A file-like object
handles : list of file-like objects
A list of file-like object that were opened in this function.
"""
try:
from s3fs import S3File
need_text_wrapping = (BytesIO, S3File)
except ImportError:
need_text_wrapping = (BytesIO,)
handles = list()
f = path_or_buf
# Convert pathlib.Path/py.path.local or string
path_or_buf = _stringify_path(path_or_buf)
is_path = isinstance(path_or_buf, str)
if is_path:
compression = _infer_compression(path_or_buf, compression)
if compression:
# GZ Compression
if compression == "gzip":
if is_path:
f = gzip.open(path_or_buf, mode)
else:
f = gzip.GzipFile(fileobj=path_or_buf)
# BZ Compression
elif compression == "bz2":
if is_path:
f = bz2.BZ2File(path_or_buf, mode)
else:
f = bz2.BZ2File(path_or_buf)
# ZIP Compression
elif compression == "zip":
zf = BytesZipFile(path_or_buf, mode)
# Ensure the container is closed as well.
handles.append(zf)
if zf.mode == "w":
f = zf
elif zf.mode == "r":
zip_names = zf.namelist()
if len(zip_names) == 1:
f = zf.open(zip_names.pop())
elif len(zip_names) == 0:
raise ValueError(
"Zero files found in ZIP file {}".format(path_or_buf)
)
else:
raise ValueError(
"Multiple files found in ZIP file."
" Only one file per ZIP: {}".format(zip_names)
)
# XZ Compression
elif compression == "xz":
f = lzma.LZMAFile(path_or_buf, mode)
# Unrecognized Compression
else:
msg = "Unrecognized compression type: {}".format(compression)
raise ValueError(msg)
handles.append(f)
elif is_path:
if encoding:
# Encoding
f = open(path_or_buf, mode, encoding=encoding, newline="")
elif is_text:
# No explicit encoding
f = open(path_or_buf, mode, errors="replace", newline="")
else:
# Binary mode
f = open(path_or_buf, mode)
handles.append(f)
# Convert BytesIO or file objects passed with an encoding
if is_text and (compression or isinstance(f, need_text_wrapping)):
from io import TextIOWrapper
f = TextIOWrapper(f, encoding=encoding, newline="")
handles.append(f)
if memory_map and hasattr(f, "fileno"):
try:
g = MMapWrapper(f)
f.close()
f = g
except Exception:
# we catch any errors that may have occurred
# because that is consistent with the lower-level
# functionality of the C engine (pd.read_csv), so
# leave the file handler as is then
pass
return f, handles
class BytesZipFile(zipfile.ZipFile, BytesIO): # type: ignore
"""
Wrapper for standard library class ZipFile and allow the returned file-like
handle to accept byte strings via `write` method.
BytesIO provides attributes of file-like object and ZipFile.writestr writes
bytes strings into a member of the archive.
"""
# GH 17778
def __init__(self, file, mode, compression=zipfile.ZIP_DEFLATED, **kwargs):
if mode in ["wb", "rb"]:
mode = mode.replace("b", "")
super().__init__(file, mode, compression, **kwargs)
def write(self, data):
super().writestr(self.filename, data)
@property
def closed(self):
return self.fp is None
class MMapWrapper(BaseIterator):
"""
Wrapper for the Python's mmap class so that it can be properly read in
by Python's csv.reader class.
Parameters
----------
f : file object
File object to be mapped onto memory. Must support the 'fileno'
method or have an equivalent attribute
"""
def __init__(self, f):
self.mmap = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
def __getattr__(self, name):
return getattr(self.mmap, name)
def __iter__(self):
return self
def __next__(self):
newline = self.mmap.readline()
# readline returns bytes, not str, but Python's CSV reader
# expects str, so convert the output to str before continuing
newline = newline.decode("utf-8")
# mmap doesn't raise if reading past the allocated
# data but instead returns an empty string, so raise
# if that is returned
if newline == "":
raise StopIteration
return newline
class UTF8Recoder(BaseIterator):
"""
Iterator that reads an encoded stream and re-encodes the input to UTF-8
"""
def __init__(self, f, encoding):
self.reader = codecs.getreader(encoding)(f)
def read(self, bytes=-1):
return self.reader.read(bytes).encode("utf-8")
def readline(self):
return self.reader.readline().encode("utf-8")
def next(self):
return next(self.reader).encode("utf-8")
# Keeping these class for now because it provides a necessary convenience
# for "dropping" the "encoding" argument from our I/O arguments when
# creating a Unicode I/O object.
def UnicodeReader(f, dialect=csv.excel, encoding="utf-8", **kwds):
return csv.reader(f, dialect=dialect, **kwds)
def UnicodeWriter(f, dialect=csv.excel, encoding="utf-8", **kwds):
return csv.writer(f, dialect=dialect, **kwds)

View File

@@ -0,0 +1,64 @@
"""This module is designed for community supported date conversion functions"""
import numpy as np
from pandas._libs.tslibs import parsing
def parse_date_time(date_col, time_col):
date_col = _maybe_cast(date_col)
time_col = _maybe_cast(time_col)
return parsing.try_parse_date_and_time(date_col, time_col)
def parse_date_fields(year_col, month_col, day_col):
year_col = _maybe_cast(year_col)
month_col = _maybe_cast(month_col)
day_col = _maybe_cast(day_col)
return parsing.try_parse_year_month_day(year_col, month_col, day_col)
def parse_all_fields(year_col, month_col, day_col, hour_col, minute_col, second_col):
year_col = _maybe_cast(year_col)
month_col = _maybe_cast(month_col)
day_col = _maybe_cast(day_col)
hour_col = _maybe_cast(hour_col)
minute_col = _maybe_cast(minute_col)
second_col = _maybe_cast(second_col)
return parsing.try_parse_datetime_components(
year_col, month_col, day_col, hour_col, minute_col, second_col
)
def generic_parser(parse_func, *cols):
N = _check_columns(cols)
results = np.empty(N, dtype=object)
for i in range(N):
args = [c[i] for c in cols]
results[i] = parse_func(*args)
return results
def _maybe_cast(arr):
if not arr.dtype.type == np.object_:
arr = np.array(arr, dtype=object)
return arr
def _check_columns(cols):
if not len(cols):
raise AssertionError("There must be at least 1 column")
head, tail = cols[0], cols[1:]
N = len(head)
for i, n in enumerate(map(len, tail)):
if n != N:
raise AssertionError(
"All columns must have the same length: {0}; "
"column {1} has length {2}".format(N, i, n)
)
return N

View File

@@ -0,0 +1,16 @@
from pandas.io.excel._base import ExcelFile, ExcelWriter, read_excel
from pandas.io.excel._openpyxl import _OpenpyxlWriter
from pandas.io.excel._util import register_writer
from pandas.io.excel._xlsxwriter import _XlsxWriter
from pandas.io.excel._xlwt import _XlwtWriter
__all__ = ["read_excel", "ExcelWriter", "ExcelFile"]
register_writer(_OpenpyxlWriter)
register_writer(_XlwtWriter)
register_writer(_XlsxWriter)

View File

@@ -0,0 +1,903 @@
import abc
from collections import OrderedDict
from datetime import date, datetime, timedelta
from io import BytesIO
import os
from textwrap import fill
from urllib.request import urlopen
from pandas._config import config
from pandas.errors import EmptyDataError
from pandas.util._decorators import Appender, deprecate_kwarg
from pandas.core.dtypes.common import is_bool, is_float, is_integer, is_list_like
from pandas.core.frame import DataFrame
from pandas.io.common import (
_NA_VALUES,
_is_url,
_stringify_path,
_validate_header_arg,
get_filepath_or_buffer,
)
from pandas.io.excel._util import (
_fill_mi_header,
_get_default_writer,
_maybe_convert_usecols,
_pop_header_name,
get_writer,
)
from pandas.io.formats.printing import pprint_thing
from pandas.io.parsers import TextParser
_read_excel_doc = (
"""
Read an Excel file into a pandas DataFrame.
Support both `xls` and `xlsx` file extensions from a local filesystem or URL.
Support an option to read a single sheet or a list of sheets.
Parameters
----------
io : str, ExcelFile, xlrd.Book, path object or file-like object
Any valid string path is acceptable. The string could be a URL. Valid
URL schemes include http, ftp, s3, and file. For file URLs, a host is
expected. A local file could be: ``file://localhost/path/to/table.xlsx``.
If you want to pass in a path object, pandas accepts any ``os.PathLike``.
By file-like object, we refer to objects with a ``read()`` method,
such as a file handler (e.g. via builtin ``open`` function)
or ``StringIO``.
sheet_name : str, int, list, or None, default 0
Strings are used for sheet names. Integers are used in zero-indexed
sheet positions. Lists of strings/integers are used to request
multiple sheets. Specify None to get all sheets.
Available cases:
* Defaults to ``0``: 1st sheet as a `DataFrame`
* ``1``: 2nd sheet as a `DataFrame`
* ``"Sheet1"``: Load sheet with name "Sheet1"
* ``[0, 1, "Sheet5"]``: Load first, second and sheet named "Sheet5"
as a dict of `DataFrame`
* None: All sheets.
header : int, list of int, default 0
Row (0-indexed) to use for the column labels of the parsed
DataFrame. If a list of integers is passed those row positions will
be combined into a ``MultiIndex``. Use None if there is no header.
names : array-like, default None
List of column names to use. If file contains no header row,
then you should explicitly pass header=None.
index_col : int, list of int, default None
Column (0-indexed) to use as the row labels of the DataFrame.
Pass None if there is no such column. If a list is passed,
those columns will be combined into a ``MultiIndex``. If a
subset of data is selected with ``usecols``, index_col
is based on the subset.
usecols : int, str, list-like, or callable default None
Return a subset of the columns.
* If None, then parse all columns.
* If int, then indicates last column to be parsed.
.. deprecated:: 0.24.0
Pass in a list of int instead from 0 to `usecols` inclusive.
* If str, then indicates comma separated list of Excel column letters
and column ranges (e.g. "A:E" or "A,C,E:F"). Ranges are inclusive of
both sides.
* If list of int, then indicates list of column numbers to be parsed.
* If list of string, then indicates list of column names to be parsed.
.. versionadded:: 0.24.0
* If callable, then evaluate each column name against it and parse the
column if the callable returns ``True``.
.. versionadded:: 0.24.0
squeeze : bool, default False
If the parsed data only contains one column then return a Series.
dtype : Type name or dict of column -> type, default None
Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32}
Use `object` to preserve data as stored in Excel and not interpret dtype.
If converters are specified, they will be applied INSTEAD
of dtype conversion.
.. versionadded:: 0.20.0
engine : str, default None
If io is not a buffer or path, this must be set to identify io.
Acceptable values are None or xlrd.
converters : dict, default None
Dict of functions for converting values in certain columns. Keys can
either be integers or column labels, values are functions that take one
input argument, the Excel cell content, and return the transformed
content.
true_values : list, default None
Values to consider as True.
.. versionadded:: 0.19.0
false_values : list, default None
Values to consider as False.
.. versionadded:: 0.19.0
skiprows : list-like
Rows to skip at the beginning (0-indexed).
nrows : int, default None
Number of rows to parse.
.. versionadded:: 0.23.0
na_values : scalar, str, list-like, or dict, default None
Additional strings to recognize as NA/NaN. If dict passed, specific
per-column NA values. By default the following values are interpreted
as NaN: '"""
+ fill("', '".join(sorted(_NA_VALUES)), 70, subsequent_indent=" ")
+ """'.
keep_default_na : bool, default True
If na_values are specified and keep_default_na is False the default NaN
values are overridden, otherwise they're appended to.
verbose : bool, default False
Indicate number of NA values placed in non-numeric columns.
parse_dates : bool, list-like, or dict, default False
The behavior is as follows:
* bool. If True -> try parsing the index.
* list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3
each as a separate date column.
* list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as
a single date column.
* dict, e.g. {'foo' : [1, 3]} -> parse columns 1, 3 as date and call
result 'foo'
If a column or index contains an unparseable date, the entire column or
index will be returned unaltered as an object data type. For non-standard
datetime parsing, use ``pd.to_datetime`` after ``pd.read_excel``.
Note: A fast-path exists for iso8601-formatted dates.
date_parser : function, optional
Function to use for converting a sequence of string columns to an array of
datetime instances. The default uses ``dateutil.parser.parser`` to do the
conversion. Pandas will try to call `date_parser` in three different ways,
advancing to the next if an exception occurs: 1) Pass one or more arrays
(as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the
string values from the columns defined by `parse_dates` into a single array
and pass that; and 3) call `date_parser` once for each row using one or
more strings (corresponding to the columns defined by `parse_dates`) as
arguments.
thousands : str, default None
Thousands separator for parsing string columns to numeric. Note that
this parameter is only necessary for columns stored as TEXT in Excel,
any numeric columns will automatically be parsed, regardless of display
format.
comment : str, default None
Comments out remainder of line. Pass a character or characters to this
argument to indicate comments in the input file. Any data between the
comment string and the end of the current line is ignored.
skip_footer : int, default 0
Alias of `skipfooter`.
.. deprecated:: 0.23.0
Use `skipfooter` instead.
skipfooter : int, default 0
Rows at the end to skip (0-indexed).
convert_float : bool, default True
Convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric
data will be read in as floats: Excel stores all numbers as floats
internally.
mangle_dupe_cols : bool, default True
Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than
'X'...'X'. Passing in False will cause data to be overwritten if there
are duplicate names in the columns.
**kwds : optional
Optional keyword arguments can be passed to ``TextFileReader``.
Returns
-------
DataFrame or dict of DataFrames
DataFrame from the passed in Excel file. See notes in sheet_name
argument for more information on when a dict of DataFrames is returned.
See Also
--------
to_excel : Write DataFrame to an Excel file.
to_csv : Write DataFrame to a comma-separated values (csv) file.
read_csv : Read a comma-separated values (csv) file into DataFrame.
read_fwf : Read a table of fixed-width formatted lines into DataFrame.
Examples
--------
The file can be read using the file name as string or an open file object:
>>> pd.read_excel('tmp.xlsx', index_col=0) # doctest: +SKIP
Name Value
0 string1 1
1 string2 2
2 #Comment 3
>>> pd.read_excel(open('tmp.xlsx', 'rb'),
... sheet_name='Sheet3') # doctest: +SKIP
Unnamed: 0 Name Value
0 0 string1 1
1 1 string2 2
2 2 #Comment 3
Index and header can be specified via the `index_col` and `header` arguments
>>> pd.read_excel('tmp.xlsx', index_col=None, header=None) # doctest: +SKIP
0 1 2
0 NaN Name Value
1 0.0 string1 1
2 1.0 string2 2
3 2.0 #Comment 3
Column types are inferred but can be explicitly specified
>>> pd.read_excel('tmp.xlsx', index_col=0,
... dtype={'Name': str, 'Value': float}) # doctest: +SKIP
Name Value
0 string1 1.0
1 string2 2.0
2 #Comment 3.0
True, False, and NA values, and thousands separators have defaults,
but can be explicitly specified, too. Supply the values you would like
as strings or lists of strings!
>>> pd.read_excel('tmp.xlsx', index_col=0,
... na_values=['string1', 'string2']) # doctest: +SKIP
Name Value
0 NaN 1
1 NaN 2
2 #Comment 3
Comment lines in the excel input file can be skipped using the `comment` kwarg
>>> pd.read_excel('tmp.xlsx', index_col=0, comment='#') # doctest: +SKIP
Name Value
0 string1 1.0
1 string2 2.0
2 None NaN
"""
)
@Appender(_read_excel_doc)
@deprecate_kwarg("skip_footer", "skipfooter")
def read_excel(
io,
sheet_name=0,
header=0,
names=None,
index_col=None,
usecols=None,
squeeze=False,
dtype=None,
engine=None,
converters=None,
true_values=None,
false_values=None,
skiprows=None,
nrows=None,
na_values=None,
keep_default_na=True,
verbose=False,
parse_dates=False,
date_parser=None,
thousands=None,
comment=None,
skip_footer=0,
skipfooter=0,
convert_float=True,
mangle_dupe_cols=True,
**kwds
):
for arg in ("sheet", "sheetname", "parse_cols"):
if arg in kwds:
raise TypeError(
"read_excel() got an unexpected keyword argument " "`{}`".format(arg)
)
if not isinstance(io, ExcelFile):
io = ExcelFile(io, engine=engine)
elif engine and engine != io.engine:
raise ValueError(
"Engine should not be specified when passing "
"an ExcelFile - ExcelFile already has the engine set"
)
return io.parse(
sheet_name=sheet_name,
header=header,
names=names,
index_col=index_col,
usecols=usecols,
squeeze=squeeze,
dtype=dtype,
converters=converters,
true_values=true_values,
false_values=false_values,
skiprows=skiprows,
nrows=nrows,
na_values=na_values,
keep_default_na=keep_default_na,
verbose=verbose,
parse_dates=parse_dates,
date_parser=date_parser,
thousands=thousands,
comment=comment,
skipfooter=skipfooter,
convert_float=convert_float,
mangle_dupe_cols=mangle_dupe_cols,
**kwds
)
class _BaseExcelReader(metaclass=abc.ABCMeta):
def __init__(self, filepath_or_buffer):
# If filepath_or_buffer is a url, load the data into a BytesIO
if _is_url(filepath_or_buffer):
filepath_or_buffer = BytesIO(urlopen(filepath_or_buffer).read())
elif not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)):
filepath_or_buffer, _, _, _ = get_filepath_or_buffer(filepath_or_buffer)
if isinstance(filepath_or_buffer, self._workbook_class):
self.book = filepath_or_buffer
elif hasattr(filepath_or_buffer, "read"):
# N.B. xlrd.Book has a read attribute too
filepath_or_buffer.seek(0)
self.book = self.load_workbook(filepath_or_buffer)
elif isinstance(filepath_or_buffer, str):
self.book = self.load_workbook(filepath_or_buffer)
else:
raise ValueError(
"Must explicitly set engine if not passing in" " buffer or path for io."
)
@property
@abc.abstractmethod
def _workbook_class(self):
pass
@abc.abstractmethod
def load_workbook(self, filepath_or_buffer):
pass
@property
@abc.abstractmethod
def sheet_names(self):
pass
@abc.abstractmethod
def get_sheet_by_name(self, name):
pass
@abc.abstractmethod
def get_sheet_by_index(self, index):
pass
@abc.abstractmethod
def get_sheet_data(self, sheet, convert_float):
pass
def parse(
self,
sheet_name=0,
header=0,
names=None,
index_col=None,
usecols=None,
squeeze=False,
dtype=None,
true_values=None,
false_values=None,
skiprows=None,
nrows=None,
na_values=None,
verbose=False,
parse_dates=False,
date_parser=None,
thousands=None,
comment=None,
skipfooter=0,
convert_float=True,
mangle_dupe_cols=True,
**kwds
):
_validate_header_arg(header)
ret_dict = False
# Keep sheetname to maintain backwards compatibility.
if isinstance(sheet_name, list):
sheets = sheet_name
ret_dict = True
elif sheet_name is None:
sheets = self.sheet_names
ret_dict = True
else:
sheets = [sheet_name]
# handle same-type duplicates.
sheets = list(OrderedDict.fromkeys(sheets).keys())
output = OrderedDict()
for asheetname in sheets:
if verbose:
print("Reading sheet {sheet}".format(sheet=asheetname))
if isinstance(asheetname, str):
sheet = self.get_sheet_by_name(asheetname)
else: # assume an integer if not a string
sheet = self.get_sheet_by_index(asheetname)
data = self.get_sheet_data(sheet, convert_float)
usecols = _maybe_convert_usecols(usecols)
if not data:
output[asheetname] = DataFrame()
continue
if is_list_like(header) and len(header) == 1:
header = header[0]
# forward fill and pull out names for MultiIndex column
header_names = None
if header is not None and is_list_like(header):
header_names = []
control_row = [True] * len(data[0])
for row in header:
if is_integer(skiprows):
row += skiprows
data[row], control_row = _fill_mi_header(data[row], control_row)
if index_col is not None:
header_name, _ = _pop_header_name(data[row], index_col)
header_names.append(header_name)
if is_list_like(index_col):
# Forward fill values for MultiIndex index.
if not is_list_like(header):
offset = 1 + header
else:
offset = 1 + max(header)
# Check if we have an empty dataset
# before trying to collect data.
if offset < len(data):
for col in index_col:
last = data[offset][col]
for row in range(offset + 1, len(data)):
if data[row][col] == "" or data[row][col] is None:
data[row][col] = last
else:
last = data[row][col]
has_index_names = is_list_like(header) and len(header) > 1
# GH 12292 : error when read one empty column from excel file
try:
parser = TextParser(
data,
names=names,
header=header,
index_col=index_col,
has_index_names=has_index_names,
squeeze=squeeze,
dtype=dtype,
true_values=true_values,
false_values=false_values,
skiprows=skiprows,
nrows=nrows,
na_values=na_values,
parse_dates=parse_dates,
date_parser=date_parser,
thousands=thousands,
comment=comment,
skipfooter=skipfooter,
usecols=usecols,
mangle_dupe_cols=mangle_dupe_cols,
**kwds
)
output[asheetname] = parser.read(nrows=nrows)
if not squeeze or isinstance(output[asheetname], DataFrame):
if header_names:
output[asheetname].columns = output[
asheetname
].columns.set_names(header_names)
except EmptyDataError:
# No Data, return an empty DataFrame
output[asheetname] = DataFrame()
if ret_dict:
return output
else:
return output[asheetname]
class ExcelWriter(metaclass=abc.ABCMeta):
"""
Class for writing DataFrame objects into excel sheets, default is to use
xlwt for xls, openpyxl for xlsx. See DataFrame.to_excel for typical usage.
Parameters
----------
path : string
Path to xls or xlsx file.
engine : string (optional)
Engine to use for writing. If None, defaults to
``io.excel.<extension>.writer``. NOTE: can only be passed as a keyword
argument.
date_format : string, default None
Format string for dates written into Excel files (e.g. 'YYYY-MM-DD')
datetime_format : string, default None
Format string for datetime objects written into Excel files
(e.g. 'YYYY-MM-DD HH:MM:SS')
mode : {'w', 'a'}, default 'w'
File mode to use (write or append).
.. versionadded:: 0.24.0
Attributes
----------
None
Methods
-------
None
Notes
-----
None of the methods and properties are considered public.
For compatibility with CSV writers, ExcelWriter serializes lists
and dicts to strings before writing.
Examples
--------
Default usage:
>>> with ExcelWriter('path_to_file.xlsx') as writer:
... df.to_excel(writer)
To write to separate sheets in a single file:
>>> with ExcelWriter('path_to_file.xlsx') as writer:
... df1.to_excel(writer, sheet_name='Sheet1')
... df2.to_excel(writer, sheet_name='Sheet2')
You can set the date format or datetime format:
>>> with ExcelWriter('path_to_file.xlsx',
date_format='YYYY-MM-DD',
datetime_format='YYYY-MM-DD HH:MM:SS') as writer:
... df.to_excel(writer)
You can also append to an existing Excel file:
>>> with ExcelWriter('path_to_file.xlsx', mode='a') as writer:
... df.to_excel(writer, sheet_name='Sheet3')
"""
# Defining an ExcelWriter implementation (see abstract methods for more...)
# - Mandatory
# - ``write_cells(self, cells, sheet_name=None, startrow=0, startcol=0)``
# --> called to write additional DataFrames to disk
# - ``supported_extensions`` (tuple of supported extensions), used to
# check that engine supports the given extension.
# - ``engine`` - string that gives the engine name. Necessary to
# instantiate class directly and bypass ``ExcelWriterMeta`` engine
# lookup.
# - ``save(self)`` --> called to save file to disk
# - Mostly mandatory (i.e. should at least exist)
# - book, cur_sheet, path
# - Optional:
# - ``__init__(self, path, engine=None, **kwargs)`` --> always called
# with path as first argument.
# You also need to register the class with ``register_writer()``.
# Technically, ExcelWriter implementations don't need to subclass
# ExcelWriter.
def __new__(cls, path, engine=None, **kwargs):
# only switch class if generic(ExcelWriter)
if cls is ExcelWriter:
if engine is None or (isinstance(engine, str) and engine == "auto"):
if isinstance(path, str):
ext = os.path.splitext(path)[-1][1:]
else:
ext = "xlsx"
try:
engine = config.get_option("io.excel.{ext}.writer".format(ext=ext))
if engine == "auto":
engine = _get_default_writer(ext)
except KeyError:
raise ValueError("No engine for filetype: '{ext}'".format(ext=ext))
cls = get_writer(engine)
return object.__new__(cls)
# declare external properties you can count on
book = None
curr_sheet = None
path = None
@property
@abc.abstractmethod
def supported_extensions(self):
"""Extensions that writer engine supports."""
pass
@property
@abc.abstractmethod
def engine(self):
"""Name of engine."""
pass
@abc.abstractmethod
def write_cells(
self, cells, sheet_name=None, startrow=0, startcol=0, freeze_panes=None
):
"""
Write given formatted cells into Excel an excel sheet
Parameters
----------
cells : generator
cell of formatted data to save to Excel sheet
sheet_name : string, default None
Name of Excel sheet, if None, then use self.cur_sheet
startrow : upper left cell row to dump data frame
startcol : upper left cell column to dump data frame
freeze_panes: integer tuple of length 2
contains the bottom-most row and right-most column to freeze
"""
pass
@abc.abstractmethod
def save(self):
"""
Save workbook to disk.
"""
pass
def __init__(
self,
path,
engine=None,
date_format=None,
datetime_format=None,
mode="w",
**engine_kwargs
):
# validate that this engine can handle the extension
if isinstance(path, str):
ext = os.path.splitext(path)[-1]
else:
ext = "xls" if engine == "xlwt" else "xlsx"
self.check_extension(ext)
self.path = path
self.sheets = {}
self.cur_sheet = None
if date_format is None:
self.date_format = "YYYY-MM-DD"
else:
self.date_format = date_format
if datetime_format is None:
self.datetime_format = "YYYY-MM-DD HH:MM:SS"
else:
self.datetime_format = datetime_format
self.mode = mode
def __fspath__(self):
return _stringify_path(self.path)
def _get_sheet_name(self, sheet_name):
if sheet_name is None:
sheet_name = self.cur_sheet
if sheet_name is None: # pragma: no cover
raise ValueError(
"Must pass explicit sheet_name or set " "cur_sheet property"
)
return sheet_name
def _value_with_fmt(self, val):
"""Convert numpy types to Python types for the Excel writers.
Parameters
----------
val : object
Value to be written into cells
Returns
-------
Tuple with the first element being the converted value and the second
being an optional format
"""
fmt = None
if is_integer(val):
val = int(val)
elif is_float(val):
val = float(val)
elif is_bool(val):
val = bool(val)
elif isinstance(val, datetime):
fmt = self.datetime_format
elif isinstance(val, date):
fmt = self.date_format
elif isinstance(val, timedelta):
val = val.total_seconds() / float(86400)
fmt = "0"
else:
val = str(val)
return val, fmt
@classmethod
def check_extension(cls, ext):
"""checks that path's extension against the Writer's supported
extensions. If it isn't supported, raises UnsupportedFiletypeError."""
if ext.startswith("."):
ext = ext[1:]
if not any(ext in extension for extension in cls.supported_extensions):
msg = "Invalid extension for engine '{engine}': '{ext}'".format(
engine=pprint_thing(cls.engine), ext=pprint_thing(ext)
)
raise ValueError(msg)
else:
return True
# Allow use as a contextmanager
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback):
self.close()
def close(self):
"""synonym for save, to make it more file-like"""
return self.save()
class ExcelFile:
"""
Class for parsing tabular excel sheets into DataFrame objects.
Uses xlrd. See read_excel for more documentation
Parameters
----------
io : string, path object (pathlib.Path or py._path.local.LocalPath),
file-like object or xlrd workbook
If a string or path object, expected to be a path to xls or xlsx file.
engine : string, default None
If io is not a buffer or path, this must be set to identify io.
Acceptable values are None or ``xlrd``.
"""
from pandas.io.excel._odfreader import _ODFReader
from pandas.io.excel._openpyxl import _OpenpyxlReader
from pandas.io.excel._xlrd import _XlrdReader
_engines = {"xlrd": _XlrdReader, "openpyxl": _OpenpyxlReader, "odf": _ODFReader}
def __init__(self, io, engine=None):
if engine is None:
engine = "xlrd"
if engine not in self._engines:
raise ValueError("Unknown engine: {engine}".format(engine=engine))
self.engine = engine
# could be a str, ExcelFile, Book, etc.
self.io = io
# Always a string
self._io = _stringify_path(io)
self._reader = self._engines[engine](self._io)
def __fspath__(self):
return self._io
def parse(
self,
sheet_name=0,
header=0,
names=None,
index_col=None,
usecols=None,
squeeze=False,
converters=None,
true_values=None,
false_values=None,
skiprows=None,
nrows=None,
na_values=None,
parse_dates=False,
date_parser=None,
thousands=None,
comment=None,
skipfooter=0,
convert_float=True,
mangle_dupe_cols=True,
**kwds
):
"""
Parse specified sheet(s) into a DataFrame
Equivalent to read_excel(ExcelFile, ...) See the read_excel
docstring for more info on accepted parameters
Returns
-------
DataFrame or dict of DataFrames
DataFrame from the passed in Excel file.
"""
if "chunksize" in kwds:
raise NotImplementedError(
"chunksize keyword of read_excel " "is not implemented"
)
return self._reader.parse(
sheet_name=sheet_name,
header=header,
names=names,
index_col=index_col,
usecols=usecols,
squeeze=squeeze,
converters=converters,
true_values=true_values,
false_values=false_values,
skiprows=skiprows,
nrows=nrows,
na_values=na_values,
parse_dates=parse_dates,
date_parser=date_parser,
thousands=thousands,
comment=comment,
skipfooter=skipfooter,
convert_float=convert_float,
mangle_dupe_cols=mangle_dupe_cols,
**kwds
)
@property
def book(self):
return self._reader.book
@property
def sheet_names(self):
return self._reader.sheet_names
def close(self):
"""close io if necessary"""
if hasattr(self.io, "close"):
self.io.close()
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback):
self.close()

View File

@@ -0,0 +1,180 @@
from typing import List
from pandas.compat._optional import import_optional_dependency
import pandas as pd
from pandas._typing import FilePathOrBuffer, Scalar
from pandas.io.excel._base import _BaseExcelReader
class _ODFReader(_BaseExcelReader):
"""Read tables out of OpenDocument formatted files
Parameters
----------
filepath_or_buffer: string, path to be parsed or
an open readable stream.
"""
def __init__(self, filepath_or_buffer: FilePathOrBuffer):
import_optional_dependency("odf")
super().__init__(filepath_or_buffer)
@property
def _workbook_class(self):
from odf.opendocument import OpenDocument
return OpenDocument
def load_workbook(self, filepath_or_buffer: FilePathOrBuffer):
from odf.opendocument import load
return load(filepath_or_buffer)
@property
def empty_value(self) -> str:
"""Property for compat with other readers."""
return ""
@property
def sheet_names(self) -> List[str]:
"""Return a list of sheet names present in the document"""
from odf.table import Table
tables = self.book.getElementsByType(Table)
return [t.getAttribute("name") for t in tables]
def get_sheet_by_index(self, index: int):
from odf.table import Table
tables = self.book.getElementsByType(Table)
return tables[index]
def get_sheet_by_name(self, name: str):
from odf.table import Table
tables = self.book.getElementsByType(Table)
for table in tables:
if table.getAttribute("name") == name:
return table
raise ValueError("sheet {name} not found".format(name))
def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:
"""Parse an ODF Table into a list of lists
"""
from odf.table import CoveredTableCell, TableCell, TableRow
covered_cell_name = CoveredTableCell().qname
table_cell_name = TableCell().qname
cell_names = {covered_cell_name, table_cell_name}
sheet_rows = sheet.getElementsByType(TableRow)
empty_rows = 0
max_row_len = 0
table = [] # type: List[List[Scalar]]
for i, sheet_row in enumerate(sheet_rows):
sheet_cells = [x for x in sheet_row.childNodes if x.qname in cell_names]
empty_cells = 0
table_row = [] # type: List[Scalar]
for j, sheet_cell in enumerate(sheet_cells):
if sheet_cell.qname == table_cell_name:
value = self._get_cell_value(sheet_cell, convert_float)
else:
value = self.empty_value
column_repeat = self._get_column_repeat(sheet_cell)
# Queue up empty values, writing only if content succeeds them
if value == self.empty_value:
empty_cells += column_repeat
else:
table_row.extend([self.empty_value] * empty_cells)
empty_cells = 0
table_row.extend([value] * column_repeat)
if max_row_len < len(table_row):
max_row_len = len(table_row)
row_repeat = self._get_row_repeat(sheet_row)
if self._is_empty_row(sheet_row):
empty_rows += row_repeat
else:
# add blank rows to our table
table.extend([[self.empty_value]] * empty_rows)
empty_rows = 0
for _ in range(row_repeat):
table.append(table_row)
# Make our table square
for row in table:
if len(row) < max_row_len:
row.extend([self.empty_value] * (max_row_len - len(row)))
return table
def _get_row_repeat(self, row) -> int:
"""Return number of times this row was repeated
Repeating an empty row appeared to be a common way
of representing sparse rows in the table.
"""
from odf.namespaces import TABLENS
return int(row.attributes.get((TABLENS, "number-rows-repeated"), 1))
def _get_column_repeat(self, cell) -> int:
from odf.namespaces import TABLENS
return int(cell.attributes.get((TABLENS, "number-columns-repeated"), 1))
def _is_empty_row(self, row) -> bool:
"""Helper function to find empty rows
"""
for column in row.childNodes:
if len(column.childNodes) > 0:
return False
return True
def _get_cell_value(self, cell, convert_float: bool) -> Scalar:
from odf.namespaces import OFFICENS
cell_type = cell.attributes.get((OFFICENS, "value-type"))
if cell_type == "boolean":
if str(cell) == "TRUE":
return True
return False
if cell_type is None:
return self.empty_value
elif cell_type == "float":
# GH5394
cell_value = float(cell.attributes.get((OFFICENS, "value")))
if cell_value == 0.0 and str(cell) != cell_value: # NA handling
return str(cell)
if convert_float:
val = int(cell_value)
if val == cell_value:
return val
return cell_value
elif cell_type == "percentage":
cell_value = cell.attributes.get((OFFICENS, "value"))
return float(cell_value)
elif cell_type == "string":
return str(cell)
elif cell_type == "currency":
cell_value = cell.attributes.get((OFFICENS, "value"))
return float(cell_value)
elif cell_type == "date":
cell_value = cell.attributes.get((OFFICENS, "date-value"))
return pd.to_datetime(cell_value)
elif cell_type == "time":
return pd.to_datetime(str(cell)).time()
else:
raise ValueError("Unrecognized type {}".format(cell_type))

View File

@@ -0,0 +1,522 @@
from typing import List
import numpy as np
from pandas.compat._optional import import_optional_dependency
from pandas._typing import FilePathOrBuffer, Scalar
from pandas.io.excel._base import ExcelWriter, _BaseExcelReader
from pandas.io.excel._util import _validate_freeze_panes
class _OpenpyxlWriter(ExcelWriter):
engine = "openpyxl"
supported_extensions = (".xlsx", ".xlsm")
def __init__(self, path, engine=None, mode="w", **engine_kwargs):
# Use the openpyxl module as the Excel writer.
from openpyxl.workbook import Workbook
super().__init__(path, mode=mode, **engine_kwargs)
if self.mode == "a": # Load from existing workbook
from openpyxl import load_workbook
book = load_workbook(self.path)
self.book = book
else:
# Create workbook object with default optimized_write=True.
self.book = Workbook()
if self.book.worksheets:
try:
self.book.remove(self.book.worksheets[0])
except AttributeError:
# compat - for openpyxl <= 2.4
self.book.remove_sheet(self.book.worksheets[0])
def save(self):
"""
Save workbook to disk.
"""
return self.book.save(self.path)
@classmethod
def _convert_to_style(cls, style_dict):
"""
converts a style_dict to an openpyxl style object
Parameters
----------
style_dict : style dictionary to convert
"""
from openpyxl.style import Style
xls_style = Style()
for key, value in style_dict.items():
for nk, nv in value.items():
if key == "borders":
(
xls_style.borders.__getattribute__(nk).__setattr__(
"border_style", nv
)
)
else:
xls_style.__getattribute__(key).__setattr__(nk, nv)
return xls_style
@classmethod
def _convert_to_style_kwargs(cls, style_dict):
"""
Convert a style_dict to a set of kwargs suitable for initializing
or updating-on-copy an openpyxl v2 style object
Parameters
----------
style_dict : dict
A dict with zero or more of the following keys (or their synonyms).
'font'
'fill'
'border' ('borders')
'alignment'
'number_format'
'protection'
Returns
-------
style_kwargs : dict
A dict with the same, normalized keys as ``style_dict`` but each
value has been replaced with a native openpyxl style object of the
appropriate class.
"""
_style_key_map = {"borders": "border"}
style_kwargs = {}
for k, v in style_dict.items():
if k in _style_key_map:
k = _style_key_map[k]
_conv_to_x = getattr(cls, "_convert_to_{k}".format(k=k), lambda x: None)
new_v = _conv_to_x(v)
if new_v:
style_kwargs[k] = new_v
return style_kwargs
@classmethod
def _convert_to_color(cls, color_spec):
"""
Convert ``color_spec`` to an openpyxl v2 Color object
Parameters
----------
color_spec : str, dict
A 32-bit ARGB hex string, or a dict with zero or more of the
following keys.
'rgb'
'indexed'
'auto'
'theme'
'tint'
'index'
'type'
Returns
-------
color : openpyxl.styles.Color
"""
from openpyxl.styles import Color
if isinstance(color_spec, str):
return Color(color_spec)
else:
return Color(**color_spec)
@classmethod
def _convert_to_font(cls, font_dict):
"""
Convert ``font_dict`` to an openpyxl v2 Font object
Parameters
----------
font_dict : dict
A dict with zero or more of the following keys (or their synonyms).
'name'
'size' ('sz')
'bold' ('b')
'italic' ('i')
'underline' ('u')
'strikethrough' ('strike')
'color'
'vertAlign' ('vertalign')
'charset'
'scheme'
'family'
'outline'
'shadow'
'condense'
Returns
-------
font : openpyxl.styles.Font
"""
from openpyxl.styles import Font
_font_key_map = {
"sz": "size",
"b": "bold",
"i": "italic",
"u": "underline",
"strike": "strikethrough",
"vertalign": "vertAlign",
}
font_kwargs = {}
for k, v in font_dict.items():
if k in _font_key_map:
k = _font_key_map[k]
if k == "color":
v = cls._convert_to_color(v)
font_kwargs[k] = v
return Font(**font_kwargs)
@classmethod
def _convert_to_stop(cls, stop_seq):
"""
Convert ``stop_seq`` to a list of openpyxl v2 Color objects,
suitable for initializing the ``GradientFill`` ``stop`` parameter.
Parameters
----------
stop_seq : iterable
An iterable that yields objects suitable for consumption by
``_convert_to_color``.
Returns
-------
stop : list of openpyxl.styles.Color
"""
return map(cls._convert_to_color, stop_seq)
@classmethod
def _convert_to_fill(cls, fill_dict):
"""
Convert ``fill_dict`` to an openpyxl v2 Fill object
Parameters
----------
fill_dict : dict
A dict with one or more of the following keys (or their synonyms),
'fill_type' ('patternType', 'patterntype')
'start_color' ('fgColor', 'fgcolor')
'end_color' ('bgColor', 'bgcolor')
or one or more of the following keys (or their synonyms).
'type' ('fill_type')
'degree'
'left'
'right'
'top'
'bottom'
'stop'
Returns
-------
fill : openpyxl.styles.Fill
"""
from openpyxl.styles import PatternFill, GradientFill
_pattern_fill_key_map = {
"patternType": "fill_type",
"patterntype": "fill_type",
"fgColor": "start_color",
"fgcolor": "start_color",
"bgColor": "end_color",
"bgcolor": "end_color",
}
_gradient_fill_key_map = {"fill_type": "type"}
pfill_kwargs = {}
gfill_kwargs = {}
for k, v in fill_dict.items():
pk = gk = None
if k in _pattern_fill_key_map:
pk = _pattern_fill_key_map[k]
if k in _gradient_fill_key_map:
gk = _gradient_fill_key_map[k]
if pk in ["start_color", "end_color"]:
v = cls._convert_to_color(v)
if gk == "stop":
v = cls._convert_to_stop(v)
if pk:
pfill_kwargs[pk] = v
elif gk:
gfill_kwargs[gk] = v
else:
pfill_kwargs[k] = v
gfill_kwargs[k] = v
try:
return PatternFill(**pfill_kwargs)
except TypeError:
return GradientFill(**gfill_kwargs)
@classmethod
def _convert_to_side(cls, side_spec):
"""
Convert ``side_spec`` to an openpyxl v2 Side object
Parameters
----------
side_spec : str, dict
A string specifying the border style, or a dict with zero or more
of the following keys (or their synonyms).
'style' ('border_style')
'color'
Returns
-------
side : openpyxl.styles.Side
"""
from openpyxl.styles import Side
_side_key_map = {"border_style": "style"}
if isinstance(side_spec, str):
return Side(style=side_spec)
side_kwargs = {}
for k, v in side_spec.items():
if k in _side_key_map:
k = _side_key_map[k]
if k == "color":
v = cls._convert_to_color(v)
side_kwargs[k] = v
return Side(**side_kwargs)
@classmethod
def _convert_to_border(cls, border_dict):
"""
Convert ``border_dict`` to an openpyxl v2 Border object
Parameters
----------
border_dict : dict
A dict with zero or more of the following keys (or their synonyms).
'left'
'right'
'top'
'bottom'
'diagonal'
'diagonal_direction'
'vertical'
'horizontal'
'diagonalUp' ('diagonalup')
'diagonalDown' ('diagonaldown')
'outline'
Returns
-------
border : openpyxl.styles.Border
"""
from openpyxl.styles import Border
_border_key_map = {"diagonalup": "diagonalUp", "diagonaldown": "diagonalDown"}
border_kwargs = {}
for k, v in border_dict.items():
if k in _border_key_map:
k = _border_key_map[k]
if k == "color":
v = cls._convert_to_color(v)
if k in ["left", "right", "top", "bottom", "diagonal"]:
v = cls._convert_to_side(v)
border_kwargs[k] = v
return Border(**border_kwargs)
@classmethod
def _convert_to_alignment(cls, alignment_dict):
"""
Convert ``alignment_dict`` to an openpyxl v2 Alignment object
Parameters
----------
alignment_dict : dict
A dict with zero or more of the following keys (or their synonyms).
'horizontal'
'vertical'
'text_rotation'
'wrap_text'
'shrink_to_fit'
'indent'
Returns
-------
alignment : openpyxl.styles.Alignment
"""
from openpyxl.styles import Alignment
return Alignment(**alignment_dict)
@classmethod
def _convert_to_number_format(cls, number_format_dict):
"""
Convert ``number_format_dict`` to an openpyxl v2.1.0 number format
initializer.
Parameters
----------
number_format_dict : dict
A dict with zero or more of the following keys.
'format_code' : str
Returns
-------
number_format : str
"""
return number_format_dict["format_code"]
@classmethod
def _convert_to_protection(cls, protection_dict):
"""
Convert ``protection_dict`` to an openpyxl v2 Protection object.
Parameters
----------
protection_dict : dict
A dict with zero or more of the following keys.
'locked'
'hidden'
Returns
-------
"""
from openpyxl.styles import Protection
return Protection(**protection_dict)
def write_cells(
self, cells, sheet_name=None, startrow=0, startcol=0, freeze_panes=None
):
# Write the frame cells using openpyxl.
sheet_name = self._get_sheet_name(sheet_name)
_style_cache = {}
if sheet_name in self.sheets:
wks = self.sheets[sheet_name]
else:
wks = self.book.create_sheet()
wks.title = sheet_name
self.sheets[sheet_name] = wks
if _validate_freeze_panes(freeze_panes):
wks.freeze_panes = wks.cell(
row=freeze_panes[0] + 1, column=freeze_panes[1] + 1
)
for cell in cells:
xcell = wks.cell(
row=startrow + cell.row + 1, column=startcol + cell.col + 1
)
xcell.value, fmt = self._value_with_fmt(cell.val)
if fmt:
xcell.number_format = fmt
style_kwargs = {}
if cell.style:
key = str(cell.style)
style_kwargs = _style_cache.get(key)
if style_kwargs is None:
style_kwargs = self._convert_to_style_kwargs(cell.style)
_style_cache[key] = style_kwargs
if style_kwargs:
for k, v in style_kwargs.items():
setattr(xcell, k, v)
if cell.mergestart is not None and cell.mergeend is not None:
wks.merge_cells(
start_row=startrow + cell.row + 1,
start_column=startcol + cell.col + 1,
end_column=startcol + cell.mergeend + 1,
end_row=startrow + cell.mergestart + 1,
)
# When cells are merged only the top-left cell is preserved
# The behaviour of the other cells in a merged range is
# undefined
if style_kwargs:
first_row = startrow + cell.row + 1
last_row = startrow + cell.mergestart + 1
first_col = startcol + cell.col + 1
last_col = startcol + cell.mergeend + 1
for row in range(first_row, last_row + 1):
for col in range(first_col, last_col + 1):
if row == first_row and col == first_col:
# Ignore first cell. It is already handled.
continue
xcell = wks.cell(column=col, row=row)
for k, v in style_kwargs.items():
setattr(xcell, k, v)
class _OpenpyxlReader(_BaseExcelReader):
def __init__(self, filepath_or_buffer: FilePathOrBuffer) -> None:
"""Reader using openpyxl engine.
Parameters
----------
filepath_or_buffer : string, path object or Workbook
Object to be parsed.
"""
import_optional_dependency("openpyxl")
super().__init__(filepath_or_buffer)
@property
def _workbook_class(self):
from openpyxl import Workbook
return Workbook
def load_workbook(self, filepath_or_buffer: FilePathOrBuffer):
from openpyxl import load_workbook
return load_workbook(
filepath_or_buffer, read_only=True, data_only=True, keep_links=False
)
@property
def sheet_names(self) -> List[str]:
return self.book.sheetnames
def get_sheet_by_name(self, name: str):
return self.book[name]
def get_sheet_by_index(self, index: int):
return self.book.worksheets[index]
def _convert_cell(self, cell, convert_float: bool) -> Scalar:
# TODO: replace with openpyxl constants
if cell.is_date:
return cell.value
elif cell.data_type == "e":
return np.nan
elif cell.data_type == "b":
return bool(cell.value)
elif cell.value is None:
return "" # compat with xlrd
elif cell.data_type == "n":
# GH5394
if convert_float:
val = int(cell.value)
if val == cell.value:
return val
else:
return float(cell.value)
return cell.value
def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:
data = [] # type: List[List[Scalar]]
for row in sheet.rows:
data.append([self._convert_cell(cell, convert_float) for cell in row])
return data

View File

@@ -0,0 +1,236 @@
import warnings
from pandas.compat._optional import import_optional_dependency
from pandas.core.dtypes.common import is_integer, is_list_like
_writers = {}
def register_writer(klass):
"""
Add engine to the excel writer registry.io.excel.
You must use this method to integrate with ``to_excel``.
Parameters
----------
klass : ExcelWriter
"""
if not callable(klass):
raise ValueError("Can only register callables as engines")
engine_name = klass.engine
_writers[engine_name] = klass
def _get_default_writer(ext):
"""
Return the default writer for the given extension.
Parameters
----------
ext : str
The excel file extension for which to get the default engine.
Returns
-------
str
The default engine for the extension.
"""
_default_writers = {"xlsx": "openpyxl", "xlsm": "openpyxl", "xls": "xlwt"}
xlsxwriter = import_optional_dependency(
"xlsxwriter", raise_on_missing=False, on_version="warn"
)
if xlsxwriter:
_default_writers["xlsx"] = "xlsxwriter"
return _default_writers[ext]
def get_writer(engine_name):
try:
return _writers[engine_name]
except KeyError:
raise ValueError("No Excel writer '{engine}'".format(engine=engine_name))
def _excel2num(x):
"""
Convert Excel column name like 'AB' to 0-based column index.
Parameters
----------
x : str
The Excel column name to convert to a 0-based column index.
Returns
-------
num : int
The column index corresponding to the name.
Raises
------
ValueError
Part of the Excel column name was invalid.
"""
index = 0
for c in x.upper().strip():
cp = ord(c)
if cp < ord("A") or cp > ord("Z"):
raise ValueError("Invalid column name: {x}".format(x=x))
index = index * 26 + cp - ord("A") + 1
return index - 1
def _range2cols(areas):
"""
Convert comma separated list of column names and ranges to indices.
Parameters
----------
areas : str
A string containing a sequence of column ranges (or areas).
Returns
-------
cols : list
A list of 0-based column indices.
Examples
--------
>>> _range2cols('A:E')
[0, 1, 2, 3, 4]
>>> _range2cols('A,C,Z:AB')
[0, 2, 25, 26, 27]
"""
cols = []
for rng in areas.split(","):
if ":" in rng:
rng = rng.split(":")
cols.extend(range(_excel2num(rng[0]), _excel2num(rng[1]) + 1))
else:
cols.append(_excel2num(rng))
return cols
def _maybe_convert_usecols(usecols):
"""
Convert `usecols` into a compatible format for parsing in `parsers.py`.
Parameters
----------
usecols : object
The use-columns object to potentially convert.
Returns
-------
converted : object
The compatible format of `usecols`.
"""
if usecols is None:
return usecols
if is_integer(usecols):
warnings.warn(
(
"Passing in an integer for `usecols` has been "
"deprecated. Please pass in a list of int from "
"0 to `usecols` inclusive instead."
),
FutureWarning,
stacklevel=2,
)
return list(range(usecols + 1))
if isinstance(usecols, str):
return _range2cols(usecols)
return usecols
def _validate_freeze_panes(freeze_panes):
if freeze_panes is not None:
if len(freeze_panes) == 2 and all(
isinstance(item, int) for item in freeze_panes
):
return True
raise ValueError(
"freeze_panes must be of form (row, column)"
" where row and column are integers"
)
# freeze_panes wasn't specified, return False so it won't be applied
# to output sheet
return False
def _trim_excel_header(row):
# trim header row so auto-index inference works
# xlrd uses '' , openpyxl None
while len(row) > 0 and (row[0] == "" or row[0] is None):
row = row[1:]
return row
def _fill_mi_header(row, control_row):
"""Forward fill blank entries in row but only inside the same parent index.
Used for creating headers in Multiindex.
Parameters
----------
row : list
List of items in a single row.
control_row : list of bool
Helps to determine if particular column is in same parent index as the
previous value. Used to stop propagation of empty cells between
different indexes.
Returns
-------
Returns changed row and control_row
"""
last = row[0]
for i in range(1, len(row)):
if not control_row[i]:
last = row[i]
if row[i] == "" or row[i] is None:
row[i] = last
else:
control_row[i] = False
last = row[i]
return row, control_row
def _pop_header_name(row, index_col):
"""
Pop the header name for MultiIndex parsing.
Parameters
----------
row : list
The data row to parse for the header name.
index_col : int, list
The index columns for our data. Assumed to be non-null.
Returns
-------
header_name : str
The extracted header name.
trimmed_row : list
The original data row with the header name removed.
"""
# Pop out header name and fill w/blank.
i = index_col if not is_list_like(index_col) else max(index_col)
header_name = row[i]
header_name = None if header_name == "" else header_name
return header_name, row[:i] + [""] + row[i + 1 :]

View File

@@ -0,0 +1,106 @@
from datetime import time
import numpy as np
from pandas.compat._optional import import_optional_dependency
from pandas.io.excel._base import _BaseExcelReader
class _XlrdReader(_BaseExcelReader):
def __init__(self, filepath_or_buffer):
"""Reader using xlrd engine.
Parameters
----------
filepath_or_buffer : string, path object or Workbook
Object to be parsed.
"""
err_msg = "Install xlrd >= 1.0.0 for Excel support"
import_optional_dependency("xlrd", extra=err_msg)
super().__init__(filepath_or_buffer)
@property
def _workbook_class(self):
from xlrd import Book
return Book
def load_workbook(self, filepath_or_buffer):
from xlrd import open_workbook
if hasattr(filepath_or_buffer, "read"):
data = filepath_or_buffer.read()
return open_workbook(file_contents=data)
else:
return open_workbook(filepath_or_buffer)
@property
def sheet_names(self):
return self.book.sheet_names()
def get_sheet_by_name(self, name):
return self.book.sheet_by_name(name)
def get_sheet_by_index(self, index):
return self.book.sheet_by_index(index)
def get_sheet_data(self, sheet, convert_float):
from xlrd import (
xldate,
XL_CELL_DATE,
XL_CELL_ERROR,
XL_CELL_BOOLEAN,
XL_CELL_NUMBER,
)
epoch1904 = self.book.datemode
def _parse_cell(cell_contents, cell_typ):
"""converts the contents of the cell into a pandas
appropriate object"""
if cell_typ == XL_CELL_DATE:
# Use the newer xlrd datetime handling.
try:
cell_contents = xldate.xldate_as_datetime(cell_contents, epoch1904)
except OverflowError:
return cell_contents
# Excel doesn't distinguish between dates and time,
# so we treat dates on the epoch as times only.
# Also, Excel supports 1900 and 1904 epochs.
year = (cell_contents.timetuple())[0:3]
if (not epoch1904 and year == (1899, 12, 31)) or (
epoch1904 and year == (1904, 1, 1)
):
cell_contents = time(
cell_contents.hour,
cell_contents.minute,
cell_contents.second,
cell_contents.microsecond,
)
elif cell_typ == XL_CELL_ERROR:
cell_contents = np.nan
elif cell_typ == XL_CELL_BOOLEAN:
cell_contents = bool(cell_contents)
elif convert_float and cell_typ == XL_CELL_NUMBER:
# GH5394 - Excel 'numbers' are always floats
# it's a minimal perf hit and less surprising
val = int(cell_contents)
if val == cell_contents:
cell_contents = val
return cell_contents
data = []
for i in range(sheet.nrows):
row = [
_parse_cell(value, typ)
for value, typ in zip(sheet.row_values(i), sheet.row_types(i))
]
data.append(row)
return data

View File

@@ -0,0 +1,237 @@
import pandas._libs.json as json
from pandas.io.excel._base import ExcelWriter
from pandas.io.excel._util import _validate_freeze_panes
class _XlsxStyler:
# Map from openpyxl-oriented styles to flatter xlsxwriter representation
# Ordering necessary for both determinism and because some are keyed by
# prefixes of others.
STYLE_MAPPING = {
"font": [
(("name",), "font_name"),
(("sz",), "font_size"),
(("size",), "font_size"),
(("color", "rgb"), "font_color"),
(("color",), "font_color"),
(("b",), "bold"),
(("bold",), "bold"),
(("i",), "italic"),
(("italic",), "italic"),
(("u",), "underline"),
(("underline",), "underline"),
(("strike",), "font_strikeout"),
(("vertAlign",), "font_script"),
(("vertalign",), "font_script"),
],
"number_format": [(("format_code",), "num_format"), ((), "num_format")],
"protection": [(("locked",), "locked"), (("hidden",), "hidden")],
"alignment": [
(("horizontal",), "align"),
(("vertical",), "valign"),
(("text_rotation",), "rotation"),
(("wrap_text",), "text_wrap"),
(("indent",), "indent"),
(("shrink_to_fit",), "shrink"),
],
"fill": [
(("patternType",), "pattern"),
(("patterntype",), "pattern"),
(("fill_type",), "pattern"),
(("start_color", "rgb"), "fg_color"),
(("fgColor", "rgb"), "fg_color"),
(("fgcolor", "rgb"), "fg_color"),
(("start_color",), "fg_color"),
(("fgColor",), "fg_color"),
(("fgcolor",), "fg_color"),
(("end_color", "rgb"), "bg_color"),
(("bgColor", "rgb"), "bg_color"),
(("bgcolor", "rgb"), "bg_color"),
(("end_color",), "bg_color"),
(("bgColor",), "bg_color"),
(("bgcolor",), "bg_color"),
],
"border": [
(("color", "rgb"), "border_color"),
(("color",), "border_color"),
(("style",), "border"),
(("top", "color", "rgb"), "top_color"),
(("top", "color"), "top_color"),
(("top", "style"), "top"),
(("top",), "top"),
(("right", "color", "rgb"), "right_color"),
(("right", "color"), "right_color"),
(("right", "style"), "right"),
(("right",), "right"),
(("bottom", "color", "rgb"), "bottom_color"),
(("bottom", "color"), "bottom_color"),
(("bottom", "style"), "bottom"),
(("bottom",), "bottom"),
(("left", "color", "rgb"), "left_color"),
(("left", "color"), "left_color"),
(("left", "style"), "left"),
(("left",), "left"),
],
}
@classmethod
def convert(cls, style_dict, num_format_str=None):
"""
converts a style_dict to an xlsxwriter format dict
Parameters
----------
style_dict : style dictionary to convert
num_format_str : optional number format string
"""
# Create a XlsxWriter format object.
props = {}
if num_format_str is not None:
props["num_format"] = num_format_str
if style_dict is None:
return props
if "borders" in style_dict:
style_dict = style_dict.copy()
style_dict["border"] = style_dict.pop("borders")
for style_group_key, style_group in style_dict.items():
for src, dst in cls.STYLE_MAPPING.get(style_group_key, []):
# src is a sequence of keys into a nested dict
# dst is a flat key
if dst in props:
continue
v = style_group
for k in src:
try:
v = v[k]
except (KeyError, TypeError):
break
else:
props[dst] = v
if isinstance(props.get("pattern"), str):
# TODO: support other fill patterns
props["pattern"] = 0 if props["pattern"] == "none" else 1
for k in ["border", "top", "right", "bottom", "left"]:
if isinstance(props.get(k), str):
try:
props[k] = [
"none",
"thin",
"medium",
"dashed",
"dotted",
"thick",
"double",
"hair",
"mediumDashed",
"dashDot",
"mediumDashDot",
"dashDotDot",
"mediumDashDotDot",
"slantDashDot",
].index(props[k])
except ValueError:
props[k] = 2
if isinstance(props.get("font_script"), str):
props["font_script"] = ["baseline", "superscript", "subscript"].index(
props["font_script"]
)
if isinstance(props.get("underline"), str):
props["underline"] = {
"none": 0,
"single": 1,
"double": 2,
"singleAccounting": 33,
"doubleAccounting": 34,
}[props["underline"]]
return props
class _XlsxWriter(ExcelWriter):
engine = "xlsxwriter"
supported_extensions = (".xlsx",)
def __init__(
self,
path,
engine=None,
date_format=None,
datetime_format=None,
mode="w",
**engine_kwargs
):
# Use the xlsxwriter module as the Excel writer.
import xlsxwriter
if mode == "a":
raise ValueError("Append mode is not supported with xlsxwriter!")
super().__init__(
path,
engine=engine,
date_format=date_format,
datetime_format=datetime_format,
mode=mode,
**engine_kwargs
)
self.book = xlsxwriter.Workbook(path, **engine_kwargs)
def save(self):
"""
Save workbook to disk.
"""
return self.book.close()
def write_cells(
self, cells, sheet_name=None, startrow=0, startcol=0, freeze_panes=None
):
# Write the frame cells using xlsxwriter.
sheet_name = self._get_sheet_name(sheet_name)
if sheet_name in self.sheets:
wks = self.sheets[sheet_name]
else:
wks = self.book.add_worksheet(sheet_name)
self.sheets[sheet_name] = wks
style_dict = {"null": None}
if _validate_freeze_panes(freeze_panes):
wks.freeze_panes(*(freeze_panes))
for cell in cells:
val, fmt = self._value_with_fmt(cell.val)
stylekey = json.dumps(cell.style)
if fmt:
stylekey += fmt
if stylekey in style_dict:
style = style_dict[stylekey]
else:
style = self.book.add_format(_XlsxStyler.convert(cell.style, fmt))
style_dict[stylekey] = style
if cell.mergestart is not None and cell.mergeend is not None:
wks.merge_range(
startrow + cell.row,
startcol + cell.col,
startrow + cell.mergestart,
startcol + cell.mergeend,
val,
style,
)
else:
wks.write(startrow + cell.row, startcol + cell.col, val, style)

View File

@@ -0,0 +1,135 @@
import pandas._libs.json as json
from pandas.io.excel._base import ExcelWriter
from pandas.io.excel._util import _validate_freeze_panes
class _XlwtWriter(ExcelWriter):
engine = "xlwt"
supported_extensions = (".xls",)
def __init__(self, path, engine=None, encoding=None, mode="w", **engine_kwargs):
# Use the xlwt module as the Excel writer.
import xlwt
engine_kwargs["engine"] = engine
if mode == "a":
raise ValueError("Append mode is not supported with xlwt!")
super().__init__(path, mode=mode, **engine_kwargs)
if encoding is None:
encoding = "ascii"
self.book = xlwt.Workbook(encoding=encoding)
self.fm_datetime = xlwt.easyxf(num_format_str=self.datetime_format)
self.fm_date = xlwt.easyxf(num_format_str=self.date_format)
def save(self):
"""
Save workbook to disk.
"""
return self.book.save(self.path)
def write_cells(
self, cells, sheet_name=None, startrow=0, startcol=0, freeze_panes=None
):
# Write the frame cells using xlwt.
sheet_name = self._get_sheet_name(sheet_name)
if sheet_name in self.sheets:
wks = self.sheets[sheet_name]
else:
wks = self.book.add_sheet(sheet_name)
self.sheets[sheet_name] = wks
if _validate_freeze_panes(freeze_panes):
wks.set_panes_frozen(True)
wks.set_horz_split_pos(freeze_panes[0])
wks.set_vert_split_pos(freeze_panes[1])
style_dict = {}
for cell in cells:
val, fmt = self._value_with_fmt(cell.val)
stylekey = json.dumps(cell.style)
if fmt:
stylekey += fmt
if stylekey in style_dict:
style = style_dict[stylekey]
else:
style = self._convert_to_style(cell.style, fmt)
style_dict[stylekey] = style
if cell.mergestart is not None and cell.mergeend is not None:
wks.write_merge(
startrow + cell.row,
startrow + cell.mergestart,
startcol + cell.col,
startcol + cell.mergeend,
val,
style,
)
else:
wks.write(startrow + cell.row, startcol + cell.col, val, style)
@classmethod
def _style_to_xlwt(cls, item, firstlevel=True, field_sep=",", line_sep=";"):
"""helper which recursively generate an xlwt easy style string
for example:
hstyle = {"font": {"bold": True},
"border": {"top": "thin",
"right": "thin",
"bottom": "thin",
"left": "thin"},
"align": {"horiz": "center"}}
will be converted to
font: bold on; \
border: top thin, right thin, bottom thin, left thin; \
align: horiz center;
"""
if hasattr(item, "items"):
if firstlevel:
it = [
"{key}: {val}".format(key=key, val=cls._style_to_xlwt(value, False))
for key, value in item.items()
]
out = "{sep} ".format(sep=(line_sep).join(it))
return out
else:
it = [
"{key} {val}".format(key=key, val=cls._style_to_xlwt(value, False))
for key, value in item.items()
]
out = "{sep} ".format(sep=(field_sep).join(it))
return out
else:
item = "{item}".format(item=item)
item = item.replace("True", "on")
item = item.replace("False", "off")
return item
@classmethod
def _convert_to_style(cls, style_dict, num_format_str=None):
"""
converts a style_dict to an xlwt style object
Parameters
----------
style_dict : style dictionary to convert
num_format_str : optional number format string
"""
import xlwt
if style_dict:
xlwt_stylestr = cls._style_to_xlwt(style_dict)
style = xlwt.easyxf(xlwt_stylestr, field_sep=",", line_sep=";")
else:
style = xlwt.XFStyle()
if num_format_str is not None:
style.num_format_str = num_format_str
return style

View File

@@ -0,0 +1,119 @@
""" feather-format compat """
from distutils.version import LooseVersion
from pandas.compat._optional import import_optional_dependency
from pandas.util._decorators import deprecate_kwarg
from pandas import DataFrame, Int64Index, RangeIndex
from pandas.io.common import _stringify_path
def to_feather(df, path):
"""
Write a DataFrame to the feather-format
Parameters
----------
df : DataFrame
path : string file path, or file-like object
"""
import_optional_dependency("pyarrow")
from pyarrow import feather
path = _stringify_path(path)
if not isinstance(df, DataFrame):
raise ValueError("feather only support IO with DataFrames")
valid_types = {"string", "unicode"}
# validate index
# --------------
# validate that we have only a default index
# raise on anything else as we don't serialize the index
if not isinstance(df.index, Int64Index):
raise ValueError(
"feather does not support serializing {} "
"for the index; you can .reset_index()"
"to make the index into column(s)".format(type(df.index))
)
if not df.index.equals(RangeIndex.from_range(range(len(df)))):
raise ValueError(
"feather does not support serializing a "
"non-default index for the index; you "
"can .reset_index() to make the index "
"into column(s)"
)
if df.index.name is not None:
raise ValueError(
"feather does not serialize index meta-data on a " "default index"
)
# validate columns
# ----------------
# must have value column names (strings only)
if df.columns.inferred_type not in valid_types:
raise ValueError("feather must have string column names")
feather.write_feather(df, path)
@deprecate_kwarg(old_arg_name="nthreads", new_arg_name="use_threads")
def read_feather(path, columns=None, use_threads=True):
"""
Load a feather-format object from the file path.
.. versionadded 0.20.0
Parameters
----------
path : str, path object or file-like object
Any valid string path is acceptable. The string could be a URL. Valid
URL schemes include http, ftp, s3, and file. For file URLs, a host is
expected. A local file could be:
``file://localhost/path/to/table.feather``.
If you want to pass in a path object, pandas accepts any
``os.PathLike``.
By file-like object, we refer to objects with a ``read()`` method,
such as a file handler (e.g. via builtin ``open`` function)
or ``StringIO``.
columns : sequence, default None
If not provided, all columns are read.
.. versionadded 0.24.0
nthreads : int, default 1
Number of CPU threads to use when reading to pandas.DataFrame.
.. versionadded 0.21.0
.. deprecated 0.24.0
use_threads : bool, default True
Whether to parallelize reading using multiple threads.
.. versionadded 0.24.0
Returns
-------
type of object stored in file
"""
pyarrow = import_optional_dependency("pyarrow")
from pyarrow import feather
path = _stringify_path(path)
if LooseVersion(pyarrow.__version__) < LooseVersion("0.11.0"):
int_use_threads = int(use_threads)
if int_use_threads < 1:
int_use_threads = 1
return feather.read_feather(path, columns=columns, nthreads=int_use_threads)
return feather.read_feather(path, columns=columns, use_threads=bool(use_threads))

View File

@@ -0,0 +1,83 @@
"""
Internal module for console introspection
"""
from shutil import get_terminal_size
def get_console_size():
"""Return console size as tuple = (width, height).
Returns (None,None) in non-interactive session.
"""
from pandas import get_option
display_width = get_option("display.width")
# deprecated.
display_height = get_option("display.max_rows")
# Consider
# interactive shell terminal, can detect term size
# interactive non-shell terminal (ipnb/ipqtconsole), cannot detect term
# size non-interactive script, should disregard term size
# in addition
# width,height have default values, but setting to 'None' signals
# should use Auto-Detection, But only in interactive shell-terminal.
# Simple. yeah.
if in_interactive_session():
if in_ipython_frontend():
# sane defaults for interactive non-shell terminal
# match default for width,height in config_init
from pandas._config.config import get_default_val
terminal_width = get_default_val("display.width")
terminal_height = get_default_val("display.max_rows")
else:
# pure terminal
terminal_width, terminal_height = get_terminal_size()
else:
terminal_width, terminal_height = None, None
# Note if the User sets width/Height to None (auto-detection)
# and we're in a script (non-inter), this will return (None,None)
# caller needs to deal.
return (display_width or terminal_width, display_height or terminal_height)
# ----------------------------------------------------------------------
# Detect our environment
def in_interactive_session():
""" check if we're running in an interactive shell
returns True if running under python/ipython interactive shell
"""
from pandas import get_option
def check_main():
try:
import __main__ as main
except ModuleNotFoundError:
return get_option("mode.sim_interactive")
return not hasattr(main, "__file__") or get_option("mode.sim_interactive")
try:
return __IPYTHON__ or check_main() # noqa
except NameError:
return check_main()
def in_ipython_frontend():
"""
check if we're inside an an IPython zmq frontend
"""
try:
ip = get_ipython() # noqa
return "zmq" in str(type(ip)).lower()
except NameError:
pass
return False

View File

@@ -0,0 +1,257 @@
"""Utilities for interpreting CSS from Stylers for formatting non-HTML outputs
"""
import re
import warnings
class CSSWarning(UserWarning):
"""This CSS syntax cannot currently be parsed"""
pass
class CSSResolver:
"""A callable for parsing and resolving CSS to atomic properties
"""
def __call__(self, declarations_str, inherited=None):
""" the given declarations to atomic properties
Parameters
----------
declarations_str : str
A list of CSS declarations
inherited : dict, optional
Atomic properties indicating the inherited style context in which
declarations_str is to be resolved. ``inherited`` should already
be resolved, i.e. valid output of this method.
Returns
-------
props : dict
Atomic CSS 2.2 properties
Examples
--------
>>> resolve = CSSResolver()
>>> inherited = {'font-family': 'serif', 'font-weight': 'bold'}
>>> out = resolve('''
... border-color: BLUE RED;
... font-size: 1em;
... font-size: 2em;
... font-weight: normal;
... font-weight: inherit;
... ''', inherited)
>>> sorted(out.items()) # doctest: +NORMALIZE_WHITESPACE
[('border-bottom-color', 'blue'),
('border-left-color', 'red'),
('border-right-color', 'red'),
('border-top-color', 'blue'),
('font-family', 'serif'),
('font-size', '24pt'),
('font-weight', 'bold')]
"""
props = dict(self.atomize(self.parse(declarations_str)))
if inherited is None:
inherited = {}
# 1. resolve inherited, initial
for prop, val in inherited.items():
if prop not in props:
props[prop] = val
for prop, val in list(props.items()):
if val == "inherit":
val = inherited.get(prop, "initial")
if val == "initial":
val = None
if val is None:
# we do not define a complete initial stylesheet
del props[prop]
else:
props[prop] = val
# 2. resolve relative font size
if props.get("font-size"):
if "font-size" in inherited:
em_pt = inherited["font-size"]
assert em_pt[-2:] == "pt"
em_pt = float(em_pt[:-2])
else:
em_pt = None
props["font-size"] = self.size_to_pt(
props["font-size"], em_pt, conversions=self.FONT_SIZE_RATIOS
)
font_size = float(props["font-size"][:-2])
else:
font_size = None
# 3. TODO: resolve other font-relative units
for side in self.SIDES:
prop = "border-{side}-width".format(side=side)
if prop in props:
props[prop] = self.size_to_pt(
props[prop], em_pt=font_size, conversions=self.BORDER_WIDTH_RATIOS
)
for prop in [
"margin-{side}".format(side=side),
"padding-{side}".format(side=side),
]:
if prop in props:
# TODO: support %
props[prop] = self.size_to_pt(
props[prop], em_pt=font_size, conversions=self.MARGIN_RATIOS
)
return props
UNIT_RATIOS = {
"rem": ("pt", 12),
"ex": ("em", 0.5),
# 'ch':
"px": ("pt", 0.75),
"pc": ("pt", 12),
"in": ("pt", 72),
"cm": ("in", 1 / 2.54),
"mm": ("in", 1 / 25.4),
"q": ("mm", 0.25),
"!!default": ("em", 0),
}
FONT_SIZE_RATIOS = UNIT_RATIOS.copy()
FONT_SIZE_RATIOS.update(
{
"%": ("em", 0.01),
"xx-small": ("rem", 0.5),
"x-small": ("rem", 0.625),
"small": ("rem", 0.8),
"medium": ("rem", 1),
"large": ("rem", 1.125),
"x-large": ("rem", 1.5),
"xx-large": ("rem", 2),
"smaller": ("em", 1 / 1.2),
"larger": ("em", 1.2),
"!!default": ("em", 1),
}
)
MARGIN_RATIOS = UNIT_RATIOS.copy()
MARGIN_RATIOS.update({"none": ("pt", 0)})
BORDER_WIDTH_RATIOS = UNIT_RATIOS.copy()
BORDER_WIDTH_RATIOS.update(
{
"none": ("pt", 0),
"thick": ("px", 4),
"medium": ("px", 2),
"thin": ("px", 1),
# Default: medium only if solid
}
)
def size_to_pt(self, in_val, em_pt=None, conversions=UNIT_RATIOS):
def _error():
warnings.warn("Unhandled size: {val!r}".format(val=in_val), CSSWarning)
return self.size_to_pt("1!!default", conversions=conversions)
try:
val, unit = re.match(r"^(\S*?)([a-zA-Z%!].*)", in_val).groups()
except AttributeError:
return _error()
if val == "":
# hack for 'large' etc.
val = 1
else:
try:
val = float(val)
except ValueError:
return _error()
while unit != "pt":
if unit == "em":
if em_pt is None:
unit = "rem"
else:
val *= em_pt
unit = "pt"
continue
try:
unit, mul = conversions[unit]
except KeyError:
return _error()
val *= mul
val = round(val, 5)
if int(val) == val:
size_fmt = "{fmt:d}pt".format(fmt=int(val))
else:
size_fmt = "{fmt:f}pt".format(fmt=val)
return size_fmt
def atomize(self, declarations):
for prop, value in declarations:
attr = "expand_" + prop.replace("-", "_")
try:
expand = getattr(self, attr)
except AttributeError:
yield prop, value
else:
for prop, value in expand(prop, value):
yield prop, value
SIDE_SHORTHANDS = {
1: [0, 0, 0, 0],
2: [0, 1, 0, 1],
3: [0, 1, 2, 1],
4: [0, 1, 2, 3],
}
SIDES = ("top", "right", "bottom", "left")
def _side_expander(prop_fmt):
def expand(self, prop, value):
tokens = value.split()
try:
mapping = self.SIDE_SHORTHANDS[len(tokens)]
except KeyError:
warnings.warn(
'Could not expand "{prop}: {val}"'.format(prop=prop, val=value),
CSSWarning,
)
return
for key, idx in zip(self.SIDES, mapping):
yield prop_fmt.format(key), tokens[idx]
return expand
expand_border_color = _side_expander("border-{:s}-color")
expand_border_style = _side_expander("border-{:s}-style")
expand_border_width = _side_expander("border-{:s}-width")
expand_margin = _side_expander("margin-{:s}")
expand_padding = _side_expander("padding-{:s}")
def parse(self, declarations_str):
"""Generates (prop, value) pairs from declarations
In a future version may generate parsed tokens from tinycss/tinycss2
"""
for decl in declarations_str.split(";"):
if not decl.strip():
continue
prop, sep, val = decl.partition(":")
prop = prop.strip().lower()
# TODO: don't lowercase case sensitive parts of values (strings)
val = val.strip().lower()
if sep:
yield prop, val
else:
warnings.warn(
"Ill-formatted attribute: expected a colon "
"in {decl!r}".format(decl=decl),
CSSWarning,
)

View File

@@ -0,0 +1,356 @@
"""
Module for formatting output data into CSV files.
"""
import csv as csvlib
from io import StringIO
import os
import warnings
from zipfile import ZipFile
import numpy as np
from pandas._libs import writers as libwriters
from pandas.core.dtypes.generic import (
ABCDatetimeIndex,
ABCIndexClass,
ABCMultiIndex,
ABCPeriodIndex,
)
from pandas.core.dtypes.missing import notna
from pandas.io.common import (
UnicodeWriter,
_get_handle,
_infer_compression,
get_filepath_or_buffer,
)
class CSVFormatter:
def __init__(
self,
obj,
path_or_buf=None,
sep=",",
na_rep="",
float_format=None,
cols=None,
header=True,
index=True,
index_label=None,
mode="w",
encoding=None,
compression="infer",
quoting=None,
line_terminator="\n",
chunksize=None,
quotechar='"',
date_format=None,
doublequote=True,
escapechar=None,
decimal=".",
):
self.obj = obj
if path_or_buf is None:
path_or_buf = StringIO()
self.path_or_buf, _, _, _ = get_filepath_or_buffer(
path_or_buf, encoding=encoding, compression=compression, mode=mode
)
self.sep = sep
self.na_rep = na_rep
self.float_format = float_format
self.decimal = decimal
self.header = header
self.index = index
self.index_label = index_label
self.mode = mode
if encoding is None:
encoding = "utf-8"
self.encoding = encoding
self.compression = _infer_compression(self.path_or_buf, compression)
if quoting is None:
quoting = csvlib.QUOTE_MINIMAL
self.quoting = quoting
if quoting == csvlib.QUOTE_NONE:
# prevents crash in _csv
quotechar = None
self.quotechar = quotechar
self.doublequote = doublequote
self.escapechar = escapechar
self.line_terminator = line_terminator or os.linesep
self.date_format = date_format
self.has_mi_columns = isinstance(obj.columns, ABCMultiIndex)
# validate mi options
if self.has_mi_columns:
if cols is not None:
raise TypeError(
"cannot specify cols with a MultiIndex on the " "columns"
)
if cols is not None:
if isinstance(cols, ABCIndexClass):
cols = cols.to_native_types(
na_rep=na_rep,
float_format=float_format,
date_format=date_format,
quoting=self.quoting,
)
else:
cols = list(cols)
self.obj = self.obj.loc[:, cols]
# update columns to include possible multiplicity of dupes
# and make sure sure cols is just a list of labels
cols = self.obj.columns
if isinstance(cols, ABCIndexClass):
cols = cols.to_native_types(
na_rep=na_rep,
float_format=float_format,
date_format=date_format,
quoting=self.quoting,
)
else:
cols = list(cols)
# save it
self.cols = cols
# preallocate data 2d list
self.blocks = self.obj._data.blocks
ncols = sum(b.shape[0] for b in self.blocks)
self.data = [None] * ncols
if chunksize is None:
chunksize = (100000 // (len(self.cols) or 1)) or 1
self.chunksize = int(chunksize)
self.data_index = obj.index
if (
isinstance(self.data_index, (ABCDatetimeIndex, ABCPeriodIndex))
and date_format is not None
):
from pandas import Index
self.data_index = Index(
[x.strftime(date_format) if notna(x) else "" for x in self.data_index]
)
self.nlevels = getattr(self.data_index, "nlevels", 1)
if not index:
self.nlevels = 0
def save(self):
"""
Create the writer & save
"""
# GH21227 internal compression is not used when file-like passed.
if self.compression and hasattr(self.path_or_buf, "write"):
msg = "compression has no effect when passing file-like " "object as input."
warnings.warn(msg, RuntimeWarning, stacklevel=2)
# when zip compression is called.
is_zip = isinstance(self.path_or_buf, ZipFile) or (
not hasattr(self.path_or_buf, "write") and self.compression == "zip"
)
if is_zip:
# zipfile doesn't support writing string to archive. uses string
# buffer to receive csv writing and dump into zip compression
# file handle. GH21241, GH21118
f = StringIO()
close = False
elif hasattr(self.path_or_buf, "write"):
f = self.path_or_buf
close = False
else:
f, handles = _get_handle(
self.path_or_buf,
self.mode,
encoding=self.encoding,
compression=self.compression,
)
close = True
try:
writer_kwargs = dict(
lineterminator=self.line_terminator,
delimiter=self.sep,
quoting=self.quoting,
doublequote=self.doublequote,
escapechar=self.escapechar,
quotechar=self.quotechar,
)
if self.encoding == "ascii":
self.writer = csvlib.writer(f, **writer_kwargs)
else:
writer_kwargs["encoding"] = self.encoding
self.writer = UnicodeWriter(f, **writer_kwargs)
self._save()
finally:
if is_zip:
# GH17778 handles zip compression separately.
buf = f.getvalue()
if hasattr(self.path_or_buf, "write"):
self.path_or_buf.write(buf)
else:
f, handles = _get_handle(
self.path_or_buf,
self.mode,
encoding=self.encoding,
compression=self.compression,
)
f.write(buf)
close = True
if close:
f.close()
for _fh in handles:
_fh.close()
def _save_header(self):
writer = self.writer
obj = self.obj
index_label = self.index_label
cols = self.cols
has_mi_columns = self.has_mi_columns
header = self.header
encoded_labels = []
has_aliases = isinstance(header, (tuple, list, np.ndarray, ABCIndexClass))
if not (has_aliases or self.header):
return
if has_aliases:
if len(header) != len(cols):
raise ValueError(
(
"Writing {ncols} cols but got {nalias} "
"aliases".format(ncols=len(cols), nalias=len(header))
)
)
else:
write_cols = header
else:
write_cols = cols
if self.index:
# should write something for index label
if index_label is not False:
if index_label is None:
if isinstance(obj.index, ABCMultiIndex):
index_label = []
for i, name in enumerate(obj.index.names):
if name is None:
name = ""
index_label.append(name)
else:
index_label = obj.index.name
if index_label is None:
index_label = [""]
else:
index_label = [index_label]
elif not isinstance(
index_label, (list, tuple, np.ndarray, ABCIndexClass)
):
# given a string for a DF with Index
index_label = [index_label]
encoded_labels = list(index_label)
else:
encoded_labels = []
if not has_mi_columns or has_aliases:
encoded_labels += list(write_cols)
writer.writerow(encoded_labels)
else:
# write out the mi
columns = obj.columns
# write out the names for each level, then ALL of the values for
# each level
for i in range(columns.nlevels):
# we need at least 1 index column to write our col names
col_line = []
if self.index:
# name is the first column
col_line.append(columns.names[i])
if isinstance(index_label, list) and len(index_label) > 1:
col_line.extend([""] * (len(index_label) - 1))
col_line.extend(columns._get_level_values(i))
writer.writerow(col_line)
# Write out the index line if it's not empty.
# Otherwise, we will print out an extraneous
# blank line between the mi and the data rows.
if encoded_labels and set(encoded_labels) != {""}:
encoded_labels.extend([""] * len(columns))
writer.writerow(encoded_labels)
def _save(self):
self._save_header()
nrows = len(self.data_index)
# write in chunksize bites
chunksize = self.chunksize
chunks = int(nrows / chunksize) + 1
for i in range(chunks):
start_i = i * chunksize
end_i = min((i + 1) * chunksize, nrows)
if start_i >= end_i:
break
self._save_chunk(start_i, end_i)
def _save_chunk(self, start_i, end_i):
data_index = self.data_index
# create the data for a chunk
slicer = slice(start_i, end_i)
for i in range(len(self.blocks)):
b = self.blocks[i]
d = b.to_native_types(
slicer=slicer,
na_rep=self.na_rep,
float_format=self.float_format,
decimal=self.decimal,
date_format=self.date_format,
quoting=self.quoting,
)
for col_loc, col in zip(b.mgr_locs, d):
# self.data is a preallocated list
self.data[col_loc] = col
ix = data_index.to_native_types(
slicer=slicer,
na_rep=self.na_rep,
float_format=self.float_format,
decimal=self.decimal,
date_format=self.date_format,
quoting=self.quoting,
)
libwriters.write_csv_rows(self.data, ix, self.nlevels, self.cols, self.writer)

View File

@@ -0,0 +1,742 @@
"""Utilities for conversion to writer-agnostic Excel representation
"""
from functools import reduce
import itertools
import re
import warnings
import numpy as np
from pandas.core.dtypes import missing
from pandas.core.dtypes.common import is_float, is_scalar
from pandas.core.dtypes.generic import ABCMultiIndex, ABCPeriodIndex
from pandas import Index
import pandas.core.common as com
from pandas.io.formats.css import CSSResolver, CSSWarning
from pandas.io.formats.format import get_level_lengths
from pandas.io.formats.printing import pprint_thing
class ExcelCell:
__fields__ = ("row", "col", "val", "style", "mergestart", "mergeend")
__slots__ = __fields__
def __init__(self, row, col, val, style=None, mergestart=None, mergeend=None):
self.row = row
self.col = col
self.val = val
self.style = style
self.mergestart = mergestart
self.mergeend = mergeend
class CSSToExcelConverter:
"""A callable for converting CSS declarations to ExcelWriter styles
Supports parts of CSS 2.2, with minimal CSS 3.0 support (e.g. text-shadow),
focusing on font styling, backgrounds, borders and alignment.
Operates by first computing CSS styles in a fairly generic
way (see :meth:`compute_css`) then determining Excel style
properties from CSS properties (see :meth:`build_xlstyle`).
Parameters
----------
inherited : str, optional
CSS declarations understood to be the containing scope for the
CSS processed by :meth:`__call__`.
"""
# NB: Most of the methods here could be classmethods, as only __init__
# and __call__ make use of instance attributes. We leave them as
# instancemethods so that users can easily experiment with extensions
# without monkey-patching.
def __init__(self, inherited=None):
if inherited is not None:
inherited = self.compute_css(inherited)
self.inherited = inherited
compute_css = CSSResolver()
def __call__(self, declarations_str):
"""Convert CSS declarations to ExcelWriter style
Parameters
----------
declarations_str : str
List of CSS declarations.
e.g. "font-weight: bold; background: blue"
Returns
-------
xlstyle : dict
A style as interpreted by ExcelWriter when found in
ExcelCell.style.
"""
# TODO: memoize?
properties = self.compute_css(declarations_str, self.inherited)
return self.build_xlstyle(properties)
def build_xlstyle(self, props):
out = {
"alignment": self.build_alignment(props),
"border": self.build_border(props),
"fill": self.build_fill(props),
"font": self.build_font(props),
"number_format": self.build_number_format(props),
}
# TODO: handle cell width and height: needs support in pandas.io.excel
def remove_none(d):
"""Remove key where value is None, through nested dicts"""
for k, v in list(d.items()):
if v is None:
del d[k]
elif isinstance(v, dict):
remove_none(v)
if not v:
del d[k]
remove_none(out)
return out
VERTICAL_MAP = {
"top": "top",
"text-top": "top",
"middle": "center",
"baseline": "bottom",
"bottom": "bottom",
"text-bottom": "bottom",
# OpenXML also has 'justify', 'distributed'
}
def build_alignment(self, props):
# TODO: text-indent, padding-left -> alignment.indent
return {
"horizontal": props.get("text-align"),
"vertical": self.VERTICAL_MAP.get(props.get("vertical-align")),
"wrap_text": (
None
if props.get("white-space") is None
else props["white-space"] not in ("nowrap", "pre", "pre-line")
),
}
def build_border(self, props):
return {
side: {
"style": self._border_style(
props.get("border-{side}-style".format(side=side)),
props.get("border-{side}-width".format(side=side)),
),
"color": self.color_to_excel(
props.get("border-{side}-color".format(side=side))
),
}
for side in ["top", "right", "bottom", "left"]
}
def _border_style(self, style, width):
# convert styles and widths to openxml, one of:
# 'dashDot'
# 'dashDotDot'
# 'dashed'
# 'dotted'
# 'double'
# 'hair'
# 'medium'
# 'mediumDashDot'
# 'mediumDashDotDot'
# 'mediumDashed'
# 'slantDashDot'
# 'thick'
# 'thin'
if width is None and style is None:
return None
if style == "none" or style == "hidden":
return None
if width is None:
width = "2pt"
width = float(width[:-2])
if width < 1e-5:
return None
elif width < 1.3:
width_name = "thin"
elif width < 2.8:
width_name = "medium"
else:
width_name = "thick"
if style in (None, "groove", "ridge", "inset", "outset"):
# not handled
style = "solid"
if style == "double":
return "double"
if style == "solid":
return width_name
if style == "dotted":
if width_name in ("hair", "thin"):
return "dotted"
return "mediumDashDotDot"
if style == "dashed":
if width_name in ("hair", "thin"):
return "dashed"
return "mediumDashed"
def build_fill(self, props):
# TODO: perhaps allow for special properties
# -excel-pattern-bgcolor and -excel-pattern-type
fill_color = props.get("background-color")
if fill_color not in (None, "transparent", "none"):
return {"fgColor": self.color_to_excel(fill_color), "patternType": "solid"}
BOLD_MAP = {
"bold": True,
"bolder": True,
"600": True,
"700": True,
"800": True,
"900": True,
"normal": False,
"lighter": False,
"100": False,
"200": False,
"300": False,
"400": False,
"500": False,
}
ITALIC_MAP = {"normal": False, "italic": True, "oblique": True}
def build_font(self, props):
size = props.get("font-size")
if size is not None:
assert size.endswith("pt")
size = float(size[:-2])
font_names_tmp = re.findall(
r"""(?x)
(
"(?:[^"]|\\")+"
|
'(?:[^']|\\')+'
|
[^'",]+
)(?=,|\s*$)
""",
props.get("font-family", ""),
)
font_names = []
for name in font_names_tmp:
if name[:1] == '"':
name = name[1:-1].replace('\\"', '"')
elif name[:1] == "'":
name = name[1:-1].replace("\\'", "'")
else:
name = name.strip()
if name:
font_names.append(name)
family = None
for name in font_names:
if name == "serif":
family = 1 # roman
break
elif name == "sans-serif":
family = 2 # swiss
break
elif name == "cursive":
family = 4 # script
break
elif name == "fantasy":
family = 5 # decorative
break
decoration = props.get("text-decoration")
if decoration is not None:
decoration = decoration.split()
else:
decoration = ()
return {
"name": font_names[0] if font_names else None,
"family": family,
"size": size,
"bold": self.BOLD_MAP.get(props.get("font-weight")),
"italic": self.ITALIC_MAP.get(props.get("font-style")),
"underline": ("single" if "underline" in decoration else None),
"strike": ("line-through" in decoration) or None,
"color": self.color_to_excel(props.get("color")),
# shadow if nonzero digit before shadow color
"shadow": (
bool(re.search("^[^#(]*[1-9]", props["text-shadow"]))
if "text-shadow" in props
else None
),
# 'vertAlign':,
# 'charset': ,
# 'scheme': ,
# 'outline': ,
# 'condense': ,
}
NAMED_COLORS = {
"maroon": "800000",
"brown": "A52A2A",
"red": "FF0000",
"pink": "FFC0CB",
"orange": "FFA500",
"yellow": "FFFF00",
"olive": "808000",
"green": "008000",
"purple": "800080",
"fuchsia": "FF00FF",
"lime": "00FF00",
"teal": "008080",
"aqua": "00FFFF",
"blue": "0000FF",
"navy": "000080",
"black": "000000",
"gray": "808080",
"grey": "808080",
"silver": "C0C0C0",
"white": "FFFFFF",
}
def color_to_excel(self, val):
if val is None:
return None
if val.startswith("#") and len(val) == 7:
return val[1:].upper()
if val.startswith("#") and len(val) == 4:
return (val[1] * 2 + val[2] * 2 + val[3] * 2).upper()
try:
return self.NAMED_COLORS[val]
except KeyError:
warnings.warn("Unhandled color format: {val!r}".format(val=val), CSSWarning)
def build_number_format(self, props):
return {"format_code": props.get("number-format")}
class ExcelFormatter:
"""
Class for formatting a DataFrame to a list of ExcelCells,
Parameters
----------
df : DataFrame or Styler
na_rep: na representation
float_format : string, default None
Format string for floating point numbers
cols : sequence, optional
Columns to write
header : boolean or list of string, default True
Write out column names. If a list of string is given it is
assumed to be aliases for the column names
index : boolean, default True
output row names (index)
index_label : string or sequence, default None
Column label for index column(s) if desired. If None is given, and
`header` and `index` are True, then the index names are used. A
sequence should be given if the DataFrame uses MultiIndex.
merge_cells : boolean, default False
Format MultiIndex and Hierarchical Rows as merged cells.
inf_rep : string, default `'inf'`
representation for np.inf values (which aren't representable in Excel)
A `'-'` sign will be added in front of -inf.
style_converter : callable, optional
This translates Styler styles (CSS) into ExcelWriter styles.
Defaults to ``CSSToExcelConverter()``.
It should have signature css_declarations string -> excel style.
This is only called for body cells.
"""
max_rows = 2 ** 20
max_cols = 2 ** 14
def __init__(
self,
df,
na_rep="",
float_format=None,
cols=None,
header=True,
index=True,
index_label=None,
merge_cells=False,
inf_rep="inf",
style_converter=None,
):
self.rowcounter = 0
self.na_rep = na_rep
if hasattr(df, "render"):
self.styler = df
df = df.data
if style_converter is None:
style_converter = CSSToExcelConverter()
self.style_converter = style_converter
else:
self.styler = None
self.df = df
if cols is not None:
# all missing, raise
if not len(Index(cols) & df.columns):
raise KeyError("passes columns are not ALL present dataframe")
# deprecatedin gh-17295
# 1 missing is ok (for now)
if len(Index(cols) & df.columns) != len(cols):
warnings.warn(
"Not all names specified in 'columns' are found; "
"this will raise a KeyError in the future",
FutureWarning,
)
self.df = df.reindex(columns=cols)
self.columns = self.df.columns
self.float_format = float_format
self.index = index
self.index_label = index_label
self.header = header
self.merge_cells = merge_cells
self.inf_rep = inf_rep
@property
def header_style(self):
return {
"font": {"bold": True},
"borders": {
"top": "thin",
"right": "thin",
"bottom": "thin",
"left": "thin",
},
"alignment": {"horizontal": "center", "vertical": "top"},
}
def _format_value(self, val):
if is_scalar(val) and missing.isna(val):
val = self.na_rep
elif is_float(val):
if missing.isposinf_scalar(val):
val = self.inf_rep
elif missing.isneginf_scalar(val):
val = "-{inf}".format(inf=self.inf_rep)
elif self.float_format is not None:
val = float(self.float_format % val)
if getattr(val, "tzinfo", None) is not None:
raise ValueError(
"Excel does not support datetimes with "
"timezones. Please ensure that datetimes "
"are timezone unaware before writing to Excel."
)
return val
def _format_header_mi(self):
if self.columns.nlevels > 1:
if not self.index:
raise NotImplementedError(
"Writing to Excel with MultiIndex"
" columns and no index "
"('index'=False) is not yet "
"implemented."
)
has_aliases = isinstance(self.header, (tuple, list, np.ndarray, Index))
if not (has_aliases or self.header):
return
columns = self.columns
level_strs = columns.format(
sparsify=self.merge_cells, adjoin=False, names=False
)
level_lengths = get_level_lengths(level_strs)
coloffset = 0
lnum = 0
if self.index and isinstance(self.df.index, ABCMultiIndex):
coloffset = len(self.df.index[0]) - 1
if self.merge_cells:
# Format multi-index as a merged cells.
for lnum in range(len(level_lengths)):
name = columns.names[lnum]
yield ExcelCell(lnum, coloffset, name, self.header_style)
for lnum, (spans, levels, level_codes) in enumerate(
zip(level_lengths, columns.levels, columns.codes)
):
values = levels.take(level_codes)
for i in spans:
if spans[i] > 1:
yield ExcelCell(
lnum,
coloffset + i + 1,
values[i],
self.header_style,
lnum,
coloffset + i + spans[i],
)
else:
yield ExcelCell(
lnum, coloffset + i + 1, values[i], self.header_style
)
else:
# Format in legacy format with dots to indicate levels.
for i, values in enumerate(zip(*level_strs)):
v = ".".join(map(pprint_thing, values))
yield ExcelCell(lnum, coloffset + i + 1, v, self.header_style)
self.rowcounter = lnum
def _format_header_regular(self):
has_aliases = isinstance(self.header, (tuple, list, np.ndarray, Index))
if has_aliases or self.header:
coloffset = 0
if self.index:
coloffset = 1
if isinstance(self.df.index, ABCMultiIndex):
coloffset = len(self.df.index[0])
colnames = self.columns
if has_aliases:
if len(self.header) != len(self.columns):
raise ValueError(
"Writing {cols} cols but got {alias} "
"aliases".format(cols=len(self.columns), alias=len(self.header))
)
else:
colnames = self.header
for colindex, colname in enumerate(colnames):
yield ExcelCell(
self.rowcounter, colindex + coloffset, colname, self.header_style
)
def _format_header(self):
if isinstance(self.columns, ABCMultiIndex):
gen = self._format_header_mi()
else:
gen = self._format_header_regular()
gen2 = ()
if self.df.index.names:
row = [x if x is not None else "" for x in self.df.index.names] + [
""
] * len(self.columns)
if reduce(lambda x, y: x and y, map(lambda x: x != "", row)):
gen2 = (
ExcelCell(self.rowcounter, colindex, val, self.header_style)
for colindex, val in enumerate(row)
)
self.rowcounter += 1
return itertools.chain(gen, gen2)
def _format_body(self):
if isinstance(self.df.index, ABCMultiIndex):
return self._format_hierarchical_rows()
else:
return self._format_regular_rows()
def _format_regular_rows(self):
has_aliases = isinstance(self.header, (tuple, list, np.ndarray, Index))
if has_aliases or self.header:
self.rowcounter += 1
# output index and index_label?
if self.index:
# check aliases
# if list only take first as this is not a MultiIndex
if self.index_label and isinstance(
self.index_label, (list, tuple, np.ndarray, Index)
):
index_label = self.index_label[0]
# if string good to go
elif self.index_label and isinstance(self.index_label, str):
index_label = self.index_label
else:
index_label = self.df.index.names[0]
if isinstance(self.columns, ABCMultiIndex):
self.rowcounter += 1
if index_label and self.header is not False:
yield ExcelCell(self.rowcounter - 1, 0, index_label, self.header_style)
# write index_values
index_values = self.df.index
if isinstance(self.df.index, ABCPeriodIndex):
index_values = self.df.index.to_timestamp()
for idx, idxval in enumerate(index_values):
yield ExcelCell(self.rowcounter + idx, 0, idxval, self.header_style)
coloffset = 1
else:
coloffset = 0
for cell in self._generate_body(coloffset):
yield cell
def _format_hierarchical_rows(self):
has_aliases = isinstance(self.header, (tuple, list, np.ndarray, Index))
if has_aliases or self.header:
self.rowcounter += 1
gcolidx = 0
if self.index:
index_labels = self.df.index.names
# check for aliases
if self.index_label and isinstance(
self.index_label, (list, tuple, np.ndarray, Index)
):
index_labels = self.index_label
# MultiIndex columns require an extra row
# with index names (blank if None) for
# unambiguous round-trip, unless not merging,
# in which case the names all go on one row Issue #11328
if isinstance(self.columns, ABCMultiIndex) and self.merge_cells:
self.rowcounter += 1
# if index labels are not empty go ahead and dump
if com._any_not_none(*index_labels) and self.header is not False:
for cidx, name in enumerate(index_labels):
yield ExcelCell(self.rowcounter - 1, cidx, name, self.header_style)
if self.merge_cells:
# Format hierarchical rows as merged cells.
level_strs = self.df.index.format(
sparsify=True, adjoin=False, names=False
)
level_lengths = get_level_lengths(level_strs)
for spans, levels, level_codes in zip(
level_lengths, self.df.index.levels, self.df.index.codes
):
values = levels.take(
level_codes, allow_fill=levels._can_hold_na, fill_value=True
)
for i in spans:
if spans[i] > 1:
yield ExcelCell(
self.rowcounter + i,
gcolidx,
values[i],
self.header_style,
self.rowcounter + i + spans[i] - 1,
gcolidx,
)
else:
yield ExcelCell(
self.rowcounter + i,
gcolidx,
values[i],
self.header_style,
)
gcolidx += 1
else:
# Format hierarchical rows with non-merged values.
for indexcolvals in zip(*self.df.index):
for idx, indexcolval in enumerate(indexcolvals):
yield ExcelCell(
self.rowcounter + idx,
gcolidx,
indexcolval,
self.header_style,
)
gcolidx += 1
for cell in self._generate_body(gcolidx):
yield cell
def _generate_body(self, coloffset):
if self.styler is None:
styles = None
else:
styles = self.styler._compute().ctx
if not styles:
styles = None
xlstyle = None
# Write the body of the frame data series by series.
for colidx in range(len(self.columns)):
series = self.df.iloc[:, colidx]
for i, val in enumerate(series):
if styles is not None:
xlstyle = self.style_converter(";".join(styles[i, colidx]))
yield ExcelCell(self.rowcounter + i, colidx + coloffset, val, xlstyle)
def get_formatted_cells(self):
for cell in itertools.chain(self._format_header(), self._format_body()):
cell.val = self._format_value(cell.val)
yield cell
def write(
self,
writer,
sheet_name="Sheet1",
startrow=0,
startcol=0,
freeze_panes=None,
engine=None,
):
"""
writer : string or ExcelWriter object
File path or existing ExcelWriter
sheet_name : string, default 'Sheet1'
Name of sheet which will contain DataFrame
startrow :
upper left cell row to dump data frame
startcol :
upper left cell column to dump data frame
freeze_panes : tuple of integer (length 2), default None
Specifies the one-based bottommost row and rightmost column that
is to be frozen
engine : string, default None
write engine to use if writer is a path - you can also set this
via the options ``io.excel.xlsx.writer``, ``io.excel.xls.writer``,
and ``io.excel.xlsm.writer``.
"""
from pandas.io.excel import ExcelWriter
from pandas.io.common import _stringify_path
num_rows, num_cols = self.df.shape
if num_rows > self.max_rows or num_cols > self.max_cols:
raise ValueError(
"This sheet is too large! Your sheet size is: "
+ "{}, {} ".format(num_rows, num_cols)
+ "Max sheet size is: {}, {}".format(self.max_rows, self.max_cols)
)
if isinstance(writer, ExcelWriter):
need_save = False
else:
writer = ExcelWriter(_stringify_path(writer), engine=engine)
need_save = True
formatted_cells = self.get_formatted_cells()
writer.write_cells(
formatted_cells,
sheet_name,
startrow=startrow,
startcol=startcol,
freeze_panes=freeze_panes,
)
if need_save:
writer.save()

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,608 @@
"""
Module for formatting output data in HTML.
"""
from collections import OrderedDict
from textwrap import dedent
from typing import Dict, List, Optional, Tuple, Union
from pandas._config import get_option
from pandas.core.dtypes.generic import ABCIndex, ABCMultiIndex
from pandas import option_context
from pandas.io.common import _is_url
from pandas.io.formats.format import (
DataFrameFormatter,
TableFormatter,
get_level_lengths,
)
from pandas.io.formats.printing import pprint_thing
class HTMLFormatter(TableFormatter):
"""
Internal class for formatting output data in html.
This class is intended for shared functionality between
DataFrame.to_html() and DataFrame._repr_html_().
Any logic in common with other output formatting methods
should ideally be inherited from classes in format.py
and this class responsible for only producing html markup.
"""
indent_delta = 2
def __init__(
self,
formatter: DataFrameFormatter,
classes: Optional[Union[str, List, Tuple]] = None,
border: Optional[bool] = None,
) -> None:
self.fmt = formatter
self.classes = classes
self.frame = self.fmt.frame
self.columns = self.fmt.tr_frame.columns
self.elements = [] # type: List[str]
self.bold_rows = self.fmt.kwds.get("bold_rows", False)
self.escape = self.fmt.kwds.get("escape", True)
self.show_dimensions = self.fmt.show_dimensions
if border is None:
border = get_option("display.html.border")
self.border = border
self.table_id = self.fmt.table_id
self.render_links = self.fmt.render_links
if isinstance(self.fmt.col_space, int):
self.fmt.col_space = "{colspace}px".format(colspace=self.fmt.col_space)
@property
def show_row_idx_names(self) -> bool:
return self.fmt.show_row_idx_names
@property
def show_col_idx_names(self) -> bool:
return self.fmt.show_col_idx_names
@property
def row_levels(self) -> int:
if self.fmt.index:
# showing (row) index
return self.frame.index.nlevels
elif self.show_col_idx_names:
# see gh-22579
# Column misalignment also occurs for
# a standard index when the columns index is named.
# If the row index is not displayed a column of
# blank cells need to be included before the DataFrame values.
return 1
# not showing (row) index
return 0
def _get_columns_formatted_values(self) -> ABCIndex:
return self.columns
@property
def is_truncated(self) -> bool:
return self.fmt.is_truncated
@property
def ncols(self) -> int:
return len(self.fmt.tr_frame.columns)
def write(self, s: str, indent: int = 0) -> None:
rs = pprint_thing(s)
self.elements.append(" " * indent + rs)
def write_th(
self, s: str, header: bool = False, indent: int = 0, tags: Optional[str] = None
) -> None:
"""
Method for writting a formatted <th> cell.
If col_space is set on the formatter then that is used for
the value of min-width.
Parameters
----------
s : object
The data to be written inside the cell.
header : boolean, default False
Set to True if the <th> is for use inside <thead>. This will
cause min-width to be set if there is one.
indent : int, default 0
The indentation level of the cell.
tags : string, default None
Tags to include in the cell.
Returns
-------
A written <th> cell.
"""
if header and self.fmt.col_space is not None:
tags = tags or ""
tags += 'style="min-width: {colspace};"'.format(colspace=self.fmt.col_space)
self._write_cell(s, kind="th", indent=indent, tags=tags)
def write_td(self, s: str, indent: int = 0, tags: Optional[str] = None) -> None:
self._write_cell(s, kind="td", indent=indent, tags=tags)
def _write_cell(
self, s: str, kind: str = "td", indent: int = 0, tags: Optional[str] = None
) -> None:
if tags is not None:
start_tag = "<{kind} {tags}>".format(kind=kind, tags=tags)
else:
start_tag = "<{kind}>".format(kind=kind)
if self.escape:
# escape & first to prevent double escaping of &
esc = OrderedDict(
[("&", r"&amp;"), ("<", r"&lt;"), (">", r"&gt;")]
) # type: Union[OrderedDict[str, str], Dict]
else:
esc = {}
rs = pprint_thing(s, escape_chars=esc).strip()
if self.render_links and _is_url(rs):
rs_unescaped = pprint_thing(s, escape_chars={}).strip()
start_tag += '<a href="{url}" target="_blank">'.format(url=rs_unescaped)
end_a = "</a>"
else:
end_a = ""
self.write(
"{start}{rs}{end_a}</{kind}>".format(
start=start_tag, rs=rs, end_a=end_a, kind=kind
),
indent,
)
def write_tr(
self,
line: List[str],
indent: int = 0,
indent_delta: int = 0,
header: bool = False,
align: Optional[str] = None,
tags: Optional[Dict[int, str]] = None,
nindex_levels: int = 0,
) -> None:
if tags is None:
tags = {}
if align is None:
self.write("<tr>", indent)
else:
self.write('<tr style="text-align: {align};">'.format(align=align), indent)
indent += indent_delta
for i, s in enumerate(line):
val_tag = tags.get(i, None)
if header or (self.bold_rows and i < nindex_levels):
self.write_th(s, indent=indent, header=header, tags=val_tag)
else:
self.write_td(s, indent, tags=val_tag)
indent -= indent_delta
self.write("</tr>", indent)
def render(self) -> List[str]:
self._write_table()
if self.should_show_dimensions:
by = chr(215) # ×
self.write(
"<p>{rows} rows {by} {cols} columns</p>".format(
rows=len(self.frame), by=by, cols=len(self.frame.columns)
)
)
return self.elements
def _write_table(self, indent: int = 0) -> None:
_classes = ["dataframe"] # Default class.
use_mathjax = get_option("display.html.use_mathjax")
if not use_mathjax:
_classes.append("tex2jax_ignore")
if self.classes is not None:
if isinstance(self.classes, str):
self.classes = self.classes.split()
if not isinstance(self.classes, (list, tuple)):
raise TypeError(
"classes must be a string, list, or tuple, "
"not {typ}".format(typ=type(self.classes))
)
_classes.extend(self.classes)
if self.table_id is None:
id_section = ""
else:
id_section = ' id="{table_id}"'.format(table_id=self.table_id)
self.write(
'<table border="{border}" class="{cls}"{id_section}>'.format(
border=self.border, cls=" ".join(_classes), id_section=id_section
),
indent,
)
if self.fmt.header or self.show_row_idx_names:
self._write_header(indent + self.indent_delta)
self._write_body(indent + self.indent_delta)
self.write("</table>", indent)
def _write_col_header(self, indent: int) -> None:
truncate_h = self.fmt.truncate_h
if isinstance(self.columns, ABCMultiIndex):
template = 'colspan="{span:d}" halign="left"'
if self.fmt.sparsify:
# GH3547
sentinel = object()
else:
sentinel = False
levels = self.columns.format(sparsify=sentinel, adjoin=False, names=False)
level_lengths = get_level_lengths(levels, sentinel)
inner_lvl = len(level_lengths) - 1
for lnum, (records, values) in enumerate(zip(level_lengths, levels)):
if truncate_h:
# modify the header lines
ins_col = self.fmt.tr_col_num
if self.fmt.sparsify:
recs_new = {}
# Increment tags after ... col.
for tag, span in list(records.items()):
if tag >= ins_col:
recs_new[tag + 1] = span
elif tag + span > ins_col:
recs_new[tag] = span + 1
if lnum == inner_lvl:
values = (
values[:ins_col] + ("...",) + values[ins_col:]
)
else:
# sparse col headers do not receive a ...
values = (
values[:ins_col]
+ (values[ins_col - 1],)
+ values[ins_col:]
)
else:
recs_new[tag] = span
# if ins_col lies between tags, all col headers
# get ...
if tag + span == ins_col:
recs_new[ins_col] = 1
values = values[:ins_col] + ("...",) + values[ins_col:]
records = recs_new
inner_lvl = len(level_lengths) - 1
if lnum == inner_lvl:
records[ins_col] = 1
else:
recs_new = {}
for tag, span in list(records.items()):
if tag >= ins_col:
recs_new[tag + 1] = span
else:
recs_new[tag] = span
recs_new[ins_col] = 1
records = recs_new
values = values[:ins_col] + ["..."] + values[ins_col:]
# see gh-22579
# Column Offset Bug with to_html(index=False) with
# MultiIndex Columns and Index.
# Initially fill row with blank cells before column names.
# TODO: Refactor to remove code duplication with code
# block below for standard columns index.
row = [""] * (self.row_levels - 1)
if self.fmt.index or self.show_col_idx_names:
# see gh-22747
# If to_html(index_names=False) do not show columns
# index names.
# TODO: Refactor to use _get_column_name_list from
# DataFrameFormatter class and create a
# _get_formatted_column_labels function for code
# parity with DataFrameFormatter class.
if self.fmt.show_index_names:
name = self.columns.names[lnum]
row.append(pprint_thing(name or ""))
else:
row.append("")
tags = {}
j = len(row)
for i, v in enumerate(values):
if i in records:
if records[i] > 1:
tags[j] = template.format(span=records[i])
else:
continue
j += 1
row.append(v)
self.write_tr(row, indent, self.indent_delta, tags=tags, header=True)
else:
# see gh-22579
# Column misalignment also occurs for
# a standard index when the columns index is named.
# Initially fill row with blank cells before column names.
# TODO: Refactor to remove code duplication with code block
# above for columns MultiIndex.
row = [""] * (self.row_levels - 1)
if self.fmt.index or self.show_col_idx_names:
# see gh-22747
# If to_html(index_names=False) do not show columns
# index names.
# TODO: Refactor to use _get_column_name_list from
# DataFrameFormatter class.
if self.fmt.show_index_names:
row.append(self.columns.name or "")
else:
row.append("")
row.extend(self._get_columns_formatted_values())
align = self.fmt.justify
if truncate_h:
ins_col = self.row_levels + self.fmt.tr_col_num
row.insert(ins_col, "...")
self.write_tr(row, indent, self.indent_delta, header=True, align=align)
def _write_row_header(self, indent: int) -> None:
truncate_h = self.fmt.truncate_h
row = [x if x is not None else "" for x in self.frame.index.names] + [""] * (
self.ncols + (1 if truncate_h else 0)
)
self.write_tr(row, indent, self.indent_delta, header=True)
def _write_header(self, indent: int) -> None:
self.write("<thead>", indent)
if self.fmt.header:
self._write_col_header(indent + self.indent_delta)
if self.show_row_idx_names:
self._write_row_header(indent + self.indent_delta)
self.write("</thead>", indent)
def _get_formatted_values(self) -> Dict[int, List[str]]:
with option_context("display.max_colwidth", 999999):
fmt_values = {i: self.fmt._format_col(i) for i in range(self.ncols)}
return fmt_values
def _write_body(self, indent: int) -> None:
self.write("<tbody>", indent)
fmt_values = self._get_formatted_values()
# write values
if self.fmt.index and isinstance(self.frame.index, ABCMultiIndex):
self._write_hierarchical_rows(fmt_values, indent + self.indent_delta)
else:
self._write_regular_rows(fmt_values, indent + self.indent_delta)
self.write("</tbody>", indent)
def _write_regular_rows(
self, fmt_values: Dict[int, List[str]], indent: int
) -> None:
truncate_h = self.fmt.truncate_h
truncate_v = self.fmt.truncate_v
nrows = len(self.fmt.tr_frame)
if self.fmt.index:
fmt = self.fmt._get_formatter("__index__")
if fmt is not None:
index_values = self.fmt.tr_frame.index.map(fmt)
else:
index_values = self.fmt.tr_frame.index.format()
row = [] # type: List[str]
for i in range(nrows):
if truncate_v and i == (self.fmt.tr_row_num):
str_sep_row = ["..."] * len(row)
self.write_tr(
str_sep_row,
indent,
self.indent_delta,
tags=None,
nindex_levels=self.row_levels,
)
row = []
if self.fmt.index:
row.append(index_values[i])
# see gh-22579
# Column misalignment also occurs for
# a standard index when the columns index is named.
# Add blank cell before data cells.
elif self.show_col_idx_names:
row.append("")
row.extend(fmt_values[j][i] for j in range(self.ncols))
if truncate_h:
dot_col_ix = self.fmt.tr_col_num + self.row_levels
row.insert(dot_col_ix, "...")
self.write_tr(
row, indent, self.indent_delta, tags=None, nindex_levels=self.row_levels
)
def _write_hierarchical_rows(
self, fmt_values: Dict[int, List[str]], indent: int
) -> None:
template = 'rowspan="{span}" valign="top"'
truncate_h = self.fmt.truncate_h
truncate_v = self.fmt.truncate_v
frame = self.fmt.tr_frame
nrows = len(frame)
idx_values = frame.index.format(sparsify=False, adjoin=False, names=False)
idx_values = list(zip(*idx_values))
if self.fmt.sparsify:
# GH3547
sentinel = object()
levels = frame.index.format(sparsify=sentinel, adjoin=False, names=False)
level_lengths = get_level_lengths(levels, sentinel)
inner_lvl = len(level_lengths) - 1
if truncate_v:
# Insert ... row and adjust idx_values and
# level_lengths to take this into account.
ins_row = self.fmt.tr_row_num
inserted = False
for lnum, records in enumerate(level_lengths):
rec_new = {}
for tag, span in list(records.items()):
if tag >= ins_row:
rec_new[tag + 1] = span
elif tag + span > ins_row:
rec_new[tag] = span + 1
# GH 14882 - Make sure insertion done once
if not inserted:
dot_row = list(idx_values[ins_row - 1])
dot_row[-1] = "..."
idx_values.insert(ins_row, tuple(dot_row))
inserted = True
else:
dot_row = list(idx_values[ins_row])
dot_row[inner_lvl - lnum] = "..."
idx_values[ins_row] = tuple(dot_row)
else:
rec_new[tag] = span
# If ins_row lies between tags, all cols idx cols
# receive ...
if tag + span == ins_row:
rec_new[ins_row] = 1
if lnum == 0:
idx_values.insert(
ins_row, tuple(["..."] * len(level_lengths))
)
# GH 14882 - Place ... in correct level
elif inserted:
dot_row = list(idx_values[ins_row])
dot_row[inner_lvl - lnum] = "..."
idx_values[ins_row] = tuple(dot_row)
level_lengths[lnum] = rec_new
level_lengths[inner_lvl][ins_row] = 1
for ix_col in range(len(fmt_values)):
fmt_values[ix_col].insert(ins_row, "...")
nrows += 1
for i in range(nrows):
row = []
tags = {}
sparse_offset = 0
j = 0
for records, v in zip(level_lengths, idx_values[i]):
if i in records:
if records[i] > 1:
tags[j] = template.format(span=records[i])
else:
sparse_offset += 1
continue
j += 1
row.append(v)
row.extend(fmt_values[j][i] for j in range(self.ncols))
if truncate_h:
row.insert(
self.row_levels - sparse_offset + self.fmt.tr_col_num, "..."
)
self.write_tr(
row,
indent,
self.indent_delta,
tags=tags,
nindex_levels=len(levels) - sparse_offset,
)
else:
row = []
for i in range(len(frame)):
if truncate_v and i == (self.fmt.tr_row_num):
str_sep_row = ["..."] * len(row)
self.write_tr(
str_sep_row,
indent,
self.indent_delta,
tags=None,
nindex_levels=self.row_levels,
)
idx_values = list(
zip(*frame.index.format(sparsify=False, adjoin=False, names=False))
)
row = []
row.extend(idx_values[i])
row.extend(fmt_values[j][i] for j in range(self.ncols))
if truncate_h:
row.insert(self.row_levels + self.fmt.tr_col_num, "...")
self.write_tr(
row,
indent,
self.indent_delta,
tags=None,
nindex_levels=frame.index.nlevels,
)
class NotebookFormatter(HTMLFormatter):
"""
Internal class for formatting output data in html for display in Jupyter
Notebooks. This class is intended for functionality specific to
DataFrame._repr_html_() and DataFrame.to_html(notebook=True)
"""
def _get_formatted_values(self) -> Dict[int, List[str]]:
return {i: self.fmt._format_col(i) for i in range(self.ncols)}
def _get_columns_formatted_values(self) -> List[str]:
return self.columns.format()
def write_style(self) -> None:
# We use the "scoped" attribute here so that the desired
# style properties for the data frame are not then applied
# throughout the entire notebook.
template_first = """\
<style scoped>"""
template_last = """\
</style>"""
template_select = """\
.dataframe %s {
%s: %s;
}"""
element_props = [
("tbody tr th:only-of-type", "vertical-align", "middle"),
("tbody tr th", "vertical-align", "top"),
]
if isinstance(self.columns, ABCMultiIndex):
element_props.append(("thead tr th", "text-align", "left"))
if self.show_row_idx_names:
element_props.append(
("thead tr:last-of-type th", "text-align", "right")
)
else:
element_props.append(("thead th", "text-align", "right"))
template_mid = "\n\n".join(map(lambda t: template_select % t, element_props))
template = dedent("\n".join((template_first, template_mid, template_last)))
self.write(template)
def render(self) -> List[str]:
self.write("<div>")
self.write_style()
super().render()
self.write("</div>")
return self.elements

View File

@@ -0,0 +1,265 @@
"""
Module for formatting output data in Latex.
"""
import numpy as np
from pandas.core.dtypes.generic import ABCMultiIndex
from pandas.io.formats.format import TableFormatter
class LatexFormatter(TableFormatter):
""" Used to render a DataFrame to a LaTeX tabular/longtable environment
output.
Parameters
----------
formatter : `DataFrameFormatter`
column_format : str, default None
The columns format as specified in `LaTeX table format
<https://en.wikibooks.org/wiki/LaTeX/Tables>`__ e.g 'rcl' for 3 columns
longtable : boolean, default False
Use a longtable environment instead of tabular.
See Also
--------
HTMLFormatter
"""
def __init__(
self,
formatter,
column_format=None,
longtable=False,
multicolumn=False,
multicolumn_format=None,
multirow=False,
):
self.fmt = formatter
self.frame = self.fmt.frame
self.bold_rows = self.fmt.kwds.get("bold_rows", False)
self.column_format = column_format
self.longtable = longtable
self.multicolumn = multicolumn
self.multicolumn_format = multicolumn_format
self.multirow = multirow
def write_result(self, buf):
"""
Render a DataFrame to a LaTeX tabular/longtable environment output.
"""
# string representation of the columns
if len(self.frame.columns) == 0 or len(self.frame.index) == 0:
info_line = "Empty {name}\nColumns: {col}\nIndex: {idx}".format(
name=type(self.frame).__name__,
col=self.frame.columns,
idx=self.frame.index,
)
strcols = [[info_line]]
else:
strcols = self.fmt._to_str_columns()
def get_col_type(dtype):
if issubclass(dtype.type, np.number):
return "r"
else:
return "l"
# reestablish the MultiIndex that has been joined by _to_str_column
if self.fmt.index and isinstance(self.frame.index, ABCMultiIndex):
out = self.frame.index.format(
adjoin=False,
sparsify=self.fmt.sparsify,
names=self.fmt.has_index_names,
na_rep=self.fmt.na_rep,
)
# index.format will sparsify repeated entries with empty strings
# so pad these with some empty space
def pad_empties(x):
for pad in reversed(x):
if pad:
break
return [x[0]] + [i if i else " " * len(pad) for i in x[1:]]
out = (pad_empties(i) for i in out)
# Add empty spaces for each column level
clevels = self.frame.columns.nlevels
out = [[" " * len(i[-1])] * clevels + i for i in out]
# Add the column names to the last index column
cnames = self.frame.columns.names
if any(cnames):
new_names = [i if i else "{}" for i in cnames]
out[self.frame.index.nlevels - 1][:clevels] = new_names
# Get rid of old multiindex column and add new ones
strcols = out + strcols[1:]
column_format = self.column_format
if column_format is None:
dtypes = self.frame.dtypes._values
column_format = "".join(map(get_col_type, dtypes))
if self.fmt.index:
index_format = "l" * self.frame.index.nlevels
column_format = index_format + column_format
elif not isinstance(column_format, str): # pragma: no cover
raise AssertionError(
"column_format must be str or unicode, "
"not {typ}".format(typ=type(column_format))
)
if not self.longtable:
buf.write("\\begin{{tabular}}{{{fmt}}}\n".format(fmt=column_format))
buf.write("\\toprule\n")
else:
buf.write("\\begin{{longtable}}{{{fmt}}}\n".format(fmt=column_format))
buf.write("\\toprule\n")
ilevels = self.frame.index.nlevels
clevels = self.frame.columns.nlevels
nlevels = clevels
if self.fmt.has_index_names and self.fmt.show_index_names:
nlevels += 1
strrows = list(zip(*strcols))
self.clinebuf = []
for i, row in enumerate(strrows):
if i == nlevels and self.fmt.header:
buf.write("\\midrule\n") # End of header
if self.longtable:
buf.write("\\endhead\n")
buf.write("\\midrule\n")
buf.write(
"\\multicolumn{{{n}}}{{r}}{{{{Continued on next "
"page}}}} \\\\\n".format(n=len(row))
)
buf.write("\\midrule\n")
buf.write("\\endfoot\n\n")
buf.write("\\bottomrule\n")
buf.write("\\endlastfoot\n")
if self.fmt.kwds.get("escape", True):
# escape backslashes first
crow = [
(
x.replace("\\", "\\textbackslash ")
.replace("_", "\\_")
.replace("%", "\\%")
.replace("$", "\\$")
.replace("#", "\\#")
.replace("{", "\\{")
.replace("}", "\\}")
.replace("~", "\\textasciitilde ")
.replace("^", "\\textasciicircum ")
.replace("&", "\\&")
if (x and x != "{}")
else "{}"
)
for x in row
]
else:
crow = [x if x else "{}" for x in row]
if self.bold_rows and self.fmt.index:
# bold row labels
crow = [
"\\textbf{{{x}}}".format(x=x)
if j < ilevels and x.strip() not in ["", "{}"]
else x
for j, x in enumerate(crow)
]
if i < clevels and self.fmt.header and self.multicolumn:
# sum up columns to multicolumns
crow = self._format_multicolumn(crow, ilevels)
if i >= nlevels and self.fmt.index and self.multirow and ilevels > 1:
# sum up rows to multirows
crow = self._format_multirow(crow, ilevels, i, strrows)
buf.write(" & ".join(crow))
buf.write(" \\\\\n")
if self.multirow and i < len(strrows) - 1:
self._print_cline(buf, i, len(strcols))
if not self.longtable:
buf.write("\\bottomrule\n")
buf.write("\\end{tabular}\n")
else:
buf.write("\\end{longtable}\n")
def _format_multicolumn(self, row, ilevels):
r"""
Combine columns belonging to a group to a single multicolumn entry
according to self.multicolumn_format
e.g.:
a & & & b & c &
will become
\multicolumn{3}{l}{a} & b & \multicolumn{2}{l}{c}
"""
row2 = list(row[:ilevels])
ncol = 1
coltext = ""
def append_col():
# write multicolumn if needed
if ncol > 1:
row2.append(
"\\multicolumn{{{ncol:d}}}{{{fmt:s}}}{{{txt:s}}}".format(
ncol=ncol, fmt=self.multicolumn_format, txt=coltext.strip()
)
)
# don't modify where not needed
else:
row2.append(coltext)
for c in row[ilevels:]:
# if next col has text, write the previous
if c.strip():
if coltext:
append_col()
coltext = c
ncol = 1
# if not, add it to the previous multicolumn
else:
ncol += 1
# write last column name
if coltext:
append_col()
return row2
def _format_multirow(self, row, ilevels, i, rows):
r"""
Check following rows, whether row should be a multirow
e.g.: becomes:
a & 0 & \multirow{2}{*}{a} & 0 &
& 1 & & 1 &
b & 0 & \cline{1-2}
b & 0 &
"""
for j in range(ilevels):
if row[j].strip():
nrow = 1
for r in rows[i + 1 :]:
if not r[j].strip():
nrow += 1
else:
break
if nrow > 1:
# overwrite non-multirow entry
row[j] = "\\multirow{{{nrow:d}}}{{*}}{{{row:s}}}".format(
nrow=nrow, row=row[j].strip()
)
# save when to end the current block with \cline
self.clinebuf.append([i + nrow - 1, j + 1])
return row
def _print_cline(self, buf, i, icol):
"""
Print clines after multirow-blocks are finished
"""
for cl in self.clinebuf:
if cl[0] == i:
buf.write("\\cline{{{cl:d}-{icol:d}}}\n".format(cl=cl[1], icol=icol))
# remove entries that have been written to buffer
self.clinebuf = [x for x in self.clinebuf if x[0] != i]

View File

@@ -0,0 +1,517 @@
"""
printing tools
"""
import sys
from pandas._config import get_option
from pandas.core.dtypes.inference import is_sequence
def adjoin(space, *lists, **kwargs):
"""
Glues together two sets of strings using the amount of space requested.
The idea is to prettify.
----------
space : int
number of spaces for padding
lists : str
list of str which being joined
strlen : callable
function used to calculate the length of each str. Needed for unicode
handling.
justfunc : callable
function used to justify str. Needed for unicode handling.
"""
strlen = kwargs.pop("strlen", len)
justfunc = kwargs.pop("justfunc", justify)
out_lines = []
newLists = []
lengths = [max(map(strlen, x)) + space for x in lists[:-1]]
# not the last one
lengths.append(max(map(len, lists[-1])))
maxLen = max(map(len, lists))
for i, lst in enumerate(lists):
nl = justfunc(lst, lengths[i], mode="left")
nl.extend([" " * lengths[i]] * (maxLen - len(lst)))
newLists.append(nl)
toJoin = zip(*newLists)
for lines in toJoin:
out_lines.append(_join_unicode(lines))
return _join_unicode(out_lines, sep="\n")
def justify(texts, max_len, mode="right"):
"""
Perform ljust, center, rjust against string or list-like
"""
if mode == "left":
return [x.ljust(max_len) for x in texts]
elif mode == "center":
return [x.center(max_len) for x in texts]
else:
return [x.rjust(max_len) for x in texts]
def _join_unicode(lines, sep=""):
try:
return sep.join(lines)
except UnicodeDecodeError:
sep = str(sep)
return sep.join([x.decode("utf-8") if isinstance(x, str) else x for x in lines])
# Unicode consolidation
# ---------------------
#
# pprinting utility functions for generating Unicode text or
# bytes(3.x)/str(2.x) representations of objects.
# Try to use these as much as possible rather then rolling your own.
#
# When to use
# -----------
#
# 1) If you're writing code internal to pandas (no I/O directly involved),
# use pprint_thing().
#
# It will always return unicode text which can handled by other
# parts of the package without breakage.
#
# 2) if you need to write something out to file, use
# pprint_thing_encoded(encoding).
#
# If no encoding is specified, it defaults to utf-8. Since encoding pure
# ascii with utf-8 is a no-op you can safely use the default utf-8 if you're
# working with straight ascii.
def _pprint_seq(seq, _nest_lvl=0, max_seq_items=None, **kwds):
"""
internal. pprinter for iterables. you should probably use pprint_thing()
rather then calling this directly.
bounds length of printed sequence, depending on options
"""
if isinstance(seq, set):
fmt = "{{{body}}}"
else:
fmt = "[{body}]" if hasattr(seq, "__setitem__") else "({body})"
if max_seq_items is False:
nitems = len(seq)
else:
nitems = max_seq_items or get_option("max_seq_items") or len(seq)
s = iter(seq)
# handle sets, no slicing
r = [
pprint_thing(next(s), _nest_lvl + 1, max_seq_items=max_seq_items, **kwds)
for i in range(min(nitems, len(seq)))
]
body = ", ".join(r)
if nitems < len(seq):
body += ", ..."
elif isinstance(seq, tuple) and len(seq) == 1:
body += ","
return fmt.format(body=body)
def _pprint_dict(seq, _nest_lvl=0, max_seq_items=None, **kwds):
"""
internal. pprinter for iterables. you should probably use pprint_thing()
rather then calling this directly.
"""
fmt = "{{{things}}}"
pairs = []
pfmt = "{key}: {val}"
if max_seq_items is False:
nitems = len(seq)
else:
nitems = max_seq_items or get_option("max_seq_items") or len(seq)
for k, v in list(seq.items())[:nitems]:
pairs.append(
pfmt.format(
key=pprint_thing(k, _nest_lvl + 1, max_seq_items=max_seq_items, **kwds),
val=pprint_thing(v, _nest_lvl + 1, max_seq_items=max_seq_items, **kwds),
)
)
if nitems < len(seq):
return fmt.format(things=", ".join(pairs) + ", ...")
else:
return fmt.format(things=", ".join(pairs))
def pprint_thing(
thing,
_nest_lvl=0,
escape_chars=None,
default_escapes=False,
quote_strings=False,
max_seq_items=None,
):
"""
This function is the sanctioned way of converting objects
to a unicode representation.
properly handles nested sequences containing unicode strings
(unicode(object) does not)
Parameters
----------
thing : anything to be formatted
_nest_lvl : internal use only. pprint_thing() is mutually-recursive
with pprint_sequence, this argument is used to keep track of the
current nesting level, and limit it.
escape_chars : list or dict, optional
Characters to escape. If a dict is passed the values are the
replacements
default_escapes : bool, default False
Whether the input escape characters replaces or adds to the defaults
max_seq_items : False, int, default None
Pass thru to other pretty printers to limit sequence printing
Returns
-------
result - unicode str
"""
def as_escaped_unicode(thing, escape_chars=escape_chars):
# Unicode is fine, else we try to decode using utf-8 and 'replace'
# if that's not it either, we have no way of knowing and the user
# should deal with it himself.
try:
result = str(thing) # we should try this first
except UnicodeDecodeError:
# either utf-8 or we replace errors
result = str(thing).decode("utf-8", "replace")
translate = {"\t": r"\t", "\n": r"\n", "\r": r"\r"}
if isinstance(escape_chars, dict):
if default_escapes:
translate.update(escape_chars)
else:
translate = escape_chars
escape_chars = list(escape_chars.keys())
else:
escape_chars = escape_chars or tuple()
for c in escape_chars:
result = result.replace(c, translate[c])
return str(result)
if hasattr(thing, "__next__"):
return str(thing)
elif isinstance(thing, dict) and _nest_lvl < get_option(
"display.pprint_nest_depth"
):
result = _pprint_dict(
thing, _nest_lvl, quote_strings=True, max_seq_items=max_seq_items
)
elif is_sequence(thing) and _nest_lvl < get_option("display.pprint_nest_depth"):
result = _pprint_seq(
thing,
_nest_lvl,
escape_chars=escape_chars,
quote_strings=quote_strings,
max_seq_items=max_seq_items,
)
elif isinstance(thing, str) and quote_strings:
result = "'{thing}'".format(thing=as_escaped_unicode(thing))
else:
result = as_escaped_unicode(thing)
return str(result) # always unicode
def pprint_thing_encoded(object, encoding="utf-8", errors="replace", **kwds):
value = pprint_thing(object) # get unicode representation of object
return value.encode(encoding, errors, **kwds)
def _enable_data_resource_formatter(enable):
if "IPython" not in sys.modules:
# definitely not in IPython
return
from IPython import get_ipython
ip = get_ipython()
if ip is None:
# still not in IPython
return
formatters = ip.display_formatter.formatters
mimetype = "application/vnd.dataresource+json"
if enable:
if mimetype not in formatters:
# define tableschema formatter
from IPython.core.formatters import BaseFormatter
class TableSchemaFormatter(BaseFormatter):
print_method = "_repr_data_resource_"
_return_type = (dict,)
# register it:
formatters[mimetype] = TableSchemaFormatter()
# enable it if it's been disabled:
formatters[mimetype].enabled = True
else:
# unregister tableschema mime-type
if mimetype in formatters:
formatters[mimetype].enabled = False
default_pprint = lambda x, max_seq_items=None: pprint_thing(
x, escape_chars=("\t", "\r", "\n"), quote_strings=True, max_seq_items=max_seq_items
)
def format_object_summary(
obj,
formatter,
is_justify=True,
name=None,
indent_for_name=True,
line_break_each_value=False,
):
"""
Return the formatted obj as a unicode string
Parameters
----------
obj : object
must be iterable and support __getitem__
formatter : callable
string formatter for an element
is_justify : boolean
should justify the display
name : name, optional
defaults to the class name of the obj
indent_for_name : bool, default True
Whether subsequent lines should be be indented to
align with the name.
line_break_each_value : bool, default False
If True, inserts a line break for each value of ``obj``.
If False, only break lines when the a line of values gets wider
than the display width.
.. versionadded:: 0.25.0
Returns
-------
summary string
"""
from pandas.io.formats.console import get_console_size
from pandas.io.formats.format import _get_adjustment
display_width, _ = get_console_size()
if display_width is None:
display_width = get_option("display.width") or 80
if name is None:
name = obj.__class__.__name__
if indent_for_name:
name_len = len(name)
space1 = "\n%s" % (" " * (name_len + 1))
space2 = "\n%s" % (" " * (name_len + 2))
else:
space1 = "\n"
space2 = "\n " # space for the opening '['
n = len(obj)
if line_break_each_value:
# If we want to vertically align on each value of obj, we need to
# separate values by a line break and indent the values
sep = ",\n " + " " * len(name)
else:
sep = ","
max_seq_items = get_option("display.max_seq_items") or n
# are we a truncated display
is_truncated = n > max_seq_items
# adj can optionally handle unicode eastern asian width
adj = _get_adjustment()
def _extend_line(s, line, value, display_width, next_line_prefix):
if adj.len(line.rstrip()) + adj.len(value.rstrip()) >= display_width:
s += line.rstrip()
line = next_line_prefix
line += value
return s, line
def best_len(values):
if values:
return max(adj.len(x) for x in values)
else:
return 0
close = ", "
if n == 0:
summary = "[]{}".format(close)
elif n == 1 and not line_break_each_value:
first = formatter(obj[0])
summary = "[{}]{}".format(first, close)
elif n == 2 and not line_break_each_value:
first = formatter(obj[0])
last = formatter(obj[-1])
summary = "[{}, {}]{}".format(first, last, close)
else:
if n > max_seq_items:
n = min(max_seq_items // 2, 10)
head = [formatter(x) for x in obj[:n]]
tail = [formatter(x) for x in obj[-n:]]
else:
head = []
tail = [formatter(x) for x in obj]
# adjust all values to max length if needed
if is_justify:
if line_break_each_value:
# Justify each string in the values of head and tail, so the
# strings will right align when head and tail are stacked
# vertically.
head, tail = _justify(head, tail)
elif is_truncated or not (
len(", ".join(head)) < display_width
and len(", ".join(tail)) < display_width
):
# Each string in head and tail should align with each other
max_length = max(best_len(head), best_len(tail))
head = [x.rjust(max_length) for x in head]
tail = [x.rjust(max_length) for x in tail]
# If we are not truncated and we are only a single
# line, then don't justify
if line_break_each_value:
# Now head and tail are of type List[Tuple[str]]. Below we
# convert them into List[str], so there will be one string per
# value. Also truncate items horizontally if wider than
# max_space
max_space = display_width - len(space2)
value = tail[0]
for max_items in reversed(range(1, len(value) + 1)):
pprinted_seq = _pprint_seq(value, max_seq_items=max_items)
if len(pprinted_seq) < max_space:
break
head = [_pprint_seq(x, max_seq_items=max_items) for x in head]
tail = [_pprint_seq(x, max_seq_items=max_items) for x in tail]
summary = ""
line = space2
for max_items in range(len(head)):
word = head[max_items] + sep + " "
summary, line = _extend_line(summary, line, word, display_width, space2)
if is_truncated:
# remove trailing space of last line
summary += line.rstrip() + space2 + "..."
line = space2
for max_items in range(len(tail) - 1):
word = tail[max_items] + sep + " "
summary, line = _extend_line(summary, line, word, display_width, space2)
# last value: no sep added + 1 space of width used for trailing ','
summary, line = _extend_line(summary, line, tail[-1], display_width - 2, space2)
summary += line
# right now close is either '' or ', '
# Now we want to include the ']', but not the maybe space.
close = "]" + close.rstrip(" ")
summary += close
if len(summary) > (display_width) or line_break_each_value:
summary += space1
else: # one row
summary += " "
# remove initial space
summary = "[" + summary[len(space2) :]
return summary
def _justify(head, tail):
"""
Justify items in head and tail, so they are right-aligned when stacked.
Parameters
----------
head : list-like of list-likes of strings
tail : list-like of list-likes of strings
Returns
-------
tuple of list of tuples of strings
Same as head and tail, but items are right aligned when stacked
vertically.
Examples
--------
>>> _justify([['a', 'b']], [['abc', 'abcd']])
([(' a', ' b')], [('abc', 'abcd')])
"""
combined = head + tail
# For each position for the sequences in ``combined``,
# find the length of the largest string.
max_length = [0] * len(combined[0])
for inner_seq in combined:
length = [len(item) for item in inner_seq]
max_length = [max(x, y) for x, y in zip(max_length, length)]
# justify each item in each list-like in head and tail using max_length
head = [
tuple(x.rjust(max_len) for x, max_len in zip(seq, max_length)) for seq in head
]
tail = [
tuple(x.rjust(max_len) for x, max_len in zip(seq, max_length)) for seq in tail
]
return head, tail
def format_object_attrs(obj, include_dtype=True):
"""
Return a list of tuples of the (attr, formatted_value)
for common attrs, including dtype, name, length
Parameters
----------
obj : object
must be iterable
include_dtype : bool
If False, dtype won't be in the returned list
Returns
-------
list
"""
attrs = []
if hasattr(obj, "dtype") and include_dtype:
attrs.append(("dtype", "'{}'".format(obj.dtype)))
if getattr(obj, "name", None) is not None:
attrs.append(("name", default_pprint(obj.name)))
elif getattr(obj, "names", None) is not None and any(obj.names):
attrs.append(("names", default_pprint(obj.names)))
max_seq_items = get_option("display.max_seq_items") or len(obj)
if len(obj) > max_seq_items:
attrs.append(("length", len(obj)))
return attrs

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,70 @@
{# Update the template_structure.html document too #}
{%- block before_style -%}{%- endblock before_style -%}
{% block style %}
<style type="text/css" >
{% block table_styles %}
{% for s in table_styles %}
#T_{{uuid}} {{s.selector}} {
{% for p,val in s.props %}
{{p}}: {{val}};
{% endfor -%}
}
{%- endfor -%}
{% endblock table_styles %}
{% block before_cellstyle %}{% endblock before_cellstyle %}
{% block cellstyle %}
{%- for s in cellstyle %}
#T_{{uuid}}{{s.selector}} {
{% for p,val in s.props %}
{{p}}: {{val}};
{% endfor %}
}
{%- endfor -%}
{%- endblock cellstyle %}
</style>
{%- endblock style %}
{%- block before_table %}{% endblock before_table %}
{%- block table %}
<table id="T_{{uuid}}" {% if table_attributes %}{{ table_attributes }}{% endif %}>
{%- block caption %}
{%- if caption -%}
<caption>{{caption}}</caption>
{%- endif -%}
{%- endblock caption %}
{%- block thead %}
<thead>
{%- block before_head_rows %}{% endblock %}
{%- for r in head %}
{%- block head_tr scoped %}
<tr>
{%- for c in r %}
{%- if c.is_visible != False %}
<{{ c.type }} class="{{c.class}}" {{ c.attributes|join(" ") }}>{{c.value}}</{{ c.type }}>
{%- endif %}
{%- endfor %}
</tr>
{%- endblock head_tr %}
{%- endfor %}
{%- block after_head_rows %}{% endblock %}
</thead>
{%- endblock thead %}
{%- block tbody %}
<tbody>
{% block before_rows %}{% endblock before_rows %}
{% for r in body %}
{% block tr scoped %}
<tr>
{% for c in r %}
{% if c.is_visible != False %}
<{{ c.type }} {% if c.id is defined -%} id="T_{{ uuid }}{{ c.id }}" {%- endif %} class="{{ c.class }}" {{ c.attributes|join(" ") }}>{{ c.display_value }}</{{ c.type }}>
{% endif %}
{%- endfor %}
</tr>
{% endblock tr %}
{%- endfor %}
{%- block after_rows %}{%- endblock after_rows %}
</tbody>
{%- endblock tbody %}
</table>
{%- endblock table %}
{%- block after_table %}{% endblock after_table %}

View File

@@ -0,0 +1,210 @@
""" Google BigQuery support """
from pandas.compat._optional import import_optional_dependency
def _try_import():
# since pandas is a dependency of pandas-gbq
# we need to import on first use
msg = (
"pandas-gbq is required to load data from Google BigQuery. "
"See the docs: https://pandas-gbq.readthedocs.io."
)
pandas_gbq = import_optional_dependency("pandas_gbq", extra=msg)
return pandas_gbq
def read_gbq(
query,
project_id=None,
index_col=None,
col_order=None,
reauth=False,
auth_local_webserver=False,
dialect=None,
location=None,
configuration=None,
credentials=None,
use_bqstorage_api=None,
private_key=None,
verbose=None,
):
"""
Load data from Google BigQuery.
This function requires the `pandas-gbq package
<https://pandas-gbq.readthedocs.io>`__.
See the `How to authenticate with Google BigQuery
<https://pandas-gbq.readthedocs.io/en/latest/howto/authentication.html>`__
guide for authentication instructions.
Parameters
----------
query : str
SQL-Like Query to return data values.
project_id : str, optional
Google BigQuery Account project ID. Optional when available from
the environment.
index_col : str, optional
Name of result column to use for index in results DataFrame.
col_order : list(str), optional
List of BigQuery column names in the desired order for results
DataFrame.
reauth : boolean, default False
Force Google BigQuery to re-authenticate the user. This is useful
if multiple accounts are used.
auth_local_webserver : boolean, default False
Use the `local webserver flow`_ instead of the `console flow`_
when getting user credentials.
.. _local webserver flow:
http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server
.. _console flow:
http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console
*New in version 0.2.0 of pandas-gbq*.
dialect : str, default 'legacy'
Note: The default value is changing to 'standard' in a future verion.
SQL syntax dialect to use. Value can be one of:
``'legacy'``
Use BigQuery's legacy SQL dialect. For more information see
`BigQuery Legacy SQL Reference
<https://cloud.google.com/bigquery/docs/reference/legacy-sql>`__.
``'standard'``
Use BigQuery's standard SQL, which is
compliant with the SQL 2011 standard. For more information
see `BigQuery Standard SQL Reference
<https://cloud.google.com/bigquery/docs/reference/standard-sql/>`__.
.. versionchanged:: 0.24.0
location : str, optional
Location where the query job should run. See the `BigQuery locations
documentation
<https://cloud.google.com/bigquery/docs/dataset-locations>`__ for a
list of available locations. The location must match that of any
datasets used in the query.
*New in version 0.5.0 of pandas-gbq*.
configuration : dict, optional
Query config parameters for job processing.
For example:
configuration = {'query': {'useQueryCache': False}}
For more information see `BigQuery REST API Reference
<https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.query>`__.
credentials : google.auth.credentials.Credentials, optional
Credentials for accessing Google APIs. Use this parameter to override
default credentials, such as to use Compute Engine
:class:`google.auth.compute_engine.Credentials` or Service Account
:class:`google.oauth2.service_account.Credentials` directly.
*New in version 0.8.0 of pandas-gbq*.
.. versionadded:: 0.24.0
use_bqstorage_api : bool, default False
Use the `BigQuery Storage API
<https://cloud.google.com/bigquery/docs/reference/storage/>`__ to
download query results quickly, but at an increased cost. To use this
API, first `enable it in the Cloud Console
<https://console.cloud.google.com/apis/library/bigquerystorage.googleapis.com>`__.
You must also have the `bigquery.readsessions.create
<https://cloud.google.com/bigquery/docs/access-control#roles>`__
permission on the project you are billing queries to.
This feature requires version 0.10.0 or later of the ``pandas-gbq``
package. It also requires the ``google-cloud-bigquery-storage`` and
``fastavro`` packages.
.. versionadded:: 0.25.0
private_key : str, deprecated
Deprecated in pandas-gbq version 0.8.0. Use the ``credentials``
parameter and
:func:`google.oauth2.service_account.Credentials.from_service_account_info`
or
:func:`google.oauth2.service_account.Credentials.from_service_account_file`
instead.
Service account private key in JSON format. Can be file path
or string contents. This is useful for remote server
authentication (eg. Jupyter/IPython notebook on remote host).
verbose : None, deprecated
Deprecated in pandas-gbq version 0.4.0. Use the `logging module to
adjust verbosity instead
<https://pandas-gbq.readthedocs.io/en/latest/intro.html#logging>`__.
Returns
-------
df: DataFrame
DataFrame representing results of query.
See Also
--------
pandas_gbq.read_gbq : This function in the pandas-gbq library.
DataFrame.to_gbq : Write a DataFrame to Google BigQuery.
"""
pandas_gbq = _try_import()
kwargs = {}
# START: new kwargs. Don't populate unless explicitly set.
if use_bqstorage_api is not None:
kwargs["use_bqstorage_api"] = use_bqstorage_api
# END: new kwargs
# START: deprecated kwargs. Don't populate unless explicitly set.
if verbose is not None:
kwargs["verbose"] = verbose
if private_key is not None:
kwargs["private_key"] = private_key
# END: deprecated kwargs
return pandas_gbq.read_gbq(
query,
project_id=project_id,
index_col=index_col,
col_order=col_order,
reauth=reauth,
auth_local_webserver=auth_local_webserver,
dialect=dialect,
location=location,
configuration=configuration,
credentials=credentials,
**kwargs
)
def to_gbq(
dataframe,
destination_table,
project_id=None,
chunksize=None,
reauth=False,
if_exists="fail",
auth_local_webserver=False,
table_schema=None,
location=None,
progress_bar=True,
credentials=None,
verbose=None,
private_key=None,
):
pandas_gbq = _try_import()
pandas_gbq.to_gbq(
dataframe,
destination_table,
project_id=project_id,
chunksize=chunksize,
reauth=reauth,
if_exists=if_exists,
auth_local_webserver=auth_local_webserver,
table_schema=table_schema,
location=location,
progress_bar=progress_bar,
credentials=credentials,
verbose=verbose,
private_key=private_key,
)

View File

@@ -0,0 +1,18 @@
""" GCS support for remote file interactivity """
from pandas.compat._optional import import_optional_dependency
gcsfs = import_optional_dependency(
"gcsfs", extra="The gcsfs library is required to handle GCS files"
)
def get_filepath_or_buffer(
filepath_or_buffer, encoding=None, compression=None, mode=None
):
if mode is None:
mode = "rb"
fs = gcsfs.GCSFileSystem()
filepath_or_buffer = fs.open(filepath_or_buffer, mode)
return filepath_or_buffer, None, compression, True

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,12 @@
from pandas.io.json._json import dumps, loads, read_json, to_json
from pandas.io.json._normalize import json_normalize
from pandas.io.json._table_schema import build_table_schema
__all__ = [
"dumps",
"loads",
"read_json",
"to_json",
"json_normalize",
"build_table_schema",
]

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,343 @@
# ---------------------------------------------------------------------
# JSON normalization routines
from collections import defaultdict
import copy
from typing import DefaultDict, Dict, List, Optional, Union
import numpy as np
from pandas._libs.writers import convert_json_to_lines
from pandas import DataFrame
def convert_to_line_delimits(s):
"""
Helper function that converts JSON lists to line delimited JSON.
"""
# Determine we have a JSON list to turn to lines otherwise just return the
# json object, only lists can
if not s[0] == "[" and s[-1] == "]":
return s
s = s[1:-1]
return convert_json_to_lines(s)
def nested_to_record(
ds,
prefix: str = "",
sep: str = ".",
level: int = 0,
max_level: Optional[int] = None,
):
"""
A simplified json_normalize
Converts a nested dict into a flat dict ("record"), unlike json_normalize,
it does not attempt to extract a subset of the data.
Parameters
----------
ds : dict or list of dicts
prefix: the prefix, optional, default: ""
sep : str, default '.'
Nested records will generate names separated by sep,
e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
.. versionadded:: 0.20.0
level: int, optional, default: 0
The number of levels in the json string.
max_level: int, optional, default: None
The max depth to normalize.
.. versionadded:: 0.25.0
Returns
-------
d - dict or list of dicts, matching `ds`
Examples
--------
IN[52]: nested_to_record(dict(flat1=1,dict1=dict(c=1,d=2),
nested=dict(e=dict(c=1,d=2),d=2)))
Out[52]:
{'dict1.c': 1,
'dict1.d': 2,
'flat1': 1,
'nested.d': 2,
'nested.e.c': 1,
'nested.e.d': 2}
"""
singleton = False
if isinstance(ds, dict):
ds = [ds]
singleton = True
new_ds = []
for d in ds:
new_d = copy.deepcopy(d)
for k, v in d.items():
# each key gets renamed with prefix
if not isinstance(k, str):
k = str(k)
if level == 0:
newkey = k
else:
newkey = prefix + sep + k
# flatten if type is dict and
# current dict level < maximum level provided and
# only dicts gets recurse-flattened
# only at level>1 do we rename the rest of the keys
if not isinstance(v, dict) or (
max_level is not None and level >= max_level
):
if level != 0: # so we skip copying for top level, common case
v = new_d.pop(k)
new_d[newkey] = v
continue
else:
v = new_d.pop(k)
new_d.update(nested_to_record(v, newkey, sep, level + 1, max_level))
new_ds.append(new_d)
if singleton:
return new_ds[0]
return new_ds
def json_normalize(
data: Union[Dict, List[Dict]],
record_path: Optional[Union[str, List]] = None,
meta: Optional[Union[str, List]] = None,
meta_prefix: Optional[str] = None,
record_prefix: Optional[str] = None,
errors: Optional[str] = "raise",
sep: str = ".",
max_level: Optional[int] = None,
):
"""
Normalize semi-structured JSON data into a flat table.
Parameters
----------
data : dict or list of dicts
Unserialized JSON objects.
record_path : str or list of str, default None
Path in each object to list of records. If not passed, data will be
assumed to be an array of records.
meta : list of paths (str or list of str), default None
Fields to use as metadata for each record in resulting table.
meta_prefix : str, default None
If True, prefix records with dotted (?) path, e.g. foo.bar.field if
meta is ['foo', 'bar'].
record_prefix : str, default None
If True, prefix records with dotted (?) path, e.g. foo.bar.field if
path to records is ['foo', 'bar'].
errors : {'raise', 'ignore'}, default 'raise'
Configures error handling.
* 'ignore' : will ignore KeyError if keys listed in meta are not
always present.
* 'raise' : will raise KeyError if keys listed in meta are not
always present.
.. versionadded:: 0.20.0
sep : str, default '.'
Nested records will generate names separated by sep.
e.g., for sep='.', {'foo': {'bar': 0}} -> foo.bar.
.. versionadded:: 0.20.0
max_level : int, default None
Max number of levels(depth of dict) to normalize.
if None, normalizes all levels.
.. versionadded:: 0.25.0
Returns
-------
frame : DataFrame
Normalize semi-structured JSON data into a flat table.
Examples
--------
>>> from pandas.io.json import json_normalize
>>> data = [{'id': 1, 'name': {'first': 'Coleen', 'last': 'Volk'}},
... {'name': {'given': 'Mose', 'family': 'Regner'}},
... {'id': 2, 'name': 'Faye Raker'}]
>>> json_normalize(data)
id name name.family name.first name.given name.last
0 1.0 NaN NaN Coleen NaN Volk
1 NaN NaN Regner NaN Mose NaN
2 2.0 Faye Raker NaN NaN NaN NaN
>>> data = [{'id': 1,
... 'name': "Cole Volk",
... 'fitness': {'height': 130, 'weight': 60}},
... {'name': "Mose Reg",
... 'fitness': {'height': 130, 'weight': 60}},
... {'id': 2, 'name': 'Faye Raker',
... 'fitness': {'height': 130, 'weight': 60}}]
>>> json_normalize(data, max_level=0)
fitness id name
0 {'height': 130, 'weight': 60} 1.0 Cole Volk
1 {'height': 130, 'weight': 60} NaN Mose Reg
2 {'height': 130, 'weight': 60} 2.0 Faye Raker
Normalizes nested data upto level 1.
>>> data = [{'id': 1,
... 'name': "Cole Volk",
... 'fitness': {'height': 130, 'weight': 60}},
... {'name': "Mose Reg",
... 'fitness': {'height': 130, 'weight': 60}},
... {'id': 2, 'name': 'Faye Raker',
... 'fitness': {'height': 130, 'weight': 60}}]
>>> json_normalize(data, max_level=1)
fitness.height fitness.weight id name
0 130 60 1.0 Cole Volk
1 130 60 NaN Mose Reg
2 130 60 2.0 Faye Raker
>>> data = [{'state': 'Florida',
... 'shortname': 'FL',
... 'info': {'governor': 'Rick Scott'},
... 'counties': [{'name': 'Dade', 'population': 12345},
... {'name': 'Broward', 'population': 40000},
... {'name': 'Palm Beach', 'population': 60000}]},
... {'state': 'Ohio',
... 'shortname': 'OH',
... 'info': {'governor': 'John Kasich'},
... 'counties': [{'name': 'Summit', 'population': 1234},
... {'name': 'Cuyahoga', 'population': 1337}]}]
>>> result = json_normalize(data, 'counties', ['state', 'shortname',
... ['info', 'governor']])
>>> result
name population state shortname info.governor
0 Dade 12345 Florida FL Rick Scott
1 Broward 40000 Florida FL Rick Scott
2 Palm Beach 60000 Florida FL Rick Scott
3 Summit 1234 Ohio OH John Kasich
4 Cuyahoga 1337 Ohio OH John Kasich
>>> data = {'A': [1, 2]}
>>> json_normalize(data, 'A', record_prefix='Prefix.')
Prefix.0
0 1
1 2
Returns normalized data with columns prefixed with the given string.
"""
def _pull_field(js, spec):
result = js
if isinstance(spec, list):
for field in spec:
result = result[field]
else:
result = result[spec]
return result
if isinstance(data, list) and not data:
return DataFrame()
# A bit of a hackjob
if isinstance(data, dict):
data = [data]
if record_path is None:
if any([isinstance(x, dict) for x in y.values()] for y in data):
# naive normalization, this is idempotent for flat records
# and potentially will inflate the data considerably for
# deeply nested structures:
# {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@}
#
# TODO: handle record value which are lists, at least error
# reasonably
data = nested_to_record(data, sep=sep, max_level=max_level)
return DataFrame(data)
elif not isinstance(record_path, list):
record_path = [record_path]
if meta is None:
meta = []
elif not isinstance(meta, list):
meta = [meta]
meta = [m if isinstance(m, list) else [m] for m in meta]
# Disastrously inefficient for now
records = [] # type: List
lengths = []
meta_vals = defaultdict(list) # type: DefaultDict
meta_keys = [sep.join(val) for val in meta]
def _recursive_extract(data, path, seen_meta, level=0):
if isinstance(data, dict):
data = [data]
if len(path) > 1:
for obj in data:
for val, key in zip(meta, meta_keys):
if level + 1 == len(val):
seen_meta[key] = _pull_field(obj, val[-1])
_recursive_extract(obj[path[0]], path[1:], seen_meta, level=level + 1)
else:
for obj in data:
recs = _pull_field(obj, path[0])
recs = [
nested_to_record(r, sep=sep, max_level=max_level)
if isinstance(r, dict)
else r
for r in recs
]
# For repeating the metadata later
lengths.append(len(recs))
for val, key in zip(meta, meta_keys):
if level + 1 > len(val):
meta_val = seen_meta[key]
else:
try:
meta_val = _pull_field(obj, val[level:])
except KeyError as e:
if errors == "ignore":
meta_val = np.nan
else:
raise KeyError(
"Try running with "
"errors='ignore' as key "
"{err} is not always present".format(err=e)
)
meta_vals[key].append(meta_val)
records.extend(recs)
_recursive_extract(data, record_path, {}, level=0)
result = DataFrame(records)
if record_prefix is not None:
result = result.rename(columns=lambda x: "{p}{c}".format(p=record_prefix, c=x))
# Data types, a problem
for k, v in meta_vals.items():
if meta_prefix is not None:
k = meta_prefix + k
if k in result:
raise ValueError(
"Conflicting metadata name {name}, "
"need distinguishing prefix ".format(name=k)
)
result[k] = np.array(v, dtype=object).repeat(lengths)
return result

View File

@@ -0,0 +1,338 @@
"""
Table Schema builders
http://specs.frictionlessdata.io/json-table-schema/
"""
import warnings
import pandas._libs.json as json
from pandas.core.dtypes.common import (
is_bool_dtype,
is_categorical_dtype,
is_datetime64_dtype,
is_datetime64tz_dtype,
is_integer_dtype,
is_numeric_dtype,
is_period_dtype,
is_string_dtype,
is_timedelta64_dtype,
)
from pandas import DataFrame
from pandas.api.types import CategoricalDtype
import pandas.core.common as com
loads = json.loads
def as_json_table_type(x):
"""
Convert a NumPy / pandas type to its corresponding json_table.
Parameters
----------
x : array or dtype
Returns
-------
t : str
the Table Schema data types
Notes
-----
This table shows the relationship between NumPy / pandas dtypes,
and Table Schema dtypes.
============== =================
Pandas type Table Schema type
============== =================
int64 integer
float64 number
bool boolean
datetime64[ns] datetime
timedelta64[ns] duration
object str
categorical any
=============== =================
"""
if is_integer_dtype(x):
return "integer"
elif is_bool_dtype(x):
return "boolean"
elif is_numeric_dtype(x):
return "number"
elif is_datetime64_dtype(x) or is_datetime64tz_dtype(x) or is_period_dtype(x):
return "datetime"
elif is_timedelta64_dtype(x):
return "duration"
elif is_categorical_dtype(x):
return "any"
elif is_string_dtype(x):
return "string"
else:
return "any"
def set_default_names(data):
"""Sets index names to 'index' for regular, or 'level_x' for Multi"""
if com._all_not_none(*data.index.names):
nms = data.index.names
if len(nms) == 1 and data.index.name == "index":
warnings.warn("Index name of 'index' is not round-trippable")
elif len(nms) > 1 and any(x.startswith("level_") for x in nms):
warnings.warn(
"Index names beginning with 'level_' are not " "round-trippable"
)
return data
data = data.copy()
if data.index.nlevels > 1:
names = [
name if name is not None else "level_{}".format(i)
for i, name in enumerate(data.index.names)
]
data.index.names = names
else:
data.index.name = data.index.name or "index"
return data
def convert_pandas_type_to_json_field(arr, dtype=None):
dtype = dtype or arr.dtype
if arr.name is None:
name = "values"
else:
name = arr.name
field = {"name": name, "type": as_json_table_type(dtype)}
if is_categorical_dtype(arr):
if hasattr(arr, "categories"):
cats = arr.categories
ordered = arr.ordered
else:
cats = arr.cat.categories
ordered = arr.cat.ordered
field["constraints"] = {"enum": list(cats)}
field["ordered"] = ordered
elif is_period_dtype(arr):
field["freq"] = arr.freqstr
elif is_datetime64tz_dtype(arr):
if hasattr(arr, "dt"):
field["tz"] = arr.dt.tz.zone
else:
field["tz"] = arr.tz.zone
return field
def convert_json_field_to_pandas_type(field):
"""
Converts a JSON field descriptor into its corresponding NumPy / pandas type
Parameters
----------
field
A JSON field descriptor
Returns
-------
dtype
Raises
------
ValueError
If the type of the provided field is unknown or currently unsupported
Examples
--------
>>> convert_json_field_to_pandas_type({'name': 'an_int',
'type': 'integer'})
'int64'
>>> convert_json_field_to_pandas_type({'name': 'a_categorical',
'type': 'any',
'constraints': {'enum': [
'a', 'b', 'c']},
'ordered': True})
'CategoricalDtype(categories=['a', 'b', 'c'], ordered=True)'
>>> convert_json_field_to_pandas_type({'name': 'a_datetime',
'type': 'datetime'})
'datetime64[ns]'
>>> convert_json_field_to_pandas_type({'name': 'a_datetime_with_tz',
'type': 'datetime',
'tz': 'US/Central'})
'datetime64[ns, US/Central]'
"""
typ = field["type"]
if typ == "string":
return "object"
elif typ == "integer":
return "int64"
elif typ == "number":
return "float64"
elif typ == "boolean":
return "bool"
elif typ == "duration":
return "timedelta64"
elif typ == "datetime":
if field.get("tz"):
return "datetime64[ns, {tz}]".format(tz=field["tz"])
else:
return "datetime64[ns]"
elif typ == "any":
if "constraints" in field and "ordered" in field:
return CategoricalDtype(
categories=field["constraints"]["enum"], ordered=field["ordered"]
)
else:
return "object"
raise ValueError("Unsupported or invalid field type: {}".format(typ))
def build_table_schema(data, index=True, primary_key=None, version=True):
"""
Create a Table schema from ``data``.
Parameters
----------
data : Series, DataFrame
index : bool, default True
Whether to include ``data.index`` in the schema.
primary_key : bool or None, default True
column names to designate as the primary key.
The default `None` will set `'primaryKey'` to the index
level or levels if the index is unique.
version : bool, default True
Whether to include a field `pandas_version` with the version
of pandas that generated the schema.
Returns
-------
schema : dict
Notes
-----
See `_as_json_table_type` for conversion types.
Timedeltas as converted to ISO8601 duration format with
9 decimal places after the seconds field for nanosecond precision.
Categoricals are converted to the `any` dtype, and use the `enum` field
constraint to list the allowed values. The `ordered` attribute is included
in an `ordered` field.
Examples
--------
>>> df = pd.DataFrame(
... {'A': [1, 2, 3],
... 'B': ['a', 'b', 'c'],
... 'C': pd.date_range('2016-01-01', freq='d', periods=3),
... }, index=pd.Index(range(3), name='idx'))
>>> build_table_schema(df)
{'fields': [{'name': 'idx', 'type': 'integer'},
{'name': 'A', 'type': 'integer'},
{'name': 'B', 'type': 'string'},
{'name': 'C', 'type': 'datetime'}],
'pandas_version': '0.20.0',
'primaryKey': ['idx']}
"""
if index is True:
data = set_default_names(data)
schema = {}
fields = []
if index:
if data.index.nlevels > 1:
for level in data.index.levels:
fields.append(convert_pandas_type_to_json_field(level))
else:
fields.append(convert_pandas_type_to_json_field(data.index))
if data.ndim > 1:
for column, s in data.items():
fields.append(convert_pandas_type_to_json_field(s))
else:
fields.append(convert_pandas_type_to_json_field(data))
schema["fields"] = fields
if index and data.index.is_unique and primary_key is None:
if data.index.nlevels == 1:
schema["primaryKey"] = [data.index.name]
else:
schema["primaryKey"] = data.index.names
elif primary_key is not None:
schema["primaryKey"] = primary_key
if version:
schema["pandas_version"] = "0.20.0"
return schema
def parse_table_schema(json, precise_float):
"""
Builds a DataFrame from a given schema
Parameters
----------
json :
A JSON table schema
precise_float : boolean
Flag controlling precision when decoding string to double values, as
dictated by ``read_json``
Returns
-------
df : DataFrame
Raises
------
NotImplementedError
If the JSON table schema contains either timezone or timedelta data
Notes
-----
Because :func:`DataFrame.to_json` uses the string 'index' to denote a
name-less :class:`Index`, this function sets the name of the returned
:class:`DataFrame` to ``None`` when said string is encountered with a
normal :class:`Index`. For a :class:`MultiIndex`, the same limitation
applies to any strings beginning with 'level_'. Therefore, an
:class:`Index` name of 'index' and :class:`MultiIndex` names starting
with 'level_' are not supported.
See Also
--------
build_table_schema : Inverse function.
pandas.read_json
"""
table = loads(json, precise_float=precise_float)
col_order = [field["name"] for field in table["schema"]["fields"]]
df = DataFrame(table["data"], columns=col_order)[col_order]
dtypes = {
field["name"]: convert_json_field_to_pandas_type(field)
for field in table["schema"]["fields"]
}
# Cannot directly use as_type with timezone data on object; raise for now
if any(str(x).startswith("datetime64[ns, ") for x in dtypes.values()):
raise NotImplementedError('table="orient" can not yet read timezone ' "data")
# No ISO constructor for Timedelta as of yet, so need to raise
if "timedelta64" in dtypes.values():
raise NotImplementedError(
'table="orient" can not yet read ' "ISO-formatted Timedelta data"
)
df = df.astype(dtypes)
if "primaryKey" in table["schema"]:
df = df.set_index(table["schema"]["primaryKey"])
if len(df.index.names) == 1:
if df.index.name == "index":
df.index.name = None
else:
df.index.names = [
None if x.startswith("level_") else x for x in df.index.names
]
return df

View File

@@ -0,0 +1,52 @@
# coding: utf-8
from collections import namedtuple
from pandas.io.msgpack.exceptions import * # noqa
from pandas.io.msgpack._version import version # noqa
class ExtType(namedtuple("ExtType", "code data")):
"""ExtType represents ext type in msgpack."""
def __new__(cls, code, data):
if not isinstance(code, int):
raise TypeError("code must be int")
if not isinstance(data, bytes):
raise TypeError("data must be bytes")
if not 0 <= code <= 127:
raise ValueError("code must be 0~127")
return super().__new__(cls, code, data)
import os # noqa
from pandas.io.msgpack._packer import Packer # noqa
from pandas.io.msgpack._unpacker import unpack, unpackb, Unpacker # noqa
def pack(o, stream, **kwargs):
"""
Pack object `o` and write it to `stream`
See :class:`Packer` for options.
"""
packer = Packer(**kwargs)
stream.write(packer.pack(o))
def packb(o, **kwargs):
"""
Pack object `o` and return packed bytes
See :class:`Packer` for options.
"""
return Packer(**kwargs).pack(o)
# alias for compatibility to simplejson/marshal/pickle.
load = unpack
loads = unpackb
dump = pack
dumps = packb

View File

@@ -0,0 +1 @@
version = (0, 4, 6)

View File

@@ -0,0 +1,31 @@
class UnpackException(Exception):
pass
class BufferFull(UnpackException):
pass
class OutOfData(UnpackException):
pass
class UnpackValueError(UnpackException, ValueError):
pass
class ExtraData(ValueError):
def __init__(self, unpacked, extra):
self.unpacked = unpacked
self.extra = extra
def __str__(self):
return "unpack(b) received extra data."
class PackException(Exception):
pass
class PackValueError(PackException, ValueError):
pass

View File

@@ -0,0 +1,891 @@
"""
Msgpack serializer support for reading and writing pandas data structures
to disk
portions of msgpack_numpy package, by Lev Givon were incorporated
into this module (and tests_packers.py)
License
=======
Copyright (c) 2013, Lev Givon.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following
disclaimer in the documentation and/or other materials provided
with the distribution.
* Neither the name of Lev Givon nor the names of any
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
from datetime import date, datetime, timedelta
from io import BytesIO
import os
import warnings
from dateutil.parser import parse
import numpy as np
from pandas.compat._optional import import_optional_dependency
from pandas.errors import PerformanceWarning
from pandas.util._move import (
BadMove as _BadMove,
move_into_mutable_buffer as _move_into_mutable_buffer,
)
from pandas.core.dtypes.common import (
is_categorical_dtype,
is_datetime64tz_dtype,
is_object_dtype,
needs_i8_conversion,
pandas_dtype,
)
from pandas import ( # noqa:F401
Categorical,
CategoricalIndex,
DataFrame,
DatetimeIndex,
Float64Index,
Index,
Int64Index,
Interval,
IntervalIndex,
MultiIndex,
NaT,
Period,
PeriodIndex,
RangeIndex,
Series,
TimedeltaIndex,
Timestamp,
)
from pandas.core import internals
from pandas.core.arrays import DatetimeArray, IntervalArray, PeriodArray
from pandas.core.arrays.sparse import BlockIndex, IntIndex
from pandas.core.generic import NDFrame
from pandas.core.internals import BlockManager, _safe_reshape, make_block
from pandas.core.sparse.api import SparseDataFrame, SparseSeries
from pandas.io.common import _stringify_path, get_filepath_or_buffer
from pandas.io.msgpack import ExtType, Packer as _Packer, Unpacker as _Unpacker
# until we can pass this into our conversion functions,
# this is pretty hacky
compressor = None
def to_msgpack(path_or_buf, *args, **kwargs):
"""
msgpack (serialize) object to input file path
.. deprecated:: 0.25.0
to_msgpack is deprecated and will be removed in a future version.
It is recommended to use pyarrow for on-the-wire transmission of
pandas objects.
Parameters
----------
path_or_buf : string File path, buffer-like, or None
if None, return generated bytes
args : an object or objects to serialize
encoding : encoding for unicode objects
append : boolean whether to append to an existing msgpack
(default is False)
compress : type of compressor (zlib or blosc), default to None (no
compression)
"""
warnings.warn(
"to_msgpack is deprecated and will be removed in a "
"future version.\n"
"It is recommended to use pyarrow for on-the-wire "
"transmission of pandas objects.",
FutureWarning,
stacklevel=3,
)
global compressor
compressor = kwargs.pop("compress", None)
append = kwargs.pop("append", None)
if append:
mode = "a+b"
else:
mode = "wb"
def writer(fh):
for a in args:
fh.write(pack(a, **kwargs))
path_or_buf = _stringify_path(path_or_buf)
if isinstance(path_or_buf, str):
try:
with open(path_or_buf, mode) as fh:
writer(fh)
except FileNotFoundError:
msg = "File b'{}' does not exist".format(path_or_buf)
raise FileNotFoundError(msg)
elif path_or_buf is None:
buf = BytesIO()
writer(buf)
return buf.getvalue()
else:
writer(path_or_buf)
def read_msgpack(path_or_buf, encoding="utf-8", iterator=False, **kwargs):
"""
Load msgpack pandas object from the specified
file path.
.. deprecated:: 0.25.0
read_msgpack is deprecated and will be removed in a future version.
It is recommended to use pyarrow for on-the-wire transmission of
pandas objects.
Parameters
----------
path_or_buf : str, path object or file-like object
Any valid string path is acceptable. The string could be a URL. Valid
URL schemes include http, ftp, s3, and file. For file URLs, a host is
expected.
If you want to pass in a path object, pandas accepts any
``os.PathLike``.
By file-like object, we refer to objects with a ``read()`` method,
such as a file handler (e.g. via builtin ``open`` function) or
``StringIO``.
encoding : Encoding for decoding msgpack str type
iterator : boolean, if True, return an iterator to the unpacker
(default is False)
Returns
-------
obj : same type as object stored in file
Notes
-----
read_msgpack is only guaranteed to be backwards compatible to pandas
0.20.3.
"""
warnings.warn(
"The read_msgpack is deprecated and will be removed in a "
"future version.\n"
"It is recommended to use pyarrow for on-the-wire "
"transmission of pandas objects.",
FutureWarning,
stacklevel=3,
)
path_or_buf, _, _, should_close = get_filepath_or_buffer(path_or_buf)
if iterator:
return Iterator(path_or_buf)
def read(fh):
unpacked_obj = list(unpack(fh, encoding=encoding, **kwargs))
if len(unpacked_obj) == 1:
return unpacked_obj[0]
if should_close:
try:
path_or_buf.close()
except IOError:
pass
return unpacked_obj
# see if we have an actual file
if isinstance(path_or_buf, str):
try:
with open(path_or_buf, "rb") as fh:
return read(fh)
except FileNotFoundError:
msg = "File b'{}' does not exist".format(path_or_buf)
raise FileNotFoundError(msg)
if isinstance(path_or_buf, bytes):
# treat as a binary-like
fh = None
try:
fh = BytesIO(path_or_buf)
return read(fh)
finally:
if fh is not None:
fh.close()
elif hasattr(path_or_buf, "read") and callable(path_or_buf.read):
# treat as a buffer like
return read(path_or_buf)
raise ValueError("path_or_buf needs to be a string file path or file-like")
dtype_dict = {
21: np.dtype("M8[ns]"),
"datetime64[ns]": np.dtype("M8[ns]"),
"datetime64[us]": np.dtype("M8[us]"),
22: np.dtype("m8[ns]"),
"timedelta64[ns]": np.dtype("m8[ns]"),
"timedelta64[us]": np.dtype("m8[us]"),
# this is platform int, which we need to remap to np.int64
# for compat on windows platforms
7: np.dtype("int64"),
"category": "category",
}
def dtype_for(t):
""" return my dtype mapping, whether number or name """
if t in dtype_dict:
return dtype_dict[t]
return np.typeDict.get(t, t)
c2f_dict = {"complex": np.float64, "complex128": np.float64, "complex64": np.float32}
# windows (32 bit) compat
if hasattr(np, "float128"):
c2f_dict["complex256"] = np.float128
def c2f(r, i, ctype_name):
"""
Convert strings to complex number instance with specified numpy type.
"""
ftype = c2f_dict[ctype_name]
return np.typeDict[ctype_name](ftype(r) + 1j * ftype(i))
def convert(values):
""" convert the numpy values to a list """
dtype = values.dtype
if is_categorical_dtype(values):
return values
elif is_object_dtype(dtype):
return values.ravel().tolist()
if needs_i8_conversion(dtype):
values = values.view("i8")
v = values.ravel()
if compressor == "zlib":
zlib = import_optional_dependency(
"zlib", extra="zlib is required when `compress='zlib'`."
)
# return string arrays like they are
if dtype == np.object_:
return v.tolist()
# convert to a bytes array
v = v.tostring()
return ExtType(0, zlib.compress(v))
elif compressor == "blosc":
blosc = import_optional_dependency(
"blosc", extra="zlib is required when `compress='blosc'`."
)
# return string arrays like they are
if dtype == np.object_:
return v.tolist()
# convert to a bytes array
v = v.tostring()
return ExtType(0, blosc.compress(v, typesize=dtype.itemsize))
# ndarray (on original dtype)
return ExtType(0, v.tostring())
def unconvert(values, dtype, compress=None):
as_is_ext = isinstance(values, ExtType) and values.code == 0
if as_is_ext:
values = values.data
if is_categorical_dtype(dtype):
return values
elif is_object_dtype(dtype):
return np.array(values, dtype=object)
dtype = pandas_dtype(dtype).base
if not as_is_ext:
values = values.encode("latin1")
if compress:
if compress == "zlib":
zlib = import_optional_dependency(
"zlib", extra="zlib is required when `compress='zlib'`."
)
decompress = zlib.decompress
elif compress == "blosc":
blosc = import_optional_dependency(
"blosc", extra="zlib is required when `compress='blosc'`."
)
decompress = blosc.decompress
else:
raise ValueError("compress must be one of 'zlib' or 'blosc'")
try:
return np.frombuffer(
_move_into_mutable_buffer(decompress(values)), dtype=dtype
)
except _BadMove as e:
# Pull the decompressed data off of the `_BadMove` exception.
# We don't just store this in the locals because we want to
# minimize the risk of giving users access to a `bytes` object
# whose data is also given to a mutable buffer.
values = e.args[0]
if len(values) > 1:
# The empty string and single characters are memoized in many
# string creating functions in the capi. This case should not
# warn even though we need to make a copy because we are only
# copying at most 1 byte.
warnings.warn(
"copying data after decompressing; this may mean that"
" decompress is caching its result",
PerformanceWarning,
)
# fall through to copying `np.fromstring`
# Copy the bytes into a numpy array.
buf = np.frombuffer(values, dtype=dtype)
buf = buf.copy() # required to not mutate the original data
buf.flags.writeable = True
return buf
def encode(obj):
"""
Data encoder
"""
tobj = type(obj)
if isinstance(obj, Index):
if isinstance(obj, RangeIndex):
return {
"typ": "range_index",
"klass": obj.__class__.__name__,
"name": getattr(obj, "name", None),
"start": obj._range.start,
"stop": obj._range.stop,
"step": obj._range.step,
}
elif isinstance(obj, PeriodIndex):
return {
"typ": "period_index",
"klass": obj.__class__.__name__,
"name": getattr(obj, "name", None),
"freq": getattr(obj, "freqstr", None),
"dtype": obj.dtype.name,
"data": convert(obj.asi8),
"compress": compressor,
}
elif isinstance(obj, DatetimeIndex):
tz = getattr(obj, "tz", None)
# store tz info and data as UTC
if tz is not None:
tz = tz.zone
obj = obj.tz_convert("UTC")
return {
"typ": "datetime_index",
"klass": obj.__class__.__name__,
"name": getattr(obj, "name", None),
"dtype": obj.dtype.name,
"data": convert(obj.asi8),
"freq": getattr(obj, "freqstr", None),
"tz": tz,
"compress": compressor,
}
elif isinstance(obj, (IntervalIndex, IntervalArray)):
if isinstance(obj, IntervalIndex):
typ = "interval_index"
else:
typ = "interval_array"
return {
"typ": typ,
"klass": obj.__class__.__name__,
"name": getattr(obj, "name", None),
"left": getattr(obj, "left", None),
"right": getattr(obj, "right", None),
"closed": getattr(obj, "closed", None),
}
elif isinstance(obj, MultiIndex):
return {
"typ": "multi_index",
"klass": obj.__class__.__name__,
"names": getattr(obj, "names", None),
"dtype": obj.dtype.name,
"data": convert(obj.values),
"compress": compressor,
}
else:
return {
"typ": "index",
"klass": obj.__class__.__name__,
"name": getattr(obj, "name", None),
"dtype": obj.dtype.name,
"data": convert(obj.values),
"compress": compressor,
}
elif isinstance(obj, Categorical):
return {
"typ": "category",
"klass": obj.__class__.__name__,
"name": getattr(obj, "name", None),
"codes": obj.codes,
"categories": obj.categories,
"ordered": obj.ordered,
"compress": compressor,
}
elif isinstance(obj, Series):
if isinstance(obj, SparseSeries):
raise NotImplementedError("msgpack sparse series is not implemented")
# d = {'typ': 'sparse_series',
# 'klass': obj.__class__.__name__,
# 'dtype': obj.dtype.name,
# 'index': obj.index,
# 'sp_index': obj.sp_index,
# 'sp_values': convert(obj.sp_values),
# 'compress': compressor}
# for f in ['name', 'fill_value', 'kind']:
# d[f] = getattr(obj, f, None)
# return d
else:
return {
"typ": "series",
"klass": obj.__class__.__name__,
"name": getattr(obj, "name", None),
"index": obj.index,
"dtype": obj.dtype.name,
"data": convert(obj.values),
"compress": compressor,
}
elif issubclass(tobj, NDFrame):
if isinstance(obj, SparseDataFrame):
raise NotImplementedError("msgpack sparse frame is not implemented")
# d = {'typ': 'sparse_dataframe',
# 'klass': obj.__class__.__name__,
# 'columns': obj.columns}
# for f in ['default_fill_value', 'default_kind']:
# d[f] = getattr(obj, f, None)
# d['data'] = dict([(name, ss)
# for name, ss in obj.items()])
# return d
else:
data = obj._data
if not data.is_consolidated():
data = data.consolidate()
# the block manager
return {
"typ": "block_manager",
"klass": obj.__class__.__name__,
"axes": data.axes,
"blocks": [
{
"locs": b.mgr_locs.as_array,
"values": convert(b.values),
"shape": b.values.shape,
"dtype": b.dtype.name,
"klass": b.__class__.__name__,
"compress": compressor,
}
for b in data.blocks
],
}
elif (
isinstance(obj, (datetime, date, np.datetime64, timedelta, np.timedelta64))
or obj is NaT
):
if isinstance(obj, Timestamp):
tz = obj.tzinfo
if tz is not None:
tz = tz.zone
freq = obj.freq
if freq is not None:
freq = freq.freqstr
return {"typ": "timestamp", "value": obj.value, "freq": freq, "tz": tz}
if obj is NaT:
return {"typ": "nat"}
elif isinstance(obj, np.timedelta64):
return {"typ": "timedelta64", "data": obj.view("i8")}
elif isinstance(obj, timedelta):
return {
"typ": "timedelta",
"data": (obj.days, obj.seconds, obj.microseconds),
}
elif isinstance(obj, np.datetime64):
return {"typ": "datetime64", "data": str(obj)}
elif isinstance(obj, datetime):
return {"typ": "datetime", "data": obj.isoformat()}
elif isinstance(obj, date):
return {"typ": "date", "data": obj.isoformat()}
raise Exception("cannot encode this datetimelike object: {obj}".format(obj=obj))
elif isinstance(obj, Period):
return {"typ": "period", "ordinal": obj.ordinal, "freq": obj.freqstr}
elif isinstance(obj, Interval):
return {
"typ": "interval",
"left": obj.left,
"right": obj.right,
"closed": obj.closed,
}
elif isinstance(obj, BlockIndex):
return {
"typ": "block_index",
"klass": obj.__class__.__name__,
"blocs": obj.blocs,
"blengths": obj.blengths,
"length": obj.length,
}
elif isinstance(obj, IntIndex):
return {
"typ": "int_index",
"klass": obj.__class__.__name__,
"indices": obj.indices,
"length": obj.length,
}
elif isinstance(obj, np.ndarray):
return {
"typ": "ndarray",
"shape": obj.shape,
"ndim": obj.ndim,
"dtype": obj.dtype.name,
"data": convert(obj),
"compress": compressor,
}
elif isinstance(obj, np.number):
if np.iscomplexobj(obj):
return {
"typ": "np_scalar",
"sub_typ": "np_complex",
"dtype": obj.dtype.name,
"real": np.real(obj).__repr__(),
"imag": np.imag(obj).__repr__(),
}
else:
return {"typ": "np_scalar", "dtype": obj.dtype.name, "data": obj.__repr__()}
elif isinstance(obj, complex):
return {
"typ": "np_complex",
"real": np.real(obj).__repr__(),
"imag": np.imag(obj).__repr__(),
}
return obj
def decode(obj):
"""
Decoder for deserializing numpy data types.
"""
typ = obj.get("typ")
if typ is None:
return obj
elif typ == "timestamp":
freq = obj["freq"] if "freq" in obj else obj["offset"]
return Timestamp(obj["value"], tz=obj["tz"], freq=freq)
elif typ == "nat":
return NaT
elif typ == "period":
return Period(ordinal=obj["ordinal"], freq=obj["freq"])
elif typ == "index":
dtype = dtype_for(obj["dtype"])
data = unconvert(obj["data"], dtype, obj.get("compress"))
return Index(data, dtype=dtype, name=obj["name"])
elif typ == "range_index":
return RangeIndex(obj["start"], obj["stop"], obj["step"], name=obj["name"])
elif typ == "multi_index":
dtype = dtype_for(obj["dtype"])
data = unconvert(obj["data"], dtype, obj.get("compress"))
data = [tuple(x) for x in data]
return MultiIndex.from_tuples(data, names=obj["names"])
elif typ == "period_index":
data = unconvert(obj["data"], np.int64, obj.get("compress"))
d = dict(name=obj["name"], freq=obj["freq"])
freq = d.pop("freq", None)
return PeriodIndex(PeriodArray(data, freq), **d)
elif typ == "datetime_index":
data = unconvert(obj["data"], np.int64, obj.get("compress"))
d = dict(name=obj["name"], freq=obj["freq"])
result = DatetimeIndex(data, **d)
tz = obj["tz"]
# reverse tz conversion
if tz is not None:
result = result.tz_localize("UTC").tz_convert(tz)
return result
elif typ in ("interval_index", "interval_array"):
return globals()[obj["klass"]].from_arrays(
obj["left"], obj["right"], obj["closed"], name=obj["name"]
)
elif typ == "category":
from_codes = globals()[obj["klass"]].from_codes
return from_codes(
codes=obj["codes"], categories=obj["categories"], ordered=obj["ordered"]
)
elif typ == "interval":
return Interval(obj["left"], obj["right"], obj["closed"])
elif typ == "series":
dtype = dtype_for(obj["dtype"])
index = obj["index"]
data = unconvert(obj["data"], dtype, obj["compress"])
return Series(data, index=index, dtype=dtype, name=obj["name"])
elif typ == "block_manager":
axes = obj["axes"]
def create_block(b):
values = _safe_reshape(
unconvert(b["values"], dtype_for(b["dtype"]), b["compress"]), b["shape"]
)
# locs handles duplicate column names, and should be used instead
# of items; see GH 9618
if "locs" in b:
placement = b["locs"]
else:
placement = axes[0].get_indexer(b["items"])
if is_datetime64tz_dtype(b["dtype"]):
assert isinstance(values, np.ndarray), type(values)
assert values.dtype == "M8[ns]", values.dtype
values = DatetimeArray(values, dtype=b["dtype"])
return make_block(
values=values,
klass=getattr(internals, b["klass"]),
placement=placement,
dtype=b["dtype"],
)
blocks = [create_block(b) for b in obj["blocks"]]
return globals()[obj["klass"]](BlockManager(blocks, axes))
elif typ == "datetime":
return parse(obj["data"])
elif typ == "datetime64":
return np.datetime64(parse(obj["data"]))
elif typ == "date":
return parse(obj["data"]).date()
elif typ == "timedelta":
return timedelta(*obj["data"])
elif typ == "timedelta64":
return np.timedelta64(int(obj["data"]))
# elif typ == 'sparse_series':
# dtype = dtype_for(obj['dtype'])
# return SparseSeries(
# unconvert(obj['sp_values'], dtype, obj['compress']),
# sparse_index=obj['sp_index'], index=obj['index'],
# fill_value=obj['fill_value'], kind=obj['kind'], name=obj['name'])
# elif typ == 'sparse_dataframe':
# return SparseDataFrame(
# obj['data'], columns=obj['columns'],
# default_fill_value=obj['default_fill_value'],
# default_kind=obj['default_kind']
# )
elif typ == "block_index":
return globals()[obj["klass"]](obj["length"], obj["blocs"], obj["blengths"])
elif typ == "int_index":
return globals()[obj["klass"]](obj["length"], obj["indices"])
elif typ == "ndarray":
return unconvert(
obj["data"], np.typeDict[obj["dtype"]], obj.get("compress")
).reshape(obj["shape"])
elif typ == "np_scalar":
if obj.get("sub_typ") == "np_complex":
return c2f(obj["real"], obj["imag"], obj["dtype"])
else:
dtype = dtype_for(obj["dtype"])
try:
return dtype(obj["data"])
except (ValueError, TypeError):
return dtype.type(obj["data"])
elif typ == "np_complex":
return complex(obj["real"] + "+" + obj["imag"] + "j")
elif isinstance(obj, (dict, list, set)):
return obj
else:
return obj
def pack(
o,
default=encode,
encoding="utf-8",
unicode_errors="strict",
use_single_float=False,
autoreset=1,
use_bin_type=1,
):
"""
Pack an object and return the packed bytes.
"""
return Packer(
default=default,
encoding=encoding,
unicode_errors=unicode_errors,
use_single_float=use_single_float,
autoreset=autoreset,
use_bin_type=use_bin_type,
).pack(o)
def unpack(
packed,
object_hook=decode,
list_hook=None,
use_list=False,
encoding="utf-8",
unicode_errors="strict",
object_pairs_hook=None,
max_buffer_size=0,
ext_hook=ExtType,
):
"""
Unpack a packed object, return an iterator
Note: packed lists will be returned as tuples
"""
return Unpacker(
packed,
object_hook=object_hook,
list_hook=list_hook,
use_list=use_list,
encoding=encoding,
unicode_errors=unicode_errors,
object_pairs_hook=object_pairs_hook,
max_buffer_size=max_buffer_size,
ext_hook=ext_hook,
)
class Packer(_Packer):
def __init__(
self,
default=encode,
encoding="utf-8",
unicode_errors="strict",
use_single_float=False,
autoreset=1,
use_bin_type=1,
):
super().__init__(
default=default,
encoding=encoding,
unicode_errors=unicode_errors,
use_single_float=use_single_float,
autoreset=autoreset,
use_bin_type=use_bin_type,
)
class Unpacker(_Unpacker):
def __init__(
self,
file_like=None,
read_size=0,
use_list=False,
object_hook=decode,
object_pairs_hook=None,
list_hook=None,
encoding="utf-8",
unicode_errors="strict",
max_buffer_size=0,
ext_hook=ExtType,
):
super().__init__(
file_like=file_like,
read_size=read_size,
use_list=use_list,
object_hook=object_hook,
object_pairs_hook=object_pairs_hook,
list_hook=list_hook,
encoding=encoding,
unicode_errors=unicode_errors,
max_buffer_size=max_buffer_size,
ext_hook=ext_hook,
)
class Iterator:
""" manage the unpacking iteration,
close the file on completion """
def __init__(self, path, **kwargs):
self.path = path
self.kwargs = kwargs
def __iter__(self):
needs_closing = True
try:
# see if we have an actual file
if isinstance(self.path, str):
try:
path_exists = os.path.exists(self.path)
except TypeError:
path_exists = False
if path_exists:
fh = open(self.path, "rb")
else:
fh = BytesIO(self.path)
else:
if not hasattr(self.path, "read"):
fh = BytesIO(self.path)
else:
# a file-like
needs_closing = False
fh = self.path
unpacker = unpack(fh)
for o in unpacker:
yield o
finally:
if needs_closing:
fh.close()

View File

@@ -0,0 +1,294 @@
""" parquet compat """
from warnings import catch_warnings
from pandas.compat._optional import import_optional_dependency
from pandas.errors import AbstractMethodError
from pandas import DataFrame, get_option
from pandas.io.common import get_filepath_or_buffer, is_s3_url
def get_engine(engine):
""" return our implementation """
if engine == "auto":
engine = get_option("io.parquet.engine")
if engine == "auto":
# try engines in this order
try:
return PyArrowImpl()
except ImportError:
pass
try:
return FastParquetImpl()
except ImportError:
pass
raise ImportError(
"Unable to find a usable engine; "
"tried using: 'pyarrow', 'fastparquet'.\n"
"pyarrow or fastparquet is required for parquet "
"support"
)
if engine not in ["pyarrow", "fastparquet"]:
raise ValueError("engine must be one of 'pyarrow', 'fastparquet'")
if engine == "pyarrow":
return PyArrowImpl()
elif engine == "fastparquet":
return FastParquetImpl()
class BaseImpl:
api = None # module
@staticmethod
def validate_dataframe(df):
if not isinstance(df, DataFrame):
raise ValueError("to_parquet only supports IO with DataFrames")
# must have value column names (strings only)
if df.columns.inferred_type not in {"string", "unicode", "empty"}:
raise ValueError("parquet must have string column names")
# index level names must be strings
valid_names = all(
isinstance(name, str) for name in df.index.names if name is not None
)
if not valid_names:
raise ValueError("Index level names must be strings")
def write(self, df, path, compression, **kwargs):
raise AbstractMethodError(self)
def read(self, path, columns=None, **kwargs):
raise AbstractMethodError(self)
class PyArrowImpl(BaseImpl):
def __init__(self):
pyarrow = import_optional_dependency(
"pyarrow", extra="pyarrow is required for parquet support."
)
import pyarrow.parquet
self.api = pyarrow
def write(
self,
df,
path,
compression="snappy",
coerce_timestamps="ms",
index=None,
partition_cols=None,
**kwargs
):
self.validate_dataframe(df)
path, _, _, _ = get_filepath_or_buffer(path, mode="wb")
if index is None:
from_pandas_kwargs = {}
else:
from_pandas_kwargs = {"preserve_index": index}
table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
if partition_cols is not None:
self.api.parquet.write_to_dataset(
table,
path,
compression=compression,
coerce_timestamps=coerce_timestamps,
partition_cols=partition_cols,
**kwargs
)
else:
self.api.parquet.write_table(
table,
path,
compression=compression,
coerce_timestamps=coerce_timestamps,
**kwargs
)
def read(self, path, columns=None, **kwargs):
path, _, _, should_close = get_filepath_or_buffer(path)
kwargs["use_pandas_metadata"] = True
result = self.api.parquet.read_table(
path, columns=columns, **kwargs
).to_pandas()
if should_close:
try:
path.close()
except: # noqa: flake8
pass
return result
class FastParquetImpl(BaseImpl):
def __init__(self):
# since pandas is a dependency of fastparquet
# we need to import on first use
fastparquet = import_optional_dependency(
"fastparquet", extra="fastparquet is required for parquet support."
)
self.api = fastparquet
def write(
self, df, path, compression="snappy", index=None, partition_cols=None, **kwargs
):
self.validate_dataframe(df)
# thriftpy/protocol/compact.py:339:
# DeprecationWarning: tostring() is deprecated.
# Use tobytes() instead.
if "partition_on" in kwargs and partition_cols is not None:
raise ValueError(
"Cannot use both partition_on and "
"partition_cols. Use partition_cols for "
"partitioning data"
)
elif "partition_on" in kwargs:
partition_cols = kwargs.pop("partition_on")
if partition_cols is not None:
kwargs["file_scheme"] = "hive"
if is_s3_url(path):
# path is s3:// so we need to open the s3file in 'wb' mode.
# TODO: Support 'ab'
path, _, _, _ = get_filepath_or_buffer(path, mode="wb")
# And pass the opened s3file to the fastparquet internal impl.
kwargs["open_with"] = lambda path, _: path
else:
path, _, _, _ = get_filepath_or_buffer(path)
with catch_warnings(record=True):
self.api.write(
path,
df,
compression=compression,
write_index=index,
partition_on=partition_cols,
**kwargs
)
def read(self, path, columns=None, **kwargs):
if is_s3_url(path):
# When path is s3:// an S3File is returned.
# We need to retain the original path(str) while also
# pass the S3File().open function to fsatparquet impl.
s3, _, _, should_close = get_filepath_or_buffer(path)
try:
parquet_file = self.api.ParquetFile(path, open_with=s3.s3.open)
finally:
s3.close()
else:
path, _, _, _ = get_filepath_or_buffer(path)
parquet_file = self.api.ParquetFile(path)
return parquet_file.to_pandas(columns=columns, **kwargs)
def to_parquet(
df,
path,
engine="auto",
compression="snappy",
index=None,
partition_cols=None,
**kwargs
):
"""
Write a DataFrame to the parquet format.
Parameters
----------
path : str
File path or Root Directory path. Will be used as Root Directory path
while writing a partitioned dataset.
.. versionchanged:: 0.24.0
engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
Parquet library to use. If 'auto', then the option
``io.parquet.engine`` is used. The default ``io.parquet.engine``
behavior is to try 'pyarrow', falling back to 'fastparquet' if
'pyarrow' is unavailable.
compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy'
Name of the compression to use. Use ``None`` for no compression.
index : bool, default None
If ``True``, include the dataframe's index(es) in the file output. If
``False``, they will not be written to the file. If ``None``, the
engine's default behavior will be used.
.. versionadded 0.24.0
partition_cols : list, optional, default None
Column names by which to partition the dataset
Columns are partitioned in the order they are given
.. versionadded:: 0.24.0
kwargs
Additional keyword arguments passed to the engine
"""
impl = get_engine(engine)
return impl.write(
df,
path,
compression=compression,
index=index,
partition_cols=partition_cols,
**kwargs
)
def read_parquet(path, engine="auto", columns=None, **kwargs):
"""
Load a parquet object from the file path, returning a DataFrame.
.. versionadded 0.21.0
Parameters
----------
path : str, path object or file-like object
Any valid string path is acceptable. The string could be a URL. Valid
URL schemes include http, ftp, s3, and file. For file URLs, a host is
expected. A local file could be:
``file://localhost/path/to/table.parquet``.
If you want to pass in a path object, pandas accepts any
``os.PathLike``.
By file-like object, we refer to objects with a ``read()`` method,
such as a file handler (e.g. via builtin ``open`` function)
or ``StringIO``.
engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
Parquet library to use. If 'auto', then the option
``io.parquet.engine`` is used. The default ``io.parquet.engine``
behavior is to try 'pyarrow', falling back to 'fastparquet' if
'pyarrow' is unavailable.
columns : list, default=None
If not None, only these columns will be read from the file.
.. versionadded 0.21.1
**kwargs
Any additional kwargs are passed to the engine.
Returns
-------
DataFrame
"""
impl = get_engine(engine)
return impl.read(path, columns=columns, **kwargs)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,173 @@
""" pickle compat """
from io import BytesIO
import pickle
import warnings
from numpy.lib.format import read_array
from pandas.compat import pickle_compat as pc
from pandas.io.common import _get_handle, _stringify_path
def to_pickle(obj, path, compression="infer", protocol=pickle.HIGHEST_PROTOCOL):
"""
Pickle (serialize) object to file.
Parameters
----------
obj : any object
Any python object.
path : str
File path where the pickled object will be stored.
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
A string representing the compression to use in the output file. By
default, infers from the file extension in specified path.
.. versionadded:: 0.20.0
protocol : int
Int which indicates which protocol should be used by the pickler,
default HIGHEST_PROTOCOL (see [1], paragraph 12.1.2). The possible
values for this parameter depend on the version of Python. For Python
2.x, possible values are 0, 1, 2. For Python>=3.0, 3 is a valid value.
For Python >= 3.4, 4 is a valid value. A negative value for the
protocol parameter is equivalent to setting its value to
HIGHEST_PROTOCOL.
.. [1] https://docs.python.org/3/library/pickle.html
.. versionadded:: 0.21.0
See Also
--------
read_pickle : Load pickled pandas object (or any object) from file.
DataFrame.to_hdf : Write DataFrame to an HDF5 file.
DataFrame.to_sql : Write DataFrame to a SQL database.
DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
Examples
--------
>>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)})
>>> original_df
foo bar
0 0 5
1 1 6
2 2 7
3 3 8
4 4 9
>>> pd.to_pickle(original_df, "./dummy.pkl")
>>> unpickled_df = pd.read_pickle("./dummy.pkl")
>>> unpickled_df
foo bar
0 0 5
1 1 6
2 2 7
3 3 8
4 4 9
>>> import os
>>> os.remove("./dummy.pkl")
"""
path = _stringify_path(path)
f, fh = _get_handle(path, "wb", compression=compression, is_text=False)
if protocol < 0:
protocol = pickle.HIGHEST_PROTOCOL
try:
f.write(pickle.dumps(obj, protocol=protocol))
finally:
f.close()
for _f in fh:
_f.close()
def read_pickle(path, compression="infer"):
"""
Load pickled pandas object (or any object) from file.
.. warning::
Loading pickled data received from untrusted sources can be
unsafe. See `here <https://docs.python.org/3/library/pickle.html>`__.
Parameters
----------
path : str
File path where the pickled object will be loaded.
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
For on-the-fly decompression of on-disk data. If 'infer', then use
gzip, bz2, xz or zip if path ends in '.gz', '.bz2', '.xz',
or '.zip' respectively, and no decompression otherwise.
Set to None for no decompression.
.. versionadded:: 0.20.0
Returns
-------
unpickled : same type as object stored in file
See Also
--------
DataFrame.to_pickle : Pickle (serialize) DataFrame object to file.
Series.to_pickle : Pickle (serialize) Series object to file.
read_hdf : Read HDF5 file into a DataFrame.
read_sql : Read SQL query or database table into a DataFrame.
read_parquet : Load a parquet object, returning a DataFrame.
Notes
-----
read_pickle is only guaranteed to be backwards compatible to pandas 0.20.3.
Examples
--------
>>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)})
>>> original_df
foo bar
0 0 5
1 1 6
2 2 7
3 3 8
4 4 9
>>> pd.to_pickle(original_df, "./dummy.pkl")
>>> unpickled_df = pd.read_pickle("./dummy.pkl")
>>> unpickled_df
foo bar
0 0 5
1 1 6
2 2 7
3 3 8
4 4 9
>>> import os
>>> os.remove("./dummy.pkl")
"""
path = _stringify_path(path)
f, fh = _get_handle(path, "rb", compression=compression, is_text=False)
# 1) try standard libary Pickle
# 2) try pickle_compat (older pandas version) to handle subclass changes
# 3) try pickle_compat with latin1 encoding
try:
with warnings.catch_warnings(record=True):
# We want to silence any warnings about, e.g. moved modules.
warnings.simplefilter("ignore", Warning)
return pickle.load(f)
except Exception: # noqa: E722
try:
return pc.load(f, encoding=None)
except Exception: # noqa: E722
return pc.load(f, encoding="latin1")
finally:
f.close()
for _f in fh:
_f.close()
# compat with sparse pickle / unpickle
def _unpickle_array(bytes):
arr = read_array(BytesIO(bytes))
return arr

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,37 @@
""" s3 support for remote file interactivity """
from urllib.parse import urlparse as parse_url
from pandas.compat._optional import import_optional_dependency
s3fs = import_optional_dependency(
"s3fs", extra="The s3fs package is required to handle s3 files."
)
def _strip_schema(url):
"""Returns the url without the s3:// part"""
result = parse_url(url, allow_fragments=False)
return result.netloc + result.path
def get_filepath_or_buffer(
filepath_or_buffer, encoding=None, compression=None, mode=None
):
from botocore.exceptions import NoCredentialsError
if mode is None:
mode = "rb"
fs = s3fs.S3FileSystem(anon=False)
try:
filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer), mode)
except (FileNotFoundError, NoCredentialsError):
# boto3 has troubles when trying to access a public file
# when credentialed...
# An OSError is raised if you have credentials, but they
# aren't valid for that bucket.
# A NoCredentialsError is raised if you don't have creds
# for that bucket.
fs = s3fs.S3FileSystem(anon=True)
filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer), mode)
return filepath_or_buffer, None, compression, True

View File

@@ -0,0 +1 @@
from .sasreader import read_sas # noqa

View File

@@ -0,0 +1,732 @@
"""
Read SAS7BDAT files
Based on code written by Jared Hobbs:
https://bitbucket.org/jaredhobbs/sas7bdat
See also:
https://github.com/BioStatMatt/sas7bdat
Partial documentation of the file format:
https://cran.r-project.org/package=sas7bdat/vignettes/sas7bdat.pdf
Reference for binary data compression:
http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm
"""
from datetime import datetime
import struct
import numpy as np
from pandas.errors import EmptyDataError
import pandas as pd
from pandas.io.common import BaseIterator, get_filepath_or_buffer
from pandas.io.sas._sas import Parser
import pandas.io.sas.sas_constants as const
class _subheader_pointer:
pass
class _column:
pass
# SAS7BDAT represents a SAS data file in SAS7BDAT format.
class SAS7BDATReader(BaseIterator):
"""
Read SAS files in SAS7BDAT format.
Parameters
----------
path_or_buf : path name or buffer
Name of SAS file or file-like object pointing to SAS file
contents.
index : column identifier, defaults to None
Column to use as index.
convert_dates : boolean, defaults to True
Attempt to convert dates to Pandas datetime values. Note that
some rarely used SAS date formats may be unsupported.
blank_missing : boolean, defaults to True
Convert empty strings to missing values (SAS uses blanks to
indicate missing character variables).
chunksize : int, defaults to None
Return SAS7BDATReader object for iterations, returns chunks
with given number of lines.
encoding : string, defaults to None
String encoding.
convert_text : bool, defaults to True
If False, text variables are left as raw bytes.
convert_header_text : bool, defaults to True
If False, header text, including column names, are left as raw
bytes.
"""
def __init__(
self,
path_or_buf,
index=None,
convert_dates=True,
blank_missing=True,
chunksize=None,
encoding=None,
convert_text=True,
convert_header_text=True,
):
self.index = index
self.convert_dates = convert_dates
self.blank_missing = blank_missing
self.chunksize = chunksize
self.encoding = encoding
self.convert_text = convert_text
self.convert_header_text = convert_header_text
self.default_encoding = "latin-1"
self.compression = ""
self.column_names_strings = []
self.column_names = []
self.column_formats = []
self.columns = []
self._current_page_data_subheader_pointers = []
self._cached_page = None
self._column_data_lengths = []
self._column_data_offsets = []
self._column_types = []
self._current_row_in_file_index = 0
self._current_row_on_page_index = 0
self._current_row_in_file_index = 0
self._path_or_buf, _, _, _ = get_filepath_or_buffer(path_or_buf)
if isinstance(self._path_or_buf, str):
self._path_or_buf = open(self._path_or_buf, "rb")
self.handle = self._path_or_buf
self._get_properties()
self._parse_metadata()
def column_data_lengths(self):
"""Return a numpy int64 array of the column data lengths"""
return np.asarray(self._column_data_lengths, dtype=np.int64)
def column_data_offsets(self):
"""Return a numpy int64 array of the column offsets"""
return np.asarray(self._column_data_offsets, dtype=np.int64)
def column_types(self):
"""Returns a numpy character array of the column types:
s (string) or d (double)"""
return np.asarray(self._column_types, dtype=np.dtype("S1"))
def close(self):
try:
self.handle.close()
except AttributeError:
pass
def _get_properties(self):
# Check magic number
self._path_or_buf.seek(0)
self._cached_page = self._path_or_buf.read(288)
if self._cached_page[0 : len(const.magic)] != const.magic:
self.close()
raise ValueError("magic number mismatch (not a SAS file?)")
# Get alignment information
align1, align2 = 0, 0
buf = self._read_bytes(const.align_1_offset, const.align_1_length)
if buf == const.u64_byte_checker_value:
align2 = const.align_2_value
self.U64 = True
self._int_length = 8
self._page_bit_offset = const.page_bit_offset_x64
self._subheader_pointer_length = const.subheader_pointer_length_x64
else:
self.U64 = False
self._page_bit_offset = const.page_bit_offset_x86
self._subheader_pointer_length = const.subheader_pointer_length_x86
self._int_length = 4
buf = self._read_bytes(const.align_2_offset, const.align_2_length)
if buf == const.align_1_checker_value:
align1 = const.align_2_value
total_align = align1 + align2
# Get endianness information
buf = self._read_bytes(const.endianness_offset, const.endianness_length)
if buf == b"\x01":
self.byte_order = "<"
else:
self.byte_order = ">"
# Get encoding information
buf = self._read_bytes(const.encoding_offset, const.encoding_length)[0]
if buf in const.encoding_names:
self.file_encoding = const.encoding_names[buf]
else:
self.file_encoding = "unknown (code={name!s})".format(name=buf)
# Get platform information
buf = self._read_bytes(const.platform_offset, const.platform_length)
if buf == b"1":
self.platform = "unix"
elif buf == b"2":
self.platform = "windows"
else:
self.platform = "unknown"
buf = self._read_bytes(const.dataset_offset, const.dataset_length)
self.name = buf.rstrip(b"\x00 ")
if self.convert_header_text:
self.name = self.name.decode(self.encoding or self.default_encoding)
buf = self._read_bytes(const.file_type_offset, const.file_type_length)
self.file_type = buf.rstrip(b"\x00 ")
if self.convert_header_text:
self.file_type = self.file_type.decode(
self.encoding or self.default_encoding
)
# Timestamp is epoch 01/01/1960
epoch = datetime(1960, 1, 1)
x = self._read_float(
const.date_created_offset + align1, const.date_created_length
)
self.date_created = epoch + pd.to_timedelta(x, unit="s")
x = self._read_float(
const.date_modified_offset + align1, const.date_modified_length
)
self.date_modified = epoch + pd.to_timedelta(x, unit="s")
self.header_length = self._read_int(
const.header_size_offset + align1, const.header_size_length
)
# Read the rest of the header into cached_page.
buf = self._path_or_buf.read(self.header_length - 288)
self._cached_page += buf
if len(self._cached_page) != self.header_length:
self.close()
raise ValueError("The SAS7BDAT file appears to be truncated.")
self._page_length = self._read_int(
const.page_size_offset + align1, const.page_size_length
)
self._page_count = self._read_int(
const.page_count_offset + align1, const.page_count_length
)
buf = self._read_bytes(
const.sas_release_offset + total_align, const.sas_release_length
)
self.sas_release = buf.rstrip(b"\x00 ")
if self.convert_header_text:
self.sas_release = self.sas_release.decode(
self.encoding or self.default_encoding
)
buf = self._read_bytes(
const.sas_server_type_offset + total_align, const.sas_server_type_length
)
self.server_type = buf.rstrip(b"\x00 ")
if self.convert_header_text:
self.server_type = self.server_type.decode(
self.encoding or self.default_encoding
)
buf = self._read_bytes(
const.os_version_number_offset + total_align, const.os_version_number_length
)
self.os_version = buf.rstrip(b"\x00 ")
if self.convert_header_text:
self.os_version = self.os_version.decode(
self.encoding or self.default_encoding
)
buf = self._read_bytes(const.os_name_offset + total_align, const.os_name_length)
buf = buf.rstrip(b"\x00 ")
if len(buf) > 0:
self.os_name = buf.decode(self.encoding or self.default_encoding)
else:
buf = self._read_bytes(
const.os_maker_offset + total_align, const.os_maker_length
)
self.os_name = buf.rstrip(b"\x00 ")
if self.convert_header_text:
self.os_name = self.os_name.decode(
self.encoding or self.default_encoding
)
def __next__(self):
da = self.read(nrows=self.chunksize or 1)
if da is None:
raise StopIteration
return da
# Read a single float of the given width (4 or 8).
def _read_float(self, offset, width):
if width not in (4, 8):
self.close()
raise ValueError("invalid float width")
buf = self._read_bytes(offset, width)
fd = "f" if width == 4 else "d"
return struct.unpack(self.byte_order + fd, buf)[0]
# Read a single signed integer of the given width (1, 2, 4 or 8).
def _read_int(self, offset, width):
if width not in (1, 2, 4, 8):
self.close()
raise ValueError("invalid int width")
buf = self._read_bytes(offset, width)
it = {1: "b", 2: "h", 4: "l", 8: "q"}[width]
iv = struct.unpack(self.byte_order + it, buf)[0]
return iv
def _read_bytes(self, offset, length):
if self._cached_page is None:
self._path_or_buf.seek(offset)
buf = self._path_or_buf.read(length)
if len(buf) < length:
self.close()
msg = "Unable to read {:d} bytes from file position {:d}."
raise ValueError(msg.format(length, offset))
return buf
else:
if offset + length > len(self._cached_page):
self.close()
raise ValueError("The cached page is too small.")
return self._cached_page[offset : offset + length]
def _parse_metadata(self):
done = False
while not done:
self._cached_page = self._path_or_buf.read(self._page_length)
if len(self._cached_page) <= 0:
break
if len(self._cached_page) != self._page_length:
self.close()
raise ValueError("Failed to read a meta data page from the SAS file.")
done = self._process_page_meta()
def _process_page_meta(self):
self._read_page_header()
pt = [const.page_meta_type, const.page_amd_type] + const.page_mix_types
if self._current_page_type in pt:
self._process_page_metadata()
is_data_page = self._current_page_type & const.page_data_type
is_mix_page = self._current_page_type in const.page_mix_types
return (
is_data_page
or is_mix_page
or self._current_page_data_subheader_pointers != []
)
def _read_page_header(self):
bit_offset = self._page_bit_offset
tx = const.page_type_offset + bit_offset
self._current_page_type = self._read_int(tx, const.page_type_length)
tx = const.block_count_offset + bit_offset
self._current_page_block_count = self._read_int(tx, const.block_count_length)
tx = const.subheader_count_offset + bit_offset
self._current_page_subheaders_count = self._read_int(
tx, const.subheader_count_length
)
def _process_page_metadata(self):
bit_offset = self._page_bit_offset
for i in range(self._current_page_subheaders_count):
pointer = self._process_subheader_pointers(
const.subheader_pointers_offset + bit_offset, i
)
if pointer.length == 0:
continue
if pointer.compression == const.truncated_subheader_id:
continue
subheader_signature = self._read_subheader_signature(pointer.offset)
subheader_index = self._get_subheader_index(
subheader_signature, pointer.compression, pointer.ptype
)
self._process_subheader(subheader_index, pointer)
def _get_subheader_index(self, signature, compression, ptype):
index = const.subheader_signature_to_index.get(signature)
if index is None:
f1 = (compression == const.compressed_subheader_id) or (compression == 0)
f2 = ptype == const.compressed_subheader_type
if (self.compression != "") and f1 and f2:
index = const.SASIndex.data_subheader_index
else:
self.close()
raise ValueError("Unknown subheader signature")
return index
def _process_subheader_pointers(self, offset, subheader_pointer_index):
subheader_pointer_length = self._subheader_pointer_length
total_offset = offset + subheader_pointer_length * subheader_pointer_index
subheader_offset = self._read_int(total_offset, self._int_length)
total_offset += self._int_length
subheader_length = self._read_int(total_offset, self._int_length)
total_offset += self._int_length
subheader_compression = self._read_int(total_offset, 1)
total_offset += 1
subheader_type = self._read_int(total_offset, 1)
x = _subheader_pointer()
x.offset = subheader_offset
x.length = subheader_length
x.compression = subheader_compression
x.ptype = subheader_type
return x
def _read_subheader_signature(self, offset):
subheader_signature = self._read_bytes(offset, self._int_length)
return subheader_signature
def _process_subheader(self, subheader_index, pointer):
offset = pointer.offset
length = pointer.length
if subheader_index == const.SASIndex.row_size_index:
processor = self._process_rowsize_subheader
elif subheader_index == const.SASIndex.column_size_index:
processor = self._process_columnsize_subheader
elif subheader_index == const.SASIndex.column_text_index:
processor = self._process_columntext_subheader
elif subheader_index == const.SASIndex.column_name_index:
processor = self._process_columnname_subheader
elif subheader_index == const.SASIndex.column_attributes_index:
processor = self._process_columnattributes_subheader
elif subheader_index == const.SASIndex.format_and_label_index:
processor = self._process_format_subheader
elif subheader_index == const.SASIndex.column_list_index:
processor = self._process_columnlist_subheader
elif subheader_index == const.SASIndex.subheader_counts_index:
processor = self._process_subheader_counts
elif subheader_index == const.SASIndex.data_subheader_index:
self._current_page_data_subheader_pointers.append(pointer)
return
else:
raise ValueError("unknown subheader index")
processor(offset, length)
def _process_rowsize_subheader(self, offset, length):
int_len = self._int_length
lcs_offset = offset
lcp_offset = offset
if self.U64:
lcs_offset += 682
lcp_offset += 706
else:
lcs_offset += 354
lcp_offset += 378
self.row_length = self._read_int(
offset + const.row_length_offset_multiplier * int_len, int_len
)
self.row_count = self._read_int(
offset + const.row_count_offset_multiplier * int_len, int_len
)
self.col_count_p1 = self._read_int(
offset + const.col_count_p1_multiplier * int_len, int_len
)
self.col_count_p2 = self._read_int(
offset + const.col_count_p2_multiplier * int_len, int_len
)
mx = const.row_count_on_mix_page_offset_multiplier * int_len
self._mix_page_row_count = self._read_int(offset + mx, int_len)
self._lcs = self._read_int(lcs_offset, 2)
self._lcp = self._read_int(lcp_offset, 2)
def _process_columnsize_subheader(self, offset, length):
int_len = self._int_length
offset += int_len
self.column_count = self._read_int(offset, int_len)
if self.col_count_p1 + self.col_count_p2 != self.column_count:
print(
"Warning: column count mismatch ({p1} + {p2} != "
"{column_count})\n".format(
p1=self.col_count_p1,
p2=self.col_count_p2,
column_count=self.column_count,
)
)
# Unknown purpose
def _process_subheader_counts(self, offset, length):
pass
def _process_columntext_subheader(self, offset, length):
offset += self._int_length
text_block_size = self._read_int(offset, const.text_block_size_length)
buf = self._read_bytes(offset, text_block_size)
cname_raw = buf[0:text_block_size].rstrip(b"\x00 ")
cname = cname_raw
if self.convert_header_text:
cname = cname.decode(self.encoding or self.default_encoding)
self.column_names_strings.append(cname)
if len(self.column_names_strings) == 1:
compression_literal = ""
for cl in const.compression_literals:
if cl in cname_raw:
compression_literal = cl
self.compression = compression_literal
offset -= self._int_length
offset1 = offset + 16
if self.U64:
offset1 += 4
buf = self._read_bytes(offset1, self._lcp)
compression_literal = buf.rstrip(b"\x00")
if compression_literal == "":
self._lcs = 0
offset1 = offset + 32
if self.U64:
offset1 += 4
buf = self._read_bytes(offset1, self._lcp)
self.creator_proc = buf[0 : self._lcp]
elif compression_literal == const.rle_compression:
offset1 = offset + 40
if self.U64:
offset1 += 4
buf = self._read_bytes(offset1, self._lcp)
self.creator_proc = buf[0 : self._lcp]
elif self._lcs > 0:
self._lcp = 0
offset1 = offset + 16
if self.U64:
offset1 += 4
buf = self._read_bytes(offset1, self._lcs)
self.creator_proc = buf[0 : self._lcp]
if self.convert_header_text:
if hasattr(self, "creator_proc"):
self.creator_proc = self.creator_proc.decode(
self.encoding or self.default_encoding
)
def _process_columnname_subheader(self, offset, length):
int_len = self._int_length
offset += int_len
column_name_pointers_count = (length - 2 * int_len - 12) // 8
for i in range(column_name_pointers_count):
text_subheader = (
offset
+ const.column_name_pointer_length * (i + 1)
+ const.column_name_text_subheader_offset
)
col_name_offset = (
offset
+ const.column_name_pointer_length * (i + 1)
+ const.column_name_offset_offset
)
col_name_length = (
offset
+ const.column_name_pointer_length * (i + 1)
+ const.column_name_length_offset
)
idx = self._read_int(
text_subheader, const.column_name_text_subheader_length
)
col_offset = self._read_int(
col_name_offset, const.column_name_offset_length
)
col_len = self._read_int(col_name_length, const.column_name_length_length)
name_str = self.column_names_strings[idx]
self.column_names.append(name_str[col_offset : col_offset + col_len])
def _process_columnattributes_subheader(self, offset, length):
int_len = self._int_length
column_attributes_vectors_count = (length - 2 * int_len - 12) // (int_len + 8)
for i in range(column_attributes_vectors_count):
col_data_offset = (
offset + int_len + const.column_data_offset_offset + i * (int_len + 8)
)
col_data_len = (
offset
+ 2 * int_len
+ const.column_data_length_offset
+ i * (int_len + 8)
)
col_types = (
offset + 2 * int_len + const.column_type_offset + i * (int_len + 8)
)
x = self._read_int(col_data_offset, int_len)
self._column_data_offsets.append(x)
x = self._read_int(col_data_len, const.column_data_length_length)
self._column_data_lengths.append(x)
x = self._read_int(col_types, const.column_type_length)
self._column_types.append(b"d" if x == 1 else b"s")
def _process_columnlist_subheader(self, offset, length):
# unknown purpose
pass
def _process_format_subheader(self, offset, length):
int_len = self._int_length
text_subheader_format = (
offset + const.column_format_text_subheader_index_offset + 3 * int_len
)
col_format_offset = offset + const.column_format_offset_offset + 3 * int_len
col_format_len = offset + const.column_format_length_offset + 3 * int_len
text_subheader_label = (
offset + const.column_label_text_subheader_index_offset + 3 * int_len
)
col_label_offset = offset + const.column_label_offset_offset + 3 * int_len
col_label_len = offset + const.column_label_length_offset + 3 * int_len
x = self._read_int(
text_subheader_format, const.column_format_text_subheader_index_length
)
format_idx = min(x, len(self.column_names_strings) - 1)
format_start = self._read_int(
col_format_offset, const.column_format_offset_length
)
format_len = self._read_int(col_format_len, const.column_format_length_length)
label_idx = self._read_int(
text_subheader_label, const.column_label_text_subheader_index_length
)
label_idx = min(label_idx, len(self.column_names_strings) - 1)
label_start = self._read_int(col_label_offset, const.column_label_offset_length)
label_len = self._read_int(col_label_len, const.column_label_length_length)
label_names = self.column_names_strings[label_idx]
column_label = label_names[label_start : label_start + label_len]
format_names = self.column_names_strings[format_idx]
column_format = format_names[format_start : format_start + format_len]
current_column_number = len(self.columns)
col = _column()
col.col_id = current_column_number
col.name = self.column_names[current_column_number]
col.label = column_label
col.format = column_format
col.ctype = self._column_types[current_column_number]
col.length = self._column_data_lengths[current_column_number]
self.column_formats.append(column_format)
self.columns.append(col)
def read(self, nrows=None):
if (nrows is None) and (self.chunksize is not None):
nrows = self.chunksize
elif nrows is None:
nrows = self.row_count
if len(self._column_types) == 0:
self.close()
raise EmptyDataError("No columns to parse from file")
if self._current_row_in_file_index >= self.row_count:
return None
m = self.row_count - self._current_row_in_file_index
if nrows > m:
nrows = m
nd = self._column_types.count(b"d")
ns = self._column_types.count(b"s")
self._string_chunk = np.empty((ns, nrows), dtype=np.object)
self._byte_chunk = np.zeros((nd, 8 * nrows), dtype=np.uint8)
self._current_row_in_chunk_index = 0
p = Parser(self)
p.read(nrows)
rslt = self._chunk_to_dataframe()
if self.index is not None:
rslt = rslt.set_index(self.index)
return rslt
def _read_next_page(self):
self._current_page_data_subheader_pointers = []
self._cached_page = self._path_or_buf.read(self._page_length)
if len(self._cached_page) <= 0:
return True
elif len(self._cached_page) != self._page_length:
self.close()
msg = "failed to read complete page from file " "(read {:d} of {:d} bytes)"
raise ValueError(msg.format(len(self._cached_page), self._page_length))
self._read_page_header()
page_type = self._current_page_type
if page_type == const.page_meta_type:
self._process_page_metadata()
is_data_page = page_type & const.page_data_type
pt = [const.page_meta_type] + const.page_mix_types
if not is_data_page and self._current_page_type not in pt:
return self._read_next_page()
return False
def _chunk_to_dataframe(self):
n = self._current_row_in_chunk_index
m = self._current_row_in_file_index
ix = range(m - n, m)
rslt = pd.DataFrame(index=ix)
js, jb = 0, 0
for j in range(self.column_count):
name = self.column_names[j]
if self._column_types[j] == b"d":
rslt[name] = self._byte_chunk[jb, :].view(dtype=self.byte_order + "d")
rslt[name] = np.asarray(rslt[name], dtype=np.float64)
if self.convert_dates:
unit = None
if self.column_formats[j] in const.sas_date_formats:
unit = "d"
elif self.column_formats[j] in const.sas_datetime_formats:
unit = "s"
if unit:
rslt[name] = pd.to_datetime(
rslt[name], unit=unit, origin="1960-01-01"
)
jb += 1
elif self._column_types[j] == b"s":
rslt[name] = self._string_chunk[js, :]
if self.convert_text and (self.encoding is not None):
rslt[name] = rslt[name].str.decode(
self.encoding or self.default_encoding
)
if self.blank_missing:
ii = rslt[name].str.len() == 0
rslt.loc[ii, name] = np.nan
js += 1
else:
self.close()
raise ValueError(
"unknown column type {type}".format(type=self._column_types[j])
)
return rslt

View File

@@ -0,0 +1,253 @@
magic = (
b"\x00\x00\x00\x00\x00\x00\x00\x00"
+ b"\x00\x00\x00\x00\xc2\xea\x81\x60"
+ b"\xb3\x14\x11\xcf\xbd\x92\x08\x00"
+ b"\x09\xc7\x31\x8c\x18\x1f\x10\x11"
)
align_1_checker_value = b"3"
align_1_offset = 32
align_1_length = 1
align_1_value = 4
u64_byte_checker_value = b"3"
align_2_offset = 35
align_2_length = 1
align_2_value = 4
endianness_offset = 37
endianness_length = 1
platform_offset = 39
platform_length = 1
encoding_offset = 70
encoding_length = 1
dataset_offset = 92
dataset_length = 64
file_type_offset = 156
file_type_length = 8
date_created_offset = 164
date_created_length = 8
date_modified_offset = 172
date_modified_length = 8
header_size_offset = 196
header_size_length = 4
page_size_offset = 200
page_size_length = 4
page_count_offset = 204
page_count_length = 4
sas_release_offset = 216
sas_release_length = 8
sas_server_type_offset = 224
sas_server_type_length = 16
os_version_number_offset = 240
os_version_number_length = 16
os_maker_offset = 256
os_maker_length = 16
os_name_offset = 272
os_name_length = 16
page_bit_offset_x86 = 16
page_bit_offset_x64 = 32
subheader_pointer_length_x86 = 12
subheader_pointer_length_x64 = 24
page_type_offset = 0
page_type_length = 2
block_count_offset = 2
block_count_length = 2
subheader_count_offset = 4
subheader_count_length = 2
page_meta_type = 0
page_data_type = 256
page_amd_type = 1024
page_metc_type = 16384
page_comp_type = -28672
page_mix_types = [512, 640]
subheader_pointers_offset = 8
truncated_subheader_id = 1
compressed_subheader_id = 4
compressed_subheader_type = 1
text_block_size_length = 2
row_length_offset_multiplier = 5
row_count_offset_multiplier = 6
col_count_p1_multiplier = 9
col_count_p2_multiplier = 10
row_count_on_mix_page_offset_multiplier = 15
column_name_pointer_length = 8
column_name_text_subheader_offset = 0
column_name_text_subheader_length = 2
column_name_offset_offset = 2
column_name_offset_length = 2
column_name_length_offset = 4
column_name_length_length = 2
column_data_offset_offset = 8
column_data_length_offset = 8
column_data_length_length = 4
column_type_offset = 14
column_type_length = 1
column_format_text_subheader_index_offset = 22
column_format_text_subheader_index_length = 2
column_format_offset_offset = 24
column_format_offset_length = 2
column_format_length_offset = 26
column_format_length_length = 2
column_label_text_subheader_index_offset = 28
column_label_text_subheader_index_length = 2
column_label_offset_offset = 30
column_label_offset_length = 2
column_label_length_offset = 32
column_label_length_length = 2
rle_compression = b"SASYZCRL"
rdc_compression = b"SASYZCR2"
compression_literals = [rle_compression, rdc_compression]
# Incomplete list of encodings, using SAS nomenclature:
# http://support.sas.com/documentation/cdl/en/nlsref/61893/HTML/default/viewer.htm#a002607278.htm
encoding_names = {
29: "latin1",
20: "utf-8",
33: "cyrillic",
60: "wlatin2",
61: "wcyrillic",
62: "wlatin1",
90: "ebcdic870",
}
class SASIndex:
row_size_index = 0
column_size_index = 1
subheader_counts_index = 2
column_text_index = 3
column_name_index = 4
column_attributes_index = 5
format_and_label_index = 6
column_list_index = 7
data_subheader_index = 8
subheader_signature_to_index = {
b"\xF7\xF7\xF7\xF7": SASIndex.row_size_index,
b"\x00\x00\x00\x00\xF7\xF7\xF7\xF7": SASIndex.row_size_index,
b"\xF7\xF7\xF7\xF7\x00\x00\x00\x00": SASIndex.row_size_index,
b"\xF7\xF7\xF7\xF7\xFF\xFF\xFB\xFE": SASIndex.row_size_index,
b"\xF6\xF6\xF6\xF6": SASIndex.column_size_index,
b"\x00\x00\x00\x00\xF6\xF6\xF6\xF6": SASIndex.column_size_index,
b"\xF6\xF6\xF6\xF6\x00\x00\x00\x00": SASIndex.column_size_index,
b"\xF6\xF6\xF6\xF6\xFF\xFF\xFB\xFE": SASIndex.column_size_index,
b"\x00\xFC\xFF\xFF": SASIndex.subheader_counts_index,
b"\xFF\xFF\xFC\x00": SASIndex.subheader_counts_index,
b"\x00\xFC\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.subheader_counts_index,
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFC\x00": SASIndex.subheader_counts_index,
b"\xFD\xFF\xFF\xFF": SASIndex.column_text_index,
b"\xFF\xFF\xFF\xFD": SASIndex.column_text_index,
b"\xFD\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_text_index,
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFD": SASIndex.column_text_index,
b"\xFF\xFF\xFF\xFF": SASIndex.column_name_index,
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_name_index,
b"\xFC\xFF\xFF\xFF": SASIndex.column_attributes_index,
b"\xFF\xFF\xFF\xFC": SASIndex.column_attributes_index,
b"\xFC\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_attributes_index,
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFC": SASIndex.column_attributes_index,
b"\xFE\xFB\xFF\xFF": SASIndex.format_and_label_index,
b"\xFF\xFF\xFB\xFE": SASIndex.format_and_label_index,
b"\xFE\xFB\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.format_and_label_index,
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFB\xFE": SASIndex.format_and_label_index,
b"\xFE\xFF\xFF\xFF": SASIndex.column_list_index,
b"\xFF\xFF\xFF\xFE": SASIndex.column_list_index,
b"\xFE\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_list_index,
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFE": SASIndex.column_list_index,
}
# List of frequently used SAS date and datetime formats
# http://support.sas.com/documentation/cdl/en/etsug/60372/HTML/default/viewer.htm#etsug_intervals_sect009.htm
# https://github.com/epam/parso/blob/master/src/main/java/com/epam/parso/impl/SasFileConstants.java
sas_date_formats = (
"DATE",
"DAY",
"DDMMYY",
"DOWNAME",
"JULDAY",
"JULIAN",
"MMDDYY",
"MMYY",
"MMYYC",
"MMYYD",
"MMYYP",
"MMYYS",
"MMYYN",
"MONNAME",
"MONTH",
"MONYY",
"QTR",
"QTRR",
"NENGO",
"WEEKDATE",
"WEEKDATX",
"WEEKDAY",
"WEEKV",
"WORDDATE",
"WORDDATX",
"YEAR",
"YYMM",
"YYMMC",
"YYMMD",
"YYMMP",
"YYMMS",
"YYMMN",
"YYMON",
"YYMMDD",
"YYQ",
"YYQC",
"YYQD",
"YYQP",
"YYQS",
"YYQN",
"YYQR",
"YYQRC",
"YYQRD",
"YYQRP",
"YYQRS",
"YYQRN",
"YYMMDDP",
"YYMMDDC",
"E8601DA",
"YYMMDDN",
"MMDDYYC",
"MMDDYYS",
"MMDDYYD",
"YYMMDDS",
"B8601DA",
"DDMMYYN",
"YYMMDDD",
"DDMMYYB",
"DDMMYYP",
"MMDDYYP",
"YYMMDDB",
"MMDDYYN",
"DDMMYYC",
"DDMMYYD",
"DDMMYYS",
"MINGUO",
)
sas_datetime_formats = (
"DATETIME",
"DTWKDATX",
"B8601DN",
"B8601DT",
"B8601DX",
"B8601DZ",
"B8601LX",
"E8601DN",
"E8601DT",
"E8601DX",
"E8601DZ",
"E8601LX",
"DATEAMPM",
"DTDATE",
"DTMONYY",
"DTMONYY",
"DTWKDATX",
"DTYEAR",
"TOD",
"MDYAMPM",
)

View File

@@ -0,0 +1,507 @@
"""
Read a SAS XPort format file into a Pandas DataFrame.
Based on code from Jack Cushman (github.com/jcushman/xport).
The file format is defined here:
https://support.sas.com/techsup/technote/ts140.pdf
"""
from datetime import datetime
from io import BytesIO
import struct
import warnings
import numpy as np
from pandas.util._decorators import Appender
import pandas as pd
from pandas.io.common import BaseIterator, get_filepath_or_buffer
_correct_line1 = (
"HEADER RECORD*******LIBRARY HEADER RECORD!!!!!!!"
"000000000000000000000000000000 "
)
_correct_header1 = (
"HEADER RECORD*******MEMBER HEADER RECORD!!!!!!!" "000000000000000001600000000"
)
_correct_header2 = (
"HEADER RECORD*******DSCRPTR HEADER RECORD!!!!!!!"
"000000000000000000000000000000 "
)
_correct_obs_header = (
"HEADER RECORD*******OBS HEADER RECORD!!!!!!!"
"000000000000000000000000000000 "
)
_fieldkeys = [
"ntype",
"nhfun",
"field_length",
"nvar0",
"name",
"label",
"nform",
"nfl",
"num_decimals",
"nfj",
"nfill",
"niform",
"nifl",
"nifd",
"npos",
"_",
]
_base_params_doc = """\
Parameters
----------
filepath_or_buffer : string or file-like object
Path to SAS file or object implementing binary read method."""
_params2_doc = """\
index : identifier of index column
Identifier of column that should be used as index of the DataFrame.
encoding : string
Encoding for text data.
chunksize : int
Read file `chunksize` lines at a time, returns iterator."""
_format_params_doc = """\
format : string
File format, only `xport` is currently supported."""
_iterator_doc = """\
iterator : boolean, default False
Return XportReader object for reading file incrementally."""
_read_sas_doc = """Read a SAS file into a DataFrame.
%(_base_params_doc)s
%(_format_params_doc)s
%(_params2_doc)s
%(_iterator_doc)s
Returns
-------
DataFrame or XportReader
Examples
--------
Read a SAS Xport file:
>>> df = pd.read_sas('filename.XPT')
Read a Xport file in 10,000 line chunks:
>>> itr = pd.read_sas('filename.XPT', chunksize=10000)
>>> for chunk in itr:
>>> do_something(chunk)
""" % {
"_base_params_doc": _base_params_doc,
"_format_params_doc": _format_params_doc,
"_params2_doc": _params2_doc,
"_iterator_doc": _iterator_doc,
}
_xport_reader_doc = """\
Class for reading SAS Xport files.
%(_base_params_doc)s
%(_params2_doc)s
Attributes
----------
member_info : list
Contains information about the file
fields : list
Contains information about the variables in the file
""" % {
"_base_params_doc": _base_params_doc,
"_params2_doc": _params2_doc,
}
_read_method_doc = """\
Read observations from SAS Xport file, returning as data frame.
Parameters
----------
nrows : int
Number of rows to read from data file; if None, read whole
file.
Returns
-------
A DataFrame.
"""
def _parse_date(datestr):
""" Given a date in xport format, return Python date. """
try:
# e.g. "16FEB11:10:07:55"
return datetime.strptime(datestr, "%d%b%y:%H:%M:%S")
except ValueError:
return pd.NaT
def _split_line(s, parts):
"""
Parameters
----------
s: string
Fixed-length string to split
parts: list of (name, length) pairs
Used to break up string, name '_' will be filtered from output.
Returns
-------
Dict of name:contents of string at given location.
"""
out = {}
start = 0
for name, length in parts:
out[name] = s[start : start + length].strip()
start += length
del out["_"]
return out
def _handle_truncated_float_vec(vec, nbytes):
# This feature is not well documented, but some SAS XPORT files
# have 2-7 byte "truncated" floats. To read these truncated
# floats, pad them with zeros on the right to make 8 byte floats.
#
# References:
# https://github.com/jcushman/xport/pull/3
# The R "foreign" library
if nbytes != 8:
vec1 = np.zeros(len(vec), np.dtype("S8"))
dtype = np.dtype("S%d,S%d" % (nbytes, 8 - nbytes))
vec2 = vec1.view(dtype=dtype)
vec2["f0"] = vec
return vec2
return vec
def _parse_float_vec(vec):
"""
Parse a vector of float values representing IBM 8 byte floats into
native 8 byte floats.
"""
dtype = np.dtype(">u4,>u4")
vec1 = vec.view(dtype=dtype)
xport1 = vec1["f0"]
xport2 = vec1["f1"]
# Start by setting first half of ieee number to first half of IBM
# number sans exponent
ieee1 = xport1 & 0x00FFFFFF
# The fraction bit to the left of the binary point in the ieee
# format was set and the number was shifted 0, 1, 2, or 3
# places. This will tell us how to adjust the ibm exponent to be a
# power of 2 ieee exponent and how to shift the fraction bits to
# restore the correct magnitude.
shift = np.zeros(len(vec), dtype=np.uint8)
shift[np.where(xport1 & 0x00200000)] = 1
shift[np.where(xport1 & 0x00400000)] = 2
shift[np.where(xport1 & 0x00800000)] = 3
# shift the ieee number down the correct number of places then
# set the second half of the ieee number to be the second half
# of the ibm number shifted appropriately, ored with the bits
# from the first half that would have been shifted in if we
# could shift a double. All we are worried about are the low
# order 3 bits of the first half since we're only shifting by
# 1, 2, or 3.
ieee1 >>= shift
ieee2 = (xport2 >> shift) | ((xport1 & 0x00000007) << (29 + (3 - shift)))
# clear the 1 bit to the left of the binary point
ieee1 &= 0xFFEFFFFF
# set the exponent of the ieee number to be the actual exponent
# plus the shift count + 1023. Or this into the first half of the
# ieee number. The ibm exponent is excess 64 but is adjusted by 65
# since during conversion to ibm format the exponent is
# incremented by 1 and the fraction bits left 4 positions to the
# right of the radix point. (had to add >> 24 because C treats &
# 0x7f as 0x7f000000 and Python doesn't)
ieee1 |= ((((((xport1 >> 24) & 0x7F) - 65) << 2) + shift + 1023) << 20) | (
xport1 & 0x80000000
)
ieee = np.empty((len(ieee1),), dtype=">u4,>u4")
ieee["f0"] = ieee1
ieee["f1"] = ieee2
ieee = ieee.view(dtype=">f8")
ieee = ieee.astype("f8")
return ieee
class XportReader(BaseIterator):
__doc__ = _xport_reader_doc
def __init__(
self, filepath_or_buffer, index=None, encoding="ISO-8859-1", chunksize=None
):
self._encoding = encoding
self._lines_read = 0
self._index = index
self._chunksize = chunksize
if isinstance(filepath_or_buffer, str):
(
filepath_or_buffer,
encoding,
compression,
should_close,
) = get_filepath_or_buffer(filepath_or_buffer, encoding=encoding)
if isinstance(filepath_or_buffer, (str, bytes)):
self.filepath_or_buffer = open(filepath_or_buffer, "rb")
else:
# Copy to BytesIO, and ensure no encoding
contents = filepath_or_buffer.read()
try:
contents = contents.encode(self._encoding)
except UnicodeEncodeError:
pass
self.filepath_or_buffer = BytesIO(contents)
self._read_header()
def close(self):
self.filepath_or_buffer.close()
def _get_row(self):
return self.filepath_or_buffer.read(80).decode()
def _read_header(self):
self.filepath_or_buffer.seek(0)
# read file header
line1 = self._get_row()
if line1 != _correct_line1:
self.close()
raise ValueError("Header record is not an XPORT file.")
line2 = self._get_row()
fif = [["prefix", 24], ["version", 8], ["OS", 8], ["_", 24], ["created", 16]]
file_info = _split_line(line2, fif)
if file_info["prefix"] != "SAS SAS SASLIB":
self.close()
raise ValueError("Header record has invalid prefix.")
file_info["created"] = _parse_date(file_info["created"])
self.file_info = file_info
line3 = self._get_row()
file_info["modified"] = _parse_date(line3[:16])
# read member header
header1 = self._get_row()
header2 = self._get_row()
headflag1 = header1.startswith(_correct_header1)
headflag2 = header2 == _correct_header2
if not (headflag1 and headflag2):
self.close()
raise ValueError("Member header not found")
# usually 140, could be 135
fieldnamelength = int(header1[-5:-2])
# member info
mem = [
["prefix", 8],
["set_name", 8],
["sasdata", 8],
["version", 8],
["OS", 8],
["_", 24],
["created", 16],
]
member_info = _split_line(self._get_row(), mem)
mem = [["modified", 16], ["_", 16], ["label", 40], ["type", 8]]
member_info.update(_split_line(self._get_row(), mem))
member_info["modified"] = _parse_date(member_info["modified"])
member_info["created"] = _parse_date(member_info["created"])
self.member_info = member_info
# read field names
types = {1: "numeric", 2: "char"}
fieldcount = int(self._get_row()[54:58])
datalength = fieldnamelength * fieldcount
# round up to nearest 80
if datalength % 80:
datalength += 80 - datalength % 80
fielddata = self.filepath_or_buffer.read(datalength)
fields = []
obs_length = 0
while len(fielddata) >= fieldnamelength:
# pull data for one field
field, fielddata = (
fielddata[:fieldnamelength],
fielddata[fieldnamelength:],
)
# rest at end gets ignored, so if field is short, pad out
# to match struct pattern below
field = field.ljust(140)
fieldstruct = struct.unpack(">hhhh8s40s8shhh2s8shhl52s", field)
field = dict(zip(_fieldkeys, fieldstruct))
del field["_"]
field["ntype"] = types[field["ntype"]]
fl = field["field_length"]
if field["ntype"] == "numeric" and ((fl < 2) or (fl > 8)):
self.close()
msg = "Floating field width {0} is not between 2 and 8."
raise TypeError(msg.format(fl))
for k, v in field.items():
try:
field[k] = v.strip()
except AttributeError:
pass
obs_length += field["field_length"]
fields += [field]
header = self._get_row()
if not header == _correct_obs_header:
self.close()
raise ValueError("Observation header not found.")
self.fields = fields
self.record_length = obs_length
self.record_start = self.filepath_or_buffer.tell()
self.nobs = self._record_count()
self.columns = [x["name"].decode() for x in self.fields]
# Setup the dtype.
dtypel = [
("s" + str(i), "S" + str(field["field_length"]))
for i, field in enumerate(self.fields)
]
dtype = np.dtype(dtypel)
self._dtype = dtype
def __next__(self):
return self.read(nrows=self._chunksize or 1)
def _record_count(self):
"""
Get number of records in file.
This is maybe suboptimal because we have to seek to the end of
the file.
Side effect: returns file position to record_start.
"""
self.filepath_or_buffer.seek(0, 2)
total_records_length = self.filepath_or_buffer.tell() - self.record_start
if total_records_length % 80 != 0:
warnings.warn("xport file may be corrupted")
if self.record_length > 80:
self.filepath_or_buffer.seek(self.record_start)
return total_records_length // self.record_length
self.filepath_or_buffer.seek(-80, 2)
last_card = self.filepath_or_buffer.read(80)
last_card = np.frombuffer(last_card, dtype=np.uint64)
# 8 byte blank
ix = np.flatnonzero(last_card == 2314885530818453536)
if len(ix) == 0:
tail_pad = 0
else:
tail_pad = 8 * len(ix)
self.filepath_or_buffer.seek(self.record_start)
return (total_records_length - tail_pad) // self.record_length
def get_chunk(self, size=None):
"""
Reads lines from Xport file and returns as dataframe
Parameters
----------
size : int, defaults to None
Number of lines to read. If None, reads whole file.
Returns
-------
DataFrame
"""
if size is None:
size = self._chunksize
return self.read(nrows=size)
def _missing_double(self, vec):
v = vec.view(dtype="u1,u1,u2,u4")
miss = (v["f1"] == 0) & (v["f2"] == 0) & (v["f3"] == 0)
miss1 = (
((v["f0"] >= 0x41) & (v["f0"] <= 0x5A))
| (v["f0"] == 0x5F)
| (v["f0"] == 0x2E)
)
miss &= miss1
return miss
@Appender(_read_method_doc)
def read(self, nrows=None):
if nrows is None:
nrows = self.nobs
read_lines = min(nrows, self.nobs - self._lines_read)
read_len = read_lines * self.record_length
if read_len <= 0:
self.close()
raise StopIteration
raw = self.filepath_or_buffer.read(read_len)
data = np.frombuffer(raw, dtype=self._dtype, count=read_lines)
df = pd.DataFrame(index=range(read_lines))
for j, x in enumerate(self.columns):
vec = data["s%d" % j]
ntype = self.fields[j]["ntype"]
if ntype == "numeric":
vec = _handle_truncated_float_vec(vec, self.fields[j]["field_length"])
miss = self._missing_double(vec)
v = _parse_float_vec(vec)
v[miss] = np.nan
elif self.fields[j]["ntype"] == "char":
v = [y.rstrip() for y in vec]
if self._encoding is not None:
v = [y.decode(self._encoding) for y in v]
df[x] = v
if self._index is None:
df.index = range(self._lines_read, self._lines_read + read_lines)
else:
df = df.set_index(self._index)
self._lines_read += read_lines
return df

View File

@@ -0,0 +1,86 @@
"""
Read SAS sas7bdat or xport files.
"""
from pandas.io.common import _stringify_path
def read_sas(
filepath_or_buffer,
format=None,
index=None,
encoding=None,
chunksize=None,
iterator=False,
):
"""
Read SAS files stored as either XPORT or SAS7BDAT format files.
Parameters
----------
filepath_or_buffer : str, path object or file-like object
Any valid string path is acceptable. The string could be a URL. Valid
URL schemes include http, ftp, s3, and file. For file URLs, a host is
expected. A local file could be:
``file://localhost/path/to/table.sas``.
If you want to pass in a path object, pandas accepts any
``os.PathLike``.
By file-like object, we refer to objects with a ``read()`` method,
such as a file handler (e.g. via builtin ``open`` function)
or ``StringIO``.
format : string {'xport', 'sas7bdat'} or None
If None, file format is inferred from file extension. If 'xport' or
'sas7bdat', uses the corresponding format.
index : identifier of index column, defaults to None
Identifier of column that should be used as index of the DataFrame.
encoding : string, default is None
Encoding for text data. If None, text data are stored as raw bytes.
chunksize : int
Read file `chunksize` lines at a time, returns iterator.
iterator : bool, defaults to False
If True, returns an iterator for reading the file incrementally.
Returns
-------
DataFrame if iterator=False and chunksize=None, else SAS7BDATReader
or XportReader
"""
if format is None:
buffer_error_msg = (
"If this is a buffer object rather "
"than a string name, you must specify "
"a format string"
)
filepath_or_buffer = _stringify_path(filepath_or_buffer)
if not isinstance(filepath_or_buffer, str):
raise ValueError(buffer_error_msg)
fname = filepath_or_buffer.lower()
if fname.endswith(".xpt"):
format = "xport"
elif fname.endswith(".sas7bdat"):
format = "sas7bdat"
else:
raise ValueError("unable to infer format of SAS file")
if format.lower() == "xport":
from pandas.io.sas.sas_xport import XportReader
reader = XportReader(
filepath_or_buffer, index=index, encoding=encoding, chunksize=chunksize
)
elif format.lower() == "sas7bdat":
from pandas.io.sas.sas7bdat import SAS7BDATReader
reader = SAS7BDATReader(
filepath_or_buffer, index=index, encoding=encoding, chunksize=chunksize
)
else:
raise ValueError("unknown SAS format")
if iterator or chunksize:
return reader
data = reader.read()
reader.close()
return data

View File

@@ -0,0 +1,44 @@
from pathlib import Path
from typing import Optional, Sequence, Union
from pandas.compat._optional import import_optional_dependency
from pandas.api.types import is_list_like
from pandas.core.api import DataFrame
def read_spss(
path: Union[str, Path],
usecols: Optional[Sequence[str]] = None,
convert_categoricals: bool = True,
) -> DataFrame:
"""
Load an SPSS file from the file path, returning a DataFrame.
.. versionadded 0.25.0
Parameters
----------
path : string or Path
File path
usecols : list-like, optional
Return a subset of the columns. If None, return all columns.
convert_categoricals : bool, default is True
Convert categorical columns into pd.Categorical.
Returns
-------
DataFrame
"""
pyreadstat = import_optional_dependency("pyreadstat")
if usecols is not None:
if not is_list_like(usecols):
raise TypeError("usecols must be list-like.")
else:
usecols = list(usecols) # pyreadstat requires a list
df, _ = pyreadstat.read_sav(
path, usecols=usecols, apply_value_formats=convert_categoricals
)
return df

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff