8th day of python challenges 111-117
This commit is contained in:
21
venv/lib/python3.6/site-packages/pandas/io/api.py
Normal file
21
venv/lib/python3.6/site-packages/pandas/io/api.py
Normal file
@@ -0,0 +1,21 @@
|
||||
"""
|
||||
Data IO api
|
||||
"""
|
||||
|
||||
# flake8: noqa
|
||||
|
||||
from pandas.io.clipboards import read_clipboard
|
||||
from pandas.io.excel import ExcelFile, ExcelWriter, read_excel
|
||||
from pandas.io.feather_format import read_feather
|
||||
from pandas.io.gbq import read_gbq
|
||||
from pandas.io.html import read_html
|
||||
from pandas.io.json import read_json
|
||||
from pandas.io.packers import read_msgpack, to_msgpack
|
||||
from pandas.io.parquet import read_parquet
|
||||
from pandas.io.parsers import read_csv, read_fwf, read_table
|
||||
from pandas.io.pickle import read_pickle, to_pickle
|
||||
from pandas.io.pytables import HDFStore, read_hdf
|
||||
from pandas.io.sas import read_sas
|
||||
from pandas.io.spss import read_spss
|
||||
from pandas.io.sql import read_sql, read_sql_query, read_sql_table
|
||||
from pandas.io.stata import read_stata
|
126
venv/lib/python3.6/site-packages/pandas/io/clipboard/__init__.py
Normal file
126
venv/lib/python3.6/site-packages/pandas/io/clipboard/__init__.py
Normal file
@@ -0,0 +1,126 @@
|
||||
"""
|
||||
Pyperclip
|
||||
|
||||
A cross-platform clipboard module for Python. (only handles plain text for now)
|
||||
By Al Sweigart al@inventwithpython.com
|
||||
BSD License
|
||||
|
||||
Usage:
|
||||
import pyperclip
|
||||
pyperclip.copy('The text to be copied to the clipboard.')
|
||||
spam = pyperclip.paste()
|
||||
|
||||
if not pyperclip.copy:
|
||||
print("Copy functionality unavailable!")
|
||||
|
||||
On Windows, no additional modules are needed.
|
||||
On Mac, the module uses pbcopy and pbpaste, which should come with the os.
|
||||
On Linux, install xclip or xsel via package manager. For example, in Debian:
|
||||
sudo apt-get install xclip
|
||||
|
||||
Otherwise on Linux, you will need the qtpy or PyQt modules installed.
|
||||
qtpy also requires a python-qt-bindings module: PyQt4, PyQt5, PySide, PySide2
|
||||
|
||||
This module does not work with PyGObject yet.
|
||||
"""
|
||||
__version__ = "1.5.27"
|
||||
|
||||
import os
|
||||
import platform
|
||||
import subprocess
|
||||
|
||||
from .clipboards import (
|
||||
init_klipper_clipboard,
|
||||
init_no_clipboard,
|
||||
init_osx_clipboard,
|
||||
init_qt_clipboard,
|
||||
init_xclip_clipboard,
|
||||
init_xsel_clipboard,
|
||||
)
|
||||
from .windows import init_windows_clipboard
|
||||
|
||||
# `import qtpy` sys.exit()s if DISPLAY is not in the environment.
|
||||
# Thus, we need to detect the presence of $DISPLAY manually
|
||||
# and not load qtpy if it is absent.
|
||||
HAS_DISPLAY = os.getenv("DISPLAY", False)
|
||||
CHECK_CMD = "where" if platform.system() == "Windows" else "which"
|
||||
|
||||
|
||||
def _executable_exists(name):
|
||||
return (
|
||||
subprocess.call(
|
||||
[CHECK_CMD, name], stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
||||
)
|
||||
== 0
|
||||
)
|
||||
|
||||
|
||||
def determine_clipboard():
|
||||
# Determine the OS/platform and set
|
||||
# the copy() and paste() functions accordingly.
|
||||
if "cygwin" in platform.system().lower():
|
||||
# FIXME: pyperclip currently does not support Cygwin,
|
||||
# see https://github.com/asweigart/pyperclip/issues/55
|
||||
pass
|
||||
elif os.name == "nt" or platform.system() == "Windows":
|
||||
return init_windows_clipboard()
|
||||
if os.name == "mac" or platform.system() == "Darwin":
|
||||
return init_osx_clipboard()
|
||||
if HAS_DISPLAY:
|
||||
# Determine which command/module is installed, if any.
|
||||
try:
|
||||
# qtpy is a small abstraction layer that lets you write
|
||||
# applications using a single api call to either PyQt or PySide
|
||||
# https://pypi.org/project/QtPy
|
||||
import qtpy # noqa
|
||||
except ImportError:
|
||||
# If qtpy isn't installed, fall back on importing PyQt5, or PyQt5
|
||||
try:
|
||||
import PyQt5 # noqa
|
||||
except ImportError:
|
||||
try:
|
||||
import PyQt4 # noqa
|
||||
except ImportError:
|
||||
pass # fail fast for all non-ImportError exceptions.
|
||||
else:
|
||||
return init_qt_clipboard()
|
||||
else:
|
||||
return init_qt_clipboard()
|
||||
pass
|
||||
else:
|
||||
return init_qt_clipboard()
|
||||
|
||||
if _executable_exists("xclip"):
|
||||
return init_xclip_clipboard()
|
||||
if _executable_exists("xsel"):
|
||||
return init_xsel_clipboard()
|
||||
if _executable_exists("klipper") and _executable_exists("qdbus"):
|
||||
return init_klipper_clipboard()
|
||||
|
||||
return init_no_clipboard()
|
||||
|
||||
|
||||
def set_clipboard(clipboard):
|
||||
global copy, paste
|
||||
|
||||
clipboard_types = {
|
||||
"osx": init_osx_clipboard,
|
||||
"qt": init_qt_clipboard,
|
||||
"xclip": init_xclip_clipboard,
|
||||
"xsel": init_xsel_clipboard,
|
||||
"klipper": init_klipper_clipboard,
|
||||
"windows": init_windows_clipboard,
|
||||
"no": init_no_clipboard,
|
||||
}
|
||||
|
||||
copy, paste = clipboard_types[clipboard]()
|
||||
|
||||
|
||||
copy, paste = determine_clipboard()
|
||||
|
||||
__all__ = ["copy", "paste"]
|
||||
|
||||
|
||||
# pandas aliases
|
||||
clipboard_get = paste
|
||||
clipboard_set = copy
|
@@ -0,0 +1,129 @@
|
||||
import subprocess
|
||||
|
||||
from .exceptions import PyperclipException
|
||||
|
||||
EXCEPT_MSG = """
|
||||
Pyperclip could not find a copy/paste mechanism for your system.
|
||||
For more information, please visit https://pyperclip.readthedocs.org """
|
||||
|
||||
|
||||
def init_osx_clipboard():
|
||||
def copy_osx(text):
|
||||
p = subprocess.Popen(["pbcopy", "w"], stdin=subprocess.PIPE, close_fds=True)
|
||||
p.communicate(input=text.encode("utf-8"))
|
||||
|
||||
def paste_osx():
|
||||
p = subprocess.Popen(["pbpaste", "r"], stdout=subprocess.PIPE, close_fds=True)
|
||||
stdout, stderr = p.communicate()
|
||||
return stdout.decode("utf-8")
|
||||
|
||||
return copy_osx, paste_osx
|
||||
|
||||
|
||||
def init_qt_clipboard():
|
||||
# $DISPLAY should exist
|
||||
|
||||
# Try to import from qtpy, but if that fails try PyQt5 then PyQt4
|
||||
try:
|
||||
from qtpy.QtWidgets import QApplication
|
||||
except ImportError:
|
||||
try:
|
||||
from PyQt5.QtWidgets import QApplication
|
||||
except ImportError:
|
||||
from PyQt4.QtGui import QApplication
|
||||
|
||||
app = QApplication.instance()
|
||||
if app is None:
|
||||
app = QApplication([])
|
||||
|
||||
def copy_qt(text):
|
||||
cb = app.clipboard()
|
||||
cb.setText(text)
|
||||
|
||||
def paste_qt():
|
||||
cb = app.clipboard()
|
||||
return str(cb.text())
|
||||
|
||||
return copy_qt, paste_qt
|
||||
|
||||
|
||||
def init_xclip_clipboard():
|
||||
def copy_xclip(text):
|
||||
p = subprocess.Popen(
|
||||
["xclip", "-selection", "c"], stdin=subprocess.PIPE, close_fds=True
|
||||
)
|
||||
p.communicate(input=text.encode("utf-8"))
|
||||
|
||||
def paste_xclip():
|
||||
p = subprocess.Popen(
|
||||
["xclip", "-selection", "c", "-o"], stdout=subprocess.PIPE, close_fds=True
|
||||
)
|
||||
stdout, stderr = p.communicate()
|
||||
return stdout.decode("utf-8")
|
||||
|
||||
return copy_xclip, paste_xclip
|
||||
|
||||
|
||||
def init_xsel_clipboard():
|
||||
def copy_xsel(text):
|
||||
p = subprocess.Popen(
|
||||
["xsel", "-b", "-i"], stdin=subprocess.PIPE, close_fds=True
|
||||
)
|
||||
p.communicate(input=text.encode("utf-8"))
|
||||
|
||||
def paste_xsel():
|
||||
p = subprocess.Popen(
|
||||
["xsel", "-b", "-o"], stdout=subprocess.PIPE, close_fds=True
|
||||
)
|
||||
stdout, stderr = p.communicate()
|
||||
return stdout.decode("utf-8")
|
||||
|
||||
return copy_xsel, paste_xsel
|
||||
|
||||
|
||||
def init_klipper_clipboard():
|
||||
def copy_klipper(text):
|
||||
p = subprocess.Popen(
|
||||
[
|
||||
"qdbus",
|
||||
"org.kde.klipper",
|
||||
"/klipper",
|
||||
"setClipboardContents",
|
||||
text.encode("utf-8"),
|
||||
],
|
||||
stdin=subprocess.PIPE,
|
||||
close_fds=True,
|
||||
)
|
||||
p.communicate(input=None)
|
||||
|
||||
def paste_klipper():
|
||||
p = subprocess.Popen(
|
||||
["qdbus", "org.kde.klipper", "/klipper", "getClipboardContents"],
|
||||
stdout=subprocess.PIPE,
|
||||
close_fds=True,
|
||||
)
|
||||
stdout, stderr = p.communicate()
|
||||
|
||||
# Workaround for https://bugs.kde.org/show_bug.cgi?id=342874
|
||||
# TODO: https://github.com/asweigart/pyperclip/issues/43
|
||||
clipboardContents = stdout.decode("utf-8")
|
||||
# even if blank, Klipper will append a newline at the end
|
||||
assert len(clipboardContents) > 0
|
||||
# make sure that newline is there
|
||||
assert clipboardContents.endswith("\n")
|
||||
if clipboardContents.endswith("\n"):
|
||||
clipboardContents = clipboardContents[:-1]
|
||||
return clipboardContents
|
||||
|
||||
return copy_klipper, paste_klipper
|
||||
|
||||
|
||||
def init_no_clipboard():
|
||||
class ClipboardUnavailable:
|
||||
def __call__(self, *args, **kwargs):
|
||||
raise PyperclipException(EXCEPT_MSG)
|
||||
|
||||
def __bool__(self):
|
||||
return False
|
||||
|
||||
return ClipboardUnavailable(), ClipboardUnavailable()
|
@@ -0,0 +1,11 @@
|
||||
import ctypes
|
||||
|
||||
|
||||
class PyperclipException(RuntimeError):
|
||||
pass
|
||||
|
||||
|
||||
class PyperclipWindowsException(PyperclipException):
|
||||
def __init__(self, message):
|
||||
message += " ({err})".format(err=ctypes.WinError())
|
||||
super().__init__(message)
|
184
venv/lib/python3.6/site-packages/pandas/io/clipboard/windows.py
Normal file
184
venv/lib/python3.6/site-packages/pandas/io/clipboard/windows.py
Normal file
@@ -0,0 +1,184 @@
|
||||
"""
|
||||
This module implements clipboard handling on Windows using ctypes.
|
||||
"""
|
||||
import contextlib
|
||||
import ctypes
|
||||
from ctypes import c_size_t, c_wchar, c_wchar_p, get_errno, sizeof
|
||||
import time
|
||||
|
||||
from .exceptions import PyperclipWindowsException
|
||||
|
||||
|
||||
class CheckedCall:
|
||||
def __init__(self, f):
|
||||
super().__setattr__("f", f)
|
||||
|
||||
def __call__(self, *args):
|
||||
ret = self.f(*args)
|
||||
if not ret and get_errno():
|
||||
raise PyperclipWindowsException("Error calling " + self.f.__name__)
|
||||
return ret
|
||||
|
||||
def __setattr__(self, key, value):
|
||||
setattr(self.f, key, value)
|
||||
|
||||
|
||||
def init_windows_clipboard():
|
||||
from ctypes.wintypes import (
|
||||
HGLOBAL,
|
||||
LPVOID,
|
||||
DWORD,
|
||||
LPCSTR,
|
||||
INT,
|
||||
HWND,
|
||||
HINSTANCE,
|
||||
HMENU,
|
||||
BOOL,
|
||||
UINT,
|
||||
HANDLE,
|
||||
)
|
||||
|
||||
windll = ctypes.windll
|
||||
msvcrt = ctypes.CDLL("msvcrt")
|
||||
|
||||
safeCreateWindowExA = CheckedCall(windll.user32.CreateWindowExA)
|
||||
safeCreateWindowExA.argtypes = [
|
||||
DWORD,
|
||||
LPCSTR,
|
||||
LPCSTR,
|
||||
DWORD,
|
||||
INT,
|
||||
INT,
|
||||
INT,
|
||||
INT,
|
||||
HWND,
|
||||
HMENU,
|
||||
HINSTANCE,
|
||||
LPVOID,
|
||||
]
|
||||
safeCreateWindowExA.restype = HWND
|
||||
|
||||
safeDestroyWindow = CheckedCall(windll.user32.DestroyWindow)
|
||||
safeDestroyWindow.argtypes = [HWND]
|
||||
safeDestroyWindow.restype = BOOL
|
||||
|
||||
OpenClipboard = windll.user32.OpenClipboard
|
||||
OpenClipboard.argtypes = [HWND]
|
||||
OpenClipboard.restype = BOOL
|
||||
|
||||
safeCloseClipboard = CheckedCall(windll.user32.CloseClipboard)
|
||||
safeCloseClipboard.argtypes = []
|
||||
safeCloseClipboard.restype = BOOL
|
||||
|
||||
safeEmptyClipboard = CheckedCall(windll.user32.EmptyClipboard)
|
||||
safeEmptyClipboard.argtypes = []
|
||||
safeEmptyClipboard.restype = BOOL
|
||||
|
||||
safeGetClipboardData = CheckedCall(windll.user32.GetClipboardData)
|
||||
safeGetClipboardData.argtypes = [UINT]
|
||||
safeGetClipboardData.restype = HANDLE
|
||||
|
||||
safeSetClipboardData = CheckedCall(windll.user32.SetClipboardData)
|
||||
safeSetClipboardData.argtypes = [UINT, HANDLE]
|
||||
safeSetClipboardData.restype = HANDLE
|
||||
|
||||
safeGlobalAlloc = CheckedCall(windll.kernel32.GlobalAlloc)
|
||||
safeGlobalAlloc.argtypes = [UINT, c_size_t]
|
||||
safeGlobalAlloc.restype = HGLOBAL
|
||||
|
||||
safeGlobalLock = CheckedCall(windll.kernel32.GlobalLock)
|
||||
safeGlobalLock.argtypes = [HGLOBAL]
|
||||
safeGlobalLock.restype = LPVOID
|
||||
|
||||
safeGlobalUnlock = CheckedCall(windll.kernel32.GlobalUnlock)
|
||||
safeGlobalUnlock.argtypes = [HGLOBAL]
|
||||
safeGlobalUnlock.restype = BOOL
|
||||
|
||||
wcslen = CheckedCall(msvcrt.wcslen)
|
||||
wcslen.argtypes = [c_wchar_p]
|
||||
wcslen.restype = UINT
|
||||
|
||||
GMEM_MOVEABLE = 0x0002
|
||||
CF_UNICODETEXT = 13
|
||||
|
||||
@contextlib.contextmanager
|
||||
def window():
|
||||
"""
|
||||
Context that provides a valid Windows hwnd.
|
||||
"""
|
||||
# we really just need the hwnd, so setting "STATIC"
|
||||
# as predefined lpClass is just fine.
|
||||
hwnd = safeCreateWindowExA(
|
||||
0, b"STATIC", None, 0, 0, 0, 0, 0, None, None, None, None
|
||||
)
|
||||
try:
|
||||
yield hwnd
|
||||
finally:
|
||||
safeDestroyWindow(hwnd)
|
||||
|
||||
@contextlib.contextmanager
|
||||
def clipboard(hwnd):
|
||||
"""
|
||||
Context manager that opens the clipboard and prevents
|
||||
other applications from modifying the clipboard content.
|
||||
"""
|
||||
# We may not get the clipboard handle immediately because
|
||||
# some other application is accessing it (?)
|
||||
# We try for at least 500ms to get the clipboard.
|
||||
t = time.time() + 0.5
|
||||
success = False
|
||||
while time.time() < t:
|
||||
success = OpenClipboard(hwnd)
|
||||
if success:
|
||||
break
|
||||
time.sleep(0.01)
|
||||
if not success:
|
||||
raise PyperclipWindowsException("Error calling OpenClipboard")
|
||||
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
safeCloseClipboard()
|
||||
|
||||
def copy_windows(text):
|
||||
# This function is heavily based on
|
||||
# http://msdn.com/ms649016#_win32_Copying_Information_to_the_Clipboard
|
||||
with window() as hwnd:
|
||||
# http://msdn.com/ms649048
|
||||
# If an application calls OpenClipboard with hwnd set to NULL,
|
||||
# EmptyClipboard sets the clipboard owner to NULL;
|
||||
# this causes SetClipboardData to fail.
|
||||
# => We need a valid hwnd to copy something.
|
||||
with clipboard(hwnd):
|
||||
safeEmptyClipboard()
|
||||
|
||||
if text:
|
||||
# http://msdn.com/ms649051
|
||||
# If the hMem parameter identifies a memory object,
|
||||
# the object must have been allocated using the
|
||||
# function with the GMEM_MOVEABLE flag.
|
||||
count = wcslen(text) + 1
|
||||
handle = safeGlobalAlloc(GMEM_MOVEABLE, count * sizeof(c_wchar))
|
||||
locked_handle = safeGlobalLock(handle)
|
||||
|
||||
ctypes.memmove(
|
||||
c_wchar_p(locked_handle),
|
||||
c_wchar_p(text),
|
||||
count * sizeof(c_wchar),
|
||||
)
|
||||
|
||||
safeGlobalUnlock(handle)
|
||||
safeSetClipboardData(CF_UNICODETEXT, handle)
|
||||
|
||||
def paste_windows():
|
||||
with clipboard(None):
|
||||
handle = safeGetClipboardData(CF_UNICODETEXT)
|
||||
if not handle:
|
||||
# GetClipboardData may return NULL with errno == NO_ERROR
|
||||
# if the clipboard is empty.
|
||||
# (Also, it may return a handle to an empty buffer,
|
||||
# but technically that's not empty)
|
||||
return ""
|
||||
return c_wchar_p(handle).value
|
||||
|
||||
return copy_windows, paste_windows
|
135
venv/lib/python3.6/site-packages/pandas/io/clipboards.py
Normal file
135
venv/lib/python3.6/site-packages/pandas/io/clipboards.py
Normal file
@@ -0,0 +1,135 @@
|
||||
""" io on the clipboard """
|
||||
from io import StringIO
|
||||
import warnings
|
||||
|
||||
from pandas.core.dtypes.generic import ABCDataFrame
|
||||
|
||||
from pandas import get_option, option_context
|
||||
|
||||
|
||||
def read_clipboard(sep=r"\s+", **kwargs): # pragma: no cover
|
||||
r"""
|
||||
Read text from clipboard and pass to read_csv. See read_csv for the
|
||||
full argument list
|
||||
|
||||
Parameters
|
||||
----------
|
||||
sep : str, default '\s+'
|
||||
A string or regex delimiter. The default of '\s+' denotes
|
||||
one or more whitespace characters.
|
||||
|
||||
Returns
|
||||
-------
|
||||
parsed : DataFrame
|
||||
"""
|
||||
encoding = kwargs.pop("encoding", "utf-8")
|
||||
|
||||
# only utf-8 is valid for passed value because that's what clipboard
|
||||
# supports
|
||||
if encoding is not None and encoding.lower().replace("-", "") != "utf8":
|
||||
raise NotImplementedError("reading from clipboard only supports utf-8 encoding")
|
||||
|
||||
from pandas.io.clipboard import clipboard_get
|
||||
from pandas.io.parsers import read_csv
|
||||
|
||||
text = clipboard_get()
|
||||
|
||||
# Try to decode (if needed, as "text" might already be a string here).
|
||||
try:
|
||||
text = text.decode(kwargs.get("encoding") or get_option("display.encoding"))
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
# Excel copies into clipboard with \t separation
|
||||
# inspect no more then the 10 first lines, if they
|
||||
# all contain an equal number (>0) of tabs, infer
|
||||
# that this came from excel and set 'sep' accordingly
|
||||
lines = text[:10000].split("\n")[:-1][:10]
|
||||
|
||||
# Need to remove leading white space, since read_csv
|
||||
# accepts:
|
||||
# a b
|
||||
# 0 1 2
|
||||
# 1 3 4
|
||||
|
||||
counts = {x.lstrip().count("\t") for x in lines}
|
||||
if len(lines) > 1 and len(counts) == 1 and counts.pop() != 0:
|
||||
sep = "\t"
|
||||
|
||||
# Edge case where sep is specified to be None, return to default
|
||||
if sep is None and kwargs.get("delim_whitespace") is None:
|
||||
sep = r"\s+"
|
||||
|
||||
# Regex separator currently only works with python engine.
|
||||
# Default to python if separator is multi-character (regex)
|
||||
if len(sep) > 1 and kwargs.get("engine") is None:
|
||||
kwargs["engine"] = "python"
|
||||
elif len(sep) > 1 and kwargs.get("engine") == "c":
|
||||
warnings.warn(
|
||||
"read_clipboard with regex separator does not work"
|
||||
" properly with c engine"
|
||||
)
|
||||
|
||||
return read_csv(StringIO(text), sep=sep, **kwargs)
|
||||
|
||||
|
||||
def to_clipboard(obj, excel=True, sep=None, **kwargs): # pragma: no cover
|
||||
"""
|
||||
Attempt to write text representation of object to the system clipboard
|
||||
The clipboard can be then pasted into Excel for example.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj : the object to write to the clipboard
|
||||
excel : boolean, defaults to True
|
||||
if True, use the provided separator, writing in a csv
|
||||
format for allowing easy pasting into excel.
|
||||
if False, write a string representation of the object
|
||||
to the clipboard
|
||||
sep : optional, defaults to tab
|
||||
other keywords are passed to to_csv
|
||||
|
||||
Notes
|
||||
-----
|
||||
Requirements for your platform
|
||||
- Linux: xclip, or xsel (with PyQt4 modules)
|
||||
- Windows:
|
||||
- OS X:
|
||||
"""
|
||||
encoding = kwargs.pop("encoding", "utf-8")
|
||||
|
||||
# testing if an invalid encoding is passed to clipboard
|
||||
if encoding is not None and encoding.lower().replace("-", "") != "utf8":
|
||||
raise ValueError("clipboard only supports utf-8 encoding")
|
||||
|
||||
from pandas.io.clipboard import clipboard_set
|
||||
|
||||
if excel is None:
|
||||
excel = True
|
||||
|
||||
if excel:
|
||||
try:
|
||||
if sep is None:
|
||||
sep = "\t"
|
||||
buf = StringIO()
|
||||
|
||||
# clipboard_set (pyperclip) expects unicode
|
||||
obj.to_csv(buf, sep=sep, encoding="utf-8", **kwargs)
|
||||
text = buf.getvalue()
|
||||
|
||||
clipboard_set(text)
|
||||
return
|
||||
except TypeError:
|
||||
warnings.warn(
|
||||
"to_clipboard in excel mode requires a single " "character separator."
|
||||
)
|
||||
elif sep is not None:
|
||||
warnings.warn("to_clipboard with excel=False ignores the sep argument")
|
||||
|
||||
if isinstance(obj, ABCDataFrame):
|
||||
# str(df) has various unhelpful defaults, like truncation
|
||||
with option_context("display.max_colwidth", 999999):
|
||||
objstr = obj.to_string(**kwargs)
|
||||
else:
|
||||
objstr = str(obj)
|
||||
clipboard_set(objstr)
|
515
venv/lib/python3.6/site-packages/pandas/io/common.py
Normal file
515
venv/lib/python3.6/site-packages/pandas/io/common.py
Normal file
@@ -0,0 +1,515 @@
|
||||
"""Common IO api utilities"""
|
||||
|
||||
import bz2
|
||||
import codecs
|
||||
import csv
|
||||
import gzip
|
||||
from http.client import HTTPException # noqa
|
||||
from io import BytesIO
|
||||
import lzma
|
||||
import mmap
|
||||
import os
|
||||
import pathlib
|
||||
from urllib.error import URLError # noqa
|
||||
from urllib.parse import ( # noqa
|
||||
urlencode,
|
||||
urljoin,
|
||||
urlparse as parse_url,
|
||||
uses_netloc,
|
||||
uses_params,
|
||||
uses_relative,
|
||||
)
|
||||
from urllib.request import pathname2url, urlopen
|
||||
import zipfile
|
||||
|
||||
from pandas.errors import ( # noqa
|
||||
AbstractMethodError,
|
||||
DtypeWarning,
|
||||
EmptyDataError,
|
||||
ParserError,
|
||||
ParserWarning,
|
||||
)
|
||||
|
||||
from pandas.core.dtypes.common import is_file_like
|
||||
|
||||
# gh-12665: Alias for now and remove later.
|
||||
CParserError = ParserError
|
||||
|
||||
# common NA values
|
||||
# no longer excluding inf representations
|
||||
# '1.#INF','-1.#INF', '1.#INF000000',
|
||||
_NA_VALUES = {
|
||||
"-1.#IND",
|
||||
"1.#QNAN",
|
||||
"1.#IND",
|
||||
"-1.#QNAN",
|
||||
"#N/A N/A",
|
||||
"#N/A",
|
||||
"N/A",
|
||||
"n/a",
|
||||
"NA",
|
||||
"#NA",
|
||||
"NULL",
|
||||
"null",
|
||||
"NaN",
|
||||
"-NaN",
|
||||
"nan",
|
||||
"-nan",
|
||||
"",
|
||||
}
|
||||
|
||||
|
||||
_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
|
||||
_VALID_URLS.discard("")
|
||||
|
||||
|
||||
class BaseIterator:
|
||||
"""Subclass this and provide a "__next__()" method to obtain an iterator.
|
||||
Useful only when the object being iterated is non-reusable (e.g. OK for a
|
||||
parser, not for an in-memory table, yes for its iterator)."""
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def __next__(self):
|
||||
raise AbstractMethodError(self)
|
||||
|
||||
|
||||
def _is_url(url):
|
||||
"""Check to see if a URL has a valid protocol.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
url : str or unicode
|
||||
|
||||
Returns
|
||||
-------
|
||||
isurl : bool
|
||||
If `url` has a valid protocol return True otherwise False.
|
||||
"""
|
||||
try:
|
||||
return parse_url(url).scheme in _VALID_URLS
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def _expand_user(filepath_or_buffer):
|
||||
"""Return the argument with an initial component of ~ or ~user
|
||||
replaced by that user's home directory.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filepath_or_buffer : object to be converted if possible
|
||||
|
||||
Returns
|
||||
-------
|
||||
expanded_filepath_or_buffer : an expanded filepath or the
|
||||
input if not expandable
|
||||
"""
|
||||
if isinstance(filepath_or_buffer, str):
|
||||
return os.path.expanduser(filepath_or_buffer)
|
||||
return filepath_or_buffer
|
||||
|
||||
|
||||
def _validate_header_arg(header):
|
||||
if isinstance(header, bool):
|
||||
raise TypeError(
|
||||
"Passing a bool to header is invalid. "
|
||||
"Use header=None for no header or "
|
||||
"header=int or list-like of ints to specify "
|
||||
"the row(s) making up the column names"
|
||||
)
|
||||
|
||||
|
||||
def _stringify_path(filepath_or_buffer):
|
||||
"""Attempt to convert a path-like object to a string.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filepath_or_buffer : object to be converted
|
||||
|
||||
Returns
|
||||
-------
|
||||
str_filepath_or_buffer : maybe a string version of the object
|
||||
|
||||
Notes
|
||||
-----
|
||||
Objects supporting the fspath protocol (python 3.6+) are coerced
|
||||
according to its __fspath__ method.
|
||||
|
||||
For backwards compatibility with older pythons, pathlib.Path and
|
||||
py.path objects are specially coerced.
|
||||
|
||||
Any other object is passed through unchanged, which includes bytes,
|
||||
strings, buffers, or anything else that's not even path-like.
|
||||
"""
|
||||
if hasattr(filepath_or_buffer, "__fspath__"):
|
||||
return filepath_or_buffer.__fspath__()
|
||||
elif isinstance(filepath_or_buffer, pathlib.Path):
|
||||
return str(filepath_or_buffer)
|
||||
return _expand_user(filepath_or_buffer)
|
||||
|
||||
|
||||
def is_s3_url(url):
|
||||
"""Check for an s3, s3n, or s3a url"""
|
||||
try:
|
||||
return parse_url(url).scheme in ["s3", "s3n", "s3a"]
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def is_gcs_url(url):
|
||||
"""Check for a gcs url"""
|
||||
try:
|
||||
return parse_url(url).scheme in ["gcs", "gs"]
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def get_filepath_or_buffer(
|
||||
filepath_or_buffer, encoding=None, compression=None, mode=None
|
||||
):
|
||||
"""
|
||||
If the filepath_or_buffer is a url, translate and return the buffer.
|
||||
Otherwise passthrough.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path),
|
||||
or buffer
|
||||
compression : {{'gzip', 'bz2', 'zip', 'xz', None}}, optional
|
||||
encoding : the encoding to use to decode bytes, default is 'utf-8'
|
||||
mode : str, optional
|
||||
|
||||
Returns
|
||||
-------
|
||||
tuple of ({a filepath_ or buffer or S3File instance},
|
||||
encoding, str,
|
||||
compression, str,
|
||||
should_close, bool)
|
||||
"""
|
||||
filepath_or_buffer = _stringify_path(filepath_or_buffer)
|
||||
|
||||
if _is_url(filepath_or_buffer):
|
||||
req = urlopen(filepath_or_buffer)
|
||||
content_encoding = req.headers.get("Content-Encoding", None)
|
||||
if content_encoding == "gzip":
|
||||
# Override compression based on Content-Encoding header
|
||||
compression = "gzip"
|
||||
reader = BytesIO(req.read())
|
||||
req.close()
|
||||
return reader, encoding, compression, True
|
||||
|
||||
if is_s3_url(filepath_or_buffer):
|
||||
from pandas.io import s3
|
||||
|
||||
return s3.get_filepath_or_buffer(
|
||||
filepath_or_buffer, encoding=encoding, compression=compression, mode=mode
|
||||
)
|
||||
|
||||
if is_gcs_url(filepath_or_buffer):
|
||||
from pandas.io import gcs
|
||||
|
||||
return gcs.get_filepath_or_buffer(
|
||||
filepath_or_buffer, encoding=encoding, compression=compression, mode=mode
|
||||
)
|
||||
|
||||
if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)):
|
||||
return _expand_user(filepath_or_buffer), None, compression, False
|
||||
|
||||
if not is_file_like(filepath_or_buffer):
|
||||
msg = "Invalid file path or buffer object type: {_type}"
|
||||
raise ValueError(msg.format(_type=type(filepath_or_buffer)))
|
||||
|
||||
return filepath_or_buffer, None, compression, False
|
||||
|
||||
|
||||
def file_path_to_url(path):
|
||||
"""
|
||||
converts an absolute native path to a FILE URL.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : a path in native format
|
||||
|
||||
Returns
|
||||
-------
|
||||
a valid FILE URL
|
||||
"""
|
||||
return urljoin("file:", pathname2url(path))
|
||||
|
||||
|
||||
_compression_to_extension = {"gzip": ".gz", "bz2": ".bz2", "zip": ".zip", "xz": ".xz"}
|
||||
|
||||
|
||||
def _infer_compression(filepath_or_buffer, compression):
|
||||
"""
|
||||
Get the compression method for filepath_or_buffer. If compression='infer',
|
||||
the inferred compression method is returned. Otherwise, the input
|
||||
compression method is returned unchanged, unless it's invalid, in which
|
||||
case an error is raised.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filepath_or_buffer :
|
||||
a path (str) or buffer
|
||||
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}
|
||||
If 'infer' and `filepath_or_buffer` is path-like, then detect
|
||||
compression from the following extensions: '.gz', '.bz2', '.zip',
|
||||
or '.xz' (otherwise no compression).
|
||||
|
||||
Returns
|
||||
-------
|
||||
string or None :
|
||||
compression method
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError on invalid compression specified
|
||||
"""
|
||||
|
||||
# No compression has been explicitly specified
|
||||
if compression is None:
|
||||
return None
|
||||
|
||||
# Infer compression
|
||||
if compression == "infer":
|
||||
# Convert all path types (e.g. pathlib.Path) to strings
|
||||
filepath_or_buffer = _stringify_path(filepath_or_buffer)
|
||||
if not isinstance(filepath_or_buffer, str):
|
||||
# Cannot infer compression of a buffer, assume no compression
|
||||
return None
|
||||
|
||||
# Infer compression from the filename/URL extension
|
||||
for compression, extension in _compression_to_extension.items():
|
||||
if filepath_or_buffer.endswith(extension):
|
||||
return compression
|
||||
return None
|
||||
|
||||
# Compression has been specified. Check that it's valid
|
||||
if compression in _compression_to_extension:
|
||||
return compression
|
||||
|
||||
msg = "Unrecognized compression type: {}".format(compression)
|
||||
valid = ["infer", None] + sorted(_compression_to_extension)
|
||||
msg += "\nValid compression types are {}".format(valid)
|
||||
raise ValueError(msg)
|
||||
|
||||
|
||||
def _get_handle(
|
||||
path_or_buf, mode, encoding=None, compression=None, memory_map=False, is_text=True
|
||||
):
|
||||
"""
|
||||
Get file handle for given path/buffer and mode.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path_or_buf :
|
||||
a path (str) or buffer
|
||||
mode : str
|
||||
mode to open path_or_buf with
|
||||
encoding : str or None
|
||||
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default None
|
||||
If 'infer' and `filepath_or_buffer` is path-like, then detect
|
||||
compression from the following extensions: '.gz', '.bz2', '.zip',
|
||||
or '.xz' (otherwise no compression).
|
||||
memory_map : boolean, default False
|
||||
See parsers._parser_params for more information.
|
||||
is_text : boolean, default True
|
||||
whether file/buffer is in text format (csv, json, etc.), or in binary
|
||||
mode (pickle, etc.)
|
||||
|
||||
Returns
|
||||
-------
|
||||
f : file-like
|
||||
A file-like object
|
||||
handles : list of file-like objects
|
||||
A list of file-like object that were opened in this function.
|
||||
"""
|
||||
try:
|
||||
from s3fs import S3File
|
||||
|
||||
need_text_wrapping = (BytesIO, S3File)
|
||||
except ImportError:
|
||||
need_text_wrapping = (BytesIO,)
|
||||
|
||||
handles = list()
|
||||
f = path_or_buf
|
||||
|
||||
# Convert pathlib.Path/py.path.local or string
|
||||
path_or_buf = _stringify_path(path_or_buf)
|
||||
is_path = isinstance(path_or_buf, str)
|
||||
|
||||
if is_path:
|
||||
compression = _infer_compression(path_or_buf, compression)
|
||||
|
||||
if compression:
|
||||
|
||||
# GZ Compression
|
||||
if compression == "gzip":
|
||||
if is_path:
|
||||
f = gzip.open(path_or_buf, mode)
|
||||
else:
|
||||
f = gzip.GzipFile(fileobj=path_or_buf)
|
||||
|
||||
# BZ Compression
|
||||
elif compression == "bz2":
|
||||
if is_path:
|
||||
f = bz2.BZ2File(path_or_buf, mode)
|
||||
else:
|
||||
f = bz2.BZ2File(path_or_buf)
|
||||
|
||||
# ZIP Compression
|
||||
elif compression == "zip":
|
||||
zf = BytesZipFile(path_or_buf, mode)
|
||||
# Ensure the container is closed as well.
|
||||
handles.append(zf)
|
||||
if zf.mode == "w":
|
||||
f = zf
|
||||
elif zf.mode == "r":
|
||||
zip_names = zf.namelist()
|
||||
if len(zip_names) == 1:
|
||||
f = zf.open(zip_names.pop())
|
||||
elif len(zip_names) == 0:
|
||||
raise ValueError(
|
||||
"Zero files found in ZIP file {}".format(path_or_buf)
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
"Multiple files found in ZIP file."
|
||||
" Only one file per ZIP: {}".format(zip_names)
|
||||
)
|
||||
|
||||
# XZ Compression
|
||||
elif compression == "xz":
|
||||
f = lzma.LZMAFile(path_or_buf, mode)
|
||||
|
||||
# Unrecognized Compression
|
||||
else:
|
||||
msg = "Unrecognized compression type: {}".format(compression)
|
||||
raise ValueError(msg)
|
||||
|
||||
handles.append(f)
|
||||
|
||||
elif is_path:
|
||||
if encoding:
|
||||
# Encoding
|
||||
f = open(path_or_buf, mode, encoding=encoding, newline="")
|
||||
elif is_text:
|
||||
# No explicit encoding
|
||||
f = open(path_or_buf, mode, errors="replace", newline="")
|
||||
else:
|
||||
# Binary mode
|
||||
f = open(path_or_buf, mode)
|
||||
handles.append(f)
|
||||
|
||||
# Convert BytesIO or file objects passed with an encoding
|
||||
if is_text and (compression or isinstance(f, need_text_wrapping)):
|
||||
from io import TextIOWrapper
|
||||
|
||||
f = TextIOWrapper(f, encoding=encoding, newline="")
|
||||
handles.append(f)
|
||||
|
||||
if memory_map and hasattr(f, "fileno"):
|
||||
try:
|
||||
g = MMapWrapper(f)
|
||||
f.close()
|
||||
f = g
|
||||
except Exception:
|
||||
# we catch any errors that may have occurred
|
||||
# because that is consistent with the lower-level
|
||||
# functionality of the C engine (pd.read_csv), so
|
||||
# leave the file handler as is then
|
||||
pass
|
||||
|
||||
return f, handles
|
||||
|
||||
|
||||
class BytesZipFile(zipfile.ZipFile, BytesIO): # type: ignore
|
||||
"""
|
||||
Wrapper for standard library class ZipFile and allow the returned file-like
|
||||
handle to accept byte strings via `write` method.
|
||||
|
||||
BytesIO provides attributes of file-like object and ZipFile.writestr writes
|
||||
bytes strings into a member of the archive.
|
||||
"""
|
||||
|
||||
# GH 17778
|
||||
def __init__(self, file, mode, compression=zipfile.ZIP_DEFLATED, **kwargs):
|
||||
if mode in ["wb", "rb"]:
|
||||
mode = mode.replace("b", "")
|
||||
super().__init__(file, mode, compression, **kwargs)
|
||||
|
||||
def write(self, data):
|
||||
super().writestr(self.filename, data)
|
||||
|
||||
@property
|
||||
def closed(self):
|
||||
return self.fp is None
|
||||
|
||||
|
||||
class MMapWrapper(BaseIterator):
|
||||
"""
|
||||
Wrapper for the Python's mmap class so that it can be properly read in
|
||||
by Python's csv.reader class.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
f : file object
|
||||
File object to be mapped onto memory. Must support the 'fileno'
|
||||
method or have an equivalent attribute
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, f):
|
||||
self.mmap = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
|
||||
|
||||
def __getattr__(self, name):
|
||||
return getattr(self.mmap, name)
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def __next__(self):
|
||||
newline = self.mmap.readline()
|
||||
|
||||
# readline returns bytes, not str, but Python's CSV reader
|
||||
# expects str, so convert the output to str before continuing
|
||||
newline = newline.decode("utf-8")
|
||||
|
||||
# mmap doesn't raise if reading past the allocated
|
||||
# data but instead returns an empty string, so raise
|
||||
# if that is returned
|
||||
if newline == "":
|
||||
raise StopIteration
|
||||
return newline
|
||||
|
||||
|
||||
class UTF8Recoder(BaseIterator):
|
||||
|
||||
"""
|
||||
Iterator that reads an encoded stream and re-encodes the input to UTF-8
|
||||
"""
|
||||
|
||||
def __init__(self, f, encoding):
|
||||
self.reader = codecs.getreader(encoding)(f)
|
||||
|
||||
def read(self, bytes=-1):
|
||||
return self.reader.read(bytes).encode("utf-8")
|
||||
|
||||
def readline(self):
|
||||
return self.reader.readline().encode("utf-8")
|
||||
|
||||
def next(self):
|
||||
return next(self.reader).encode("utf-8")
|
||||
|
||||
|
||||
# Keeping these class for now because it provides a necessary convenience
|
||||
# for "dropping" the "encoding" argument from our I/O arguments when
|
||||
# creating a Unicode I/O object.
|
||||
def UnicodeReader(f, dialect=csv.excel, encoding="utf-8", **kwds):
|
||||
return csv.reader(f, dialect=dialect, **kwds)
|
||||
|
||||
|
||||
def UnicodeWriter(f, dialect=csv.excel, encoding="utf-8", **kwds):
|
||||
return csv.writer(f, dialect=dialect, **kwds)
|
@@ -0,0 +1,64 @@
|
||||
"""This module is designed for community supported date conversion functions"""
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs.tslibs import parsing
|
||||
|
||||
|
||||
def parse_date_time(date_col, time_col):
|
||||
date_col = _maybe_cast(date_col)
|
||||
time_col = _maybe_cast(time_col)
|
||||
return parsing.try_parse_date_and_time(date_col, time_col)
|
||||
|
||||
|
||||
def parse_date_fields(year_col, month_col, day_col):
|
||||
year_col = _maybe_cast(year_col)
|
||||
month_col = _maybe_cast(month_col)
|
||||
day_col = _maybe_cast(day_col)
|
||||
return parsing.try_parse_year_month_day(year_col, month_col, day_col)
|
||||
|
||||
|
||||
def parse_all_fields(year_col, month_col, day_col, hour_col, minute_col, second_col):
|
||||
year_col = _maybe_cast(year_col)
|
||||
month_col = _maybe_cast(month_col)
|
||||
day_col = _maybe_cast(day_col)
|
||||
hour_col = _maybe_cast(hour_col)
|
||||
minute_col = _maybe_cast(minute_col)
|
||||
second_col = _maybe_cast(second_col)
|
||||
return parsing.try_parse_datetime_components(
|
||||
year_col, month_col, day_col, hour_col, minute_col, second_col
|
||||
)
|
||||
|
||||
|
||||
def generic_parser(parse_func, *cols):
|
||||
N = _check_columns(cols)
|
||||
results = np.empty(N, dtype=object)
|
||||
|
||||
for i in range(N):
|
||||
args = [c[i] for c in cols]
|
||||
results[i] = parse_func(*args)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def _maybe_cast(arr):
|
||||
if not arr.dtype.type == np.object_:
|
||||
arr = np.array(arr, dtype=object)
|
||||
return arr
|
||||
|
||||
|
||||
def _check_columns(cols):
|
||||
if not len(cols):
|
||||
raise AssertionError("There must be at least 1 column")
|
||||
|
||||
head, tail = cols[0], cols[1:]
|
||||
|
||||
N = len(head)
|
||||
|
||||
for i, n in enumerate(map(len, tail)):
|
||||
if n != N:
|
||||
raise AssertionError(
|
||||
"All columns must have the same length: {0}; "
|
||||
"column {1} has length {2}".format(N, i, n)
|
||||
)
|
||||
|
||||
return N
|
16
venv/lib/python3.6/site-packages/pandas/io/excel/__init__.py
Normal file
16
venv/lib/python3.6/site-packages/pandas/io/excel/__init__.py
Normal file
@@ -0,0 +1,16 @@
|
||||
from pandas.io.excel._base import ExcelFile, ExcelWriter, read_excel
|
||||
from pandas.io.excel._openpyxl import _OpenpyxlWriter
|
||||
from pandas.io.excel._util import register_writer
|
||||
from pandas.io.excel._xlsxwriter import _XlsxWriter
|
||||
from pandas.io.excel._xlwt import _XlwtWriter
|
||||
|
||||
__all__ = ["read_excel", "ExcelWriter", "ExcelFile"]
|
||||
|
||||
|
||||
register_writer(_OpenpyxlWriter)
|
||||
|
||||
|
||||
register_writer(_XlwtWriter)
|
||||
|
||||
|
||||
register_writer(_XlsxWriter)
|
903
venv/lib/python3.6/site-packages/pandas/io/excel/_base.py
Normal file
903
venv/lib/python3.6/site-packages/pandas/io/excel/_base.py
Normal file
@@ -0,0 +1,903 @@
|
||||
import abc
|
||||
from collections import OrderedDict
|
||||
from datetime import date, datetime, timedelta
|
||||
from io import BytesIO
|
||||
import os
|
||||
from textwrap import fill
|
||||
from urllib.request import urlopen
|
||||
|
||||
from pandas._config import config
|
||||
|
||||
from pandas.errors import EmptyDataError
|
||||
from pandas.util._decorators import Appender, deprecate_kwarg
|
||||
|
||||
from pandas.core.dtypes.common import is_bool, is_float, is_integer, is_list_like
|
||||
|
||||
from pandas.core.frame import DataFrame
|
||||
|
||||
from pandas.io.common import (
|
||||
_NA_VALUES,
|
||||
_is_url,
|
||||
_stringify_path,
|
||||
_validate_header_arg,
|
||||
get_filepath_or_buffer,
|
||||
)
|
||||
from pandas.io.excel._util import (
|
||||
_fill_mi_header,
|
||||
_get_default_writer,
|
||||
_maybe_convert_usecols,
|
||||
_pop_header_name,
|
||||
get_writer,
|
||||
)
|
||||
from pandas.io.formats.printing import pprint_thing
|
||||
from pandas.io.parsers import TextParser
|
||||
|
||||
_read_excel_doc = (
|
||||
"""
|
||||
Read an Excel file into a pandas DataFrame.
|
||||
|
||||
Support both `xls` and `xlsx` file extensions from a local filesystem or URL.
|
||||
Support an option to read a single sheet or a list of sheets.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
io : str, ExcelFile, xlrd.Book, path object or file-like object
|
||||
Any valid string path is acceptable. The string could be a URL. Valid
|
||||
URL schemes include http, ftp, s3, and file. For file URLs, a host is
|
||||
expected. A local file could be: ``file://localhost/path/to/table.xlsx``.
|
||||
|
||||
If you want to pass in a path object, pandas accepts any ``os.PathLike``.
|
||||
|
||||
By file-like object, we refer to objects with a ``read()`` method,
|
||||
such as a file handler (e.g. via builtin ``open`` function)
|
||||
or ``StringIO``.
|
||||
sheet_name : str, int, list, or None, default 0
|
||||
Strings are used for sheet names. Integers are used in zero-indexed
|
||||
sheet positions. Lists of strings/integers are used to request
|
||||
multiple sheets. Specify None to get all sheets.
|
||||
|
||||
Available cases:
|
||||
|
||||
* Defaults to ``0``: 1st sheet as a `DataFrame`
|
||||
* ``1``: 2nd sheet as a `DataFrame`
|
||||
* ``"Sheet1"``: Load sheet with name "Sheet1"
|
||||
* ``[0, 1, "Sheet5"]``: Load first, second and sheet named "Sheet5"
|
||||
as a dict of `DataFrame`
|
||||
* None: All sheets.
|
||||
|
||||
header : int, list of int, default 0
|
||||
Row (0-indexed) to use for the column labels of the parsed
|
||||
DataFrame. If a list of integers is passed those row positions will
|
||||
be combined into a ``MultiIndex``. Use None if there is no header.
|
||||
names : array-like, default None
|
||||
List of column names to use. If file contains no header row,
|
||||
then you should explicitly pass header=None.
|
||||
index_col : int, list of int, default None
|
||||
Column (0-indexed) to use as the row labels of the DataFrame.
|
||||
Pass None if there is no such column. If a list is passed,
|
||||
those columns will be combined into a ``MultiIndex``. If a
|
||||
subset of data is selected with ``usecols``, index_col
|
||||
is based on the subset.
|
||||
usecols : int, str, list-like, or callable default None
|
||||
Return a subset of the columns.
|
||||
|
||||
* If None, then parse all columns.
|
||||
* If int, then indicates last column to be parsed.
|
||||
|
||||
.. deprecated:: 0.24.0
|
||||
Pass in a list of int instead from 0 to `usecols` inclusive.
|
||||
|
||||
* If str, then indicates comma separated list of Excel column letters
|
||||
and column ranges (e.g. "A:E" or "A,C,E:F"). Ranges are inclusive of
|
||||
both sides.
|
||||
* If list of int, then indicates list of column numbers to be parsed.
|
||||
* If list of string, then indicates list of column names to be parsed.
|
||||
|
||||
.. versionadded:: 0.24.0
|
||||
|
||||
* If callable, then evaluate each column name against it and parse the
|
||||
column if the callable returns ``True``.
|
||||
|
||||
.. versionadded:: 0.24.0
|
||||
|
||||
squeeze : bool, default False
|
||||
If the parsed data only contains one column then return a Series.
|
||||
dtype : Type name or dict of column -> type, default None
|
||||
Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32}
|
||||
Use `object` to preserve data as stored in Excel and not interpret dtype.
|
||||
If converters are specified, they will be applied INSTEAD
|
||||
of dtype conversion.
|
||||
|
||||
.. versionadded:: 0.20.0
|
||||
|
||||
engine : str, default None
|
||||
If io is not a buffer or path, this must be set to identify io.
|
||||
Acceptable values are None or xlrd.
|
||||
converters : dict, default None
|
||||
Dict of functions for converting values in certain columns. Keys can
|
||||
either be integers or column labels, values are functions that take one
|
||||
input argument, the Excel cell content, and return the transformed
|
||||
content.
|
||||
true_values : list, default None
|
||||
Values to consider as True.
|
||||
|
||||
.. versionadded:: 0.19.0
|
||||
|
||||
false_values : list, default None
|
||||
Values to consider as False.
|
||||
|
||||
.. versionadded:: 0.19.0
|
||||
|
||||
skiprows : list-like
|
||||
Rows to skip at the beginning (0-indexed).
|
||||
nrows : int, default None
|
||||
Number of rows to parse.
|
||||
|
||||
.. versionadded:: 0.23.0
|
||||
|
||||
na_values : scalar, str, list-like, or dict, default None
|
||||
Additional strings to recognize as NA/NaN. If dict passed, specific
|
||||
per-column NA values. By default the following values are interpreted
|
||||
as NaN: '"""
|
||||
+ fill("', '".join(sorted(_NA_VALUES)), 70, subsequent_indent=" ")
|
||||
+ """'.
|
||||
keep_default_na : bool, default True
|
||||
If na_values are specified and keep_default_na is False the default NaN
|
||||
values are overridden, otherwise they're appended to.
|
||||
verbose : bool, default False
|
||||
Indicate number of NA values placed in non-numeric columns.
|
||||
parse_dates : bool, list-like, or dict, default False
|
||||
The behavior is as follows:
|
||||
|
||||
* bool. If True -> try parsing the index.
|
||||
* list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3
|
||||
each as a separate date column.
|
||||
* list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as
|
||||
a single date column.
|
||||
* dict, e.g. {'foo' : [1, 3]} -> parse columns 1, 3 as date and call
|
||||
result 'foo'
|
||||
|
||||
If a column or index contains an unparseable date, the entire column or
|
||||
index will be returned unaltered as an object data type. For non-standard
|
||||
datetime parsing, use ``pd.to_datetime`` after ``pd.read_excel``.
|
||||
|
||||
Note: A fast-path exists for iso8601-formatted dates.
|
||||
date_parser : function, optional
|
||||
Function to use for converting a sequence of string columns to an array of
|
||||
datetime instances. The default uses ``dateutil.parser.parser`` to do the
|
||||
conversion. Pandas will try to call `date_parser` in three different ways,
|
||||
advancing to the next if an exception occurs: 1) Pass one or more arrays
|
||||
(as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the
|
||||
string values from the columns defined by `parse_dates` into a single array
|
||||
and pass that; and 3) call `date_parser` once for each row using one or
|
||||
more strings (corresponding to the columns defined by `parse_dates`) as
|
||||
arguments.
|
||||
thousands : str, default None
|
||||
Thousands separator for parsing string columns to numeric. Note that
|
||||
this parameter is only necessary for columns stored as TEXT in Excel,
|
||||
any numeric columns will automatically be parsed, regardless of display
|
||||
format.
|
||||
comment : str, default None
|
||||
Comments out remainder of line. Pass a character or characters to this
|
||||
argument to indicate comments in the input file. Any data between the
|
||||
comment string and the end of the current line is ignored.
|
||||
skip_footer : int, default 0
|
||||
Alias of `skipfooter`.
|
||||
|
||||
.. deprecated:: 0.23.0
|
||||
Use `skipfooter` instead.
|
||||
skipfooter : int, default 0
|
||||
Rows at the end to skip (0-indexed).
|
||||
convert_float : bool, default True
|
||||
Convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric
|
||||
data will be read in as floats: Excel stores all numbers as floats
|
||||
internally.
|
||||
mangle_dupe_cols : bool, default True
|
||||
Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than
|
||||
'X'...'X'. Passing in False will cause data to be overwritten if there
|
||||
are duplicate names in the columns.
|
||||
**kwds : optional
|
||||
Optional keyword arguments can be passed to ``TextFileReader``.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame or dict of DataFrames
|
||||
DataFrame from the passed in Excel file. See notes in sheet_name
|
||||
argument for more information on when a dict of DataFrames is returned.
|
||||
|
||||
See Also
|
||||
--------
|
||||
to_excel : Write DataFrame to an Excel file.
|
||||
to_csv : Write DataFrame to a comma-separated values (csv) file.
|
||||
read_csv : Read a comma-separated values (csv) file into DataFrame.
|
||||
read_fwf : Read a table of fixed-width formatted lines into DataFrame.
|
||||
|
||||
Examples
|
||||
--------
|
||||
The file can be read using the file name as string or an open file object:
|
||||
|
||||
>>> pd.read_excel('tmp.xlsx', index_col=0) # doctest: +SKIP
|
||||
Name Value
|
||||
0 string1 1
|
||||
1 string2 2
|
||||
2 #Comment 3
|
||||
|
||||
>>> pd.read_excel(open('tmp.xlsx', 'rb'),
|
||||
... sheet_name='Sheet3') # doctest: +SKIP
|
||||
Unnamed: 0 Name Value
|
||||
0 0 string1 1
|
||||
1 1 string2 2
|
||||
2 2 #Comment 3
|
||||
|
||||
Index and header can be specified via the `index_col` and `header` arguments
|
||||
|
||||
>>> pd.read_excel('tmp.xlsx', index_col=None, header=None) # doctest: +SKIP
|
||||
0 1 2
|
||||
0 NaN Name Value
|
||||
1 0.0 string1 1
|
||||
2 1.0 string2 2
|
||||
3 2.0 #Comment 3
|
||||
|
||||
Column types are inferred but can be explicitly specified
|
||||
|
||||
>>> pd.read_excel('tmp.xlsx', index_col=0,
|
||||
... dtype={'Name': str, 'Value': float}) # doctest: +SKIP
|
||||
Name Value
|
||||
0 string1 1.0
|
||||
1 string2 2.0
|
||||
2 #Comment 3.0
|
||||
|
||||
True, False, and NA values, and thousands separators have defaults,
|
||||
but can be explicitly specified, too. Supply the values you would like
|
||||
as strings or lists of strings!
|
||||
|
||||
>>> pd.read_excel('tmp.xlsx', index_col=0,
|
||||
... na_values=['string1', 'string2']) # doctest: +SKIP
|
||||
Name Value
|
||||
0 NaN 1
|
||||
1 NaN 2
|
||||
2 #Comment 3
|
||||
|
||||
Comment lines in the excel input file can be skipped using the `comment` kwarg
|
||||
|
||||
>>> pd.read_excel('tmp.xlsx', index_col=0, comment='#') # doctest: +SKIP
|
||||
Name Value
|
||||
0 string1 1.0
|
||||
1 string2 2.0
|
||||
2 None NaN
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
@Appender(_read_excel_doc)
|
||||
@deprecate_kwarg("skip_footer", "skipfooter")
|
||||
def read_excel(
|
||||
io,
|
||||
sheet_name=0,
|
||||
header=0,
|
||||
names=None,
|
||||
index_col=None,
|
||||
usecols=None,
|
||||
squeeze=False,
|
||||
dtype=None,
|
||||
engine=None,
|
||||
converters=None,
|
||||
true_values=None,
|
||||
false_values=None,
|
||||
skiprows=None,
|
||||
nrows=None,
|
||||
na_values=None,
|
||||
keep_default_na=True,
|
||||
verbose=False,
|
||||
parse_dates=False,
|
||||
date_parser=None,
|
||||
thousands=None,
|
||||
comment=None,
|
||||
skip_footer=0,
|
||||
skipfooter=0,
|
||||
convert_float=True,
|
||||
mangle_dupe_cols=True,
|
||||
**kwds
|
||||
):
|
||||
|
||||
for arg in ("sheet", "sheetname", "parse_cols"):
|
||||
if arg in kwds:
|
||||
raise TypeError(
|
||||
"read_excel() got an unexpected keyword argument " "`{}`".format(arg)
|
||||
)
|
||||
|
||||
if not isinstance(io, ExcelFile):
|
||||
io = ExcelFile(io, engine=engine)
|
||||
elif engine and engine != io.engine:
|
||||
raise ValueError(
|
||||
"Engine should not be specified when passing "
|
||||
"an ExcelFile - ExcelFile already has the engine set"
|
||||
)
|
||||
|
||||
return io.parse(
|
||||
sheet_name=sheet_name,
|
||||
header=header,
|
||||
names=names,
|
||||
index_col=index_col,
|
||||
usecols=usecols,
|
||||
squeeze=squeeze,
|
||||
dtype=dtype,
|
||||
converters=converters,
|
||||
true_values=true_values,
|
||||
false_values=false_values,
|
||||
skiprows=skiprows,
|
||||
nrows=nrows,
|
||||
na_values=na_values,
|
||||
keep_default_na=keep_default_na,
|
||||
verbose=verbose,
|
||||
parse_dates=parse_dates,
|
||||
date_parser=date_parser,
|
||||
thousands=thousands,
|
||||
comment=comment,
|
||||
skipfooter=skipfooter,
|
||||
convert_float=convert_float,
|
||||
mangle_dupe_cols=mangle_dupe_cols,
|
||||
**kwds
|
||||
)
|
||||
|
||||
|
||||
class _BaseExcelReader(metaclass=abc.ABCMeta):
|
||||
def __init__(self, filepath_or_buffer):
|
||||
# If filepath_or_buffer is a url, load the data into a BytesIO
|
||||
if _is_url(filepath_or_buffer):
|
||||
filepath_or_buffer = BytesIO(urlopen(filepath_or_buffer).read())
|
||||
elif not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)):
|
||||
filepath_or_buffer, _, _, _ = get_filepath_or_buffer(filepath_or_buffer)
|
||||
|
||||
if isinstance(filepath_or_buffer, self._workbook_class):
|
||||
self.book = filepath_or_buffer
|
||||
elif hasattr(filepath_or_buffer, "read"):
|
||||
# N.B. xlrd.Book has a read attribute too
|
||||
filepath_or_buffer.seek(0)
|
||||
self.book = self.load_workbook(filepath_or_buffer)
|
||||
elif isinstance(filepath_or_buffer, str):
|
||||
self.book = self.load_workbook(filepath_or_buffer)
|
||||
else:
|
||||
raise ValueError(
|
||||
"Must explicitly set engine if not passing in" " buffer or path for io."
|
||||
)
|
||||
|
||||
@property
|
||||
@abc.abstractmethod
|
||||
def _workbook_class(self):
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def load_workbook(self, filepath_or_buffer):
|
||||
pass
|
||||
|
||||
@property
|
||||
@abc.abstractmethod
|
||||
def sheet_names(self):
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def get_sheet_by_name(self, name):
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def get_sheet_by_index(self, index):
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def get_sheet_data(self, sheet, convert_float):
|
||||
pass
|
||||
|
||||
def parse(
|
||||
self,
|
||||
sheet_name=0,
|
||||
header=0,
|
||||
names=None,
|
||||
index_col=None,
|
||||
usecols=None,
|
||||
squeeze=False,
|
||||
dtype=None,
|
||||
true_values=None,
|
||||
false_values=None,
|
||||
skiprows=None,
|
||||
nrows=None,
|
||||
na_values=None,
|
||||
verbose=False,
|
||||
parse_dates=False,
|
||||
date_parser=None,
|
||||
thousands=None,
|
||||
comment=None,
|
||||
skipfooter=0,
|
||||
convert_float=True,
|
||||
mangle_dupe_cols=True,
|
||||
**kwds
|
||||
):
|
||||
|
||||
_validate_header_arg(header)
|
||||
|
||||
ret_dict = False
|
||||
|
||||
# Keep sheetname to maintain backwards compatibility.
|
||||
if isinstance(sheet_name, list):
|
||||
sheets = sheet_name
|
||||
ret_dict = True
|
||||
elif sheet_name is None:
|
||||
sheets = self.sheet_names
|
||||
ret_dict = True
|
||||
else:
|
||||
sheets = [sheet_name]
|
||||
|
||||
# handle same-type duplicates.
|
||||
sheets = list(OrderedDict.fromkeys(sheets).keys())
|
||||
|
||||
output = OrderedDict()
|
||||
|
||||
for asheetname in sheets:
|
||||
if verbose:
|
||||
print("Reading sheet {sheet}".format(sheet=asheetname))
|
||||
|
||||
if isinstance(asheetname, str):
|
||||
sheet = self.get_sheet_by_name(asheetname)
|
||||
else: # assume an integer if not a string
|
||||
sheet = self.get_sheet_by_index(asheetname)
|
||||
|
||||
data = self.get_sheet_data(sheet, convert_float)
|
||||
usecols = _maybe_convert_usecols(usecols)
|
||||
|
||||
if not data:
|
||||
output[asheetname] = DataFrame()
|
||||
continue
|
||||
|
||||
if is_list_like(header) and len(header) == 1:
|
||||
header = header[0]
|
||||
|
||||
# forward fill and pull out names for MultiIndex column
|
||||
header_names = None
|
||||
if header is not None and is_list_like(header):
|
||||
header_names = []
|
||||
control_row = [True] * len(data[0])
|
||||
|
||||
for row in header:
|
||||
if is_integer(skiprows):
|
||||
row += skiprows
|
||||
|
||||
data[row], control_row = _fill_mi_header(data[row], control_row)
|
||||
|
||||
if index_col is not None:
|
||||
header_name, _ = _pop_header_name(data[row], index_col)
|
||||
header_names.append(header_name)
|
||||
|
||||
if is_list_like(index_col):
|
||||
# Forward fill values for MultiIndex index.
|
||||
if not is_list_like(header):
|
||||
offset = 1 + header
|
||||
else:
|
||||
offset = 1 + max(header)
|
||||
|
||||
# Check if we have an empty dataset
|
||||
# before trying to collect data.
|
||||
if offset < len(data):
|
||||
for col in index_col:
|
||||
last = data[offset][col]
|
||||
|
||||
for row in range(offset + 1, len(data)):
|
||||
if data[row][col] == "" or data[row][col] is None:
|
||||
data[row][col] = last
|
||||
else:
|
||||
last = data[row][col]
|
||||
|
||||
has_index_names = is_list_like(header) and len(header) > 1
|
||||
|
||||
# GH 12292 : error when read one empty column from excel file
|
||||
try:
|
||||
parser = TextParser(
|
||||
data,
|
||||
names=names,
|
||||
header=header,
|
||||
index_col=index_col,
|
||||
has_index_names=has_index_names,
|
||||
squeeze=squeeze,
|
||||
dtype=dtype,
|
||||
true_values=true_values,
|
||||
false_values=false_values,
|
||||
skiprows=skiprows,
|
||||
nrows=nrows,
|
||||
na_values=na_values,
|
||||
parse_dates=parse_dates,
|
||||
date_parser=date_parser,
|
||||
thousands=thousands,
|
||||
comment=comment,
|
||||
skipfooter=skipfooter,
|
||||
usecols=usecols,
|
||||
mangle_dupe_cols=mangle_dupe_cols,
|
||||
**kwds
|
||||
)
|
||||
|
||||
output[asheetname] = parser.read(nrows=nrows)
|
||||
|
||||
if not squeeze or isinstance(output[asheetname], DataFrame):
|
||||
if header_names:
|
||||
output[asheetname].columns = output[
|
||||
asheetname
|
||||
].columns.set_names(header_names)
|
||||
|
||||
except EmptyDataError:
|
||||
# No Data, return an empty DataFrame
|
||||
output[asheetname] = DataFrame()
|
||||
|
||||
if ret_dict:
|
||||
return output
|
||||
else:
|
||||
return output[asheetname]
|
||||
|
||||
|
||||
class ExcelWriter(metaclass=abc.ABCMeta):
|
||||
"""
|
||||
Class for writing DataFrame objects into excel sheets, default is to use
|
||||
xlwt for xls, openpyxl for xlsx. See DataFrame.to_excel for typical usage.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : string
|
||||
Path to xls or xlsx file.
|
||||
engine : string (optional)
|
||||
Engine to use for writing. If None, defaults to
|
||||
``io.excel.<extension>.writer``. NOTE: can only be passed as a keyword
|
||||
argument.
|
||||
date_format : string, default None
|
||||
Format string for dates written into Excel files (e.g. 'YYYY-MM-DD')
|
||||
datetime_format : string, default None
|
||||
Format string for datetime objects written into Excel files
|
||||
(e.g. 'YYYY-MM-DD HH:MM:SS')
|
||||
mode : {'w', 'a'}, default 'w'
|
||||
File mode to use (write or append).
|
||||
|
||||
.. versionadded:: 0.24.0
|
||||
|
||||
Attributes
|
||||
----------
|
||||
None
|
||||
|
||||
Methods
|
||||
-------
|
||||
None
|
||||
|
||||
Notes
|
||||
-----
|
||||
None of the methods and properties are considered public.
|
||||
|
||||
For compatibility with CSV writers, ExcelWriter serializes lists
|
||||
and dicts to strings before writing.
|
||||
|
||||
Examples
|
||||
--------
|
||||
Default usage:
|
||||
|
||||
>>> with ExcelWriter('path_to_file.xlsx') as writer:
|
||||
... df.to_excel(writer)
|
||||
|
||||
To write to separate sheets in a single file:
|
||||
|
||||
>>> with ExcelWriter('path_to_file.xlsx') as writer:
|
||||
... df1.to_excel(writer, sheet_name='Sheet1')
|
||||
... df2.to_excel(writer, sheet_name='Sheet2')
|
||||
|
||||
You can set the date format or datetime format:
|
||||
|
||||
>>> with ExcelWriter('path_to_file.xlsx',
|
||||
date_format='YYYY-MM-DD',
|
||||
datetime_format='YYYY-MM-DD HH:MM:SS') as writer:
|
||||
... df.to_excel(writer)
|
||||
|
||||
You can also append to an existing Excel file:
|
||||
|
||||
>>> with ExcelWriter('path_to_file.xlsx', mode='a') as writer:
|
||||
... df.to_excel(writer, sheet_name='Sheet3')
|
||||
"""
|
||||
|
||||
# Defining an ExcelWriter implementation (see abstract methods for more...)
|
||||
|
||||
# - Mandatory
|
||||
# - ``write_cells(self, cells, sheet_name=None, startrow=0, startcol=0)``
|
||||
# --> called to write additional DataFrames to disk
|
||||
# - ``supported_extensions`` (tuple of supported extensions), used to
|
||||
# check that engine supports the given extension.
|
||||
# - ``engine`` - string that gives the engine name. Necessary to
|
||||
# instantiate class directly and bypass ``ExcelWriterMeta`` engine
|
||||
# lookup.
|
||||
# - ``save(self)`` --> called to save file to disk
|
||||
# - Mostly mandatory (i.e. should at least exist)
|
||||
# - book, cur_sheet, path
|
||||
|
||||
# - Optional:
|
||||
# - ``__init__(self, path, engine=None, **kwargs)`` --> always called
|
||||
# with path as first argument.
|
||||
|
||||
# You also need to register the class with ``register_writer()``.
|
||||
# Technically, ExcelWriter implementations don't need to subclass
|
||||
# ExcelWriter.
|
||||
def __new__(cls, path, engine=None, **kwargs):
|
||||
# only switch class if generic(ExcelWriter)
|
||||
|
||||
if cls is ExcelWriter:
|
||||
if engine is None or (isinstance(engine, str) and engine == "auto"):
|
||||
if isinstance(path, str):
|
||||
ext = os.path.splitext(path)[-1][1:]
|
||||
else:
|
||||
ext = "xlsx"
|
||||
|
||||
try:
|
||||
engine = config.get_option("io.excel.{ext}.writer".format(ext=ext))
|
||||
if engine == "auto":
|
||||
engine = _get_default_writer(ext)
|
||||
except KeyError:
|
||||
raise ValueError("No engine for filetype: '{ext}'".format(ext=ext))
|
||||
cls = get_writer(engine)
|
||||
|
||||
return object.__new__(cls)
|
||||
|
||||
# declare external properties you can count on
|
||||
book = None
|
||||
curr_sheet = None
|
||||
path = None
|
||||
|
||||
@property
|
||||
@abc.abstractmethod
|
||||
def supported_extensions(self):
|
||||
"""Extensions that writer engine supports."""
|
||||
pass
|
||||
|
||||
@property
|
||||
@abc.abstractmethod
|
||||
def engine(self):
|
||||
"""Name of engine."""
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def write_cells(
|
||||
self, cells, sheet_name=None, startrow=0, startcol=0, freeze_panes=None
|
||||
):
|
||||
"""
|
||||
Write given formatted cells into Excel an excel sheet
|
||||
|
||||
Parameters
|
||||
----------
|
||||
cells : generator
|
||||
cell of formatted data to save to Excel sheet
|
||||
sheet_name : string, default None
|
||||
Name of Excel sheet, if None, then use self.cur_sheet
|
||||
startrow : upper left cell row to dump data frame
|
||||
startcol : upper left cell column to dump data frame
|
||||
freeze_panes: integer tuple of length 2
|
||||
contains the bottom-most row and right-most column to freeze
|
||||
"""
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def save(self):
|
||||
"""
|
||||
Save workbook to disk.
|
||||
"""
|
||||
pass
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
path,
|
||||
engine=None,
|
||||
date_format=None,
|
||||
datetime_format=None,
|
||||
mode="w",
|
||||
**engine_kwargs
|
||||
):
|
||||
# validate that this engine can handle the extension
|
||||
if isinstance(path, str):
|
||||
ext = os.path.splitext(path)[-1]
|
||||
else:
|
||||
ext = "xls" if engine == "xlwt" else "xlsx"
|
||||
|
||||
self.check_extension(ext)
|
||||
|
||||
self.path = path
|
||||
self.sheets = {}
|
||||
self.cur_sheet = None
|
||||
|
||||
if date_format is None:
|
||||
self.date_format = "YYYY-MM-DD"
|
||||
else:
|
||||
self.date_format = date_format
|
||||
if datetime_format is None:
|
||||
self.datetime_format = "YYYY-MM-DD HH:MM:SS"
|
||||
else:
|
||||
self.datetime_format = datetime_format
|
||||
|
||||
self.mode = mode
|
||||
|
||||
def __fspath__(self):
|
||||
return _stringify_path(self.path)
|
||||
|
||||
def _get_sheet_name(self, sheet_name):
|
||||
if sheet_name is None:
|
||||
sheet_name = self.cur_sheet
|
||||
if sheet_name is None: # pragma: no cover
|
||||
raise ValueError(
|
||||
"Must pass explicit sheet_name or set " "cur_sheet property"
|
||||
)
|
||||
return sheet_name
|
||||
|
||||
def _value_with_fmt(self, val):
|
||||
"""Convert numpy types to Python types for the Excel writers.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
val : object
|
||||
Value to be written into cells
|
||||
|
||||
Returns
|
||||
-------
|
||||
Tuple with the first element being the converted value and the second
|
||||
being an optional format
|
||||
"""
|
||||
fmt = None
|
||||
|
||||
if is_integer(val):
|
||||
val = int(val)
|
||||
elif is_float(val):
|
||||
val = float(val)
|
||||
elif is_bool(val):
|
||||
val = bool(val)
|
||||
elif isinstance(val, datetime):
|
||||
fmt = self.datetime_format
|
||||
elif isinstance(val, date):
|
||||
fmt = self.date_format
|
||||
elif isinstance(val, timedelta):
|
||||
val = val.total_seconds() / float(86400)
|
||||
fmt = "0"
|
||||
else:
|
||||
val = str(val)
|
||||
|
||||
return val, fmt
|
||||
|
||||
@classmethod
|
||||
def check_extension(cls, ext):
|
||||
"""checks that path's extension against the Writer's supported
|
||||
extensions. If it isn't supported, raises UnsupportedFiletypeError."""
|
||||
if ext.startswith("."):
|
||||
ext = ext[1:]
|
||||
if not any(ext in extension for extension in cls.supported_extensions):
|
||||
msg = "Invalid extension for engine '{engine}': '{ext}'".format(
|
||||
engine=pprint_thing(cls.engine), ext=pprint_thing(ext)
|
||||
)
|
||||
raise ValueError(msg)
|
||||
else:
|
||||
return True
|
||||
|
||||
# Allow use as a contextmanager
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_value, traceback):
|
||||
self.close()
|
||||
|
||||
def close(self):
|
||||
"""synonym for save, to make it more file-like"""
|
||||
return self.save()
|
||||
|
||||
|
||||
class ExcelFile:
|
||||
"""
|
||||
Class for parsing tabular excel sheets into DataFrame objects.
|
||||
Uses xlrd. See read_excel for more documentation
|
||||
|
||||
Parameters
|
||||
----------
|
||||
io : string, path object (pathlib.Path or py._path.local.LocalPath),
|
||||
file-like object or xlrd workbook
|
||||
If a string or path object, expected to be a path to xls or xlsx file.
|
||||
engine : string, default None
|
||||
If io is not a buffer or path, this must be set to identify io.
|
||||
Acceptable values are None or ``xlrd``.
|
||||
"""
|
||||
|
||||
from pandas.io.excel._odfreader import _ODFReader
|
||||
from pandas.io.excel._openpyxl import _OpenpyxlReader
|
||||
from pandas.io.excel._xlrd import _XlrdReader
|
||||
|
||||
_engines = {"xlrd": _XlrdReader, "openpyxl": _OpenpyxlReader, "odf": _ODFReader}
|
||||
|
||||
def __init__(self, io, engine=None):
|
||||
if engine is None:
|
||||
engine = "xlrd"
|
||||
if engine not in self._engines:
|
||||
raise ValueError("Unknown engine: {engine}".format(engine=engine))
|
||||
|
||||
self.engine = engine
|
||||
# could be a str, ExcelFile, Book, etc.
|
||||
self.io = io
|
||||
# Always a string
|
||||
self._io = _stringify_path(io)
|
||||
|
||||
self._reader = self._engines[engine](self._io)
|
||||
|
||||
def __fspath__(self):
|
||||
return self._io
|
||||
|
||||
def parse(
|
||||
self,
|
||||
sheet_name=0,
|
||||
header=0,
|
||||
names=None,
|
||||
index_col=None,
|
||||
usecols=None,
|
||||
squeeze=False,
|
||||
converters=None,
|
||||
true_values=None,
|
||||
false_values=None,
|
||||
skiprows=None,
|
||||
nrows=None,
|
||||
na_values=None,
|
||||
parse_dates=False,
|
||||
date_parser=None,
|
||||
thousands=None,
|
||||
comment=None,
|
||||
skipfooter=0,
|
||||
convert_float=True,
|
||||
mangle_dupe_cols=True,
|
||||
**kwds
|
||||
):
|
||||
"""
|
||||
Parse specified sheet(s) into a DataFrame
|
||||
|
||||
Equivalent to read_excel(ExcelFile, ...) See the read_excel
|
||||
docstring for more info on accepted parameters
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame or dict of DataFrames
|
||||
DataFrame from the passed in Excel file.
|
||||
"""
|
||||
if "chunksize" in kwds:
|
||||
raise NotImplementedError(
|
||||
"chunksize keyword of read_excel " "is not implemented"
|
||||
)
|
||||
|
||||
return self._reader.parse(
|
||||
sheet_name=sheet_name,
|
||||
header=header,
|
||||
names=names,
|
||||
index_col=index_col,
|
||||
usecols=usecols,
|
||||
squeeze=squeeze,
|
||||
converters=converters,
|
||||
true_values=true_values,
|
||||
false_values=false_values,
|
||||
skiprows=skiprows,
|
||||
nrows=nrows,
|
||||
na_values=na_values,
|
||||
parse_dates=parse_dates,
|
||||
date_parser=date_parser,
|
||||
thousands=thousands,
|
||||
comment=comment,
|
||||
skipfooter=skipfooter,
|
||||
convert_float=convert_float,
|
||||
mangle_dupe_cols=mangle_dupe_cols,
|
||||
**kwds
|
||||
)
|
||||
|
||||
@property
|
||||
def book(self):
|
||||
return self._reader.book
|
||||
|
||||
@property
|
||||
def sheet_names(self):
|
||||
return self._reader.sheet_names
|
||||
|
||||
def close(self):
|
||||
"""close io if necessary"""
|
||||
if hasattr(self.io, "close"):
|
||||
self.io.close()
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_value, traceback):
|
||||
self.close()
|
180
venv/lib/python3.6/site-packages/pandas/io/excel/_odfreader.py
Normal file
180
venv/lib/python3.6/site-packages/pandas/io/excel/_odfreader.py
Normal file
@@ -0,0 +1,180 @@
|
||||
from typing import List
|
||||
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
|
||||
import pandas as pd
|
||||
from pandas._typing import FilePathOrBuffer, Scalar
|
||||
|
||||
from pandas.io.excel._base import _BaseExcelReader
|
||||
|
||||
|
||||
class _ODFReader(_BaseExcelReader):
|
||||
"""Read tables out of OpenDocument formatted files
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filepath_or_buffer: string, path to be parsed or
|
||||
an open readable stream.
|
||||
"""
|
||||
|
||||
def __init__(self, filepath_or_buffer: FilePathOrBuffer):
|
||||
import_optional_dependency("odf")
|
||||
super().__init__(filepath_or_buffer)
|
||||
|
||||
@property
|
||||
def _workbook_class(self):
|
||||
from odf.opendocument import OpenDocument
|
||||
|
||||
return OpenDocument
|
||||
|
||||
def load_workbook(self, filepath_or_buffer: FilePathOrBuffer):
|
||||
from odf.opendocument import load
|
||||
|
||||
return load(filepath_or_buffer)
|
||||
|
||||
@property
|
||||
def empty_value(self) -> str:
|
||||
"""Property for compat with other readers."""
|
||||
return ""
|
||||
|
||||
@property
|
||||
def sheet_names(self) -> List[str]:
|
||||
"""Return a list of sheet names present in the document"""
|
||||
from odf.table import Table
|
||||
|
||||
tables = self.book.getElementsByType(Table)
|
||||
return [t.getAttribute("name") for t in tables]
|
||||
|
||||
def get_sheet_by_index(self, index: int):
|
||||
from odf.table import Table
|
||||
|
||||
tables = self.book.getElementsByType(Table)
|
||||
return tables[index]
|
||||
|
||||
def get_sheet_by_name(self, name: str):
|
||||
from odf.table import Table
|
||||
|
||||
tables = self.book.getElementsByType(Table)
|
||||
|
||||
for table in tables:
|
||||
if table.getAttribute("name") == name:
|
||||
return table
|
||||
|
||||
raise ValueError("sheet {name} not found".format(name))
|
||||
|
||||
def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:
|
||||
"""Parse an ODF Table into a list of lists
|
||||
"""
|
||||
from odf.table import CoveredTableCell, TableCell, TableRow
|
||||
|
||||
covered_cell_name = CoveredTableCell().qname
|
||||
table_cell_name = TableCell().qname
|
||||
cell_names = {covered_cell_name, table_cell_name}
|
||||
|
||||
sheet_rows = sheet.getElementsByType(TableRow)
|
||||
empty_rows = 0
|
||||
max_row_len = 0
|
||||
|
||||
table = [] # type: List[List[Scalar]]
|
||||
|
||||
for i, sheet_row in enumerate(sheet_rows):
|
||||
sheet_cells = [x for x in sheet_row.childNodes if x.qname in cell_names]
|
||||
empty_cells = 0
|
||||
table_row = [] # type: List[Scalar]
|
||||
|
||||
for j, sheet_cell in enumerate(sheet_cells):
|
||||
if sheet_cell.qname == table_cell_name:
|
||||
value = self._get_cell_value(sheet_cell, convert_float)
|
||||
else:
|
||||
value = self.empty_value
|
||||
|
||||
column_repeat = self._get_column_repeat(sheet_cell)
|
||||
|
||||
# Queue up empty values, writing only if content succeeds them
|
||||
if value == self.empty_value:
|
||||
empty_cells += column_repeat
|
||||
else:
|
||||
table_row.extend([self.empty_value] * empty_cells)
|
||||
empty_cells = 0
|
||||
table_row.extend([value] * column_repeat)
|
||||
|
||||
if max_row_len < len(table_row):
|
||||
max_row_len = len(table_row)
|
||||
|
||||
row_repeat = self._get_row_repeat(sheet_row)
|
||||
if self._is_empty_row(sheet_row):
|
||||
empty_rows += row_repeat
|
||||
else:
|
||||
# add blank rows to our table
|
||||
table.extend([[self.empty_value]] * empty_rows)
|
||||
empty_rows = 0
|
||||
for _ in range(row_repeat):
|
||||
table.append(table_row)
|
||||
|
||||
# Make our table square
|
||||
for row in table:
|
||||
if len(row) < max_row_len:
|
||||
row.extend([self.empty_value] * (max_row_len - len(row)))
|
||||
|
||||
return table
|
||||
|
||||
def _get_row_repeat(self, row) -> int:
|
||||
"""Return number of times this row was repeated
|
||||
Repeating an empty row appeared to be a common way
|
||||
of representing sparse rows in the table.
|
||||
"""
|
||||
from odf.namespaces import TABLENS
|
||||
|
||||
return int(row.attributes.get((TABLENS, "number-rows-repeated"), 1))
|
||||
|
||||
def _get_column_repeat(self, cell) -> int:
|
||||
from odf.namespaces import TABLENS
|
||||
|
||||
return int(cell.attributes.get((TABLENS, "number-columns-repeated"), 1))
|
||||
|
||||
def _is_empty_row(self, row) -> bool:
|
||||
"""Helper function to find empty rows
|
||||
"""
|
||||
for column in row.childNodes:
|
||||
if len(column.childNodes) > 0:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def _get_cell_value(self, cell, convert_float: bool) -> Scalar:
|
||||
from odf.namespaces import OFFICENS
|
||||
|
||||
cell_type = cell.attributes.get((OFFICENS, "value-type"))
|
||||
if cell_type == "boolean":
|
||||
if str(cell) == "TRUE":
|
||||
return True
|
||||
return False
|
||||
if cell_type is None:
|
||||
return self.empty_value
|
||||
elif cell_type == "float":
|
||||
# GH5394
|
||||
cell_value = float(cell.attributes.get((OFFICENS, "value")))
|
||||
|
||||
if cell_value == 0.0 and str(cell) != cell_value: # NA handling
|
||||
return str(cell)
|
||||
|
||||
if convert_float:
|
||||
val = int(cell_value)
|
||||
if val == cell_value:
|
||||
return val
|
||||
return cell_value
|
||||
elif cell_type == "percentage":
|
||||
cell_value = cell.attributes.get((OFFICENS, "value"))
|
||||
return float(cell_value)
|
||||
elif cell_type == "string":
|
||||
return str(cell)
|
||||
elif cell_type == "currency":
|
||||
cell_value = cell.attributes.get((OFFICENS, "value"))
|
||||
return float(cell_value)
|
||||
elif cell_type == "date":
|
||||
cell_value = cell.attributes.get((OFFICENS, "date-value"))
|
||||
return pd.to_datetime(cell_value)
|
||||
elif cell_type == "time":
|
||||
return pd.to_datetime(str(cell)).time()
|
||||
else:
|
||||
raise ValueError("Unrecognized type {}".format(cell_type))
|
522
venv/lib/python3.6/site-packages/pandas/io/excel/_openpyxl.py
Normal file
522
venv/lib/python3.6/site-packages/pandas/io/excel/_openpyxl.py
Normal file
@@ -0,0 +1,522 @@
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
|
||||
from pandas._typing import FilePathOrBuffer, Scalar
|
||||
|
||||
from pandas.io.excel._base import ExcelWriter, _BaseExcelReader
|
||||
from pandas.io.excel._util import _validate_freeze_panes
|
||||
|
||||
|
||||
class _OpenpyxlWriter(ExcelWriter):
|
||||
engine = "openpyxl"
|
||||
supported_extensions = (".xlsx", ".xlsm")
|
||||
|
||||
def __init__(self, path, engine=None, mode="w", **engine_kwargs):
|
||||
# Use the openpyxl module as the Excel writer.
|
||||
from openpyxl.workbook import Workbook
|
||||
|
||||
super().__init__(path, mode=mode, **engine_kwargs)
|
||||
|
||||
if self.mode == "a": # Load from existing workbook
|
||||
from openpyxl import load_workbook
|
||||
|
||||
book = load_workbook(self.path)
|
||||
self.book = book
|
||||
else:
|
||||
# Create workbook object with default optimized_write=True.
|
||||
self.book = Workbook()
|
||||
|
||||
if self.book.worksheets:
|
||||
try:
|
||||
self.book.remove(self.book.worksheets[0])
|
||||
except AttributeError:
|
||||
|
||||
# compat - for openpyxl <= 2.4
|
||||
self.book.remove_sheet(self.book.worksheets[0])
|
||||
|
||||
def save(self):
|
||||
"""
|
||||
Save workbook to disk.
|
||||
"""
|
||||
return self.book.save(self.path)
|
||||
|
||||
@classmethod
|
||||
def _convert_to_style(cls, style_dict):
|
||||
"""
|
||||
converts a style_dict to an openpyxl style object
|
||||
Parameters
|
||||
----------
|
||||
style_dict : style dictionary to convert
|
||||
"""
|
||||
|
||||
from openpyxl.style import Style
|
||||
|
||||
xls_style = Style()
|
||||
for key, value in style_dict.items():
|
||||
for nk, nv in value.items():
|
||||
if key == "borders":
|
||||
(
|
||||
xls_style.borders.__getattribute__(nk).__setattr__(
|
||||
"border_style", nv
|
||||
)
|
||||
)
|
||||
else:
|
||||
xls_style.__getattribute__(key).__setattr__(nk, nv)
|
||||
|
||||
return xls_style
|
||||
|
||||
@classmethod
|
||||
def _convert_to_style_kwargs(cls, style_dict):
|
||||
"""
|
||||
Convert a style_dict to a set of kwargs suitable for initializing
|
||||
or updating-on-copy an openpyxl v2 style object
|
||||
Parameters
|
||||
----------
|
||||
style_dict : dict
|
||||
A dict with zero or more of the following keys (or their synonyms).
|
||||
'font'
|
||||
'fill'
|
||||
'border' ('borders')
|
||||
'alignment'
|
||||
'number_format'
|
||||
'protection'
|
||||
Returns
|
||||
-------
|
||||
style_kwargs : dict
|
||||
A dict with the same, normalized keys as ``style_dict`` but each
|
||||
value has been replaced with a native openpyxl style object of the
|
||||
appropriate class.
|
||||
"""
|
||||
|
||||
_style_key_map = {"borders": "border"}
|
||||
|
||||
style_kwargs = {}
|
||||
for k, v in style_dict.items():
|
||||
if k in _style_key_map:
|
||||
k = _style_key_map[k]
|
||||
_conv_to_x = getattr(cls, "_convert_to_{k}".format(k=k), lambda x: None)
|
||||
new_v = _conv_to_x(v)
|
||||
if new_v:
|
||||
style_kwargs[k] = new_v
|
||||
|
||||
return style_kwargs
|
||||
|
||||
@classmethod
|
||||
def _convert_to_color(cls, color_spec):
|
||||
"""
|
||||
Convert ``color_spec`` to an openpyxl v2 Color object
|
||||
Parameters
|
||||
----------
|
||||
color_spec : str, dict
|
||||
A 32-bit ARGB hex string, or a dict with zero or more of the
|
||||
following keys.
|
||||
'rgb'
|
||||
'indexed'
|
||||
'auto'
|
||||
'theme'
|
||||
'tint'
|
||||
'index'
|
||||
'type'
|
||||
Returns
|
||||
-------
|
||||
color : openpyxl.styles.Color
|
||||
"""
|
||||
|
||||
from openpyxl.styles import Color
|
||||
|
||||
if isinstance(color_spec, str):
|
||||
return Color(color_spec)
|
||||
else:
|
||||
return Color(**color_spec)
|
||||
|
||||
@classmethod
|
||||
def _convert_to_font(cls, font_dict):
|
||||
"""
|
||||
Convert ``font_dict`` to an openpyxl v2 Font object
|
||||
Parameters
|
||||
----------
|
||||
font_dict : dict
|
||||
A dict with zero or more of the following keys (or their synonyms).
|
||||
'name'
|
||||
'size' ('sz')
|
||||
'bold' ('b')
|
||||
'italic' ('i')
|
||||
'underline' ('u')
|
||||
'strikethrough' ('strike')
|
||||
'color'
|
||||
'vertAlign' ('vertalign')
|
||||
'charset'
|
||||
'scheme'
|
||||
'family'
|
||||
'outline'
|
||||
'shadow'
|
||||
'condense'
|
||||
Returns
|
||||
-------
|
||||
font : openpyxl.styles.Font
|
||||
"""
|
||||
|
||||
from openpyxl.styles import Font
|
||||
|
||||
_font_key_map = {
|
||||
"sz": "size",
|
||||
"b": "bold",
|
||||
"i": "italic",
|
||||
"u": "underline",
|
||||
"strike": "strikethrough",
|
||||
"vertalign": "vertAlign",
|
||||
}
|
||||
|
||||
font_kwargs = {}
|
||||
for k, v in font_dict.items():
|
||||
if k in _font_key_map:
|
||||
k = _font_key_map[k]
|
||||
if k == "color":
|
||||
v = cls._convert_to_color(v)
|
||||
font_kwargs[k] = v
|
||||
|
||||
return Font(**font_kwargs)
|
||||
|
||||
@classmethod
|
||||
def _convert_to_stop(cls, stop_seq):
|
||||
"""
|
||||
Convert ``stop_seq`` to a list of openpyxl v2 Color objects,
|
||||
suitable for initializing the ``GradientFill`` ``stop`` parameter.
|
||||
Parameters
|
||||
----------
|
||||
stop_seq : iterable
|
||||
An iterable that yields objects suitable for consumption by
|
||||
``_convert_to_color``.
|
||||
Returns
|
||||
-------
|
||||
stop : list of openpyxl.styles.Color
|
||||
"""
|
||||
|
||||
return map(cls._convert_to_color, stop_seq)
|
||||
|
||||
@classmethod
|
||||
def _convert_to_fill(cls, fill_dict):
|
||||
"""
|
||||
Convert ``fill_dict`` to an openpyxl v2 Fill object
|
||||
Parameters
|
||||
----------
|
||||
fill_dict : dict
|
||||
A dict with one or more of the following keys (or their synonyms),
|
||||
'fill_type' ('patternType', 'patterntype')
|
||||
'start_color' ('fgColor', 'fgcolor')
|
||||
'end_color' ('bgColor', 'bgcolor')
|
||||
or one or more of the following keys (or their synonyms).
|
||||
'type' ('fill_type')
|
||||
'degree'
|
||||
'left'
|
||||
'right'
|
||||
'top'
|
||||
'bottom'
|
||||
'stop'
|
||||
Returns
|
||||
-------
|
||||
fill : openpyxl.styles.Fill
|
||||
"""
|
||||
|
||||
from openpyxl.styles import PatternFill, GradientFill
|
||||
|
||||
_pattern_fill_key_map = {
|
||||
"patternType": "fill_type",
|
||||
"patterntype": "fill_type",
|
||||
"fgColor": "start_color",
|
||||
"fgcolor": "start_color",
|
||||
"bgColor": "end_color",
|
||||
"bgcolor": "end_color",
|
||||
}
|
||||
|
||||
_gradient_fill_key_map = {"fill_type": "type"}
|
||||
|
||||
pfill_kwargs = {}
|
||||
gfill_kwargs = {}
|
||||
for k, v in fill_dict.items():
|
||||
pk = gk = None
|
||||
if k in _pattern_fill_key_map:
|
||||
pk = _pattern_fill_key_map[k]
|
||||
if k in _gradient_fill_key_map:
|
||||
gk = _gradient_fill_key_map[k]
|
||||
if pk in ["start_color", "end_color"]:
|
||||
v = cls._convert_to_color(v)
|
||||
if gk == "stop":
|
||||
v = cls._convert_to_stop(v)
|
||||
if pk:
|
||||
pfill_kwargs[pk] = v
|
||||
elif gk:
|
||||
gfill_kwargs[gk] = v
|
||||
else:
|
||||
pfill_kwargs[k] = v
|
||||
gfill_kwargs[k] = v
|
||||
|
||||
try:
|
||||
return PatternFill(**pfill_kwargs)
|
||||
except TypeError:
|
||||
return GradientFill(**gfill_kwargs)
|
||||
|
||||
@classmethod
|
||||
def _convert_to_side(cls, side_spec):
|
||||
"""
|
||||
Convert ``side_spec`` to an openpyxl v2 Side object
|
||||
Parameters
|
||||
----------
|
||||
side_spec : str, dict
|
||||
A string specifying the border style, or a dict with zero or more
|
||||
of the following keys (or their synonyms).
|
||||
'style' ('border_style')
|
||||
'color'
|
||||
Returns
|
||||
-------
|
||||
side : openpyxl.styles.Side
|
||||
"""
|
||||
|
||||
from openpyxl.styles import Side
|
||||
|
||||
_side_key_map = {"border_style": "style"}
|
||||
|
||||
if isinstance(side_spec, str):
|
||||
return Side(style=side_spec)
|
||||
|
||||
side_kwargs = {}
|
||||
for k, v in side_spec.items():
|
||||
if k in _side_key_map:
|
||||
k = _side_key_map[k]
|
||||
if k == "color":
|
||||
v = cls._convert_to_color(v)
|
||||
side_kwargs[k] = v
|
||||
|
||||
return Side(**side_kwargs)
|
||||
|
||||
@classmethod
|
||||
def _convert_to_border(cls, border_dict):
|
||||
"""
|
||||
Convert ``border_dict`` to an openpyxl v2 Border object
|
||||
Parameters
|
||||
----------
|
||||
border_dict : dict
|
||||
A dict with zero or more of the following keys (or their synonyms).
|
||||
'left'
|
||||
'right'
|
||||
'top'
|
||||
'bottom'
|
||||
'diagonal'
|
||||
'diagonal_direction'
|
||||
'vertical'
|
||||
'horizontal'
|
||||
'diagonalUp' ('diagonalup')
|
||||
'diagonalDown' ('diagonaldown')
|
||||
'outline'
|
||||
Returns
|
||||
-------
|
||||
border : openpyxl.styles.Border
|
||||
"""
|
||||
|
||||
from openpyxl.styles import Border
|
||||
|
||||
_border_key_map = {"diagonalup": "diagonalUp", "diagonaldown": "diagonalDown"}
|
||||
|
||||
border_kwargs = {}
|
||||
for k, v in border_dict.items():
|
||||
if k in _border_key_map:
|
||||
k = _border_key_map[k]
|
||||
if k == "color":
|
||||
v = cls._convert_to_color(v)
|
||||
if k in ["left", "right", "top", "bottom", "diagonal"]:
|
||||
v = cls._convert_to_side(v)
|
||||
border_kwargs[k] = v
|
||||
|
||||
return Border(**border_kwargs)
|
||||
|
||||
@classmethod
|
||||
def _convert_to_alignment(cls, alignment_dict):
|
||||
"""
|
||||
Convert ``alignment_dict`` to an openpyxl v2 Alignment object
|
||||
Parameters
|
||||
----------
|
||||
alignment_dict : dict
|
||||
A dict with zero or more of the following keys (or their synonyms).
|
||||
'horizontal'
|
||||
'vertical'
|
||||
'text_rotation'
|
||||
'wrap_text'
|
||||
'shrink_to_fit'
|
||||
'indent'
|
||||
Returns
|
||||
-------
|
||||
alignment : openpyxl.styles.Alignment
|
||||
"""
|
||||
|
||||
from openpyxl.styles import Alignment
|
||||
|
||||
return Alignment(**alignment_dict)
|
||||
|
||||
@classmethod
|
||||
def _convert_to_number_format(cls, number_format_dict):
|
||||
"""
|
||||
Convert ``number_format_dict`` to an openpyxl v2.1.0 number format
|
||||
initializer.
|
||||
Parameters
|
||||
----------
|
||||
number_format_dict : dict
|
||||
A dict with zero or more of the following keys.
|
||||
'format_code' : str
|
||||
Returns
|
||||
-------
|
||||
number_format : str
|
||||
"""
|
||||
return number_format_dict["format_code"]
|
||||
|
||||
@classmethod
|
||||
def _convert_to_protection(cls, protection_dict):
|
||||
"""
|
||||
Convert ``protection_dict`` to an openpyxl v2 Protection object.
|
||||
Parameters
|
||||
----------
|
||||
protection_dict : dict
|
||||
A dict with zero or more of the following keys.
|
||||
'locked'
|
||||
'hidden'
|
||||
Returns
|
||||
-------
|
||||
"""
|
||||
|
||||
from openpyxl.styles import Protection
|
||||
|
||||
return Protection(**protection_dict)
|
||||
|
||||
def write_cells(
|
||||
self, cells, sheet_name=None, startrow=0, startcol=0, freeze_panes=None
|
||||
):
|
||||
# Write the frame cells using openpyxl.
|
||||
sheet_name = self._get_sheet_name(sheet_name)
|
||||
|
||||
_style_cache = {}
|
||||
|
||||
if sheet_name in self.sheets:
|
||||
wks = self.sheets[sheet_name]
|
||||
else:
|
||||
wks = self.book.create_sheet()
|
||||
wks.title = sheet_name
|
||||
self.sheets[sheet_name] = wks
|
||||
|
||||
if _validate_freeze_panes(freeze_panes):
|
||||
wks.freeze_panes = wks.cell(
|
||||
row=freeze_panes[0] + 1, column=freeze_panes[1] + 1
|
||||
)
|
||||
|
||||
for cell in cells:
|
||||
xcell = wks.cell(
|
||||
row=startrow + cell.row + 1, column=startcol + cell.col + 1
|
||||
)
|
||||
xcell.value, fmt = self._value_with_fmt(cell.val)
|
||||
if fmt:
|
||||
xcell.number_format = fmt
|
||||
|
||||
style_kwargs = {}
|
||||
if cell.style:
|
||||
key = str(cell.style)
|
||||
style_kwargs = _style_cache.get(key)
|
||||
if style_kwargs is None:
|
||||
style_kwargs = self._convert_to_style_kwargs(cell.style)
|
||||
_style_cache[key] = style_kwargs
|
||||
|
||||
if style_kwargs:
|
||||
for k, v in style_kwargs.items():
|
||||
setattr(xcell, k, v)
|
||||
|
||||
if cell.mergestart is not None and cell.mergeend is not None:
|
||||
|
||||
wks.merge_cells(
|
||||
start_row=startrow + cell.row + 1,
|
||||
start_column=startcol + cell.col + 1,
|
||||
end_column=startcol + cell.mergeend + 1,
|
||||
end_row=startrow + cell.mergestart + 1,
|
||||
)
|
||||
|
||||
# When cells are merged only the top-left cell is preserved
|
||||
# The behaviour of the other cells in a merged range is
|
||||
# undefined
|
||||
if style_kwargs:
|
||||
first_row = startrow + cell.row + 1
|
||||
last_row = startrow + cell.mergestart + 1
|
||||
first_col = startcol + cell.col + 1
|
||||
last_col = startcol + cell.mergeend + 1
|
||||
|
||||
for row in range(first_row, last_row + 1):
|
||||
for col in range(first_col, last_col + 1):
|
||||
if row == first_row and col == first_col:
|
||||
# Ignore first cell. It is already handled.
|
||||
continue
|
||||
xcell = wks.cell(column=col, row=row)
|
||||
for k, v in style_kwargs.items():
|
||||
setattr(xcell, k, v)
|
||||
|
||||
|
||||
class _OpenpyxlReader(_BaseExcelReader):
|
||||
def __init__(self, filepath_or_buffer: FilePathOrBuffer) -> None:
|
||||
"""Reader using openpyxl engine.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filepath_or_buffer : string, path object or Workbook
|
||||
Object to be parsed.
|
||||
"""
|
||||
import_optional_dependency("openpyxl")
|
||||
super().__init__(filepath_or_buffer)
|
||||
|
||||
@property
|
||||
def _workbook_class(self):
|
||||
from openpyxl import Workbook
|
||||
|
||||
return Workbook
|
||||
|
||||
def load_workbook(self, filepath_or_buffer: FilePathOrBuffer):
|
||||
from openpyxl import load_workbook
|
||||
|
||||
return load_workbook(
|
||||
filepath_or_buffer, read_only=True, data_only=True, keep_links=False
|
||||
)
|
||||
|
||||
@property
|
||||
def sheet_names(self) -> List[str]:
|
||||
return self.book.sheetnames
|
||||
|
||||
def get_sheet_by_name(self, name: str):
|
||||
return self.book[name]
|
||||
|
||||
def get_sheet_by_index(self, index: int):
|
||||
return self.book.worksheets[index]
|
||||
|
||||
def _convert_cell(self, cell, convert_float: bool) -> Scalar:
|
||||
|
||||
# TODO: replace with openpyxl constants
|
||||
if cell.is_date:
|
||||
return cell.value
|
||||
elif cell.data_type == "e":
|
||||
return np.nan
|
||||
elif cell.data_type == "b":
|
||||
return bool(cell.value)
|
||||
elif cell.value is None:
|
||||
return "" # compat with xlrd
|
||||
elif cell.data_type == "n":
|
||||
# GH5394
|
||||
if convert_float:
|
||||
val = int(cell.value)
|
||||
if val == cell.value:
|
||||
return val
|
||||
else:
|
||||
return float(cell.value)
|
||||
|
||||
return cell.value
|
||||
|
||||
def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:
|
||||
data = [] # type: List[List[Scalar]]
|
||||
for row in sheet.rows:
|
||||
data.append([self._convert_cell(cell, convert_float) for cell in row])
|
||||
|
||||
return data
|
236
venv/lib/python3.6/site-packages/pandas/io/excel/_util.py
Normal file
236
venv/lib/python3.6/site-packages/pandas/io/excel/_util.py
Normal file
@@ -0,0 +1,236 @@
|
||||
import warnings
|
||||
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
|
||||
from pandas.core.dtypes.common import is_integer, is_list_like
|
||||
|
||||
_writers = {}
|
||||
|
||||
|
||||
def register_writer(klass):
|
||||
"""
|
||||
Add engine to the excel writer registry.io.excel.
|
||||
|
||||
You must use this method to integrate with ``to_excel``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
klass : ExcelWriter
|
||||
"""
|
||||
if not callable(klass):
|
||||
raise ValueError("Can only register callables as engines")
|
||||
engine_name = klass.engine
|
||||
_writers[engine_name] = klass
|
||||
|
||||
|
||||
def _get_default_writer(ext):
|
||||
"""
|
||||
Return the default writer for the given extension.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ext : str
|
||||
The excel file extension for which to get the default engine.
|
||||
|
||||
Returns
|
||||
-------
|
||||
str
|
||||
The default engine for the extension.
|
||||
"""
|
||||
_default_writers = {"xlsx": "openpyxl", "xlsm": "openpyxl", "xls": "xlwt"}
|
||||
xlsxwriter = import_optional_dependency(
|
||||
"xlsxwriter", raise_on_missing=False, on_version="warn"
|
||||
)
|
||||
if xlsxwriter:
|
||||
_default_writers["xlsx"] = "xlsxwriter"
|
||||
return _default_writers[ext]
|
||||
|
||||
|
||||
def get_writer(engine_name):
|
||||
try:
|
||||
return _writers[engine_name]
|
||||
except KeyError:
|
||||
raise ValueError("No Excel writer '{engine}'".format(engine=engine_name))
|
||||
|
||||
|
||||
def _excel2num(x):
|
||||
"""
|
||||
Convert Excel column name like 'AB' to 0-based column index.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : str
|
||||
The Excel column name to convert to a 0-based column index.
|
||||
|
||||
Returns
|
||||
-------
|
||||
num : int
|
||||
The column index corresponding to the name.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
Part of the Excel column name was invalid.
|
||||
"""
|
||||
index = 0
|
||||
|
||||
for c in x.upper().strip():
|
||||
cp = ord(c)
|
||||
|
||||
if cp < ord("A") or cp > ord("Z"):
|
||||
raise ValueError("Invalid column name: {x}".format(x=x))
|
||||
|
||||
index = index * 26 + cp - ord("A") + 1
|
||||
|
||||
return index - 1
|
||||
|
||||
|
||||
def _range2cols(areas):
|
||||
"""
|
||||
Convert comma separated list of column names and ranges to indices.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
areas : str
|
||||
A string containing a sequence of column ranges (or areas).
|
||||
|
||||
Returns
|
||||
-------
|
||||
cols : list
|
||||
A list of 0-based column indices.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> _range2cols('A:E')
|
||||
[0, 1, 2, 3, 4]
|
||||
>>> _range2cols('A,C,Z:AB')
|
||||
[0, 2, 25, 26, 27]
|
||||
"""
|
||||
cols = []
|
||||
|
||||
for rng in areas.split(","):
|
||||
if ":" in rng:
|
||||
rng = rng.split(":")
|
||||
cols.extend(range(_excel2num(rng[0]), _excel2num(rng[1]) + 1))
|
||||
else:
|
||||
cols.append(_excel2num(rng))
|
||||
|
||||
return cols
|
||||
|
||||
|
||||
def _maybe_convert_usecols(usecols):
|
||||
"""
|
||||
Convert `usecols` into a compatible format for parsing in `parsers.py`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
usecols : object
|
||||
The use-columns object to potentially convert.
|
||||
|
||||
Returns
|
||||
-------
|
||||
converted : object
|
||||
The compatible format of `usecols`.
|
||||
"""
|
||||
if usecols is None:
|
||||
return usecols
|
||||
|
||||
if is_integer(usecols):
|
||||
warnings.warn(
|
||||
(
|
||||
"Passing in an integer for `usecols` has been "
|
||||
"deprecated. Please pass in a list of int from "
|
||||
"0 to `usecols` inclusive instead."
|
||||
),
|
||||
FutureWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
return list(range(usecols + 1))
|
||||
|
||||
if isinstance(usecols, str):
|
||||
return _range2cols(usecols)
|
||||
|
||||
return usecols
|
||||
|
||||
|
||||
def _validate_freeze_panes(freeze_panes):
|
||||
if freeze_panes is not None:
|
||||
if len(freeze_panes) == 2 and all(
|
||||
isinstance(item, int) for item in freeze_panes
|
||||
):
|
||||
return True
|
||||
|
||||
raise ValueError(
|
||||
"freeze_panes must be of form (row, column)"
|
||||
" where row and column are integers"
|
||||
)
|
||||
|
||||
# freeze_panes wasn't specified, return False so it won't be applied
|
||||
# to output sheet
|
||||
return False
|
||||
|
||||
|
||||
def _trim_excel_header(row):
|
||||
# trim header row so auto-index inference works
|
||||
# xlrd uses '' , openpyxl None
|
||||
while len(row) > 0 and (row[0] == "" or row[0] is None):
|
||||
row = row[1:]
|
||||
return row
|
||||
|
||||
|
||||
def _fill_mi_header(row, control_row):
|
||||
"""Forward fill blank entries in row but only inside the same parent index.
|
||||
|
||||
Used for creating headers in Multiindex.
|
||||
Parameters
|
||||
----------
|
||||
row : list
|
||||
List of items in a single row.
|
||||
control_row : list of bool
|
||||
Helps to determine if particular column is in same parent index as the
|
||||
previous value. Used to stop propagation of empty cells between
|
||||
different indexes.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Returns changed row and control_row
|
||||
"""
|
||||
last = row[0]
|
||||
for i in range(1, len(row)):
|
||||
if not control_row[i]:
|
||||
last = row[i]
|
||||
|
||||
if row[i] == "" or row[i] is None:
|
||||
row[i] = last
|
||||
else:
|
||||
control_row[i] = False
|
||||
last = row[i]
|
||||
|
||||
return row, control_row
|
||||
|
||||
|
||||
def _pop_header_name(row, index_col):
|
||||
"""
|
||||
Pop the header name for MultiIndex parsing.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
row : list
|
||||
The data row to parse for the header name.
|
||||
index_col : int, list
|
||||
The index columns for our data. Assumed to be non-null.
|
||||
|
||||
Returns
|
||||
-------
|
||||
header_name : str
|
||||
The extracted header name.
|
||||
trimmed_row : list
|
||||
The original data row with the header name removed.
|
||||
"""
|
||||
# Pop out header name and fill w/blank.
|
||||
i = index_col if not is_list_like(index_col) else max(index_col)
|
||||
|
||||
header_name = row[i]
|
||||
header_name = None if header_name == "" else header_name
|
||||
|
||||
return header_name, row[:i] + [""] + row[i + 1 :]
|
106
venv/lib/python3.6/site-packages/pandas/io/excel/_xlrd.py
Normal file
106
venv/lib/python3.6/site-packages/pandas/io/excel/_xlrd.py
Normal file
@@ -0,0 +1,106 @@
|
||||
from datetime import time
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
|
||||
from pandas.io.excel._base import _BaseExcelReader
|
||||
|
||||
|
||||
class _XlrdReader(_BaseExcelReader):
|
||||
def __init__(self, filepath_or_buffer):
|
||||
"""Reader using xlrd engine.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filepath_or_buffer : string, path object or Workbook
|
||||
Object to be parsed.
|
||||
"""
|
||||
err_msg = "Install xlrd >= 1.0.0 for Excel support"
|
||||
import_optional_dependency("xlrd", extra=err_msg)
|
||||
super().__init__(filepath_or_buffer)
|
||||
|
||||
@property
|
||||
def _workbook_class(self):
|
||||
from xlrd import Book
|
||||
|
||||
return Book
|
||||
|
||||
def load_workbook(self, filepath_or_buffer):
|
||||
from xlrd import open_workbook
|
||||
|
||||
if hasattr(filepath_or_buffer, "read"):
|
||||
data = filepath_or_buffer.read()
|
||||
return open_workbook(file_contents=data)
|
||||
else:
|
||||
return open_workbook(filepath_or_buffer)
|
||||
|
||||
@property
|
||||
def sheet_names(self):
|
||||
return self.book.sheet_names()
|
||||
|
||||
def get_sheet_by_name(self, name):
|
||||
return self.book.sheet_by_name(name)
|
||||
|
||||
def get_sheet_by_index(self, index):
|
||||
return self.book.sheet_by_index(index)
|
||||
|
||||
def get_sheet_data(self, sheet, convert_float):
|
||||
from xlrd import (
|
||||
xldate,
|
||||
XL_CELL_DATE,
|
||||
XL_CELL_ERROR,
|
||||
XL_CELL_BOOLEAN,
|
||||
XL_CELL_NUMBER,
|
||||
)
|
||||
|
||||
epoch1904 = self.book.datemode
|
||||
|
||||
def _parse_cell(cell_contents, cell_typ):
|
||||
"""converts the contents of the cell into a pandas
|
||||
appropriate object"""
|
||||
|
||||
if cell_typ == XL_CELL_DATE:
|
||||
|
||||
# Use the newer xlrd datetime handling.
|
||||
try:
|
||||
cell_contents = xldate.xldate_as_datetime(cell_contents, epoch1904)
|
||||
except OverflowError:
|
||||
return cell_contents
|
||||
|
||||
# Excel doesn't distinguish between dates and time,
|
||||
# so we treat dates on the epoch as times only.
|
||||
# Also, Excel supports 1900 and 1904 epochs.
|
||||
year = (cell_contents.timetuple())[0:3]
|
||||
if (not epoch1904 and year == (1899, 12, 31)) or (
|
||||
epoch1904 and year == (1904, 1, 1)
|
||||
):
|
||||
cell_contents = time(
|
||||
cell_contents.hour,
|
||||
cell_contents.minute,
|
||||
cell_contents.second,
|
||||
cell_contents.microsecond,
|
||||
)
|
||||
|
||||
elif cell_typ == XL_CELL_ERROR:
|
||||
cell_contents = np.nan
|
||||
elif cell_typ == XL_CELL_BOOLEAN:
|
||||
cell_contents = bool(cell_contents)
|
||||
elif convert_float and cell_typ == XL_CELL_NUMBER:
|
||||
# GH5394 - Excel 'numbers' are always floats
|
||||
# it's a minimal perf hit and less surprising
|
||||
val = int(cell_contents)
|
||||
if val == cell_contents:
|
||||
cell_contents = val
|
||||
return cell_contents
|
||||
|
||||
data = []
|
||||
|
||||
for i in range(sheet.nrows):
|
||||
row = [
|
||||
_parse_cell(value, typ)
|
||||
for value, typ in zip(sheet.row_values(i), sheet.row_types(i))
|
||||
]
|
||||
data.append(row)
|
||||
|
||||
return data
|
237
venv/lib/python3.6/site-packages/pandas/io/excel/_xlsxwriter.py
Normal file
237
venv/lib/python3.6/site-packages/pandas/io/excel/_xlsxwriter.py
Normal file
@@ -0,0 +1,237 @@
|
||||
import pandas._libs.json as json
|
||||
|
||||
from pandas.io.excel._base import ExcelWriter
|
||||
from pandas.io.excel._util import _validate_freeze_panes
|
||||
|
||||
|
||||
class _XlsxStyler:
|
||||
# Map from openpyxl-oriented styles to flatter xlsxwriter representation
|
||||
# Ordering necessary for both determinism and because some are keyed by
|
||||
# prefixes of others.
|
||||
STYLE_MAPPING = {
|
||||
"font": [
|
||||
(("name",), "font_name"),
|
||||
(("sz",), "font_size"),
|
||||
(("size",), "font_size"),
|
||||
(("color", "rgb"), "font_color"),
|
||||
(("color",), "font_color"),
|
||||
(("b",), "bold"),
|
||||
(("bold",), "bold"),
|
||||
(("i",), "italic"),
|
||||
(("italic",), "italic"),
|
||||
(("u",), "underline"),
|
||||
(("underline",), "underline"),
|
||||
(("strike",), "font_strikeout"),
|
||||
(("vertAlign",), "font_script"),
|
||||
(("vertalign",), "font_script"),
|
||||
],
|
||||
"number_format": [(("format_code",), "num_format"), ((), "num_format")],
|
||||
"protection": [(("locked",), "locked"), (("hidden",), "hidden")],
|
||||
"alignment": [
|
||||
(("horizontal",), "align"),
|
||||
(("vertical",), "valign"),
|
||||
(("text_rotation",), "rotation"),
|
||||
(("wrap_text",), "text_wrap"),
|
||||
(("indent",), "indent"),
|
||||
(("shrink_to_fit",), "shrink"),
|
||||
],
|
||||
"fill": [
|
||||
(("patternType",), "pattern"),
|
||||
(("patterntype",), "pattern"),
|
||||
(("fill_type",), "pattern"),
|
||||
(("start_color", "rgb"), "fg_color"),
|
||||
(("fgColor", "rgb"), "fg_color"),
|
||||
(("fgcolor", "rgb"), "fg_color"),
|
||||
(("start_color",), "fg_color"),
|
||||
(("fgColor",), "fg_color"),
|
||||
(("fgcolor",), "fg_color"),
|
||||
(("end_color", "rgb"), "bg_color"),
|
||||
(("bgColor", "rgb"), "bg_color"),
|
||||
(("bgcolor", "rgb"), "bg_color"),
|
||||
(("end_color",), "bg_color"),
|
||||
(("bgColor",), "bg_color"),
|
||||
(("bgcolor",), "bg_color"),
|
||||
],
|
||||
"border": [
|
||||
(("color", "rgb"), "border_color"),
|
||||
(("color",), "border_color"),
|
||||
(("style",), "border"),
|
||||
(("top", "color", "rgb"), "top_color"),
|
||||
(("top", "color"), "top_color"),
|
||||
(("top", "style"), "top"),
|
||||
(("top",), "top"),
|
||||
(("right", "color", "rgb"), "right_color"),
|
||||
(("right", "color"), "right_color"),
|
||||
(("right", "style"), "right"),
|
||||
(("right",), "right"),
|
||||
(("bottom", "color", "rgb"), "bottom_color"),
|
||||
(("bottom", "color"), "bottom_color"),
|
||||
(("bottom", "style"), "bottom"),
|
||||
(("bottom",), "bottom"),
|
||||
(("left", "color", "rgb"), "left_color"),
|
||||
(("left", "color"), "left_color"),
|
||||
(("left", "style"), "left"),
|
||||
(("left",), "left"),
|
||||
],
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def convert(cls, style_dict, num_format_str=None):
|
||||
"""
|
||||
converts a style_dict to an xlsxwriter format dict
|
||||
|
||||
Parameters
|
||||
----------
|
||||
style_dict : style dictionary to convert
|
||||
num_format_str : optional number format string
|
||||
"""
|
||||
|
||||
# Create a XlsxWriter format object.
|
||||
props = {}
|
||||
|
||||
if num_format_str is not None:
|
||||
props["num_format"] = num_format_str
|
||||
|
||||
if style_dict is None:
|
||||
return props
|
||||
|
||||
if "borders" in style_dict:
|
||||
style_dict = style_dict.copy()
|
||||
style_dict["border"] = style_dict.pop("borders")
|
||||
|
||||
for style_group_key, style_group in style_dict.items():
|
||||
for src, dst in cls.STYLE_MAPPING.get(style_group_key, []):
|
||||
# src is a sequence of keys into a nested dict
|
||||
# dst is a flat key
|
||||
if dst in props:
|
||||
continue
|
||||
v = style_group
|
||||
for k in src:
|
||||
try:
|
||||
v = v[k]
|
||||
except (KeyError, TypeError):
|
||||
break
|
||||
else:
|
||||
props[dst] = v
|
||||
|
||||
if isinstance(props.get("pattern"), str):
|
||||
# TODO: support other fill patterns
|
||||
props["pattern"] = 0 if props["pattern"] == "none" else 1
|
||||
|
||||
for k in ["border", "top", "right", "bottom", "left"]:
|
||||
if isinstance(props.get(k), str):
|
||||
try:
|
||||
props[k] = [
|
||||
"none",
|
||||
"thin",
|
||||
"medium",
|
||||
"dashed",
|
||||
"dotted",
|
||||
"thick",
|
||||
"double",
|
||||
"hair",
|
||||
"mediumDashed",
|
||||
"dashDot",
|
||||
"mediumDashDot",
|
||||
"dashDotDot",
|
||||
"mediumDashDotDot",
|
||||
"slantDashDot",
|
||||
].index(props[k])
|
||||
except ValueError:
|
||||
props[k] = 2
|
||||
|
||||
if isinstance(props.get("font_script"), str):
|
||||
props["font_script"] = ["baseline", "superscript", "subscript"].index(
|
||||
props["font_script"]
|
||||
)
|
||||
|
||||
if isinstance(props.get("underline"), str):
|
||||
props["underline"] = {
|
||||
"none": 0,
|
||||
"single": 1,
|
||||
"double": 2,
|
||||
"singleAccounting": 33,
|
||||
"doubleAccounting": 34,
|
||||
}[props["underline"]]
|
||||
|
||||
return props
|
||||
|
||||
|
||||
class _XlsxWriter(ExcelWriter):
|
||||
engine = "xlsxwriter"
|
||||
supported_extensions = (".xlsx",)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
path,
|
||||
engine=None,
|
||||
date_format=None,
|
||||
datetime_format=None,
|
||||
mode="w",
|
||||
**engine_kwargs
|
||||
):
|
||||
# Use the xlsxwriter module as the Excel writer.
|
||||
import xlsxwriter
|
||||
|
||||
if mode == "a":
|
||||
raise ValueError("Append mode is not supported with xlsxwriter!")
|
||||
|
||||
super().__init__(
|
||||
path,
|
||||
engine=engine,
|
||||
date_format=date_format,
|
||||
datetime_format=datetime_format,
|
||||
mode=mode,
|
||||
**engine_kwargs
|
||||
)
|
||||
|
||||
self.book = xlsxwriter.Workbook(path, **engine_kwargs)
|
||||
|
||||
def save(self):
|
||||
"""
|
||||
Save workbook to disk.
|
||||
"""
|
||||
|
||||
return self.book.close()
|
||||
|
||||
def write_cells(
|
||||
self, cells, sheet_name=None, startrow=0, startcol=0, freeze_panes=None
|
||||
):
|
||||
# Write the frame cells using xlsxwriter.
|
||||
sheet_name = self._get_sheet_name(sheet_name)
|
||||
|
||||
if sheet_name in self.sheets:
|
||||
wks = self.sheets[sheet_name]
|
||||
else:
|
||||
wks = self.book.add_worksheet(sheet_name)
|
||||
self.sheets[sheet_name] = wks
|
||||
|
||||
style_dict = {"null": None}
|
||||
|
||||
if _validate_freeze_panes(freeze_panes):
|
||||
wks.freeze_panes(*(freeze_panes))
|
||||
|
||||
for cell in cells:
|
||||
val, fmt = self._value_with_fmt(cell.val)
|
||||
|
||||
stylekey = json.dumps(cell.style)
|
||||
if fmt:
|
||||
stylekey += fmt
|
||||
|
||||
if stylekey in style_dict:
|
||||
style = style_dict[stylekey]
|
||||
else:
|
||||
style = self.book.add_format(_XlsxStyler.convert(cell.style, fmt))
|
||||
style_dict[stylekey] = style
|
||||
|
||||
if cell.mergestart is not None and cell.mergeend is not None:
|
||||
wks.merge_range(
|
||||
startrow + cell.row,
|
||||
startcol + cell.col,
|
||||
startrow + cell.mergestart,
|
||||
startcol + cell.mergeend,
|
||||
val,
|
||||
style,
|
||||
)
|
||||
else:
|
||||
wks.write(startrow + cell.row, startcol + cell.col, val, style)
|
135
venv/lib/python3.6/site-packages/pandas/io/excel/_xlwt.py
Normal file
135
venv/lib/python3.6/site-packages/pandas/io/excel/_xlwt.py
Normal file
@@ -0,0 +1,135 @@
|
||||
import pandas._libs.json as json
|
||||
|
||||
from pandas.io.excel._base import ExcelWriter
|
||||
from pandas.io.excel._util import _validate_freeze_panes
|
||||
|
||||
|
||||
class _XlwtWriter(ExcelWriter):
|
||||
engine = "xlwt"
|
||||
supported_extensions = (".xls",)
|
||||
|
||||
def __init__(self, path, engine=None, encoding=None, mode="w", **engine_kwargs):
|
||||
# Use the xlwt module as the Excel writer.
|
||||
import xlwt
|
||||
|
||||
engine_kwargs["engine"] = engine
|
||||
|
||||
if mode == "a":
|
||||
raise ValueError("Append mode is not supported with xlwt!")
|
||||
|
||||
super().__init__(path, mode=mode, **engine_kwargs)
|
||||
|
||||
if encoding is None:
|
||||
encoding = "ascii"
|
||||
self.book = xlwt.Workbook(encoding=encoding)
|
||||
self.fm_datetime = xlwt.easyxf(num_format_str=self.datetime_format)
|
||||
self.fm_date = xlwt.easyxf(num_format_str=self.date_format)
|
||||
|
||||
def save(self):
|
||||
"""
|
||||
Save workbook to disk.
|
||||
"""
|
||||
return self.book.save(self.path)
|
||||
|
||||
def write_cells(
|
||||
self, cells, sheet_name=None, startrow=0, startcol=0, freeze_panes=None
|
||||
):
|
||||
# Write the frame cells using xlwt.
|
||||
|
||||
sheet_name = self._get_sheet_name(sheet_name)
|
||||
|
||||
if sheet_name in self.sheets:
|
||||
wks = self.sheets[sheet_name]
|
||||
else:
|
||||
wks = self.book.add_sheet(sheet_name)
|
||||
self.sheets[sheet_name] = wks
|
||||
|
||||
if _validate_freeze_panes(freeze_panes):
|
||||
wks.set_panes_frozen(True)
|
||||
wks.set_horz_split_pos(freeze_panes[0])
|
||||
wks.set_vert_split_pos(freeze_panes[1])
|
||||
|
||||
style_dict = {}
|
||||
|
||||
for cell in cells:
|
||||
val, fmt = self._value_with_fmt(cell.val)
|
||||
|
||||
stylekey = json.dumps(cell.style)
|
||||
if fmt:
|
||||
stylekey += fmt
|
||||
|
||||
if stylekey in style_dict:
|
||||
style = style_dict[stylekey]
|
||||
else:
|
||||
style = self._convert_to_style(cell.style, fmt)
|
||||
style_dict[stylekey] = style
|
||||
|
||||
if cell.mergestart is not None and cell.mergeend is not None:
|
||||
wks.write_merge(
|
||||
startrow + cell.row,
|
||||
startrow + cell.mergestart,
|
||||
startcol + cell.col,
|
||||
startcol + cell.mergeend,
|
||||
val,
|
||||
style,
|
||||
)
|
||||
else:
|
||||
wks.write(startrow + cell.row, startcol + cell.col, val, style)
|
||||
|
||||
@classmethod
|
||||
def _style_to_xlwt(cls, item, firstlevel=True, field_sep=",", line_sep=";"):
|
||||
"""helper which recursively generate an xlwt easy style string
|
||||
for example:
|
||||
|
||||
hstyle = {"font": {"bold": True},
|
||||
"border": {"top": "thin",
|
||||
"right": "thin",
|
||||
"bottom": "thin",
|
||||
"left": "thin"},
|
||||
"align": {"horiz": "center"}}
|
||||
will be converted to
|
||||
font: bold on; \
|
||||
border: top thin, right thin, bottom thin, left thin; \
|
||||
align: horiz center;
|
||||
"""
|
||||
if hasattr(item, "items"):
|
||||
if firstlevel:
|
||||
it = [
|
||||
"{key}: {val}".format(key=key, val=cls._style_to_xlwt(value, False))
|
||||
for key, value in item.items()
|
||||
]
|
||||
out = "{sep} ".format(sep=(line_sep).join(it))
|
||||
return out
|
||||
else:
|
||||
it = [
|
||||
"{key} {val}".format(key=key, val=cls._style_to_xlwt(value, False))
|
||||
for key, value in item.items()
|
||||
]
|
||||
out = "{sep} ".format(sep=(field_sep).join(it))
|
||||
return out
|
||||
else:
|
||||
item = "{item}".format(item=item)
|
||||
item = item.replace("True", "on")
|
||||
item = item.replace("False", "off")
|
||||
return item
|
||||
|
||||
@classmethod
|
||||
def _convert_to_style(cls, style_dict, num_format_str=None):
|
||||
"""
|
||||
converts a style_dict to an xlwt style object
|
||||
Parameters
|
||||
----------
|
||||
style_dict : style dictionary to convert
|
||||
num_format_str : optional number format string
|
||||
"""
|
||||
import xlwt
|
||||
|
||||
if style_dict:
|
||||
xlwt_stylestr = cls._style_to_xlwt(style_dict)
|
||||
style = xlwt.easyxf(xlwt_stylestr, field_sep=",", line_sep=";")
|
||||
else:
|
||||
style = xlwt.XFStyle()
|
||||
if num_format_str is not None:
|
||||
style.num_format_str = num_format_str
|
||||
|
||||
return style
|
119
venv/lib/python3.6/site-packages/pandas/io/feather_format.py
Normal file
119
venv/lib/python3.6/site-packages/pandas/io/feather_format.py
Normal file
@@ -0,0 +1,119 @@
|
||||
""" feather-format compat """
|
||||
|
||||
from distutils.version import LooseVersion
|
||||
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
from pandas.util._decorators import deprecate_kwarg
|
||||
|
||||
from pandas import DataFrame, Int64Index, RangeIndex
|
||||
|
||||
from pandas.io.common import _stringify_path
|
||||
|
||||
|
||||
def to_feather(df, path):
|
||||
"""
|
||||
Write a DataFrame to the feather-format
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df : DataFrame
|
||||
path : string file path, or file-like object
|
||||
|
||||
"""
|
||||
import_optional_dependency("pyarrow")
|
||||
from pyarrow import feather
|
||||
|
||||
path = _stringify_path(path)
|
||||
|
||||
if not isinstance(df, DataFrame):
|
||||
raise ValueError("feather only support IO with DataFrames")
|
||||
|
||||
valid_types = {"string", "unicode"}
|
||||
|
||||
# validate index
|
||||
# --------------
|
||||
|
||||
# validate that we have only a default index
|
||||
# raise on anything else as we don't serialize the index
|
||||
|
||||
if not isinstance(df.index, Int64Index):
|
||||
raise ValueError(
|
||||
"feather does not support serializing {} "
|
||||
"for the index; you can .reset_index()"
|
||||
"to make the index into column(s)".format(type(df.index))
|
||||
)
|
||||
|
||||
if not df.index.equals(RangeIndex.from_range(range(len(df)))):
|
||||
raise ValueError(
|
||||
"feather does not support serializing a "
|
||||
"non-default index for the index; you "
|
||||
"can .reset_index() to make the index "
|
||||
"into column(s)"
|
||||
)
|
||||
|
||||
if df.index.name is not None:
|
||||
raise ValueError(
|
||||
"feather does not serialize index meta-data on a " "default index"
|
||||
)
|
||||
|
||||
# validate columns
|
||||
# ----------------
|
||||
|
||||
# must have value column names (strings only)
|
||||
if df.columns.inferred_type not in valid_types:
|
||||
raise ValueError("feather must have string column names")
|
||||
|
||||
feather.write_feather(df, path)
|
||||
|
||||
|
||||
@deprecate_kwarg(old_arg_name="nthreads", new_arg_name="use_threads")
|
||||
def read_feather(path, columns=None, use_threads=True):
|
||||
"""
|
||||
Load a feather-format object from the file path.
|
||||
|
||||
.. versionadded 0.20.0
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str, path object or file-like object
|
||||
Any valid string path is acceptable. The string could be a URL. Valid
|
||||
URL schemes include http, ftp, s3, and file. For file URLs, a host is
|
||||
expected. A local file could be:
|
||||
``file://localhost/path/to/table.feather``.
|
||||
|
||||
If you want to pass in a path object, pandas accepts any
|
||||
``os.PathLike``.
|
||||
|
||||
By file-like object, we refer to objects with a ``read()`` method,
|
||||
such as a file handler (e.g. via builtin ``open`` function)
|
||||
or ``StringIO``.
|
||||
columns : sequence, default None
|
||||
If not provided, all columns are read.
|
||||
|
||||
.. versionadded 0.24.0
|
||||
nthreads : int, default 1
|
||||
Number of CPU threads to use when reading to pandas.DataFrame.
|
||||
|
||||
.. versionadded 0.21.0
|
||||
.. deprecated 0.24.0
|
||||
use_threads : bool, default True
|
||||
Whether to parallelize reading using multiple threads.
|
||||
|
||||
.. versionadded 0.24.0
|
||||
|
||||
Returns
|
||||
-------
|
||||
type of object stored in file
|
||||
"""
|
||||
pyarrow = import_optional_dependency("pyarrow")
|
||||
from pyarrow import feather
|
||||
|
||||
path = _stringify_path(path)
|
||||
|
||||
if LooseVersion(pyarrow.__version__) < LooseVersion("0.11.0"):
|
||||
int_use_threads = int(use_threads)
|
||||
if int_use_threads < 1:
|
||||
int_use_threads = 1
|
||||
return feather.read_feather(path, columns=columns, nthreads=int_use_threads)
|
||||
|
||||
return feather.read_feather(path, columns=columns, use_threads=bool(use_threads))
|
@@ -0,0 +1,83 @@
|
||||
"""
|
||||
Internal module for console introspection
|
||||
"""
|
||||
|
||||
from shutil import get_terminal_size
|
||||
|
||||
|
||||
def get_console_size():
|
||||
"""Return console size as tuple = (width, height).
|
||||
|
||||
Returns (None,None) in non-interactive session.
|
||||
"""
|
||||
from pandas import get_option
|
||||
|
||||
display_width = get_option("display.width")
|
||||
# deprecated.
|
||||
display_height = get_option("display.max_rows")
|
||||
|
||||
# Consider
|
||||
# interactive shell terminal, can detect term size
|
||||
# interactive non-shell terminal (ipnb/ipqtconsole), cannot detect term
|
||||
# size non-interactive script, should disregard term size
|
||||
|
||||
# in addition
|
||||
# width,height have default values, but setting to 'None' signals
|
||||
# should use Auto-Detection, But only in interactive shell-terminal.
|
||||
# Simple. yeah.
|
||||
|
||||
if in_interactive_session():
|
||||
if in_ipython_frontend():
|
||||
# sane defaults for interactive non-shell terminal
|
||||
# match default for width,height in config_init
|
||||
from pandas._config.config import get_default_val
|
||||
|
||||
terminal_width = get_default_val("display.width")
|
||||
terminal_height = get_default_val("display.max_rows")
|
||||
else:
|
||||
# pure terminal
|
||||
terminal_width, terminal_height = get_terminal_size()
|
||||
else:
|
||||
terminal_width, terminal_height = None, None
|
||||
|
||||
# Note if the User sets width/Height to None (auto-detection)
|
||||
# and we're in a script (non-inter), this will return (None,None)
|
||||
# caller needs to deal.
|
||||
return (display_width or terminal_width, display_height or terminal_height)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Detect our environment
|
||||
|
||||
|
||||
def in_interactive_session():
|
||||
""" check if we're running in an interactive shell
|
||||
|
||||
returns True if running under python/ipython interactive shell
|
||||
"""
|
||||
from pandas import get_option
|
||||
|
||||
def check_main():
|
||||
try:
|
||||
import __main__ as main
|
||||
except ModuleNotFoundError:
|
||||
return get_option("mode.sim_interactive")
|
||||
return not hasattr(main, "__file__") or get_option("mode.sim_interactive")
|
||||
|
||||
try:
|
||||
return __IPYTHON__ or check_main() # noqa
|
||||
except NameError:
|
||||
return check_main()
|
||||
|
||||
|
||||
def in_ipython_frontend():
|
||||
"""
|
||||
check if we're inside an an IPython zmq frontend
|
||||
"""
|
||||
try:
|
||||
ip = get_ipython() # noqa
|
||||
return "zmq" in str(type(ip)).lower()
|
||||
except NameError:
|
||||
pass
|
||||
|
||||
return False
|
257
venv/lib/python3.6/site-packages/pandas/io/formats/css.py
Normal file
257
venv/lib/python3.6/site-packages/pandas/io/formats/css.py
Normal file
@@ -0,0 +1,257 @@
|
||||
"""Utilities for interpreting CSS from Stylers for formatting non-HTML outputs
|
||||
"""
|
||||
|
||||
import re
|
||||
import warnings
|
||||
|
||||
|
||||
class CSSWarning(UserWarning):
|
||||
"""This CSS syntax cannot currently be parsed"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class CSSResolver:
|
||||
"""A callable for parsing and resolving CSS to atomic properties
|
||||
|
||||
"""
|
||||
|
||||
def __call__(self, declarations_str, inherited=None):
|
||||
""" the given declarations to atomic properties
|
||||
|
||||
Parameters
|
||||
----------
|
||||
declarations_str : str
|
||||
A list of CSS declarations
|
||||
inherited : dict, optional
|
||||
Atomic properties indicating the inherited style context in which
|
||||
declarations_str is to be resolved. ``inherited`` should already
|
||||
be resolved, i.e. valid output of this method.
|
||||
|
||||
Returns
|
||||
-------
|
||||
props : dict
|
||||
Atomic CSS 2.2 properties
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> resolve = CSSResolver()
|
||||
>>> inherited = {'font-family': 'serif', 'font-weight': 'bold'}
|
||||
>>> out = resolve('''
|
||||
... border-color: BLUE RED;
|
||||
... font-size: 1em;
|
||||
... font-size: 2em;
|
||||
... font-weight: normal;
|
||||
... font-weight: inherit;
|
||||
... ''', inherited)
|
||||
>>> sorted(out.items()) # doctest: +NORMALIZE_WHITESPACE
|
||||
[('border-bottom-color', 'blue'),
|
||||
('border-left-color', 'red'),
|
||||
('border-right-color', 'red'),
|
||||
('border-top-color', 'blue'),
|
||||
('font-family', 'serif'),
|
||||
('font-size', '24pt'),
|
||||
('font-weight', 'bold')]
|
||||
"""
|
||||
|
||||
props = dict(self.atomize(self.parse(declarations_str)))
|
||||
if inherited is None:
|
||||
inherited = {}
|
||||
|
||||
# 1. resolve inherited, initial
|
||||
for prop, val in inherited.items():
|
||||
if prop not in props:
|
||||
props[prop] = val
|
||||
|
||||
for prop, val in list(props.items()):
|
||||
if val == "inherit":
|
||||
val = inherited.get(prop, "initial")
|
||||
if val == "initial":
|
||||
val = None
|
||||
|
||||
if val is None:
|
||||
# we do not define a complete initial stylesheet
|
||||
del props[prop]
|
||||
else:
|
||||
props[prop] = val
|
||||
|
||||
# 2. resolve relative font size
|
||||
if props.get("font-size"):
|
||||
if "font-size" in inherited:
|
||||
em_pt = inherited["font-size"]
|
||||
assert em_pt[-2:] == "pt"
|
||||
em_pt = float(em_pt[:-2])
|
||||
else:
|
||||
em_pt = None
|
||||
props["font-size"] = self.size_to_pt(
|
||||
props["font-size"], em_pt, conversions=self.FONT_SIZE_RATIOS
|
||||
)
|
||||
|
||||
font_size = float(props["font-size"][:-2])
|
||||
else:
|
||||
font_size = None
|
||||
|
||||
# 3. TODO: resolve other font-relative units
|
||||
for side in self.SIDES:
|
||||
prop = "border-{side}-width".format(side=side)
|
||||
if prop in props:
|
||||
props[prop] = self.size_to_pt(
|
||||
props[prop], em_pt=font_size, conversions=self.BORDER_WIDTH_RATIOS
|
||||
)
|
||||
for prop in [
|
||||
"margin-{side}".format(side=side),
|
||||
"padding-{side}".format(side=side),
|
||||
]:
|
||||
if prop in props:
|
||||
# TODO: support %
|
||||
props[prop] = self.size_to_pt(
|
||||
props[prop], em_pt=font_size, conversions=self.MARGIN_RATIOS
|
||||
)
|
||||
|
||||
return props
|
||||
|
||||
UNIT_RATIOS = {
|
||||
"rem": ("pt", 12),
|
||||
"ex": ("em", 0.5),
|
||||
# 'ch':
|
||||
"px": ("pt", 0.75),
|
||||
"pc": ("pt", 12),
|
||||
"in": ("pt", 72),
|
||||
"cm": ("in", 1 / 2.54),
|
||||
"mm": ("in", 1 / 25.4),
|
||||
"q": ("mm", 0.25),
|
||||
"!!default": ("em", 0),
|
||||
}
|
||||
|
||||
FONT_SIZE_RATIOS = UNIT_RATIOS.copy()
|
||||
FONT_SIZE_RATIOS.update(
|
||||
{
|
||||
"%": ("em", 0.01),
|
||||
"xx-small": ("rem", 0.5),
|
||||
"x-small": ("rem", 0.625),
|
||||
"small": ("rem", 0.8),
|
||||
"medium": ("rem", 1),
|
||||
"large": ("rem", 1.125),
|
||||
"x-large": ("rem", 1.5),
|
||||
"xx-large": ("rem", 2),
|
||||
"smaller": ("em", 1 / 1.2),
|
||||
"larger": ("em", 1.2),
|
||||
"!!default": ("em", 1),
|
||||
}
|
||||
)
|
||||
|
||||
MARGIN_RATIOS = UNIT_RATIOS.copy()
|
||||
MARGIN_RATIOS.update({"none": ("pt", 0)})
|
||||
|
||||
BORDER_WIDTH_RATIOS = UNIT_RATIOS.copy()
|
||||
BORDER_WIDTH_RATIOS.update(
|
||||
{
|
||||
"none": ("pt", 0),
|
||||
"thick": ("px", 4),
|
||||
"medium": ("px", 2),
|
||||
"thin": ("px", 1),
|
||||
# Default: medium only if solid
|
||||
}
|
||||
)
|
||||
|
||||
def size_to_pt(self, in_val, em_pt=None, conversions=UNIT_RATIOS):
|
||||
def _error():
|
||||
warnings.warn("Unhandled size: {val!r}".format(val=in_val), CSSWarning)
|
||||
return self.size_to_pt("1!!default", conversions=conversions)
|
||||
|
||||
try:
|
||||
val, unit = re.match(r"^(\S*?)([a-zA-Z%!].*)", in_val).groups()
|
||||
except AttributeError:
|
||||
return _error()
|
||||
if val == "":
|
||||
# hack for 'large' etc.
|
||||
val = 1
|
||||
else:
|
||||
try:
|
||||
val = float(val)
|
||||
except ValueError:
|
||||
return _error()
|
||||
|
||||
while unit != "pt":
|
||||
if unit == "em":
|
||||
if em_pt is None:
|
||||
unit = "rem"
|
||||
else:
|
||||
val *= em_pt
|
||||
unit = "pt"
|
||||
continue
|
||||
|
||||
try:
|
||||
unit, mul = conversions[unit]
|
||||
except KeyError:
|
||||
return _error()
|
||||
val *= mul
|
||||
|
||||
val = round(val, 5)
|
||||
if int(val) == val:
|
||||
size_fmt = "{fmt:d}pt".format(fmt=int(val))
|
||||
else:
|
||||
size_fmt = "{fmt:f}pt".format(fmt=val)
|
||||
return size_fmt
|
||||
|
||||
def atomize(self, declarations):
|
||||
for prop, value in declarations:
|
||||
attr = "expand_" + prop.replace("-", "_")
|
||||
try:
|
||||
expand = getattr(self, attr)
|
||||
except AttributeError:
|
||||
yield prop, value
|
||||
else:
|
||||
for prop, value in expand(prop, value):
|
||||
yield prop, value
|
||||
|
||||
SIDE_SHORTHANDS = {
|
||||
1: [0, 0, 0, 0],
|
||||
2: [0, 1, 0, 1],
|
||||
3: [0, 1, 2, 1],
|
||||
4: [0, 1, 2, 3],
|
||||
}
|
||||
SIDES = ("top", "right", "bottom", "left")
|
||||
|
||||
def _side_expander(prop_fmt):
|
||||
def expand(self, prop, value):
|
||||
tokens = value.split()
|
||||
try:
|
||||
mapping = self.SIDE_SHORTHANDS[len(tokens)]
|
||||
except KeyError:
|
||||
warnings.warn(
|
||||
'Could not expand "{prop}: {val}"'.format(prop=prop, val=value),
|
||||
CSSWarning,
|
||||
)
|
||||
return
|
||||
for key, idx in zip(self.SIDES, mapping):
|
||||
yield prop_fmt.format(key), tokens[idx]
|
||||
|
||||
return expand
|
||||
|
||||
expand_border_color = _side_expander("border-{:s}-color")
|
||||
expand_border_style = _side_expander("border-{:s}-style")
|
||||
expand_border_width = _side_expander("border-{:s}-width")
|
||||
expand_margin = _side_expander("margin-{:s}")
|
||||
expand_padding = _side_expander("padding-{:s}")
|
||||
|
||||
def parse(self, declarations_str):
|
||||
"""Generates (prop, value) pairs from declarations
|
||||
|
||||
In a future version may generate parsed tokens from tinycss/tinycss2
|
||||
"""
|
||||
for decl in declarations_str.split(";"):
|
||||
if not decl.strip():
|
||||
continue
|
||||
prop, sep, val = decl.partition(":")
|
||||
prop = prop.strip().lower()
|
||||
# TODO: don't lowercase case sensitive parts of values (strings)
|
||||
val = val.strip().lower()
|
||||
if sep:
|
||||
yield prop, val
|
||||
else:
|
||||
warnings.warn(
|
||||
"Ill-formatted attribute: expected a colon "
|
||||
"in {decl!r}".format(decl=decl),
|
||||
CSSWarning,
|
||||
)
|
356
venv/lib/python3.6/site-packages/pandas/io/formats/csvs.py
Normal file
356
venv/lib/python3.6/site-packages/pandas/io/formats/csvs.py
Normal file
@@ -0,0 +1,356 @@
|
||||
"""
|
||||
Module for formatting output data into CSV files.
|
||||
"""
|
||||
|
||||
import csv as csvlib
|
||||
from io import StringIO
|
||||
import os
|
||||
import warnings
|
||||
from zipfile import ZipFile
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import writers as libwriters
|
||||
|
||||
from pandas.core.dtypes.generic import (
|
||||
ABCDatetimeIndex,
|
||||
ABCIndexClass,
|
||||
ABCMultiIndex,
|
||||
ABCPeriodIndex,
|
||||
)
|
||||
from pandas.core.dtypes.missing import notna
|
||||
|
||||
from pandas.io.common import (
|
||||
UnicodeWriter,
|
||||
_get_handle,
|
||||
_infer_compression,
|
||||
get_filepath_or_buffer,
|
||||
)
|
||||
|
||||
|
||||
class CSVFormatter:
|
||||
def __init__(
|
||||
self,
|
||||
obj,
|
||||
path_or_buf=None,
|
||||
sep=",",
|
||||
na_rep="",
|
||||
float_format=None,
|
||||
cols=None,
|
||||
header=True,
|
||||
index=True,
|
||||
index_label=None,
|
||||
mode="w",
|
||||
encoding=None,
|
||||
compression="infer",
|
||||
quoting=None,
|
||||
line_terminator="\n",
|
||||
chunksize=None,
|
||||
quotechar='"',
|
||||
date_format=None,
|
||||
doublequote=True,
|
||||
escapechar=None,
|
||||
decimal=".",
|
||||
):
|
||||
|
||||
self.obj = obj
|
||||
|
||||
if path_or_buf is None:
|
||||
path_or_buf = StringIO()
|
||||
|
||||
self.path_or_buf, _, _, _ = get_filepath_or_buffer(
|
||||
path_or_buf, encoding=encoding, compression=compression, mode=mode
|
||||
)
|
||||
self.sep = sep
|
||||
self.na_rep = na_rep
|
||||
self.float_format = float_format
|
||||
self.decimal = decimal
|
||||
|
||||
self.header = header
|
||||
self.index = index
|
||||
self.index_label = index_label
|
||||
self.mode = mode
|
||||
if encoding is None:
|
||||
encoding = "utf-8"
|
||||
self.encoding = encoding
|
||||
self.compression = _infer_compression(self.path_or_buf, compression)
|
||||
|
||||
if quoting is None:
|
||||
quoting = csvlib.QUOTE_MINIMAL
|
||||
self.quoting = quoting
|
||||
|
||||
if quoting == csvlib.QUOTE_NONE:
|
||||
# prevents crash in _csv
|
||||
quotechar = None
|
||||
self.quotechar = quotechar
|
||||
|
||||
self.doublequote = doublequote
|
||||
self.escapechar = escapechar
|
||||
|
||||
self.line_terminator = line_terminator or os.linesep
|
||||
|
||||
self.date_format = date_format
|
||||
|
||||
self.has_mi_columns = isinstance(obj.columns, ABCMultiIndex)
|
||||
|
||||
# validate mi options
|
||||
if self.has_mi_columns:
|
||||
if cols is not None:
|
||||
raise TypeError(
|
||||
"cannot specify cols with a MultiIndex on the " "columns"
|
||||
)
|
||||
|
||||
if cols is not None:
|
||||
if isinstance(cols, ABCIndexClass):
|
||||
cols = cols.to_native_types(
|
||||
na_rep=na_rep,
|
||||
float_format=float_format,
|
||||
date_format=date_format,
|
||||
quoting=self.quoting,
|
||||
)
|
||||
else:
|
||||
cols = list(cols)
|
||||
self.obj = self.obj.loc[:, cols]
|
||||
|
||||
# update columns to include possible multiplicity of dupes
|
||||
# and make sure sure cols is just a list of labels
|
||||
cols = self.obj.columns
|
||||
if isinstance(cols, ABCIndexClass):
|
||||
cols = cols.to_native_types(
|
||||
na_rep=na_rep,
|
||||
float_format=float_format,
|
||||
date_format=date_format,
|
||||
quoting=self.quoting,
|
||||
)
|
||||
else:
|
||||
cols = list(cols)
|
||||
|
||||
# save it
|
||||
self.cols = cols
|
||||
|
||||
# preallocate data 2d list
|
||||
self.blocks = self.obj._data.blocks
|
||||
ncols = sum(b.shape[0] for b in self.blocks)
|
||||
self.data = [None] * ncols
|
||||
|
||||
if chunksize is None:
|
||||
chunksize = (100000 // (len(self.cols) or 1)) or 1
|
||||
self.chunksize = int(chunksize)
|
||||
|
||||
self.data_index = obj.index
|
||||
if (
|
||||
isinstance(self.data_index, (ABCDatetimeIndex, ABCPeriodIndex))
|
||||
and date_format is not None
|
||||
):
|
||||
from pandas import Index
|
||||
|
||||
self.data_index = Index(
|
||||
[x.strftime(date_format) if notna(x) else "" for x in self.data_index]
|
||||
)
|
||||
|
||||
self.nlevels = getattr(self.data_index, "nlevels", 1)
|
||||
if not index:
|
||||
self.nlevels = 0
|
||||
|
||||
def save(self):
|
||||
"""
|
||||
Create the writer & save
|
||||
"""
|
||||
# GH21227 internal compression is not used when file-like passed.
|
||||
if self.compression and hasattr(self.path_or_buf, "write"):
|
||||
msg = "compression has no effect when passing file-like " "object as input."
|
||||
warnings.warn(msg, RuntimeWarning, stacklevel=2)
|
||||
|
||||
# when zip compression is called.
|
||||
is_zip = isinstance(self.path_or_buf, ZipFile) or (
|
||||
not hasattr(self.path_or_buf, "write") and self.compression == "zip"
|
||||
)
|
||||
|
||||
if is_zip:
|
||||
# zipfile doesn't support writing string to archive. uses string
|
||||
# buffer to receive csv writing and dump into zip compression
|
||||
# file handle. GH21241, GH21118
|
||||
f = StringIO()
|
||||
close = False
|
||||
elif hasattr(self.path_or_buf, "write"):
|
||||
f = self.path_or_buf
|
||||
close = False
|
||||
else:
|
||||
f, handles = _get_handle(
|
||||
self.path_or_buf,
|
||||
self.mode,
|
||||
encoding=self.encoding,
|
||||
compression=self.compression,
|
||||
)
|
||||
close = True
|
||||
|
||||
try:
|
||||
writer_kwargs = dict(
|
||||
lineterminator=self.line_terminator,
|
||||
delimiter=self.sep,
|
||||
quoting=self.quoting,
|
||||
doublequote=self.doublequote,
|
||||
escapechar=self.escapechar,
|
||||
quotechar=self.quotechar,
|
||||
)
|
||||
if self.encoding == "ascii":
|
||||
self.writer = csvlib.writer(f, **writer_kwargs)
|
||||
else:
|
||||
writer_kwargs["encoding"] = self.encoding
|
||||
self.writer = UnicodeWriter(f, **writer_kwargs)
|
||||
|
||||
self._save()
|
||||
|
||||
finally:
|
||||
if is_zip:
|
||||
# GH17778 handles zip compression separately.
|
||||
buf = f.getvalue()
|
||||
if hasattr(self.path_or_buf, "write"):
|
||||
self.path_or_buf.write(buf)
|
||||
else:
|
||||
f, handles = _get_handle(
|
||||
self.path_or_buf,
|
||||
self.mode,
|
||||
encoding=self.encoding,
|
||||
compression=self.compression,
|
||||
)
|
||||
f.write(buf)
|
||||
close = True
|
||||
if close:
|
||||
f.close()
|
||||
for _fh in handles:
|
||||
_fh.close()
|
||||
|
||||
def _save_header(self):
|
||||
|
||||
writer = self.writer
|
||||
obj = self.obj
|
||||
index_label = self.index_label
|
||||
cols = self.cols
|
||||
has_mi_columns = self.has_mi_columns
|
||||
header = self.header
|
||||
encoded_labels = []
|
||||
|
||||
has_aliases = isinstance(header, (tuple, list, np.ndarray, ABCIndexClass))
|
||||
if not (has_aliases or self.header):
|
||||
return
|
||||
if has_aliases:
|
||||
if len(header) != len(cols):
|
||||
raise ValueError(
|
||||
(
|
||||
"Writing {ncols} cols but got {nalias} "
|
||||
"aliases".format(ncols=len(cols), nalias=len(header))
|
||||
)
|
||||
)
|
||||
else:
|
||||
write_cols = header
|
||||
else:
|
||||
write_cols = cols
|
||||
|
||||
if self.index:
|
||||
# should write something for index label
|
||||
if index_label is not False:
|
||||
if index_label is None:
|
||||
if isinstance(obj.index, ABCMultiIndex):
|
||||
index_label = []
|
||||
for i, name in enumerate(obj.index.names):
|
||||
if name is None:
|
||||
name = ""
|
||||
index_label.append(name)
|
||||
else:
|
||||
index_label = obj.index.name
|
||||
if index_label is None:
|
||||
index_label = [""]
|
||||
else:
|
||||
index_label = [index_label]
|
||||
elif not isinstance(
|
||||
index_label, (list, tuple, np.ndarray, ABCIndexClass)
|
||||
):
|
||||
# given a string for a DF with Index
|
||||
index_label = [index_label]
|
||||
|
||||
encoded_labels = list(index_label)
|
||||
else:
|
||||
encoded_labels = []
|
||||
|
||||
if not has_mi_columns or has_aliases:
|
||||
encoded_labels += list(write_cols)
|
||||
writer.writerow(encoded_labels)
|
||||
else:
|
||||
# write out the mi
|
||||
columns = obj.columns
|
||||
|
||||
# write out the names for each level, then ALL of the values for
|
||||
# each level
|
||||
for i in range(columns.nlevels):
|
||||
|
||||
# we need at least 1 index column to write our col names
|
||||
col_line = []
|
||||
if self.index:
|
||||
|
||||
# name is the first column
|
||||
col_line.append(columns.names[i])
|
||||
|
||||
if isinstance(index_label, list) and len(index_label) > 1:
|
||||
col_line.extend([""] * (len(index_label) - 1))
|
||||
|
||||
col_line.extend(columns._get_level_values(i))
|
||||
|
||||
writer.writerow(col_line)
|
||||
|
||||
# Write out the index line if it's not empty.
|
||||
# Otherwise, we will print out an extraneous
|
||||
# blank line between the mi and the data rows.
|
||||
if encoded_labels and set(encoded_labels) != {""}:
|
||||
encoded_labels.extend([""] * len(columns))
|
||||
writer.writerow(encoded_labels)
|
||||
|
||||
def _save(self):
|
||||
|
||||
self._save_header()
|
||||
|
||||
nrows = len(self.data_index)
|
||||
|
||||
# write in chunksize bites
|
||||
chunksize = self.chunksize
|
||||
chunks = int(nrows / chunksize) + 1
|
||||
|
||||
for i in range(chunks):
|
||||
start_i = i * chunksize
|
||||
end_i = min((i + 1) * chunksize, nrows)
|
||||
if start_i >= end_i:
|
||||
break
|
||||
|
||||
self._save_chunk(start_i, end_i)
|
||||
|
||||
def _save_chunk(self, start_i, end_i):
|
||||
|
||||
data_index = self.data_index
|
||||
|
||||
# create the data for a chunk
|
||||
slicer = slice(start_i, end_i)
|
||||
for i in range(len(self.blocks)):
|
||||
b = self.blocks[i]
|
||||
d = b.to_native_types(
|
||||
slicer=slicer,
|
||||
na_rep=self.na_rep,
|
||||
float_format=self.float_format,
|
||||
decimal=self.decimal,
|
||||
date_format=self.date_format,
|
||||
quoting=self.quoting,
|
||||
)
|
||||
|
||||
for col_loc, col in zip(b.mgr_locs, d):
|
||||
# self.data is a preallocated list
|
||||
self.data[col_loc] = col
|
||||
|
||||
ix = data_index.to_native_types(
|
||||
slicer=slicer,
|
||||
na_rep=self.na_rep,
|
||||
float_format=self.float_format,
|
||||
decimal=self.decimal,
|
||||
date_format=self.date_format,
|
||||
quoting=self.quoting,
|
||||
)
|
||||
|
||||
libwriters.write_csv_rows(self.data, ix, self.nlevels, self.cols, self.writer)
|
742
venv/lib/python3.6/site-packages/pandas/io/formats/excel.py
Normal file
742
venv/lib/python3.6/site-packages/pandas/io/formats/excel.py
Normal file
@@ -0,0 +1,742 @@
|
||||
"""Utilities for conversion to writer-agnostic Excel representation
|
||||
"""
|
||||
|
||||
from functools import reduce
|
||||
import itertools
|
||||
import re
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.core.dtypes import missing
|
||||
from pandas.core.dtypes.common import is_float, is_scalar
|
||||
from pandas.core.dtypes.generic import ABCMultiIndex, ABCPeriodIndex
|
||||
|
||||
from pandas import Index
|
||||
import pandas.core.common as com
|
||||
|
||||
from pandas.io.formats.css import CSSResolver, CSSWarning
|
||||
from pandas.io.formats.format import get_level_lengths
|
||||
from pandas.io.formats.printing import pprint_thing
|
||||
|
||||
|
||||
class ExcelCell:
|
||||
__fields__ = ("row", "col", "val", "style", "mergestart", "mergeend")
|
||||
__slots__ = __fields__
|
||||
|
||||
def __init__(self, row, col, val, style=None, mergestart=None, mergeend=None):
|
||||
self.row = row
|
||||
self.col = col
|
||||
self.val = val
|
||||
self.style = style
|
||||
self.mergestart = mergestart
|
||||
self.mergeend = mergeend
|
||||
|
||||
|
||||
class CSSToExcelConverter:
|
||||
"""A callable for converting CSS declarations to ExcelWriter styles
|
||||
|
||||
Supports parts of CSS 2.2, with minimal CSS 3.0 support (e.g. text-shadow),
|
||||
focusing on font styling, backgrounds, borders and alignment.
|
||||
|
||||
Operates by first computing CSS styles in a fairly generic
|
||||
way (see :meth:`compute_css`) then determining Excel style
|
||||
properties from CSS properties (see :meth:`build_xlstyle`).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
inherited : str, optional
|
||||
CSS declarations understood to be the containing scope for the
|
||||
CSS processed by :meth:`__call__`.
|
||||
"""
|
||||
|
||||
# NB: Most of the methods here could be classmethods, as only __init__
|
||||
# and __call__ make use of instance attributes. We leave them as
|
||||
# instancemethods so that users can easily experiment with extensions
|
||||
# without monkey-patching.
|
||||
|
||||
def __init__(self, inherited=None):
|
||||
if inherited is not None:
|
||||
inherited = self.compute_css(inherited)
|
||||
|
||||
self.inherited = inherited
|
||||
|
||||
compute_css = CSSResolver()
|
||||
|
||||
def __call__(self, declarations_str):
|
||||
"""Convert CSS declarations to ExcelWriter style
|
||||
|
||||
Parameters
|
||||
----------
|
||||
declarations_str : str
|
||||
List of CSS declarations.
|
||||
e.g. "font-weight: bold; background: blue"
|
||||
|
||||
Returns
|
||||
-------
|
||||
xlstyle : dict
|
||||
A style as interpreted by ExcelWriter when found in
|
||||
ExcelCell.style.
|
||||
"""
|
||||
# TODO: memoize?
|
||||
properties = self.compute_css(declarations_str, self.inherited)
|
||||
return self.build_xlstyle(properties)
|
||||
|
||||
def build_xlstyle(self, props):
|
||||
out = {
|
||||
"alignment": self.build_alignment(props),
|
||||
"border": self.build_border(props),
|
||||
"fill": self.build_fill(props),
|
||||
"font": self.build_font(props),
|
||||
"number_format": self.build_number_format(props),
|
||||
}
|
||||
# TODO: handle cell width and height: needs support in pandas.io.excel
|
||||
|
||||
def remove_none(d):
|
||||
"""Remove key where value is None, through nested dicts"""
|
||||
for k, v in list(d.items()):
|
||||
if v is None:
|
||||
del d[k]
|
||||
elif isinstance(v, dict):
|
||||
remove_none(v)
|
||||
if not v:
|
||||
del d[k]
|
||||
|
||||
remove_none(out)
|
||||
return out
|
||||
|
||||
VERTICAL_MAP = {
|
||||
"top": "top",
|
||||
"text-top": "top",
|
||||
"middle": "center",
|
||||
"baseline": "bottom",
|
||||
"bottom": "bottom",
|
||||
"text-bottom": "bottom",
|
||||
# OpenXML also has 'justify', 'distributed'
|
||||
}
|
||||
|
||||
def build_alignment(self, props):
|
||||
# TODO: text-indent, padding-left -> alignment.indent
|
||||
return {
|
||||
"horizontal": props.get("text-align"),
|
||||
"vertical": self.VERTICAL_MAP.get(props.get("vertical-align")),
|
||||
"wrap_text": (
|
||||
None
|
||||
if props.get("white-space") is None
|
||||
else props["white-space"] not in ("nowrap", "pre", "pre-line")
|
||||
),
|
||||
}
|
||||
|
||||
def build_border(self, props):
|
||||
return {
|
||||
side: {
|
||||
"style": self._border_style(
|
||||
props.get("border-{side}-style".format(side=side)),
|
||||
props.get("border-{side}-width".format(side=side)),
|
||||
),
|
||||
"color": self.color_to_excel(
|
||||
props.get("border-{side}-color".format(side=side))
|
||||
),
|
||||
}
|
||||
for side in ["top", "right", "bottom", "left"]
|
||||
}
|
||||
|
||||
def _border_style(self, style, width):
|
||||
# convert styles and widths to openxml, one of:
|
||||
# 'dashDot'
|
||||
# 'dashDotDot'
|
||||
# 'dashed'
|
||||
# 'dotted'
|
||||
# 'double'
|
||||
# 'hair'
|
||||
# 'medium'
|
||||
# 'mediumDashDot'
|
||||
# 'mediumDashDotDot'
|
||||
# 'mediumDashed'
|
||||
# 'slantDashDot'
|
||||
# 'thick'
|
||||
# 'thin'
|
||||
if width is None and style is None:
|
||||
return None
|
||||
if style == "none" or style == "hidden":
|
||||
return None
|
||||
|
||||
if width is None:
|
||||
width = "2pt"
|
||||
width = float(width[:-2])
|
||||
if width < 1e-5:
|
||||
return None
|
||||
elif width < 1.3:
|
||||
width_name = "thin"
|
||||
elif width < 2.8:
|
||||
width_name = "medium"
|
||||
else:
|
||||
width_name = "thick"
|
||||
|
||||
if style in (None, "groove", "ridge", "inset", "outset"):
|
||||
# not handled
|
||||
style = "solid"
|
||||
|
||||
if style == "double":
|
||||
return "double"
|
||||
if style == "solid":
|
||||
return width_name
|
||||
if style == "dotted":
|
||||
if width_name in ("hair", "thin"):
|
||||
return "dotted"
|
||||
return "mediumDashDotDot"
|
||||
if style == "dashed":
|
||||
if width_name in ("hair", "thin"):
|
||||
return "dashed"
|
||||
return "mediumDashed"
|
||||
|
||||
def build_fill(self, props):
|
||||
# TODO: perhaps allow for special properties
|
||||
# -excel-pattern-bgcolor and -excel-pattern-type
|
||||
fill_color = props.get("background-color")
|
||||
if fill_color not in (None, "transparent", "none"):
|
||||
return {"fgColor": self.color_to_excel(fill_color), "patternType": "solid"}
|
||||
|
||||
BOLD_MAP = {
|
||||
"bold": True,
|
||||
"bolder": True,
|
||||
"600": True,
|
||||
"700": True,
|
||||
"800": True,
|
||||
"900": True,
|
||||
"normal": False,
|
||||
"lighter": False,
|
||||
"100": False,
|
||||
"200": False,
|
||||
"300": False,
|
||||
"400": False,
|
||||
"500": False,
|
||||
}
|
||||
ITALIC_MAP = {"normal": False, "italic": True, "oblique": True}
|
||||
|
||||
def build_font(self, props):
|
||||
size = props.get("font-size")
|
||||
if size is not None:
|
||||
assert size.endswith("pt")
|
||||
size = float(size[:-2])
|
||||
|
||||
font_names_tmp = re.findall(
|
||||
r"""(?x)
|
||||
(
|
||||
"(?:[^"]|\\")+"
|
||||
|
|
||||
'(?:[^']|\\')+'
|
||||
|
|
||||
[^'",]+
|
||||
)(?=,|\s*$)
|
||||
""",
|
||||
props.get("font-family", ""),
|
||||
)
|
||||
font_names = []
|
||||
for name in font_names_tmp:
|
||||
if name[:1] == '"':
|
||||
name = name[1:-1].replace('\\"', '"')
|
||||
elif name[:1] == "'":
|
||||
name = name[1:-1].replace("\\'", "'")
|
||||
else:
|
||||
name = name.strip()
|
||||
if name:
|
||||
font_names.append(name)
|
||||
|
||||
family = None
|
||||
for name in font_names:
|
||||
if name == "serif":
|
||||
family = 1 # roman
|
||||
break
|
||||
elif name == "sans-serif":
|
||||
family = 2 # swiss
|
||||
break
|
||||
elif name == "cursive":
|
||||
family = 4 # script
|
||||
break
|
||||
elif name == "fantasy":
|
||||
family = 5 # decorative
|
||||
break
|
||||
|
||||
decoration = props.get("text-decoration")
|
||||
if decoration is not None:
|
||||
decoration = decoration.split()
|
||||
else:
|
||||
decoration = ()
|
||||
|
||||
return {
|
||||
"name": font_names[0] if font_names else None,
|
||||
"family": family,
|
||||
"size": size,
|
||||
"bold": self.BOLD_MAP.get(props.get("font-weight")),
|
||||
"italic": self.ITALIC_MAP.get(props.get("font-style")),
|
||||
"underline": ("single" if "underline" in decoration else None),
|
||||
"strike": ("line-through" in decoration) or None,
|
||||
"color": self.color_to_excel(props.get("color")),
|
||||
# shadow if nonzero digit before shadow color
|
||||
"shadow": (
|
||||
bool(re.search("^[^#(]*[1-9]", props["text-shadow"]))
|
||||
if "text-shadow" in props
|
||||
else None
|
||||
),
|
||||
# 'vertAlign':,
|
||||
# 'charset': ,
|
||||
# 'scheme': ,
|
||||
# 'outline': ,
|
||||
# 'condense': ,
|
||||
}
|
||||
|
||||
NAMED_COLORS = {
|
||||
"maroon": "800000",
|
||||
"brown": "A52A2A",
|
||||
"red": "FF0000",
|
||||
"pink": "FFC0CB",
|
||||
"orange": "FFA500",
|
||||
"yellow": "FFFF00",
|
||||
"olive": "808000",
|
||||
"green": "008000",
|
||||
"purple": "800080",
|
||||
"fuchsia": "FF00FF",
|
||||
"lime": "00FF00",
|
||||
"teal": "008080",
|
||||
"aqua": "00FFFF",
|
||||
"blue": "0000FF",
|
||||
"navy": "000080",
|
||||
"black": "000000",
|
||||
"gray": "808080",
|
||||
"grey": "808080",
|
||||
"silver": "C0C0C0",
|
||||
"white": "FFFFFF",
|
||||
}
|
||||
|
||||
def color_to_excel(self, val):
|
||||
if val is None:
|
||||
return None
|
||||
if val.startswith("#") and len(val) == 7:
|
||||
return val[1:].upper()
|
||||
if val.startswith("#") and len(val) == 4:
|
||||
return (val[1] * 2 + val[2] * 2 + val[3] * 2).upper()
|
||||
try:
|
||||
return self.NAMED_COLORS[val]
|
||||
except KeyError:
|
||||
warnings.warn("Unhandled color format: {val!r}".format(val=val), CSSWarning)
|
||||
|
||||
def build_number_format(self, props):
|
||||
return {"format_code": props.get("number-format")}
|
||||
|
||||
|
||||
class ExcelFormatter:
|
||||
"""
|
||||
Class for formatting a DataFrame to a list of ExcelCells,
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df : DataFrame or Styler
|
||||
na_rep: na representation
|
||||
float_format : string, default None
|
||||
Format string for floating point numbers
|
||||
cols : sequence, optional
|
||||
Columns to write
|
||||
header : boolean or list of string, default True
|
||||
Write out column names. If a list of string is given it is
|
||||
assumed to be aliases for the column names
|
||||
index : boolean, default True
|
||||
output row names (index)
|
||||
index_label : string or sequence, default None
|
||||
Column label for index column(s) if desired. If None is given, and
|
||||
`header` and `index` are True, then the index names are used. A
|
||||
sequence should be given if the DataFrame uses MultiIndex.
|
||||
merge_cells : boolean, default False
|
||||
Format MultiIndex and Hierarchical Rows as merged cells.
|
||||
inf_rep : string, default `'inf'`
|
||||
representation for np.inf values (which aren't representable in Excel)
|
||||
A `'-'` sign will be added in front of -inf.
|
||||
style_converter : callable, optional
|
||||
This translates Styler styles (CSS) into ExcelWriter styles.
|
||||
Defaults to ``CSSToExcelConverter()``.
|
||||
It should have signature css_declarations string -> excel style.
|
||||
This is only called for body cells.
|
||||
"""
|
||||
|
||||
max_rows = 2 ** 20
|
||||
max_cols = 2 ** 14
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
df,
|
||||
na_rep="",
|
||||
float_format=None,
|
||||
cols=None,
|
||||
header=True,
|
||||
index=True,
|
||||
index_label=None,
|
||||
merge_cells=False,
|
||||
inf_rep="inf",
|
||||
style_converter=None,
|
||||
):
|
||||
self.rowcounter = 0
|
||||
self.na_rep = na_rep
|
||||
if hasattr(df, "render"):
|
||||
self.styler = df
|
||||
df = df.data
|
||||
if style_converter is None:
|
||||
style_converter = CSSToExcelConverter()
|
||||
self.style_converter = style_converter
|
||||
else:
|
||||
self.styler = None
|
||||
self.df = df
|
||||
if cols is not None:
|
||||
|
||||
# all missing, raise
|
||||
if not len(Index(cols) & df.columns):
|
||||
raise KeyError("passes columns are not ALL present dataframe")
|
||||
|
||||
# deprecatedin gh-17295
|
||||
# 1 missing is ok (for now)
|
||||
if len(Index(cols) & df.columns) != len(cols):
|
||||
warnings.warn(
|
||||
"Not all names specified in 'columns' are found; "
|
||||
"this will raise a KeyError in the future",
|
||||
FutureWarning,
|
||||
)
|
||||
|
||||
self.df = df.reindex(columns=cols)
|
||||
self.columns = self.df.columns
|
||||
self.float_format = float_format
|
||||
self.index = index
|
||||
self.index_label = index_label
|
||||
self.header = header
|
||||
self.merge_cells = merge_cells
|
||||
self.inf_rep = inf_rep
|
||||
|
||||
@property
|
||||
def header_style(self):
|
||||
return {
|
||||
"font": {"bold": True},
|
||||
"borders": {
|
||||
"top": "thin",
|
||||
"right": "thin",
|
||||
"bottom": "thin",
|
||||
"left": "thin",
|
||||
},
|
||||
"alignment": {"horizontal": "center", "vertical": "top"},
|
||||
}
|
||||
|
||||
def _format_value(self, val):
|
||||
if is_scalar(val) and missing.isna(val):
|
||||
val = self.na_rep
|
||||
elif is_float(val):
|
||||
if missing.isposinf_scalar(val):
|
||||
val = self.inf_rep
|
||||
elif missing.isneginf_scalar(val):
|
||||
val = "-{inf}".format(inf=self.inf_rep)
|
||||
elif self.float_format is not None:
|
||||
val = float(self.float_format % val)
|
||||
if getattr(val, "tzinfo", None) is not None:
|
||||
raise ValueError(
|
||||
"Excel does not support datetimes with "
|
||||
"timezones. Please ensure that datetimes "
|
||||
"are timezone unaware before writing to Excel."
|
||||
)
|
||||
return val
|
||||
|
||||
def _format_header_mi(self):
|
||||
if self.columns.nlevels > 1:
|
||||
if not self.index:
|
||||
raise NotImplementedError(
|
||||
"Writing to Excel with MultiIndex"
|
||||
" columns and no index "
|
||||
"('index'=False) is not yet "
|
||||
"implemented."
|
||||
)
|
||||
|
||||
has_aliases = isinstance(self.header, (tuple, list, np.ndarray, Index))
|
||||
if not (has_aliases or self.header):
|
||||
return
|
||||
|
||||
columns = self.columns
|
||||
level_strs = columns.format(
|
||||
sparsify=self.merge_cells, adjoin=False, names=False
|
||||
)
|
||||
level_lengths = get_level_lengths(level_strs)
|
||||
coloffset = 0
|
||||
lnum = 0
|
||||
|
||||
if self.index and isinstance(self.df.index, ABCMultiIndex):
|
||||
coloffset = len(self.df.index[0]) - 1
|
||||
|
||||
if self.merge_cells:
|
||||
# Format multi-index as a merged cells.
|
||||
for lnum in range(len(level_lengths)):
|
||||
name = columns.names[lnum]
|
||||
yield ExcelCell(lnum, coloffset, name, self.header_style)
|
||||
|
||||
for lnum, (spans, levels, level_codes) in enumerate(
|
||||
zip(level_lengths, columns.levels, columns.codes)
|
||||
):
|
||||
values = levels.take(level_codes)
|
||||
for i in spans:
|
||||
if spans[i] > 1:
|
||||
yield ExcelCell(
|
||||
lnum,
|
||||
coloffset + i + 1,
|
||||
values[i],
|
||||
self.header_style,
|
||||
lnum,
|
||||
coloffset + i + spans[i],
|
||||
)
|
||||
else:
|
||||
yield ExcelCell(
|
||||
lnum, coloffset + i + 1, values[i], self.header_style
|
||||
)
|
||||
else:
|
||||
# Format in legacy format with dots to indicate levels.
|
||||
for i, values in enumerate(zip(*level_strs)):
|
||||
v = ".".join(map(pprint_thing, values))
|
||||
yield ExcelCell(lnum, coloffset + i + 1, v, self.header_style)
|
||||
|
||||
self.rowcounter = lnum
|
||||
|
||||
def _format_header_regular(self):
|
||||
has_aliases = isinstance(self.header, (tuple, list, np.ndarray, Index))
|
||||
if has_aliases or self.header:
|
||||
coloffset = 0
|
||||
|
||||
if self.index:
|
||||
coloffset = 1
|
||||
if isinstance(self.df.index, ABCMultiIndex):
|
||||
coloffset = len(self.df.index[0])
|
||||
|
||||
colnames = self.columns
|
||||
if has_aliases:
|
||||
if len(self.header) != len(self.columns):
|
||||
raise ValueError(
|
||||
"Writing {cols} cols but got {alias} "
|
||||
"aliases".format(cols=len(self.columns), alias=len(self.header))
|
||||
)
|
||||
else:
|
||||
colnames = self.header
|
||||
|
||||
for colindex, colname in enumerate(colnames):
|
||||
yield ExcelCell(
|
||||
self.rowcounter, colindex + coloffset, colname, self.header_style
|
||||
)
|
||||
|
||||
def _format_header(self):
|
||||
if isinstance(self.columns, ABCMultiIndex):
|
||||
gen = self._format_header_mi()
|
||||
else:
|
||||
gen = self._format_header_regular()
|
||||
|
||||
gen2 = ()
|
||||
if self.df.index.names:
|
||||
row = [x if x is not None else "" for x in self.df.index.names] + [
|
||||
""
|
||||
] * len(self.columns)
|
||||
if reduce(lambda x, y: x and y, map(lambda x: x != "", row)):
|
||||
gen2 = (
|
||||
ExcelCell(self.rowcounter, colindex, val, self.header_style)
|
||||
for colindex, val in enumerate(row)
|
||||
)
|
||||
self.rowcounter += 1
|
||||
return itertools.chain(gen, gen2)
|
||||
|
||||
def _format_body(self):
|
||||
|
||||
if isinstance(self.df.index, ABCMultiIndex):
|
||||
return self._format_hierarchical_rows()
|
||||
else:
|
||||
return self._format_regular_rows()
|
||||
|
||||
def _format_regular_rows(self):
|
||||
has_aliases = isinstance(self.header, (tuple, list, np.ndarray, Index))
|
||||
if has_aliases or self.header:
|
||||
self.rowcounter += 1
|
||||
|
||||
# output index and index_label?
|
||||
if self.index:
|
||||
# check aliases
|
||||
# if list only take first as this is not a MultiIndex
|
||||
if self.index_label and isinstance(
|
||||
self.index_label, (list, tuple, np.ndarray, Index)
|
||||
):
|
||||
index_label = self.index_label[0]
|
||||
# if string good to go
|
||||
elif self.index_label and isinstance(self.index_label, str):
|
||||
index_label = self.index_label
|
||||
else:
|
||||
index_label = self.df.index.names[0]
|
||||
|
||||
if isinstance(self.columns, ABCMultiIndex):
|
||||
self.rowcounter += 1
|
||||
|
||||
if index_label and self.header is not False:
|
||||
yield ExcelCell(self.rowcounter - 1, 0, index_label, self.header_style)
|
||||
|
||||
# write index_values
|
||||
index_values = self.df.index
|
||||
if isinstance(self.df.index, ABCPeriodIndex):
|
||||
index_values = self.df.index.to_timestamp()
|
||||
|
||||
for idx, idxval in enumerate(index_values):
|
||||
yield ExcelCell(self.rowcounter + idx, 0, idxval, self.header_style)
|
||||
|
||||
coloffset = 1
|
||||
else:
|
||||
coloffset = 0
|
||||
|
||||
for cell in self._generate_body(coloffset):
|
||||
yield cell
|
||||
|
||||
def _format_hierarchical_rows(self):
|
||||
has_aliases = isinstance(self.header, (tuple, list, np.ndarray, Index))
|
||||
if has_aliases or self.header:
|
||||
self.rowcounter += 1
|
||||
|
||||
gcolidx = 0
|
||||
|
||||
if self.index:
|
||||
index_labels = self.df.index.names
|
||||
# check for aliases
|
||||
if self.index_label and isinstance(
|
||||
self.index_label, (list, tuple, np.ndarray, Index)
|
||||
):
|
||||
index_labels = self.index_label
|
||||
|
||||
# MultiIndex columns require an extra row
|
||||
# with index names (blank if None) for
|
||||
# unambiguous round-trip, unless not merging,
|
||||
# in which case the names all go on one row Issue #11328
|
||||
if isinstance(self.columns, ABCMultiIndex) and self.merge_cells:
|
||||
self.rowcounter += 1
|
||||
|
||||
# if index labels are not empty go ahead and dump
|
||||
if com._any_not_none(*index_labels) and self.header is not False:
|
||||
|
||||
for cidx, name in enumerate(index_labels):
|
||||
yield ExcelCell(self.rowcounter - 1, cidx, name, self.header_style)
|
||||
|
||||
if self.merge_cells:
|
||||
# Format hierarchical rows as merged cells.
|
||||
level_strs = self.df.index.format(
|
||||
sparsify=True, adjoin=False, names=False
|
||||
)
|
||||
level_lengths = get_level_lengths(level_strs)
|
||||
|
||||
for spans, levels, level_codes in zip(
|
||||
level_lengths, self.df.index.levels, self.df.index.codes
|
||||
):
|
||||
|
||||
values = levels.take(
|
||||
level_codes, allow_fill=levels._can_hold_na, fill_value=True
|
||||
)
|
||||
|
||||
for i in spans:
|
||||
if spans[i] > 1:
|
||||
yield ExcelCell(
|
||||
self.rowcounter + i,
|
||||
gcolidx,
|
||||
values[i],
|
||||
self.header_style,
|
||||
self.rowcounter + i + spans[i] - 1,
|
||||
gcolidx,
|
||||
)
|
||||
else:
|
||||
yield ExcelCell(
|
||||
self.rowcounter + i,
|
||||
gcolidx,
|
||||
values[i],
|
||||
self.header_style,
|
||||
)
|
||||
gcolidx += 1
|
||||
|
||||
else:
|
||||
# Format hierarchical rows with non-merged values.
|
||||
for indexcolvals in zip(*self.df.index):
|
||||
for idx, indexcolval in enumerate(indexcolvals):
|
||||
yield ExcelCell(
|
||||
self.rowcounter + idx,
|
||||
gcolidx,
|
||||
indexcolval,
|
||||
self.header_style,
|
||||
)
|
||||
gcolidx += 1
|
||||
|
||||
for cell in self._generate_body(gcolidx):
|
||||
yield cell
|
||||
|
||||
def _generate_body(self, coloffset):
|
||||
if self.styler is None:
|
||||
styles = None
|
||||
else:
|
||||
styles = self.styler._compute().ctx
|
||||
if not styles:
|
||||
styles = None
|
||||
xlstyle = None
|
||||
|
||||
# Write the body of the frame data series by series.
|
||||
for colidx in range(len(self.columns)):
|
||||
series = self.df.iloc[:, colidx]
|
||||
for i, val in enumerate(series):
|
||||
if styles is not None:
|
||||
xlstyle = self.style_converter(";".join(styles[i, colidx]))
|
||||
yield ExcelCell(self.rowcounter + i, colidx + coloffset, val, xlstyle)
|
||||
|
||||
def get_formatted_cells(self):
|
||||
for cell in itertools.chain(self._format_header(), self._format_body()):
|
||||
cell.val = self._format_value(cell.val)
|
||||
yield cell
|
||||
|
||||
def write(
|
||||
self,
|
||||
writer,
|
||||
sheet_name="Sheet1",
|
||||
startrow=0,
|
||||
startcol=0,
|
||||
freeze_panes=None,
|
||||
engine=None,
|
||||
):
|
||||
"""
|
||||
writer : string or ExcelWriter object
|
||||
File path or existing ExcelWriter
|
||||
sheet_name : string, default 'Sheet1'
|
||||
Name of sheet which will contain DataFrame
|
||||
startrow :
|
||||
upper left cell row to dump data frame
|
||||
startcol :
|
||||
upper left cell column to dump data frame
|
||||
freeze_panes : tuple of integer (length 2), default None
|
||||
Specifies the one-based bottommost row and rightmost column that
|
||||
is to be frozen
|
||||
engine : string, default None
|
||||
write engine to use if writer is a path - you can also set this
|
||||
via the options ``io.excel.xlsx.writer``, ``io.excel.xls.writer``,
|
||||
and ``io.excel.xlsm.writer``.
|
||||
"""
|
||||
from pandas.io.excel import ExcelWriter
|
||||
from pandas.io.common import _stringify_path
|
||||
|
||||
num_rows, num_cols = self.df.shape
|
||||
if num_rows > self.max_rows or num_cols > self.max_cols:
|
||||
raise ValueError(
|
||||
"This sheet is too large! Your sheet size is: "
|
||||
+ "{}, {} ".format(num_rows, num_cols)
|
||||
+ "Max sheet size is: {}, {}".format(self.max_rows, self.max_cols)
|
||||
)
|
||||
|
||||
if isinstance(writer, ExcelWriter):
|
||||
need_save = False
|
||||
else:
|
||||
writer = ExcelWriter(_stringify_path(writer), engine=engine)
|
||||
need_save = True
|
||||
|
||||
formatted_cells = self.get_formatted_cells()
|
||||
writer.write_cells(
|
||||
formatted_cells,
|
||||
sheet_name,
|
||||
startrow=startrow,
|
||||
startcol=startcol,
|
||||
freeze_panes=freeze_panes,
|
||||
)
|
||||
if need_save:
|
||||
writer.save()
|
1816
venv/lib/python3.6/site-packages/pandas/io/formats/format.py
Normal file
1816
venv/lib/python3.6/site-packages/pandas/io/formats/format.py
Normal file
File diff suppressed because it is too large
Load Diff
608
venv/lib/python3.6/site-packages/pandas/io/formats/html.py
Normal file
608
venv/lib/python3.6/site-packages/pandas/io/formats/html.py
Normal file
@@ -0,0 +1,608 @@
|
||||
"""
|
||||
Module for formatting output data in HTML.
|
||||
"""
|
||||
|
||||
from collections import OrderedDict
|
||||
from textwrap import dedent
|
||||
from typing import Dict, List, Optional, Tuple, Union
|
||||
|
||||
from pandas._config import get_option
|
||||
|
||||
from pandas.core.dtypes.generic import ABCIndex, ABCMultiIndex
|
||||
|
||||
from pandas import option_context
|
||||
|
||||
from pandas.io.common import _is_url
|
||||
from pandas.io.formats.format import (
|
||||
DataFrameFormatter,
|
||||
TableFormatter,
|
||||
get_level_lengths,
|
||||
)
|
||||
from pandas.io.formats.printing import pprint_thing
|
||||
|
||||
|
||||
class HTMLFormatter(TableFormatter):
|
||||
"""
|
||||
Internal class for formatting output data in html.
|
||||
This class is intended for shared functionality between
|
||||
DataFrame.to_html() and DataFrame._repr_html_().
|
||||
Any logic in common with other output formatting methods
|
||||
should ideally be inherited from classes in format.py
|
||||
and this class responsible for only producing html markup.
|
||||
"""
|
||||
|
||||
indent_delta = 2
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
formatter: DataFrameFormatter,
|
||||
classes: Optional[Union[str, List, Tuple]] = None,
|
||||
border: Optional[bool] = None,
|
||||
) -> None:
|
||||
self.fmt = formatter
|
||||
self.classes = classes
|
||||
|
||||
self.frame = self.fmt.frame
|
||||
self.columns = self.fmt.tr_frame.columns
|
||||
self.elements = [] # type: List[str]
|
||||
self.bold_rows = self.fmt.kwds.get("bold_rows", False)
|
||||
self.escape = self.fmt.kwds.get("escape", True)
|
||||
self.show_dimensions = self.fmt.show_dimensions
|
||||
if border is None:
|
||||
border = get_option("display.html.border")
|
||||
self.border = border
|
||||
self.table_id = self.fmt.table_id
|
||||
self.render_links = self.fmt.render_links
|
||||
if isinstance(self.fmt.col_space, int):
|
||||
self.fmt.col_space = "{colspace}px".format(colspace=self.fmt.col_space)
|
||||
|
||||
@property
|
||||
def show_row_idx_names(self) -> bool:
|
||||
return self.fmt.show_row_idx_names
|
||||
|
||||
@property
|
||||
def show_col_idx_names(self) -> bool:
|
||||
return self.fmt.show_col_idx_names
|
||||
|
||||
@property
|
||||
def row_levels(self) -> int:
|
||||
if self.fmt.index:
|
||||
# showing (row) index
|
||||
return self.frame.index.nlevels
|
||||
elif self.show_col_idx_names:
|
||||
# see gh-22579
|
||||
# Column misalignment also occurs for
|
||||
# a standard index when the columns index is named.
|
||||
# If the row index is not displayed a column of
|
||||
# blank cells need to be included before the DataFrame values.
|
||||
return 1
|
||||
# not showing (row) index
|
||||
return 0
|
||||
|
||||
def _get_columns_formatted_values(self) -> ABCIndex:
|
||||
return self.columns
|
||||
|
||||
@property
|
||||
def is_truncated(self) -> bool:
|
||||
return self.fmt.is_truncated
|
||||
|
||||
@property
|
||||
def ncols(self) -> int:
|
||||
return len(self.fmt.tr_frame.columns)
|
||||
|
||||
def write(self, s: str, indent: int = 0) -> None:
|
||||
rs = pprint_thing(s)
|
||||
self.elements.append(" " * indent + rs)
|
||||
|
||||
def write_th(
|
||||
self, s: str, header: bool = False, indent: int = 0, tags: Optional[str] = None
|
||||
) -> None:
|
||||
"""
|
||||
Method for writting a formatted <th> cell.
|
||||
|
||||
If col_space is set on the formatter then that is used for
|
||||
the value of min-width.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s : object
|
||||
The data to be written inside the cell.
|
||||
header : boolean, default False
|
||||
Set to True if the <th> is for use inside <thead>. This will
|
||||
cause min-width to be set if there is one.
|
||||
indent : int, default 0
|
||||
The indentation level of the cell.
|
||||
tags : string, default None
|
||||
Tags to include in the cell.
|
||||
|
||||
Returns
|
||||
-------
|
||||
A written <th> cell.
|
||||
"""
|
||||
if header and self.fmt.col_space is not None:
|
||||
tags = tags or ""
|
||||
tags += 'style="min-width: {colspace};"'.format(colspace=self.fmt.col_space)
|
||||
|
||||
self._write_cell(s, kind="th", indent=indent, tags=tags)
|
||||
|
||||
def write_td(self, s: str, indent: int = 0, tags: Optional[str] = None) -> None:
|
||||
self._write_cell(s, kind="td", indent=indent, tags=tags)
|
||||
|
||||
def _write_cell(
|
||||
self, s: str, kind: str = "td", indent: int = 0, tags: Optional[str] = None
|
||||
) -> None:
|
||||
if tags is not None:
|
||||
start_tag = "<{kind} {tags}>".format(kind=kind, tags=tags)
|
||||
else:
|
||||
start_tag = "<{kind}>".format(kind=kind)
|
||||
|
||||
if self.escape:
|
||||
# escape & first to prevent double escaping of &
|
||||
esc = OrderedDict(
|
||||
[("&", r"&"), ("<", r"<"), (">", r">")]
|
||||
) # type: Union[OrderedDict[str, str], Dict]
|
||||
else:
|
||||
esc = {}
|
||||
|
||||
rs = pprint_thing(s, escape_chars=esc).strip()
|
||||
|
||||
if self.render_links and _is_url(rs):
|
||||
rs_unescaped = pprint_thing(s, escape_chars={}).strip()
|
||||
start_tag += '<a href="{url}" target="_blank">'.format(url=rs_unescaped)
|
||||
end_a = "</a>"
|
||||
else:
|
||||
end_a = ""
|
||||
|
||||
self.write(
|
||||
"{start}{rs}{end_a}</{kind}>".format(
|
||||
start=start_tag, rs=rs, end_a=end_a, kind=kind
|
||||
),
|
||||
indent,
|
||||
)
|
||||
|
||||
def write_tr(
|
||||
self,
|
||||
line: List[str],
|
||||
indent: int = 0,
|
||||
indent_delta: int = 0,
|
||||
header: bool = False,
|
||||
align: Optional[str] = None,
|
||||
tags: Optional[Dict[int, str]] = None,
|
||||
nindex_levels: int = 0,
|
||||
) -> None:
|
||||
if tags is None:
|
||||
tags = {}
|
||||
|
||||
if align is None:
|
||||
self.write("<tr>", indent)
|
||||
else:
|
||||
self.write('<tr style="text-align: {align};">'.format(align=align), indent)
|
||||
indent += indent_delta
|
||||
|
||||
for i, s in enumerate(line):
|
||||
val_tag = tags.get(i, None)
|
||||
if header or (self.bold_rows and i < nindex_levels):
|
||||
self.write_th(s, indent=indent, header=header, tags=val_tag)
|
||||
else:
|
||||
self.write_td(s, indent, tags=val_tag)
|
||||
|
||||
indent -= indent_delta
|
||||
self.write("</tr>", indent)
|
||||
|
||||
def render(self) -> List[str]:
|
||||
self._write_table()
|
||||
|
||||
if self.should_show_dimensions:
|
||||
by = chr(215) # ×
|
||||
self.write(
|
||||
"<p>{rows} rows {by} {cols} columns</p>".format(
|
||||
rows=len(self.frame), by=by, cols=len(self.frame.columns)
|
||||
)
|
||||
)
|
||||
|
||||
return self.elements
|
||||
|
||||
def _write_table(self, indent: int = 0) -> None:
|
||||
_classes = ["dataframe"] # Default class.
|
||||
use_mathjax = get_option("display.html.use_mathjax")
|
||||
if not use_mathjax:
|
||||
_classes.append("tex2jax_ignore")
|
||||
if self.classes is not None:
|
||||
if isinstance(self.classes, str):
|
||||
self.classes = self.classes.split()
|
||||
if not isinstance(self.classes, (list, tuple)):
|
||||
raise TypeError(
|
||||
"classes must be a string, list, or tuple, "
|
||||
"not {typ}".format(typ=type(self.classes))
|
||||
)
|
||||
_classes.extend(self.classes)
|
||||
|
||||
if self.table_id is None:
|
||||
id_section = ""
|
||||
else:
|
||||
id_section = ' id="{table_id}"'.format(table_id=self.table_id)
|
||||
|
||||
self.write(
|
||||
'<table border="{border}" class="{cls}"{id_section}>'.format(
|
||||
border=self.border, cls=" ".join(_classes), id_section=id_section
|
||||
),
|
||||
indent,
|
||||
)
|
||||
|
||||
if self.fmt.header or self.show_row_idx_names:
|
||||
self._write_header(indent + self.indent_delta)
|
||||
|
||||
self._write_body(indent + self.indent_delta)
|
||||
|
||||
self.write("</table>", indent)
|
||||
|
||||
def _write_col_header(self, indent: int) -> None:
|
||||
truncate_h = self.fmt.truncate_h
|
||||
if isinstance(self.columns, ABCMultiIndex):
|
||||
template = 'colspan="{span:d}" halign="left"'
|
||||
|
||||
if self.fmt.sparsify:
|
||||
# GH3547
|
||||
sentinel = object()
|
||||
else:
|
||||
sentinel = False
|
||||
levels = self.columns.format(sparsify=sentinel, adjoin=False, names=False)
|
||||
level_lengths = get_level_lengths(levels, sentinel)
|
||||
inner_lvl = len(level_lengths) - 1
|
||||
for lnum, (records, values) in enumerate(zip(level_lengths, levels)):
|
||||
if truncate_h:
|
||||
# modify the header lines
|
||||
ins_col = self.fmt.tr_col_num
|
||||
if self.fmt.sparsify:
|
||||
recs_new = {}
|
||||
# Increment tags after ... col.
|
||||
for tag, span in list(records.items()):
|
||||
if tag >= ins_col:
|
||||
recs_new[tag + 1] = span
|
||||
elif tag + span > ins_col:
|
||||
recs_new[tag] = span + 1
|
||||
if lnum == inner_lvl:
|
||||
values = (
|
||||
values[:ins_col] + ("...",) + values[ins_col:]
|
||||
)
|
||||
else:
|
||||
# sparse col headers do not receive a ...
|
||||
values = (
|
||||
values[:ins_col]
|
||||
+ (values[ins_col - 1],)
|
||||
+ values[ins_col:]
|
||||
)
|
||||
else:
|
||||
recs_new[tag] = span
|
||||
# if ins_col lies between tags, all col headers
|
||||
# get ...
|
||||
if tag + span == ins_col:
|
||||
recs_new[ins_col] = 1
|
||||
values = values[:ins_col] + ("...",) + values[ins_col:]
|
||||
records = recs_new
|
||||
inner_lvl = len(level_lengths) - 1
|
||||
if lnum == inner_lvl:
|
||||
records[ins_col] = 1
|
||||
else:
|
||||
recs_new = {}
|
||||
for tag, span in list(records.items()):
|
||||
if tag >= ins_col:
|
||||
recs_new[tag + 1] = span
|
||||
else:
|
||||
recs_new[tag] = span
|
||||
recs_new[ins_col] = 1
|
||||
records = recs_new
|
||||
values = values[:ins_col] + ["..."] + values[ins_col:]
|
||||
|
||||
# see gh-22579
|
||||
# Column Offset Bug with to_html(index=False) with
|
||||
# MultiIndex Columns and Index.
|
||||
# Initially fill row with blank cells before column names.
|
||||
# TODO: Refactor to remove code duplication with code
|
||||
# block below for standard columns index.
|
||||
row = [""] * (self.row_levels - 1)
|
||||
if self.fmt.index or self.show_col_idx_names:
|
||||
# see gh-22747
|
||||
# If to_html(index_names=False) do not show columns
|
||||
# index names.
|
||||
# TODO: Refactor to use _get_column_name_list from
|
||||
# DataFrameFormatter class and create a
|
||||
# _get_formatted_column_labels function for code
|
||||
# parity with DataFrameFormatter class.
|
||||
if self.fmt.show_index_names:
|
||||
name = self.columns.names[lnum]
|
||||
row.append(pprint_thing(name or ""))
|
||||
else:
|
||||
row.append("")
|
||||
|
||||
tags = {}
|
||||
j = len(row)
|
||||
for i, v in enumerate(values):
|
||||
if i in records:
|
||||
if records[i] > 1:
|
||||
tags[j] = template.format(span=records[i])
|
||||
else:
|
||||
continue
|
||||
j += 1
|
||||
row.append(v)
|
||||
self.write_tr(row, indent, self.indent_delta, tags=tags, header=True)
|
||||
else:
|
||||
# see gh-22579
|
||||
# Column misalignment also occurs for
|
||||
# a standard index when the columns index is named.
|
||||
# Initially fill row with blank cells before column names.
|
||||
# TODO: Refactor to remove code duplication with code block
|
||||
# above for columns MultiIndex.
|
||||
row = [""] * (self.row_levels - 1)
|
||||
if self.fmt.index or self.show_col_idx_names:
|
||||
# see gh-22747
|
||||
# If to_html(index_names=False) do not show columns
|
||||
# index names.
|
||||
# TODO: Refactor to use _get_column_name_list from
|
||||
# DataFrameFormatter class.
|
||||
if self.fmt.show_index_names:
|
||||
row.append(self.columns.name or "")
|
||||
else:
|
||||
row.append("")
|
||||
row.extend(self._get_columns_formatted_values())
|
||||
align = self.fmt.justify
|
||||
|
||||
if truncate_h:
|
||||
ins_col = self.row_levels + self.fmt.tr_col_num
|
||||
row.insert(ins_col, "...")
|
||||
|
||||
self.write_tr(row, indent, self.indent_delta, header=True, align=align)
|
||||
|
||||
def _write_row_header(self, indent: int) -> None:
|
||||
truncate_h = self.fmt.truncate_h
|
||||
row = [x if x is not None else "" for x in self.frame.index.names] + [""] * (
|
||||
self.ncols + (1 if truncate_h else 0)
|
||||
)
|
||||
self.write_tr(row, indent, self.indent_delta, header=True)
|
||||
|
||||
def _write_header(self, indent: int) -> None:
|
||||
self.write("<thead>", indent)
|
||||
|
||||
if self.fmt.header:
|
||||
self._write_col_header(indent + self.indent_delta)
|
||||
|
||||
if self.show_row_idx_names:
|
||||
self._write_row_header(indent + self.indent_delta)
|
||||
|
||||
self.write("</thead>", indent)
|
||||
|
||||
def _get_formatted_values(self) -> Dict[int, List[str]]:
|
||||
with option_context("display.max_colwidth", 999999):
|
||||
fmt_values = {i: self.fmt._format_col(i) for i in range(self.ncols)}
|
||||
return fmt_values
|
||||
|
||||
def _write_body(self, indent: int) -> None:
|
||||
self.write("<tbody>", indent)
|
||||
fmt_values = self._get_formatted_values()
|
||||
|
||||
# write values
|
||||
if self.fmt.index and isinstance(self.frame.index, ABCMultiIndex):
|
||||
self._write_hierarchical_rows(fmt_values, indent + self.indent_delta)
|
||||
else:
|
||||
self._write_regular_rows(fmt_values, indent + self.indent_delta)
|
||||
|
||||
self.write("</tbody>", indent)
|
||||
|
||||
def _write_regular_rows(
|
||||
self, fmt_values: Dict[int, List[str]], indent: int
|
||||
) -> None:
|
||||
truncate_h = self.fmt.truncate_h
|
||||
truncate_v = self.fmt.truncate_v
|
||||
|
||||
nrows = len(self.fmt.tr_frame)
|
||||
|
||||
if self.fmt.index:
|
||||
fmt = self.fmt._get_formatter("__index__")
|
||||
if fmt is not None:
|
||||
index_values = self.fmt.tr_frame.index.map(fmt)
|
||||
else:
|
||||
index_values = self.fmt.tr_frame.index.format()
|
||||
|
||||
row = [] # type: List[str]
|
||||
for i in range(nrows):
|
||||
|
||||
if truncate_v and i == (self.fmt.tr_row_num):
|
||||
str_sep_row = ["..."] * len(row)
|
||||
self.write_tr(
|
||||
str_sep_row,
|
||||
indent,
|
||||
self.indent_delta,
|
||||
tags=None,
|
||||
nindex_levels=self.row_levels,
|
||||
)
|
||||
|
||||
row = []
|
||||
if self.fmt.index:
|
||||
row.append(index_values[i])
|
||||
# see gh-22579
|
||||
# Column misalignment also occurs for
|
||||
# a standard index when the columns index is named.
|
||||
# Add blank cell before data cells.
|
||||
elif self.show_col_idx_names:
|
||||
row.append("")
|
||||
row.extend(fmt_values[j][i] for j in range(self.ncols))
|
||||
|
||||
if truncate_h:
|
||||
dot_col_ix = self.fmt.tr_col_num + self.row_levels
|
||||
row.insert(dot_col_ix, "...")
|
||||
self.write_tr(
|
||||
row, indent, self.indent_delta, tags=None, nindex_levels=self.row_levels
|
||||
)
|
||||
|
||||
def _write_hierarchical_rows(
|
||||
self, fmt_values: Dict[int, List[str]], indent: int
|
||||
) -> None:
|
||||
template = 'rowspan="{span}" valign="top"'
|
||||
|
||||
truncate_h = self.fmt.truncate_h
|
||||
truncate_v = self.fmt.truncate_v
|
||||
frame = self.fmt.tr_frame
|
||||
nrows = len(frame)
|
||||
|
||||
idx_values = frame.index.format(sparsify=False, adjoin=False, names=False)
|
||||
idx_values = list(zip(*idx_values))
|
||||
|
||||
if self.fmt.sparsify:
|
||||
# GH3547
|
||||
sentinel = object()
|
||||
levels = frame.index.format(sparsify=sentinel, adjoin=False, names=False)
|
||||
|
||||
level_lengths = get_level_lengths(levels, sentinel)
|
||||
inner_lvl = len(level_lengths) - 1
|
||||
if truncate_v:
|
||||
# Insert ... row and adjust idx_values and
|
||||
# level_lengths to take this into account.
|
||||
ins_row = self.fmt.tr_row_num
|
||||
inserted = False
|
||||
for lnum, records in enumerate(level_lengths):
|
||||
rec_new = {}
|
||||
for tag, span in list(records.items()):
|
||||
if tag >= ins_row:
|
||||
rec_new[tag + 1] = span
|
||||
elif tag + span > ins_row:
|
||||
rec_new[tag] = span + 1
|
||||
|
||||
# GH 14882 - Make sure insertion done once
|
||||
if not inserted:
|
||||
dot_row = list(idx_values[ins_row - 1])
|
||||
dot_row[-1] = "..."
|
||||
idx_values.insert(ins_row, tuple(dot_row))
|
||||
inserted = True
|
||||
else:
|
||||
dot_row = list(idx_values[ins_row])
|
||||
dot_row[inner_lvl - lnum] = "..."
|
||||
idx_values[ins_row] = tuple(dot_row)
|
||||
else:
|
||||
rec_new[tag] = span
|
||||
# If ins_row lies between tags, all cols idx cols
|
||||
# receive ...
|
||||
if tag + span == ins_row:
|
||||
rec_new[ins_row] = 1
|
||||
if lnum == 0:
|
||||
idx_values.insert(
|
||||
ins_row, tuple(["..."] * len(level_lengths))
|
||||
)
|
||||
|
||||
# GH 14882 - Place ... in correct level
|
||||
elif inserted:
|
||||
dot_row = list(idx_values[ins_row])
|
||||
dot_row[inner_lvl - lnum] = "..."
|
||||
idx_values[ins_row] = tuple(dot_row)
|
||||
level_lengths[lnum] = rec_new
|
||||
|
||||
level_lengths[inner_lvl][ins_row] = 1
|
||||
for ix_col in range(len(fmt_values)):
|
||||
fmt_values[ix_col].insert(ins_row, "...")
|
||||
nrows += 1
|
||||
|
||||
for i in range(nrows):
|
||||
row = []
|
||||
tags = {}
|
||||
|
||||
sparse_offset = 0
|
||||
j = 0
|
||||
for records, v in zip(level_lengths, idx_values[i]):
|
||||
if i in records:
|
||||
if records[i] > 1:
|
||||
tags[j] = template.format(span=records[i])
|
||||
else:
|
||||
sparse_offset += 1
|
||||
continue
|
||||
|
||||
j += 1
|
||||
row.append(v)
|
||||
|
||||
row.extend(fmt_values[j][i] for j in range(self.ncols))
|
||||
if truncate_h:
|
||||
row.insert(
|
||||
self.row_levels - sparse_offset + self.fmt.tr_col_num, "..."
|
||||
)
|
||||
self.write_tr(
|
||||
row,
|
||||
indent,
|
||||
self.indent_delta,
|
||||
tags=tags,
|
||||
nindex_levels=len(levels) - sparse_offset,
|
||||
)
|
||||
else:
|
||||
row = []
|
||||
for i in range(len(frame)):
|
||||
if truncate_v and i == (self.fmt.tr_row_num):
|
||||
str_sep_row = ["..."] * len(row)
|
||||
self.write_tr(
|
||||
str_sep_row,
|
||||
indent,
|
||||
self.indent_delta,
|
||||
tags=None,
|
||||
nindex_levels=self.row_levels,
|
||||
)
|
||||
|
||||
idx_values = list(
|
||||
zip(*frame.index.format(sparsify=False, adjoin=False, names=False))
|
||||
)
|
||||
row = []
|
||||
row.extend(idx_values[i])
|
||||
row.extend(fmt_values[j][i] for j in range(self.ncols))
|
||||
if truncate_h:
|
||||
row.insert(self.row_levels + self.fmt.tr_col_num, "...")
|
||||
self.write_tr(
|
||||
row,
|
||||
indent,
|
||||
self.indent_delta,
|
||||
tags=None,
|
||||
nindex_levels=frame.index.nlevels,
|
||||
)
|
||||
|
||||
|
||||
class NotebookFormatter(HTMLFormatter):
|
||||
"""
|
||||
Internal class for formatting output data in html for display in Jupyter
|
||||
Notebooks. This class is intended for functionality specific to
|
||||
DataFrame._repr_html_() and DataFrame.to_html(notebook=True)
|
||||
"""
|
||||
|
||||
def _get_formatted_values(self) -> Dict[int, List[str]]:
|
||||
return {i: self.fmt._format_col(i) for i in range(self.ncols)}
|
||||
|
||||
def _get_columns_formatted_values(self) -> List[str]:
|
||||
return self.columns.format()
|
||||
|
||||
def write_style(self) -> None:
|
||||
# We use the "scoped" attribute here so that the desired
|
||||
# style properties for the data frame are not then applied
|
||||
# throughout the entire notebook.
|
||||
template_first = """\
|
||||
<style scoped>"""
|
||||
template_last = """\
|
||||
</style>"""
|
||||
template_select = """\
|
||||
.dataframe %s {
|
||||
%s: %s;
|
||||
}"""
|
||||
element_props = [
|
||||
("tbody tr th:only-of-type", "vertical-align", "middle"),
|
||||
("tbody tr th", "vertical-align", "top"),
|
||||
]
|
||||
if isinstance(self.columns, ABCMultiIndex):
|
||||
element_props.append(("thead tr th", "text-align", "left"))
|
||||
if self.show_row_idx_names:
|
||||
element_props.append(
|
||||
("thead tr:last-of-type th", "text-align", "right")
|
||||
)
|
||||
else:
|
||||
element_props.append(("thead th", "text-align", "right"))
|
||||
template_mid = "\n\n".join(map(lambda t: template_select % t, element_props))
|
||||
template = dedent("\n".join((template_first, template_mid, template_last)))
|
||||
self.write(template)
|
||||
|
||||
def render(self) -> List[str]:
|
||||
self.write("<div>")
|
||||
self.write_style()
|
||||
super().render()
|
||||
self.write("</div>")
|
||||
return self.elements
|
265
venv/lib/python3.6/site-packages/pandas/io/formats/latex.py
Normal file
265
venv/lib/python3.6/site-packages/pandas/io/formats/latex.py
Normal file
@@ -0,0 +1,265 @@
|
||||
"""
|
||||
Module for formatting output data in Latex.
|
||||
"""
|
||||
import numpy as np
|
||||
|
||||
from pandas.core.dtypes.generic import ABCMultiIndex
|
||||
|
||||
from pandas.io.formats.format import TableFormatter
|
||||
|
||||
|
||||
class LatexFormatter(TableFormatter):
|
||||
""" Used to render a DataFrame to a LaTeX tabular/longtable environment
|
||||
output.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
formatter : `DataFrameFormatter`
|
||||
column_format : str, default None
|
||||
The columns format as specified in `LaTeX table format
|
||||
<https://en.wikibooks.org/wiki/LaTeX/Tables>`__ e.g 'rcl' for 3 columns
|
||||
longtable : boolean, default False
|
||||
Use a longtable environment instead of tabular.
|
||||
|
||||
See Also
|
||||
--------
|
||||
HTMLFormatter
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
formatter,
|
||||
column_format=None,
|
||||
longtable=False,
|
||||
multicolumn=False,
|
||||
multicolumn_format=None,
|
||||
multirow=False,
|
||||
):
|
||||
self.fmt = formatter
|
||||
self.frame = self.fmt.frame
|
||||
self.bold_rows = self.fmt.kwds.get("bold_rows", False)
|
||||
self.column_format = column_format
|
||||
self.longtable = longtable
|
||||
self.multicolumn = multicolumn
|
||||
self.multicolumn_format = multicolumn_format
|
||||
self.multirow = multirow
|
||||
|
||||
def write_result(self, buf):
|
||||
"""
|
||||
Render a DataFrame to a LaTeX tabular/longtable environment output.
|
||||
"""
|
||||
|
||||
# string representation of the columns
|
||||
if len(self.frame.columns) == 0 or len(self.frame.index) == 0:
|
||||
info_line = "Empty {name}\nColumns: {col}\nIndex: {idx}".format(
|
||||
name=type(self.frame).__name__,
|
||||
col=self.frame.columns,
|
||||
idx=self.frame.index,
|
||||
)
|
||||
strcols = [[info_line]]
|
||||
else:
|
||||
strcols = self.fmt._to_str_columns()
|
||||
|
||||
def get_col_type(dtype):
|
||||
if issubclass(dtype.type, np.number):
|
||||
return "r"
|
||||
else:
|
||||
return "l"
|
||||
|
||||
# reestablish the MultiIndex that has been joined by _to_str_column
|
||||
if self.fmt.index and isinstance(self.frame.index, ABCMultiIndex):
|
||||
out = self.frame.index.format(
|
||||
adjoin=False,
|
||||
sparsify=self.fmt.sparsify,
|
||||
names=self.fmt.has_index_names,
|
||||
na_rep=self.fmt.na_rep,
|
||||
)
|
||||
|
||||
# index.format will sparsify repeated entries with empty strings
|
||||
# so pad these with some empty space
|
||||
def pad_empties(x):
|
||||
for pad in reversed(x):
|
||||
if pad:
|
||||
break
|
||||
return [x[0]] + [i if i else " " * len(pad) for i in x[1:]]
|
||||
|
||||
out = (pad_empties(i) for i in out)
|
||||
|
||||
# Add empty spaces for each column level
|
||||
clevels = self.frame.columns.nlevels
|
||||
out = [[" " * len(i[-1])] * clevels + i for i in out]
|
||||
|
||||
# Add the column names to the last index column
|
||||
cnames = self.frame.columns.names
|
||||
if any(cnames):
|
||||
new_names = [i if i else "{}" for i in cnames]
|
||||
out[self.frame.index.nlevels - 1][:clevels] = new_names
|
||||
|
||||
# Get rid of old multiindex column and add new ones
|
||||
strcols = out + strcols[1:]
|
||||
|
||||
column_format = self.column_format
|
||||
if column_format is None:
|
||||
dtypes = self.frame.dtypes._values
|
||||
column_format = "".join(map(get_col_type, dtypes))
|
||||
if self.fmt.index:
|
||||
index_format = "l" * self.frame.index.nlevels
|
||||
column_format = index_format + column_format
|
||||
elif not isinstance(column_format, str): # pragma: no cover
|
||||
raise AssertionError(
|
||||
"column_format must be str or unicode, "
|
||||
"not {typ}".format(typ=type(column_format))
|
||||
)
|
||||
|
||||
if not self.longtable:
|
||||
buf.write("\\begin{{tabular}}{{{fmt}}}\n".format(fmt=column_format))
|
||||
buf.write("\\toprule\n")
|
||||
else:
|
||||
buf.write("\\begin{{longtable}}{{{fmt}}}\n".format(fmt=column_format))
|
||||
buf.write("\\toprule\n")
|
||||
|
||||
ilevels = self.frame.index.nlevels
|
||||
clevels = self.frame.columns.nlevels
|
||||
nlevels = clevels
|
||||
if self.fmt.has_index_names and self.fmt.show_index_names:
|
||||
nlevels += 1
|
||||
strrows = list(zip(*strcols))
|
||||
self.clinebuf = []
|
||||
|
||||
for i, row in enumerate(strrows):
|
||||
if i == nlevels and self.fmt.header:
|
||||
buf.write("\\midrule\n") # End of header
|
||||
if self.longtable:
|
||||
buf.write("\\endhead\n")
|
||||
buf.write("\\midrule\n")
|
||||
buf.write(
|
||||
"\\multicolumn{{{n}}}{{r}}{{{{Continued on next "
|
||||
"page}}}} \\\\\n".format(n=len(row))
|
||||
)
|
||||
buf.write("\\midrule\n")
|
||||
buf.write("\\endfoot\n\n")
|
||||
buf.write("\\bottomrule\n")
|
||||
buf.write("\\endlastfoot\n")
|
||||
if self.fmt.kwds.get("escape", True):
|
||||
# escape backslashes first
|
||||
crow = [
|
||||
(
|
||||
x.replace("\\", "\\textbackslash ")
|
||||
.replace("_", "\\_")
|
||||
.replace("%", "\\%")
|
||||
.replace("$", "\\$")
|
||||
.replace("#", "\\#")
|
||||
.replace("{", "\\{")
|
||||
.replace("}", "\\}")
|
||||
.replace("~", "\\textasciitilde ")
|
||||
.replace("^", "\\textasciicircum ")
|
||||
.replace("&", "\\&")
|
||||
if (x and x != "{}")
|
||||
else "{}"
|
||||
)
|
||||
for x in row
|
||||
]
|
||||
else:
|
||||
crow = [x if x else "{}" for x in row]
|
||||
if self.bold_rows and self.fmt.index:
|
||||
# bold row labels
|
||||
crow = [
|
||||
"\\textbf{{{x}}}".format(x=x)
|
||||
if j < ilevels and x.strip() not in ["", "{}"]
|
||||
else x
|
||||
for j, x in enumerate(crow)
|
||||
]
|
||||
if i < clevels and self.fmt.header and self.multicolumn:
|
||||
# sum up columns to multicolumns
|
||||
crow = self._format_multicolumn(crow, ilevels)
|
||||
if i >= nlevels and self.fmt.index and self.multirow and ilevels > 1:
|
||||
# sum up rows to multirows
|
||||
crow = self._format_multirow(crow, ilevels, i, strrows)
|
||||
buf.write(" & ".join(crow))
|
||||
buf.write(" \\\\\n")
|
||||
if self.multirow and i < len(strrows) - 1:
|
||||
self._print_cline(buf, i, len(strcols))
|
||||
|
||||
if not self.longtable:
|
||||
buf.write("\\bottomrule\n")
|
||||
buf.write("\\end{tabular}\n")
|
||||
else:
|
||||
buf.write("\\end{longtable}\n")
|
||||
|
||||
def _format_multicolumn(self, row, ilevels):
|
||||
r"""
|
||||
Combine columns belonging to a group to a single multicolumn entry
|
||||
according to self.multicolumn_format
|
||||
|
||||
e.g.:
|
||||
a & & & b & c &
|
||||
will become
|
||||
\multicolumn{3}{l}{a} & b & \multicolumn{2}{l}{c}
|
||||
"""
|
||||
row2 = list(row[:ilevels])
|
||||
ncol = 1
|
||||
coltext = ""
|
||||
|
||||
def append_col():
|
||||
# write multicolumn if needed
|
||||
if ncol > 1:
|
||||
row2.append(
|
||||
"\\multicolumn{{{ncol:d}}}{{{fmt:s}}}{{{txt:s}}}".format(
|
||||
ncol=ncol, fmt=self.multicolumn_format, txt=coltext.strip()
|
||||
)
|
||||
)
|
||||
# don't modify where not needed
|
||||
else:
|
||||
row2.append(coltext)
|
||||
|
||||
for c in row[ilevels:]:
|
||||
# if next col has text, write the previous
|
||||
if c.strip():
|
||||
if coltext:
|
||||
append_col()
|
||||
coltext = c
|
||||
ncol = 1
|
||||
# if not, add it to the previous multicolumn
|
||||
else:
|
||||
ncol += 1
|
||||
# write last column name
|
||||
if coltext:
|
||||
append_col()
|
||||
return row2
|
||||
|
||||
def _format_multirow(self, row, ilevels, i, rows):
|
||||
r"""
|
||||
Check following rows, whether row should be a multirow
|
||||
|
||||
e.g.: becomes:
|
||||
a & 0 & \multirow{2}{*}{a} & 0 &
|
||||
& 1 & & 1 &
|
||||
b & 0 & \cline{1-2}
|
||||
b & 0 &
|
||||
"""
|
||||
for j in range(ilevels):
|
||||
if row[j].strip():
|
||||
nrow = 1
|
||||
for r in rows[i + 1 :]:
|
||||
if not r[j].strip():
|
||||
nrow += 1
|
||||
else:
|
||||
break
|
||||
if nrow > 1:
|
||||
# overwrite non-multirow entry
|
||||
row[j] = "\\multirow{{{nrow:d}}}{{*}}{{{row:s}}}".format(
|
||||
nrow=nrow, row=row[j].strip()
|
||||
)
|
||||
# save when to end the current block with \cline
|
||||
self.clinebuf.append([i + nrow - 1, j + 1])
|
||||
return row
|
||||
|
||||
def _print_cline(self, buf, i, icol):
|
||||
"""
|
||||
Print clines after multirow-blocks are finished
|
||||
"""
|
||||
for cl in self.clinebuf:
|
||||
if cl[0] == i:
|
||||
buf.write("\\cline{{{cl:d}-{icol:d}}}\n".format(cl=cl[1], icol=icol))
|
||||
# remove entries that have been written to buffer
|
||||
self.clinebuf = [x for x in self.clinebuf if x[0] != i]
|
517
venv/lib/python3.6/site-packages/pandas/io/formats/printing.py
Normal file
517
venv/lib/python3.6/site-packages/pandas/io/formats/printing.py
Normal file
@@ -0,0 +1,517 @@
|
||||
"""
|
||||
printing tools
|
||||
"""
|
||||
|
||||
import sys
|
||||
|
||||
from pandas._config import get_option
|
||||
|
||||
from pandas.core.dtypes.inference import is_sequence
|
||||
|
||||
|
||||
def adjoin(space, *lists, **kwargs):
|
||||
"""
|
||||
Glues together two sets of strings using the amount of space requested.
|
||||
The idea is to prettify.
|
||||
|
||||
----------
|
||||
space : int
|
||||
number of spaces for padding
|
||||
lists : str
|
||||
list of str which being joined
|
||||
strlen : callable
|
||||
function used to calculate the length of each str. Needed for unicode
|
||||
handling.
|
||||
justfunc : callable
|
||||
function used to justify str. Needed for unicode handling.
|
||||
"""
|
||||
strlen = kwargs.pop("strlen", len)
|
||||
justfunc = kwargs.pop("justfunc", justify)
|
||||
|
||||
out_lines = []
|
||||
newLists = []
|
||||
lengths = [max(map(strlen, x)) + space for x in lists[:-1]]
|
||||
# not the last one
|
||||
lengths.append(max(map(len, lists[-1])))
|
||||
maxLen = max(map(len, lists))
|
||||
for i, lst in enumerate(lists):
|
||||
nl = justfunc(lst, lengths[i], mode="left")
|
||||
nl.extend([" " * lengths[i]] * (maxLen - len(lst)))
|
||||
newLists.append(nl)
|
||||
toJoin = zip(*newLists)
|
||||
for lines in toJoin:
|
||||
out_lines.append(_join_unicode(lines))
|
||||
return _join_unicode(out_lines, sep="\n")
|
||||
|
||||
|
||||
def justify(texts, max_len, mode="right"):
|
||||
"""
|
||||
Perform ljust, center, rjust against string or list-like
|
||||
"""
|
||||
if mode == "left":
|
||||
return [x.ljust(max_len) for x in texts]
|
||||
elif mode == "center":
|
||||
return [x.center(max_len) for x in texts]
|
||||
else:
|
||||
return [x.rjust(max_len) for x in texts]
|
||||
|
||||
|
||||
def _join_unicode(lines, sep=""):
|
||||
try:
|
||||
return sep.join(lines)
|
||||
except UnicodeDecodeError:
|
||||
sep = str(sep)
|
||||
return sep.join([x.decode("utf-8") if isinstance(x, str) else x for x in lines])
|
||||
|
||||
|
||||
# Unicode consolidation
|
||||
# ---------------------
|
||||
#
|
||||
# pprinting utility functions for generating Unicode text or
|
||||
# bytes(3.x)/str(2.x) representations of objects.
|
||||
# Try to use these as much as possible rather then rolling your own.
|
||||
#
|
||||
# When to use
|
||||
# -----------
|
||||
#
|
||||
# 1) If you're writing code internal to pandas (no I/O directly involved),
|
||||
# use pprint_thing().
|
||||
#
|
||||
# It will always return unicode text which can handled by other
|
||||
# parts of the package without breakage.
|
||||
#
|
||||
# 2) if you need to write something out to file, use
|
||||
# pprint_thing_encoded(encoding).
|
||||
#
|
||||
# If no encoding is specified, it defaults to utf-8. Since encoding pure
|
||||
# ascii with utf-8 is a no-op you can safely use the default utf-8 if you're
|
||||
# working with straight ascii.
|
||||
|
||||
|
||||
def _pprint_seq(seq, _nest_lvl=0, max_seq_items=None, **kwds):
|
||||
"""
|
||||
internal. pprinter for iterables. you should probably use pprint_thing()
|
||||
rather then calling this directly.
|
||||
|
||||
bounds length of printed sequence, depending on options
|
||||
"""
|
||||
if isinstance(seq, set):
|
||||
fmt = "{{{body}}}"
|
||||
else:
|
||||
fmt = "[{body}]" if hasattr(seq, "__setitem__") else "({body})"
|
||||
|
||||
if max_seq_items is False:
|
||||
nitems = len(seq)
|
||||
else:
|
||||
nitems = max_seq_items or get_option("max_seq_items") or len(seq)
|
||||
|
||||
s = iter(seq)
|
||||
# handle sets, no slicing
|
||||
r = [
|
||||
pprint_thing(next(s), _nest_lvl + 1, max_seq_items=max_seq_items, **kwds)
|
||||
for i in range(min(nitems, len(seq)))
|
||||
]
|
||||
body = ", ".join(r)
|
||||
|
||||
if nitems < len(seq):
|
||||
body += ", ..."
|
||||
elif isinstance(seq, tuple) and len(seq) == 1:
|
||||
body += ","
|
||||
|
||||
return fmt.format(body=body)
|
||||
|
||||
|
||||
def _pprint_dict(seq, _nest_lvl=0, max_seq_items=None, **kwds):
|
||||
"""
|
||||
internal. pprinter for iterables. you should probably use pprint_thing()
|
||||
rather then calling this directly.
|
||||
"""
|
||||
fmt = "{{{things}}}"
|
||||
pairs = []
|
||||
|
||||
pfmt = "{key}: {val}"
|
||||
|
||||
if max_seq_items is False:
|
||||
nitems = len(seq)
|
||||
else:
|
||||
nitems = max_seq_items or get_option("max_seq_items") or len(seq)
|
||||
|
||||
for k, v in list(seq.items())[:nitems]:
|
||||
pairs.append(
|
||||
pfmt.format(
|
||||
key=pprint_thing(k, _nest_lvl + 1, max_seq_items=max_seq_items, **kwds),
|
||||
val=pprint_thing(v, _nest_lvl + 1, max_seq_items=max_seq_items, **kwds),
|
||||
)
|
||||
)
|
||||
|
||||
if nitems < len(seq):
|
||||
return fmt.format(things=", ".join(pairs) + ", ...")
|
||||
else:
|
||||
return fmt.format(things=", ".join(pairs))
|
||||
|
||||
|
||||
def pprint_thing(
|
||||
thing,
|
||||
_nest_lvl=0,
|
||||
escape_chars=None,
|
||||
default_escapes=False,
|
||||
quote_strings=False,
|
||||
max_seq_items=None,
|
||||
):
|
||||
"""
|
||||
This function is the sanctioned way of converting objects
|
||||
to a unicode representation.
|
||||
|
||||
properly handles nested sequences containing unicode strings
|
||||
(unicode(object) does not)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
thing : anything to be formatted
|
||||
_nest_lvl : internal use only. pprint_thing() is mutually-recursive
|
||||
with pprint_sequence, this argument is used to keep track of the
|
||||
current nesting level, and limit it.
|
||||
escape_chars : list or dict, optional
|
||||
Characters to escape. If a dict is passed the values are the
|
||||
replacements
|
||||
default_escapes : bool, default False
|
||||
Whether the input escape characters replaces or adds to the defaults
|
||||
max_seq_items : False, int, default None
|
||||
Pass thru to other pretty printers to limit sequence printing
|
||||
|
||||
Returns
|
||||
-------
|
||||
result - unicode str
|
||||
|
||||
"""
|
||||
|
||||
def as_escaped_unicode(thing, escape_chars=escape_chars):
|
||||
# Unicode is fine, else we try to decode using utf-8 and 'replace'
|
||||
# if that's not it either, we have no way of knowing and the user
|
||||
# should deal with it himself.
|
||||
|
||||
try:
|
||||
result = str(thing) # we should try this first
|
||||
except UnicodeDecodeError:
|
||||
# either utf-8 or we replace errors
|
||||
result = str(thing).decode("utf-8", "replace")
|
||||
|
||||
translate = {"\t": r"\t", "\n": r"\n", "\r": r"\r"}
|
||||
if isinstance(escape_chars, dict):
|
||||
if default_escapes:
|
||||
translate.update(escape_chars)
|
||||
else:
|
||||
translate = escape_chars
|
||||
escape_chars = list(escape_chars.keys())
|
||||
else:
|
||||
escape_chars = escape_chars or tuple()
|
||||
for c in escape_chars:
|
||||
result = result.replace(c, translate[c])
|
||||
|
||||
return str(result)
|
||||
|
||||
if hasattr(thing, "__next__"):
|
||||
return str(thing)
|
||||
elif isinstance(thing, dict) and _nest_lvl < get_option(
|
||||
"display.pprint_nest_depth"
|
||||
):
|
||||
result = _pprint_dict(
|
||||
thing, _nest_lvl, quote_strings=True, max_seq_items=max_seq_items
|
||||
)
|
||||
elif is_sequence(thing) and _nest_lvl < get_option("display.pprint_nest_depth"):
|
||||
result = _pprint_seq(
|
||||
thing,
|
||||
_nest_lvl,
|
||||
escape_chars=escape_chars,
|
||||
quote_strings=quote_strings,
|
||||
max_seq_items=max_seq_items,
|
||||
)
|
||||
elif isinstance(thing, str) and quote_strings:
|
||||
result = "'{thing}'".format(thing=as_escaped_unicode(thing))
|
||||
else:
|
||||
result = as_escaped_unicode(thing)
|
||||
|
||||
return str(result) # always unicode
|
||||
|
||||
|
||||
def pprint_thing_encoded(object, encoding="utf-8", errors="replace", **kwds):
|
||||
value = pprint_thing(object) # get unicode representation of object
|
||||
return value.encode(encoding, errors, **kwds)
|
||||
|
||||
|
||||
def _enable_data_resource_formatter(enable):
|
||||
if "IPython" not in sys.modules:
|
||||
# definitely not in IPython
|
||||
return
|
||||
from IPython import get_ipython
|
||||
|
||||
ip = get_ipython()
|
||||
if ip is None:
|
||||
# still not in IPython
|
||||
return
|
||||
|
||||
formatters = ip.display_formatter.formatters
|
||||
mimetype = "application/vnd.dataresource+json"
|
||||
|
||||
if enable:
|
||||
if mimetype not in formatters:
|
||||
# define tableschema formatter
|
||||
from IPython.core.formatters import BaseFormatter
|
||||
|
||||
class TableSchemaFormatter(BaseFormatter):
|
||||
print_method = "_repr_data_resource_"
|
||||
_return_type = (dict,)
|
||||
|
||||
# register it:
|
||||
formatters[mimetype] = TableSchemaFormatter()
|
||||
# enable it if it's been disabled:
|
||||
formatters[mimetype].enabled = True
|
||||
else:
|
||||
# unregister tableschema mime-type
|
||||
if mimetype in formatters:
|
||||
formatters[mimetype].enabled = False
|
||||
|
||||
|
||||
default_pprint = lambda x, max_seq_items=None: pprint_thing(
|
||||
x, escape_chars=("\t", "\r", "\n"), quote_strings=True, max_seq_items=max_seq_items
|
||||
)
|
||||
|
||||
|
||||
def format_object_summary(
|
||||
obj,
|
||||
formatter,
|
||||
is_justify=True,
|
||||
name=None,
|
||||
indent_for_name=True,
|
||||
line_break_each_value=False,
|
||||
):
|
||||
"""
|
||||
Return the formatted obj as a unicode string
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj : object
|
||||
must be iterable and support __getitem__
|
||||
formatter : callable
|
||||
string formatter for an element
|
||||
is_justify : boolean
|
||||
should justify the display
|
||||
name : name, optional
|
||||
defaults to the class name of the obj
|
||||
indent_for_name : bool, default True
|
||||
Whether subsequent lines should be be indented to
|
||||
align with the name.
|
||||
line_break_each_value : bool, default False
|
||||
If True, inserts a line break for each value of ``obj``.
|
||||
If False, only break lines when the a line of values gets wider
|
||||
than the display width.
|
||||
|
||||
.. versionadded:: 0.25.0
|
||||
|
||||
Returns
|
||||
-------
|
||||
summary string
|
||||
|
||||
"""
|
||||
from pandas.io.formats.console import get_console_size
|
||||
from pandas.io.formats.format import _get_adjustment
|
||||
|
||||
display_width, _ = get_console_size()
|
||||
if display_width is None:
|
||||
display_width = get_option("display.width") or 80
|
||||
if name is None:
|
||||
name = obj.__class__.__name__
|
||||
|
||||
if indent_for_name:
|
||||
name_len = len(name)
|
||||
space1 = "\n%s" % (" " * (name_len + 1))
|
||||
space2 = "\n%s" % (" " * (name_len + 2))
|
||||
else:
|
||||
space1 = "\n"
|
||||
space2 = "\n " # space for the opening '['
|
||||
|
||||
n = len(obj)
|
||||
if line_break_each_value:
|
||||
# If we want to vertically align on each value of obj, we need to
|
||||
# separate values by a line break and indent the values
|
||||
sep = ",\n " + " " * len(name)
|
||||
else:
|
||||
sep = ","
|
||||
max_seq_items = get_option("display.max_seq_items") or n
|
||||
|
||||
# are we a truncated display
|
||||
is_truncated = n > max_seq_items
|
||||
|
||||
# adj can optionally handle unicode eastern asian width
|
||||
adj = _get_adjustment()
|
||||
|
||||
def _extend_line(s, line, value, display_width, next_line_prefix):
|
||||
|
||||
if adj.len(line.rstrip()) + adj.len(value.rstrip()) >= display_width:
|
||||
s += line.rstrip()
|
||||
line = next_line_prefix
|
||||
line += value
|
||||
return s, line
|
||||
|
||||
def best_len(values):
|
||||
if values:
|
||||
return max(adj.len(x) for x in values)
|
||||
else:
|
||||
return 0
|
||||
|
||||
close = ", "
|
||||
|
||||
if n == 0:
|
||||
summary = "[]{}".format(close)
|
||||
elif n == 1 and not line_break_each_value:
|
||||
first = formatter(obj[0])
|
||||
summary = "[{}]{}".format(first, close)
|
||||
elif n == 2 and not line_break_each_value:
|
||||
first = formatter(obj[0])
|
||||
last = formatter(obj[-1])
|
||||
summary = "[{}, {}]{}".format(first, last, close)
|
||||
else:
|
||||
|
||||
if n > max_seq_items:
|
||||
n = min(max_seq_items // 2, 10)
|
||||
head = [formatter(x) for x in obj[:n]]
|
||||
tail = [formatter(x) for x in obj[-n:]]
|
||||
else:
|
||||
head = []
|
||||
tail = [formatter(x) for x in obj]
|
||||
|
||||
# adjust all values to max length if needed
|
||||
if is_justify:
|
||||
if line_break_each_value:
|
||||
# Justify each string in the values of head and tail, so the
|
||||
# strings will right align when head and tail are stacked
|
||||
# vertically.
|
||||
head, tail = _justify(head, tail)
|
||||
elif is_truncated or not (
|
||||
len(", ".join(head)) < display_width
|
||||
and len(", ".join(tail)) < display_width
|
||||
):
|
||||
# Each string in head and tail should align with each other
|
||||
max_length = max(best_len(head), best_len(tail))
|
||||
head = [x.rjust(max_length) for x in head]
|
||||
tail = [x.rjust(max_length) for x in tail]
|
||||
# If we are not truncated and we are only a single
|
||||
# line, then don't justify
|
||||
|
||||
if line_break_each_value:
|
||||
# Now head and tail are of type List[Tuple[str]]. Below we
|
||||
# convert them into List[str], so there will be one string per
|
||||
# value. Also truncate items horizontally if wider than
|
||||
# max_space
|
||||
max_space = display_width - len(space2)
|
||||
value = tail[0]
|
||||
for max_items in reversed(range(1, len(value) + 1)):
|
||||
pprinted_seq = _pprint_seq(value, max_seq_items=max_items)
|
||||
if len(pprinted_seq) < max_space:
|
||||
break
|
||||
head = [_pprint_seq(x, max_seq_items=max_items) for x in head]
|
||||
tail = [_pprint_seq(x, max_seq_items=max_items) for x in tail]
|
||||
|
||||
summary = ""
|
||||
line = space2
|
||||
|
||||
for max_items in range(len(head)):
|
||||
word = head[max_items] + sep + " "
|
||||
summary, line = _extend_line(summary, line, word, display_width, space2)
|
||||
|
||||
if is_truncated:
|
||||
# remove trailing space of last line
|
||||
summary += line.rstrip() + space2 + "..."
|
||||
line = space2
|
||||
|
||||
for max_items in range(len(tail) - 1):
|
||||
word = tail[max_items] + sep + " "
|
||||
summary, line = _extend_line(summary, line, word, display_width, space2)
|
||||
|
||||
# last value: no sep added + 1 space of width used for trailing ','
|
||||
summary, line = _extend_line(summary, line, tail[-1], display_width - 2, space2)
|
||||
summary += line
|
||||
|
||||
# right now close is either '' or ', '
|
||||
# Now we want to include the ']', but not the maybe space.
|
||||
close = "]" + close.rstrip(" ")
|
||||
summary += close
|
||||
|
||||
if len(summary) > (display_width) or line_break_each_value:
|
||||
summary += space1
|
||||
else: # one row
|
||||
summary += " "
|
||||
|
||||
# remove initial space
|
||||
summary = "[" + summary[len(space2) :]
|
||||
|
||||
return summary
|
||||
|
||||
|
||||
def _justify(head, tail):
|
||||
"""
|
||||
Justify items in head and tail, so they are right-aligned when stacked.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
head : list-like of list-likes of strings
|
||||
tail : list-like of list-likes of strings
|
||||
|
||||
Returns
|
||||
-------
|
||||
tuple of list of tuples of strings
|
||||
Same as head and tail, but items are right aligned when stacked
|
||||
vertically.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> _justify([['a', 'b']], [['abc', 'abcd']])
|
||||
([(' a', ' b')], [('abc', 'abcd')])
|
||||
"""
|
||||
combined = head + tail
|
||||
|
||||
# For each position for the sequences in ``combined``,
|
||||
# find the length of the largest string.
|
||||
max_length = [0] * len(combined[0])
|
||||
for inner_seq in combined:
|
||||
length = [len(item) for item in inner_seq]
|
||||
max_length = [max(x, y) for x, y in zip(max_length, length)]
|
||||
|
||||
# justify each item in each list-like in head and tail using max_length
|
||||
head = [
|
||||
tuple(x.rjust(max_len) for x, max_len in zip(seq, max_length)) for seq in head
|
||||
]
|
||||
tail = [
|
||||
tuple(x.rjust(max_len) for x, max_len in zip(seq, max_length)) for seq in tail
|
||||
]
|
||||
return head, tail
|
||||
|
||||
|
||||
def format_object_attrs(obj, include_dtype=True):
|
||||
"""
|
||||
Return a list of tuples of the (attr, formatted_value)
|
||||
for common attrs, including dtype, name, length
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj : object
|
||||
must be iterable
|
||||
include_dtype : bool
|
||||
If False, dtype won't be in the returned list
|
||||
|
||||
Returns
|
||||
-------
|
||||
list
|
||||
|
||||
"""
|
||||
attrs = []
|
||||
if hasattr(obj, "dtype") and include_dtype:
|
||||
attrs.append(("dtype", "'{}'".format(obj.dtype)))
|
||||
if getattr(obj, "name", None) is not None:
|
||||
attrs.append(("name", default_pprint(obj.name)))
|
||||
elif getattr(obj, "names", None) is not None and any(obj.names):
|
||||
attrs.append(("names", default_pprint(obj.names)))
|
||||
max_seq_items = get_option("display.max_seq_items") or len(obj)
|
||||
if len(obj) > max_seq_items:
|
||||
attrs.append(("length", len(obj)))
|
||||
return attrs
|
1474
venv/lib/python3.6/site-packages/pandas/io/formats/style.py
Normal file
1474
venv/lib/python3.6/site-packages/pandas/io/formats/style.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,70 @@
|
||||
{# Update the template_structure.html document too #}
|
||||
{%- block before_style -%}{%- endblock before_style -%}
|
||||
{% block style %}
|
||||
<style type="text/css" >
|
||||
{% block table_styles %}
|
||||
{% for s in table_styles %}
|
||||
#T_{{uuid}} {{s.selector}} {
|
||||
{% for p,val in s.props %}
|
||||
{{p}}: {{val}};
|
||||
{% endfor -%}
|
||||
}
|
||||
{%- endfor -%}
|
||||
{% endblock table_styles %}
|
||||
{% block before_cellstyle %}{% endblock before_cellstyle %}
|
||||
{% block cellstyle %}
|
||||
{%- for s in cellstyle %}
|
||||
#T_{{uuid}}{{s.selector}} {
|
||||
{% for p,val in s.props %}
|
||||
{{p}}: {{val}};
|
||||
{% endfor %}
|
||||
}
|
||||
{%- endfor -%}
|
||||
{%- endblock cellstyle %}
|
||||
</style>
|
||||
{%- endblock style %}
|
||||
{%- block before_table %}{% endblock before_table %}
|
||||
{%- block table %}
|
||||
<table id="T_{{uuid}}" {% if table_attributes %}{{ table_attributes }}{% endif %}>
|
||||
{%- block caption %}
|
||||
{%- if caption -%}
|
||||
<caption>{{caption}}</caption>
|
||||
{%- endif -%}
|
||||
{%- endblock caption %}
|
||||
{%- block thead %}
|
||||
<thead>
|
||||
{%- block before_head_rows %}{% endblock %}
|
||||
{%- for r in head %}
|
||||
{%- block head_tr scoped %}
|
||||
<tr>
|
||||
{%- for c in r %}
|
||||
{%- if c.is_visible != False %}
|
||||
<{{ c.type }} class="{{c.class}}" {{ c.attributes|join(" ") }}>{{c.value}}</{{ c.type }}>
|
||||
{%- endif %}
|
||||
{%- endfor %}
|
||||
</tr>
|
||||
{%- endblock head_tr %}
|
||||
{%- endfor %}
|
||||
{%- block after_head_rows %}{% endblock %}
|
||||
</thead>
|
||||
{%- endblock thead %}
|
||||
{%- block tbody %}
|
||||
<tbody>
|
||||
{% block before_rows %}{% endblock before_rows %}
|
||||
{% for r in body %}
|
||||
{% block tr scoped %}
|
||||
<tr>
|
||||
{% for c in r %}
|
||||
{% if c.is_visible != False %}
|
||||
<{{ c.type }} {% if c.id is defined -%} id="T_{{ uuid }}{{ c.id }}" {%- endif %} class="{{ c.class }}" {{ c.attributes|join(" ") }}>{{ c.display_value }}</{{ c.type }}>
|
||||
{% endif %}
|
||||
{%- endfor %}
|
||||
</tr>
|
||||
{% endblock tr %}
|
||||
{%- endfor %}
|
||||
{%- block after_rows %}{%- endblock after_rows %}
|
||||
</tbody>
|
||||
{%- endblock tbody %}
|
||||
</table>
|
||||
{%- endblock table %}
|
||||
{%- block after_table %}{% endblock after_table %}
|
210
venv/lib/python3.6/site-packages/pandas/io/gbq.py
Normal file
210
venv/lib/python3.6/site-packages/pandas/io/gbq.py
Normal file
@@ -0,0 +1,210 @@
|
||||
""" Google BigQuery support """
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
|
||||
|
||||
def _try_import():
|
||||
# since pandas is a dependency of pandas-gbq
|
||||
# we need to import on first use
|
||||
msg = (
|
||||
"pandas-gbq is required to load data from Google BigQuery. "
|
||||
"See the docs: https://pandas-gbq.readthedocs.io."
|
||||
)
|
||||
pandas_gbq = import_optional_dependency("pandas_gbq", extra=msg)
|
||||
return pandas_gbq
|
||||
|
||||
|
||||
def read_gbq(
|
||||
query,
|
||||
project_id=None,
|
||||
index_col=None,
|
||||
col_order=None,
|
||||
reauth=False,
|
||||
auth_local_webserver=False,
|
||||
dialect=None,
|
||||
location=None,
|
||||
configuration=None,
|
||||
credentials=None,
|
||||
use_bqstorage_api=None,
|
||||
private_key=None,
|
||||
verbose=None,
|
||||
):
|
||||
"""
|
||||
Load data from Google BigQuery.
|
||||
|
||||
This function requires the `pandas-gbq package
|
||||
<https://pandas-gbq.readthedocs.io>`__.
|
||||
|
||||
See the `How to authenticate with Google BigQuery
|
||||
<https://pandas-gbq.readthedocs.io/en/latest/howto/authentication.html>`__
|
||||
guide for authentication instructions.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
query : str
|
||||
SQL-Like Query to return data values.
|
||||
project_id : str, optional
|
||||
Google BigQuery Account project ID. Optional when available from
|
||||
the environment.
|
||||
index_col : str, optional
|
||||
Name of result column to use for index in results DataFrame.
|
||||
col_order : list(str), optional
|
||||
List of BigQuery column names in the desired order for results
|
||||
DataFrame.
|
||||
reauth : boolean, default False
|
||||
Force Google BigQuery to re-authenticate the user. This is useful
|
||||
if multiple accounts are used.
|
||||
auth_local_webserver : boolean, default False
|
||||
Use the `local webserver flow`_ instead of the `console flow`_
|
||||
when getting user credentials.
|
||||
|
||||
.. _local webserver flow:
|
||||
http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server
|
||||
.. _console flow:
|
||||
http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console
|
||||
|
||||
*New in version 0.2.0 of pandas-gbq*.
|
||||
dialect : str, default 'legacy'
|
||||
Note: The default value is changing to 'standard' in a future verion.
|
||||
|
||||
SQL syntax dialect to use. Value can be one of:
|
||||
|
||||
``'legacy'``
|
||||
Use BigQuery's legacy SQL dialect. For more information see
|
||||
`BigQuery Legacy SQL Reference
|
||||
<https://cloud.google.com/bigquery/docs/reference/legacy-sql>`__.
|
||||
``'standard'``
|
||||
Use BigQuery's standard SQL, which is
|
||||
compliant with the SQL 2011 standard. For more information
|
||||
see `BigQuery Standard SQL Reference
|
||||
<https://cloud.google.com/bigquery/docs/reference/standard-sql/>`__.
|
||||
|
||||
.. versionchanged:: 0.24.0
|
||||
location : str, optional
|
||||
Location where the query job should run. See the `BigQuery locations
|
||||
documentation
|
||||
<https://cloud.google.com/bigquery/docs/dataset-locations>`__ for a
|
||||
list of available locations. The location must match that of any
|
||||
datasets used in the query.
|
||||
|
||||
*New in version 0.5.0 of pandas-gbq*.
|
||||
configuration : dict, optional
|
||||
Query config parameters for job processing.
|
||||
For example:
|
||||
|
||||
configuration = {'query': {'useQueryCache': False}}
|
||||
|
||||
For more information see `BigQuery REST API Reference
|
||||
<https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.query>`__.
|
||||
credentials : google.auth.credentials.Credentials, optional
|
||||
Credentials for accessing Google APIs. Use this parameter to override
|
||||
default credentials, such as to use Compute Engine
|
||||
:class:`google.auth.compute_engine.Credentials` or Service Account
|
||||
:class:`google.oauth2.service_account.Credentials` directly.
|
||||
|
||||
*New in version 0.8.0 of pandas-gbq*.
|
||||
|
||||
.. versionadded:: 0.24.0
|
||||
use_bqstorage_api : bool, default False
|
||||
Use the `BigQuery Storage API
|
||||
<https://cloud.google.com/bigquery/docs/reference/storage/>`__ to
|
||||
download query results quickly, but at an increased cost. To use this
|
||||
API, first `enable it in the Cloud Console
|
||||
<https://console.cloud.google.com/apis/library/bigquerystorage.googleapis.com>`__.
|
||||
You must also have the `bigquery.readsessions.create
|
||||
<https://cloud.google.com/bigquery/docs/access-control#roles>`__
|
||||
permission on the project you are billing queries to.
|
||||
|
||||
This feature requires version 0.10.0 or later of the ``pandas-gbq``
|
||||
package. It also requires the ``google-cloud-bigquery-storage`` and
|
||||
``fastavro`` packages.
|
||||
|
||||
.. versionadded:: 0.25.0
|
||||
private_key : str, deprecated
|
||||
Deprecated in pandas-gbq version 0.8.0. Use the ``credentials``
|
||||
parameter and
|
||||
:func:`google.oauth2.service_account.Credentials.from_service_account_info`
|
||||
or
|
||||
:func:`google.oauth2.service_account.Credentials.from_service_account_file`
|
||||
instead.
|
||||
|
||||
Service account private key in JSON format. Can be file path
|
||||
or string contents. This is useful for remote server
|
||||
authentication (eg. Jupyter/IPython notebook on remote host).
|
||||
verbose : None, deprecated
|
||||
Deprecated in pandas-gbq version 0.4.0. Use the `logging module to
|
||||
adjust verbosity instead
|
||||
<https://pandas-gbq.readthedocs.io/en/latest/intro.html#logging>`__.
|
||||
|
||||
Returns
|
||||
-------
|
||||
df: DataFrame
|
||||
DataFrame representing results of query.
|
||||
|
||||
See Also
|
||||
--------
|
||||
pandas_gbq.read_gbq : This function in the pandas-gbq library.
|
||||
DataFrame.to_gbq : Write a DataFrame to Google BigQuery.
|
||||
"""
|
||||
pandas_gbq = _try_import()
|
||||
|
||||
kwargs = {}
|
||||
|
||||
# START: new kwargs. Don't populate unless explicitly set.
|
||||
if use_bqstorage_api is not None:
|
||||
kwargs["use_bqstorage_api"] = use_bqstorage_api
|
||||
# END: new kwargs
|
||||
|
||||
# START: deprecated kwargs. Don't populate unless explicitly set.
|
||||
if verbose is not None:
|
||||
kwargs["verbose"] = verbose
|
||||
|
||||
if private_key is not None:
|
||||
kwargs["private_key"] = private_key
|
||||
# END: deprecated kwargs
|
||||
|
||||
return pandas_gbq.read_gbq(
|
||||
query,
|
||||
project_id=project_id,
|
||||
index_col=index_col,
|
||||
col_order=col_order,
|
||||
reauth=reauth,
|
||||
auth_local_webserver=auth_local_webserver,
|
||||
dialect=dialect,
|
||||
location=location,
|
||||
configuration=configuration,
|
||||
credentials=credentials,
|
||||
**kwargs
|
||||
)
|
||||
|
||||
|
||||
def to_gbq(
|
||||
dataframe,
|
||||
destination_table,
|
||||
project_id=None,
|
||||
chunksize=None,
|
||||
reauth=False,
|
||||
if_exists="fail",
|
||||
auth_local_webserver=False,
|
||||
table_schema=None,
|
||||
location=None,
|
||||
progress_bar=True,
|
||||
credentials=None,
|
||||
verbose=None,
|
||||
private_key=None,
|
||||
):
|
||||
pandas_gbq = _try_import()
|
||||
pandas_gbq.to_gbq(
|
||||
dataframe,
|
||||
destination_table,
|
||||
project_id=project_id,
|
||||
chunksize=chunksize,
|
||||
reauth=reauth,
|
||||
if_exists=if_exists,
|
||||
auth_local_webserver=auth_local_webserver,
|
||||
table_schema=table_schema,
|
||||
location=location,
|
||||
progress_bar=progress_bar,
|
||||
credentials=credentials,
|
||||
verbose=verbose,
|
||||
private_key=private_key,
|
||||
)
|
18
venv/lib/python3.6/site-packages/pandas/io/gcs.py
Normal file
18
venv/lib/python3.6/site-packages/pandas/io/gcs.py
Normal file
@@ -0,0 +1,18 @@
|
||||
""" GCS support for remote file interactivity """
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
|
||||
gcsfs = import_optional_dependency(
|
||||
"gcsfs", extra="The gcsfs library is required to handle GCS files"
|
||||
)
|
||||
|
||||
|
||||
def get_filepath_or_buffer(
|
||||
filepath_or_buffer, encoding=None, compression=None, mode=None
|
||||
):
|
||||
|
||||
if mode is None:
|
||||
mode = "rb"
|
||||
|
||||
fs = gcsfs.GCSFileSystem()
|
||||
filepath_or_buffer = fs.open(filepath_or_buffer, mode)
|
||||
return filepath_or_buffer, None, compression, True
|
1106
venv/lib/python3.6/site-packages/pandas/io/html.py
Normal file
1106
venv/lib/python3.6/site-packages/pandas/io/html.py
Normal file
File diff suppressed because it is too large
Load Diff
12
venv/lib/python3.6/site-packages/pandas/io/json/__init__.py
Normal file
12
venv/lib/python3.6/site-packages/pandas/io/json/__init__.py
Normal file
@@ -0,0 +1,12 @@
|
||||
from pandas.io.json._json import dumps, loads, read_json, to_json
|
||||
from pandas.io.json._normalize import json_normalize
|
||||
from pandas.io.json._table_schema import build_table_schema
|
||||
|
||||
__all__ = [
|
||||
"dumps",
|
||||
"loads",
|
||||
"read_json",
|
||||
"to_json",
|
||||
"json_normalize",
|
||||
"build_table_schema",
|
||||
]
|
1188
venv/lib/python3.6/site-packages/pandas/io/json/_json.py
Normal file
1188
venv/lib/python3.6/site-packages/pandas/io/json/_json.py
Normal file
File diff suppressed because it is too large
Load Diff
343
venv/lib/python3.6/site-packages/pandas/io/json/_normalize.py
Normal file
343
venv/lib/python3.6/site-packages/pandas/io/json/_normalize.py
Normal file
@@ -0,0 +1,343 @@
|
||||
# ---------------------------------------------------------------------
|
||||
# JSON normalization routines
|
||||
|
||||
from collections import defaultdict
|
||||
import copy
|
||||
from typing import DefaultDict, Dict, List, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs.writers import convert_json_to_lines
|
||||
|
||||
from pandas import DataFrame
|
||||
|
||||
|
||||
def convert_to_line_delimits(s):
|
||||
"""
|
||||
Helper function that converts JSON lists to line delimited JSON.
|
||||
"""
|
||||
|
||||
# Determine we have a JSON list to turn to lines otherwise just return the
|
||||
# json object, only lists can
|
||||
if not s[0] == "[" and s[-1] == "]":
|
||||
return s
|
||||
s = s[1:-1]
|
||||
|
||||
return convert_json_to_lines(s)
|
||||
|
||||
|
||||
def nested_to_record(
|
||||
ds,
|
||||
prefix: str = "",
|
||||
sep: str = ".",
|
||||
level: int = 0,
|
||||
max_level: Optional[int] = None,
|
||||
):
|
||||
"""
|
||||
A simplified json_normalize
|
||||
|
||||
Converts a nested dict into a flat dict ("record"), unlike json_normalize,
|
||||
it does not attempt to extract a subset of the data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ds : dict or list of dicts
|
||||
prefix: the prefix, optional, default: ""
|
||||
sep : str, default '.'
|
||||
Nested records will generate names separated by sep,
|
||||
e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
|
||||
|
||||
.. versionadded:: 0.20.0
|
||||
|
||||
level: int, optional, default: 0
|
||||
The number of levels in the json string.
|
||||
|
||||
max_level: int, optional, default: None
|
||||
The max depth to normalize.
|
||||
|
||||
.. versionadded:: 0.25.0
|
||||
|
||||
Returns
|
||||
-------
|
||||
d - dict or list of dicts, matching `ds`
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
IN[52]: nested_to_record(dict(flat1=1,dict1=dict(c=1,d=2),
|
||||
nested=dict(e=dict(c=1,d=2),d=2)))
|
||||
Out[52]:
|
||||
{'dict1.c': 1,
|
||||
'dict1.d': 2,
|
||||
'flat1': 1,
|
||||
'nested.d': 2,
|
||||
'nested.e.c': 1,
|
||||
'nested.e.d': 2}
|
||||
"""
|
||||
singleton = False
|
||||
if isinstance(ds, dict):
|
||||
ds = [ds]
|
||||
singleton = True
|
||||
new_ds = []
|
||||
for d in ds:
|
||||
new_d = copy.deepcopy(d)
|
||||
for k, v in d.items():
|
||||
# each key gets renamed with prefix
|
||||
if not isinstance(k, str):
|
||||
k = str(k)
|
||||
if level == 0:
|
||||
newkey = k
|
||||
else:
|
||||
newkey = prefix + sep + k
|
||||
|
||||
# flatten if type is dict and
|
||||
# current dict level < maximum level provided and
|
||||
# only dicts gets recurse-flattened
|
||||
# only at level>1 do we rename the rest of the keys
|
||||
if not isinstance(v, dict) or (
|
||||
max_level is not None and level >= max_level
|
||||
):
|
||||
if level != 0: # so we skip copying for top level, common case
|
||||
v = new_d.pop(k)
|
||||
new_d[newkey] = v
|
||||
continue
|
||||
else:
|
||||
v = new_d.pop(k)
|
||||
new_d.update(nested_to_record(v, newkey, sep, level + 1, max_level))
|
||||
new_ds.append(new_d)
|
||||
|
||||
if singleton:
|
||||
return new_ds[0]
|
||||
return new_ds
|
||||
|
||||
|
||||
def json_normalize(
|
||||
data: Union[Dict, List[Dict]],
|
||||
record_path: Optional[Union[str, List]] = None,
|
||||
meta: Optional[Union[str, List]] = None,
|
||||
meta_prefix: Optional[str] = None,
|
||||
record_prefix: Optional[str] = None,
|
||||
errors: Optional[str] = "raise",
|
||||
sep: str = ".",
|
||||
max_level: Optional[int] = None,
|
||||
):
|
||||
"""
|
||||
Normalize semi-structured JSON data into a flat table.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : dict or list of dicts
|
||||
Unserialized JSON objects.
|
||||
record_path : str or list of str, default None
|
||||
Path in each object to list of records. If not passed, data will be
|
||||
assumed to be an array of records.
|
||||
meta : list of paths (str or list of str), default None
|
||||
Fields to use as metadata for each record in resulting table.
|
||||
meta_prefix : str, default None
|
||||
If True, prefix records with dotted (?) path, e.g. foo.bar.field if
|
||||
meta is ['foo', 'bar'].
|
||||
record_prefix : str, default None
|
||||
If True, prefix records with dotted (?) path, e.g. foo.bar.field if
|
||||
path to records is ['foo', 'bar'].
|
||||
errors : {'raise', 'ignore'}, default 'raise'
|
||||
Configures error handling.
|
||||
|
||||
* 'ignore' : will ignore KeyError if keys listed in meta are not
|
||||
always present.
|
||||
* 'raise' : will raise KeyError if keys listed in meta are not
|
||||
always present.
|
||||
|
||||
.. versionadded:: 0.20.0
|
||||
|
||||
sep : str, default '.'
|
||||
Nested records will generate names separated by sep.
|
||||
e.g., for sep='.', {'foo': {'bar': 0}} -> foo.bar.
|
||||
|
||||
.. versionadded:: 0.20.0
|
||||
|
||||
max_level : int, default None
|
||||
Max number of levels(depth of dict) to normalize.
|
||||
if None, normalizes all levels.
|
||||
|
||||
.. versionadded:: 0.25.0
|
||||
|
||||
Returns
|
||||
-------
|
||||
frame : DataFrame
|
||||
Normalize semi-structured JSON data into a flat table.
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
>>> from pandas.io.json import json_normalize
|
||||
>>> data = [{'id': 1, 'name': {'first': 'Coleen', 'last': 'Volk'}},
|
||||
... {'name': {'given': 'Mose', 'family': 'Regner'}},
|
||||
... {'id': 2, 'name': 'Faye Raker'}]
|
||||
>>> json_normalize(data)
|
||||
id name name.family name.first name.given name.last
|
||||
0 1.0 NaN NaN Coleen NaN Volk
|
||||
1 NaN NaN Regner NaN Mose NaN
|
||||
2 2.0 Faye Raker NaN NaN NaN NaN
|
||||
|
||||
>>> data = [{'id': 1,
|
||||
... 'name': "Cole Volk",
|
||||
... 'fitness': {'height': 130, 'weight': 60}},
|
||||
... {'name': "Mose Reg",
|
||||
... 'fitness': {'height': 130, 'weight': 60}},
|
||||
... {'id': 2, 'name': 'Faye Raker',
|
||||
... 'fitness': {'height': 130, 'weight': 60}}]
|
||||
>>> json_normalize(data, max_level=0)
|
||||
fitness id name
|
||||
0 {'height': 130, 'weight': 60} 1.0 Cole Volk
|
||||
1 {'height': 130, 'weight': 60} NaN Mose Reg
|
||||
2 {'height': 130, 'weight': 60} 2.0 Faye Raker
|
||||
|
||||
Normalizes nested data upto level 1.
|
||||
|
||||
>>> data = [{'id': 1,
|
||||
... 'name': "Cole Volk",
|
||||
... 'fitness': {'height': 130, 'weight': 60}},
|
||||
... {'name': "Mose Reg",
|
||||
... 'fitness': {'height': 130, 'weight': 60}},
|
||||
... {'id': 2, 'name': 'Faye Raker',
|
||||
... 'fitness': {'height': 130, 'weight': 60}}]
|
||||
>>> json_normalize(data, max_level=1)
|
||||
fitness.height fitness.weight id name
|
||||
0 130 60 1.0 Cole Volk
|
||||
1 130 60 NaN Mose Reg
|
||||
2 130 60 2.0 Faye Raker
|
||||
|
||||
>>> data = [{'state': 'Florida',
|
||||
... 'shortname': 'FL',
|
||||
... 'info': {'governor': 'Rick Scott'},
|
||||
... 'counties': [{'name': 'Dade', 'population': 12345},
|
||||
... {'name': 'Broward', 'population': 40000},
|
||||
... {'name': 'Palm Beach', 'population': 60000}]},
|
||||
... {'state': 'Ohio',
|
||||
... 'shortname': 'OH',
|
||||
... 'info': {'governor': 'John Kasich'},
|
||||
... 'counties': [{'name': 'Summit', 'population': 1234},
|
||||
... {'name': 'Cuyahoga', 'population': 1337}]}]
|
||||
>>> result = json_normalize(data, 'counties', ['state', 'shortname',
|
||||
... ['info', 'governor']])
|
||||
>>> result
|
||||
name population state shortname info.governor
|
||||
0 Dade 12345 Florida FL Rick Scott
|
||||
1 Broward 40000 Florida FL Rick Scott
|
||||
2 Palm Beach 60000 Florida FL Rick Scott
|
||||
3 Summit 1234 Ohio OH John Kasich
|
||||
4 Cuyahoga 1337 Ohio OH John Kasich
|
||||
|
||||
>>> data = {'A': [1, 2]}
|
||||
>>> json_normalize(data, 'A', record_prefix='Prefix.')
|
||||
Prefix.0
|
||||
0 1
|
||||
1 2
|
||||
|
||||
Returns normalized data with columns prefixed with the given string.
|
||||
"""
|
||||
|
||||
def _pull_field(js, spec):
|
||||
result = js
|
||||
if isinstance(spec, list):
|
||||
for field in spec:
|
||||
result = result[field]
|
||||
else:
|
||||
result = result[spec]
|
||||
|
||||
return result
|
||||
|
||||
if isinstance(data, list) and not data:
|
||||
return DataFrame()
|
||||
|
||||
# A bit of a hackjob
|
||||
if isinstance(data, dict):
|
||||
data = [data]
|
||||
|
||||
if record_path is None:
|
||||
if any([isinstance(x, dict) for x in y.values()] for y in data):
|
||||
# naive normalization, this is idempotent for flat records
|
||||
# and potentially will inflate the data considerably for
|
||||
# deeply nested structures:
|
||||
# {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@}
|
||||
#
|
||||
# TODO: handle record value which are lists, at least error
|
||||
# reasonably
|
||||
data = nested_to_record(data, sep=sep, max_level=max_level)
|
||||
return DataFrame(data)
|
||||
elif not isinstance(record_path, list):
|
||||
record_path = [record_path]
|
||||
|
||||
if meta is None:
|
||||
meta = []
|
||||
elif not isinstance(meta, list):
|
||||
meta = [meta]
|
||||
|
||||
meta = [m if isinstance(m, list) else [m] for m in meta]
|
||||
|
||||
# Disastrously inefficient for now
|
||||
records = [] # type: List
|
||||
lengths = []
|
||||
|
||||
meta_vals = defaultdict(list) # type: DefaultDict
|
||||
meta_keys = [sep.join(val) for val in meta]
|
||||
|
||||
def _recursive_extract(data, path, seen_meta, level=0):
|
||||
if isinstance(data, dict):
|
||||
data = [data]
|
||||
if len(path) > 1:
|
||||
for obj in data:
|
||||
for val, key in zip(meta, meta_keys):
|
||||
if level + 1 == len(val):
|
||||
seen_meta[key] = _pull_field(obj, val[-1])
|
||||
|
||||
_recursive_extract(obj[path[0]], path[1:], seen_meta, level=level + 1)
|
||||
else:
|
||||
for obj in data:
|
||||
recs = _pull_field(obj, path[0])
|
||||
recs = [
|
||||
nested_to_record(r, sep=sep, max_level=max_level)
|
||||
if isinstance(r, dict)
|
||||
else r
|
||||
for r in recs
|
||||
]
|
||||
|
||||
# For repeating the metadata later
|
||||
lengths.append(len(recs))
|
||||
for val, key in zip(meta, meta_keys):
|
||||
if level + 1 > len(val):
|
||||
meta_val = seen_meta[key]
|
||||
else:
|
||||
try:
|
||||
meta_val = _pull_field(obj, val[level:])
|
||||
except KeyError as e:
|
||||
if errors == "ignore":
|
||||
meta_val = np.nan
|
||||
else:
|
||||
raise KeyError(
|
||||
"Try running with "
|
||||
"errors='ignore' as key "
|
||||
"{err} is not always present".format(err=e)
|
||||
)
|
||||
meta_vals[key].append(meta_val)
|
||||
records.extend(recs)
|
||||
|
||||
_recursive_extract(data, record_path, {}, level=0)
|
||||
|
||||
result = DataFrame(records)
|
||||
|
||||
if record_prefix is not None:
|
||||
result = result.rename(columns=lambda x: "{p}{c}".format(p=record_prefix, c=x))
|
||||
|
||||
# Data types, a problem
|
||||
for k, v in meta_vals.items():
|
||||
if meta_prefix is not None:
|
||||
k = meta_prefix + k
|
||||
|
||||
if k in result:
|
||||
raise ValueError(
|
||||
"Conflicting metadata name {name}, "
|
||||
"need distinguishing prefix ".format(name=k)
|
||||
)
|
||||
result[k] = np.array(v, dtype=object).repeat(lengths)
|
||||
return result
|
338
venv/lib/python3.6/site-packages/pandas/io/json/_table_schema.py
Normal file
338
venv/lib/python3.6/site-packages/pandas/io/json/_table_schema.py
Normal file
@@ -0,0 +1,338 @@
|
||||
"""
|
||||
Table Schema builders
|
||||
|
||||
http://specs.frictionlessdata.io/json-table-schema/
|
||||
"""
|
||||
import warnings
|
||||
|
||||
import pandas._libs.json as json
|
||||
|
||||
from pandas.core.dtypes.common import (
|
||||
is_bool_dtype,
|
||||
is_categorical_dtype,
|
||||
is_datetime64_dtype,
|
||||
is_datetime64tz_dtype,
|
||||
is_integer_dtype,
|
||||
is_numeric_dtype,
|
||||
is_period_dtype,
|
||||
is_string_dtype,
|
||||
is_timedelta64_dtype,
|
||||
)
|
||||
|
||||
from pandas import DataFrame
|
||||
from pandas.api.types import CategoricalDtype
|
||||
import pandas.core.common as com
|
||||
|
||||
loads = json.loads
|
||||
|
||||
|
||||
def as_json_table_type(x):
|
||||
"""
|
||||
Convert a NumPy / pandas type to its corresponding json_table.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : array or dtype
|
||||
|
||||
Returns
|
||||
-------
|
||||
t : str
|
||||
the Table Schema data types
|
||||
|
||||
Notes
|
||||
-----
|
||||
This table shows the relationship between NumPy / pandas dtypes,
|
||||
and Table Schema dtypes.
|
||||
|
||||
============== =================
|
||||
Pandas type Table Schema type
|
||||
============== =================
|
||||
int64 integer
|
||||
float64 number
|
||||
bool boolean
|
||||
datetime64[ns] datetime
|
||||
timedelta64[ns] duration
|
||||
object str
|
||||
categorical any
|
||||
=============== =================
|
||||
"""
|
||||
if is_integer_dtype(x):
|
||||
return "integer"
|
||||
elif is_bool_dtype(x):
|
||||
return "boolean"
|
||||
elif is_numeric_dtype(x):
|
||||
return "number"
|
||||
elif is_datetime64_dtype(x) or is_datetime64tz_dtype(x) or is_period_dtype(x):
|
||||
return "datetime"
|
||||
elif is_timedelta64_dtype(x):
|
||||
return "duration"
|
||||
elif is_categorical_dtype(x):
|
||||
return "any"
|
||||
elif is_string_dtype(x):
|
||||
return "string"
|
||||
else:
|
||||
return "any"
|
||||
|
||||
|
||||
def set_default_names(data):
|
||||
"""Sets index names to 'index' for regular, or 'level_x' for Multi"""
|
||||
if com._all_not_none(*data.index.names):
|
||||
nms = data.index.names
|
||||
if len(nms) == 1 and data.index.name == "index":
|
||||
warnings.warn("Index name of 'index' is not round-trippable")
|
||||
elif len(nms) > 1 and any(x.startswith("level_") for x in nms):
|
||||
warnings.warn(
|
||||
"Index names beginning with 'level_' are not " "round-trippable"
|
||||
)
|
||||
return data
|
||||
|
||||
data = data.copy()
|
||||
if data.index.nlevels > 1:
|
||||
names = [
|
||||
name if name is not None else "level_{}".format(i)
|
||||
for i, name in enumerate(data.index.names)
|
||||
]
|
||||
data.index.names = names
|
||||
else:
|
||||
data.index.name = data.index.name or "index"
|
||||
return data
|
||||
|
||||
|
||||
def convert_pandas_type_to_json_field(arr, dtype=None):
|
||||
dtype = dtype or arr.dtype
|
||||
if arr.name is None:
|
||||
name = "values"
|
||||
else:
|
||||
name = arr.name
|
||||
field = {"name": name, "type": as_json_table_type(dtype)}
|
||||
|
||||
if is_categorical_dtype(arr):
|
||||
if hasattr(arr, "categories"):
|
||||
cats = arr.categories
|
||||
ordered = arr.ordered
|
||||
else:
|
||||
cats = arr.cat.categories
|
||||
ordered = arr.cat.ordered
|
||||
field["constraints"] = {"enum": list(cats)}
|
||||
field["ordered"] = ordered
|
||||
elif is_period_dtype(arr):
|
||||
field["freq"] = arr.freqstr
|
||||
elif is_datetime64tz_dtype(arr):
|
||||
if hasattr(arr, "dt"):
|
||||
field["tz"] = arr.dt.tz.zone
|
||||
else:
|
||||
field["tz"] = arr.tz.zone
|
||||
return field
|
||||
|
||||
|
||||
def convert_json_field_to_pandas_type(field):
|
||||
"""
|
||||
Converts a JSON field descriptor into its corresponding NumPy / pandas type
|
||||
|
||||
Parameters
|
||||
----------
|
||||
field
|
||||
A JSON field descriptor
|
||||
|
||||
Returns
|
||||
-------
|
||||
dtype
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If the type of the provided field is unknown or currently unsupported
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> convert_json_field_to_pandas_type({'name': 'an_int',
|
||||
'type': 'integer'})
|
||||
'int64'
|
||||
>>> convert_json_field_to_pandas_type({'name': 'a_categorical',
|
||||
'type': 'any',
|
||||
'constraints': {'enum': [
|
||||
'a', 'b', 'c']},
|
||||
'ordered': True})
|
||||
'CategoricalDtype(categories=['a', 'b', 'c'], ordered=True)'
|
||||
>>> convert_json_field_to_pandas_type({'name': 'a_datetime',
|
||||
'type': 'datetime'})
|
||||
'datetime64[ns]'
|
||||
>>> convert_json_field_to_pandas_type({'name': 'a_datetime_with_tz',
|
||||
'type': 'datetime',
|
||||
'tz': 'US/Central'})
|
||||
'datetime64[ns, US/Central]'
|
||||
"""
|
||||
typ = field["type"]
|
||||
if typ == "string":
|
||||
return "object"
|
||||
elif typ == "integer":
|
||||
return "int64"
|
||||
elif typ == "number":
|
||||
return "float64"
|
||||
elif typ == "boolean":
|
||||
return "bool"
|
||||
elif typ == "duration":
|
||||
return "timedelta64"
|
||||
elif typ == "datetime":
|
||||
if field.get("tz"):
|
||||
return "datetime64[ns, {tz}]".format(tz=field["tz"])
|
||||
else:
|
||||
return "datetime64[ns]"
|
||||
elif typ == "any":
|
||||
if "constraints" in field and "ordered" in field:
|
||||
return CategoricalDtype(
|
||||
categories=field["constraints"]["enum"], ordered=field["ordered"]
|
||||
)
|
||||
else:
|
||||
return "object"
|
||||
|
||||
raise ValueError("Unsupported or invalid field type: {}".format(typ))
|
||||
|
||||
|
||||
def build_table_schema(data, index=True, primary_key=None, version=True):
|
||||
"""
|
||||
Create a Table schema from ``data``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : Series, DataFrame
|
||||
index : bool, default True
|
||||
Whether to include ``data.index`` in the schema.
|
||||
primary_key : bool or None, default True
|
||||
column names to designate as the primary key.
|
||||
The default `None` will set `'primaryKey'` to the index
|
||||
level or levels if the index is unique.
|
||||
version : bool, default True
|
||||
Whether to include a field `pandas_version` with the version
|
||||
of pandas that generated the schema.
|
||||
|
||||
Returns
|
||||
-------
|
||||
schema : dict
|
||||
|
||||
Notes
|
||||
-----
|
||||
See `_as_json_table_type` for conversion types.
|
||||
Timedeltas as converted to ISO8601 duration format with
|
||||
9 decimal places after the seconds field for nanosecond precision.
|
||||
|
||||
Categoricals are converted to the `any` dtype, and use the `enum` field
|
||||
constraint to list the allowed values. The `ordered` attribute is included
|
||||
in an `ordered` field.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = pd.DataFrame(
|
||||
... {'A': [1, 2, 3],
|
||||
... 'B': ['a', 'b', 'c'],
|
||||
... 'C': pd.date_range('2016-01-01', freq='d', periods=3),
|
||||
... }, index=pd.Index(range(3), name='idx'))
|
||||
>>> build_table_schema(df)
|
||||
{'fields': [{'name': 'idx', 'type': 'integer'},
|
||||
{'name': 'A', 'type': 'integer'},
|
||||
{'name': 'B', 'type': 'string'},
|
||||
{'name': 'C', 'type': 'datetime'}],
|
||||
'pandas_version': '0.20.0',
|
||||
'primaryKey': ['idx']}
|
||||
"""
|
||||
if index is True:
|
||||
data = set_default_names(data)
|
||||
|
||||
schema = {}
|
||||
fields = []
|
||||
|
||||
if index:
|
||||
if data.index.nlevels > 1:
|
||||
for level in data.index.levels:
|
||||
fields.append(convert_pandas_type_to_json_field(level))
|
||||
else:
|
||||
fields.append(convert_pandas_type_to_json_field(data.index))
|
||||
|
||||
if data.ndim > 1:
|
||||
for column, s in data.items():
|
||||
fields.append(convert_pandas_type_to_json_field(s))
|
||||
else:
|
||||
fields.append(convert_pandas_type_to_json_field(data))
|
||||
|
||||
schema["fields"] = fields
|
||||
if index and data.index.is_unique and primary_key is None:
|
||||
if data.index.nlevels == 1:
|
||||
schema["primaryKey"] = [data.index.name]
|
||||
else:
|
||||
schema["primaryKey"] = data.index.names
|
||||
elif primary_key is not None:
|
||||
schema["primaryKey"] = primary_key
|
||||
|
||||
if version:
|
||||
schema["pandas_version"] = "0.20.0"
|
||||
return schema
|
||||
|
||||
|
||||
def parse_table_schema(json, precise_float):
|
||||
"""
|
||||
Builds a DataFrame from a given schema
|
||||
|
||||
Parameters
|
||||
----------
|
||||
json :
|
||||
A JSON table schema
|
||||
precise_float : boolean
|
||||
Flag controlling precision when decoding string to double values, as
|
||||
dictated by ``read_json``
|
||||
|
||||
Returns
|
||||
-------
|
||||
df : DataFrame
|
||||
|
||||
Raises
|
||||
------
|
||||
NotImplementedError
|
||||
If the JSON table schema contains either timezone or timedelta data
|
||||
|
||||
Notes
|
||||
-----
|
||||
Because :func:`DataFrame.to_json` uses the string 'index' to denote a
|
||||
name-less :class:`Index`, this function sets the name of the returned
|
||||
:class:`DataFrame` to ``None`` when said string is encountered with a
|
||||
normal :class:`Index`. For a :class:`MultiIndex`, the same limitation
|
||||
applies to any strings beginning with 'level_'. Therefore, an
|
||||
:class:`Index` name of 'index' and :class:`MultiIndex` names starting
|
||||
with 'level_' are not supported.
|
||||
|
||||
See Also
|
||||
--------
|
||||
build_table_schema : Inverse function.
|
||||
pandas.read_json
|
||||
"""
|
||||
table = loads(json, precise_float=precise_float)
|
||||
col_order = [field["name"] for field in table["schema"]["fields"]]
|
||||
df = DataFrame(table["data"], columns=col_order)[col_order]
|
||||
|
||||
dtypes = {
|
||||
field["name"]: convert_json_field_to_pandas_type(field)
|
||||
for field in table["schema"]["fields"]
|
||||
}
|
||||
|
||||
# Cannot directly use as_type with timezone data on object; raise for now
|
||||
if any(str(x).startswith("datetime64[ns, ") for x in dtypes.values()):
|
||||
raise NotImplementedError('table="orient" can not yet read timezone ' "data")
|
||||
|
||||
# No ISO constructor for Timedelta as of yet, so need to raise
|
||||
if "timedelta64" in dtypes.values():
|
||||
raise NotImplementedError(
|
||||
'table="orient" can not yet read ' "ISO-formatted Timedelta data"
|
||||
)
|
||||
|
||||
df = df.astype(dtypes)
|
||||
|
||||
if "primaryKey" in table["schema"]:
|
||||
df = df.set_index(table["schema"]["primaryKey"])
|
||||
if len(df.index.names) == 1:
|
||||
if df.index.name == "index":
|
||||
df.index.name = None
|
||||
else:
|
||||
df.index.names = [
|
||||
None if x.startswith("level_") else x for x in df.index.names
|
||||
]
|
||||
|
||||
return df
|
@@ -0,0 +1,52 @@
|
||||
# coding: utf-8
|
||||
|
||||
from collections import namedtuple
|
||||
|
||||
from pandas.io.msgpack.exceptions import * # noqa
|
||||
from pandas.io.msgpack._version import version # noqa
|
||||
|
||||
|
||||
class ExtType(namedtuple("ExtType", "code data")):
|
||||
"""ExtType represents ext type in msgpack."""
|
||||
|
||||
def __new__(cls, code, data):
|
||||
if not isinstance(code, int):
|
||||
raise TypeError("code must be int")
|
||||
if not isinstance(data, bytes):
|
||||
raise TypeError("data must be bytes")
|
||||
if not 0 <= code <= 127:
|
||||
raise ValueError("code must be 0~127")
|
||||
return super().__new__(cls, code, data)
|
||||
|
||||
|
||||
import os # noqa
|
||||
|
||||
from pandas.io.msgpack._packer import Packer # noqa
|
||||
from pandas.io.msgpack._unpacker import unpack, unpackb, Unpacker # noqa
|
||||
|
||||
|
||||
def pack(o, stream, **kwargs):
|
||||
"""
|
||||
Pack object `o` and write it to `stream`
|
||||
|
||||
See :class:`Packer` for options.
|
||||
"""
|
||||
packer = Packer(**kwargs)
|
||||
stream.write(packer.pack(o))
|
||||
|
||||
|
||||
def packb(o, **kwargs):
|
||||
"""
|
||||
Pack object `o` and return packed bytes
|
||||
|
||||
See :class:`Packer` for options.
|
||||
"""
|
||||
return Packer(**kwargs).pack(o)
|
||||
|
||||
|
||||
# alias for compatibility to simplejson/marshal/pickle.
|
||||
load = unpack
|
||||
loads = unpackb
|
||||
|
||||
dump = pack
|
||||
dumps = packb
|
Binary file not shown.
Binary file not shown.
@@ -0,0 +1 @@
|
||||
version = (0, 4, 6)
|
@@ -0,0 +1,31 @@
|
||||
class UnpackException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class BufferFull(UnpackException):
|
||||
pass
|
||||
|
||||
|
||||
class OutOfData(UnpackException):
|
||||
pass
|
||||
|
||||
|
||||
class UnpackValueError(UnpackException, ValueError):
|
||||
pass
|
||||
|
||||
|
||||
class ExtraData(ValueError):
|
||||
def __init__(self, unpacked, extra):
|
||||
self.unpacked = unpacked
|
||||
self.extra = extra
|
||||
|
||||
def __str__(self):
|
||||
return "unpack(b) received extra data."
|
||||
|
||||
|
||||
class PackException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class PackValueError(PackException, ValueError):
|
||||
pass
|
891
venv/lib/python3.6/site-packages/pandas/io/packers.py
Normal file
891
venv/lib/python3.6/site-packages/pandas/io/packers.py
Normal file
@@ -0,0 +1,891 @@
|
||||
"""
|
||||
Msgpack serializer support for reading and writing pandas data structures
|
||||
to disk
|
||||
|
||||
portions of msgpack_numpy package, by Lev Givon were incorporated
|
||||
into this module (and tests_packers.py)
|
||||
|
||||
License
|
||||
=======
|
||||
|
||||
Copyright (c) 2013, Lev Givon.
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the following
|
||||
disclaimer in the documentation and/or other materials provided
|
||||
with the distribution.
|
||||
* Neither the name of Lev Givon nor the names of any
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
"""
|
||||
|
||||
from datetime import date, datetime, timedelta
|
||||
from io import BytesIO
|
||||
import os
|
||||
import warnings
|
||||
|
||||
from dateutil.parser import parse
|
||||
import numpy as np
|
||||
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
from pandas.errors import PerformanceWarning
|
||||
from pandas.util._move import (
|
||||
BadMove as _BadMove,
|
||||
move_into_mutable_buffer as _move_into_mutable_buffer,
|
||||
)
|
||||
|
||||
from pandas.core.dtypes.common import (
|
||||
is_categorical_dtype,
|
||||
is_datetime64tz_dtype,
|
||||
is_object_dtype,
|
||||
needs_i8_conversion,
|
||||
pandas_dtype,
|
||||
)
|
||||
|
||||
from pandas import ( # noqa:F401
|
||||
Categorical,
|
||||
CategoricalIndex,
|
||||
DataFrame,
|
||||
DatetimeIndex,
|
||||
Float64Index,
|
||||
Index,
|
||||
Int64Index,
|
||||
Interval,
|
||||
IntervalIndex,
|
||||
MultiIndex,
|
||||
NaT,
|
||||
Period,
|
||||
PeriodIndex,
|
||||
RangeIndex,
|
||||
Series,
|
||||
TimedeltaIndex,
|
||||
Timestamp,
|
||||
)
|
||||
from pandas.core import internals
|
||||
from pandas.core.arrays import DatetimeArray, IntervalArray, PeriodArray
|
||||
from pandas.core.arrays.sparse import BlockIndex, IntIndex
|
||||
from pandas.core.generic import NDFrame
|
||||
from pandas.core.internals import BlockManager, _safe_reshape, make_block
|
||||
from pandas.core.sparse.api import SparseDataFrame, SparseSeries
|
||||
|
||||
from pandas.io.common import _stringify_path, get_filepath_or_buffer
|
||||
from pandas.io.msgpack import ExtType, Packer as _Packer, Unpacker as _Unpacker
|
||||
|
||||
# until we can pass this into our conversion functions,
|
||||
# this is pretty hacky
|
||||
compressor = None
|
||||
|
||||
|
||||
def to_msgpack(path_or_buf, *args, **kwargs):
|
||||
"""
|
||||
msgpack (serialize) object to input file path
|
||||
|
||||
.. deprecated:: 0.25.0
|
||||
|
||||
to_msgpack is deprecated and will be removed in a future version.
|
||||
It is recommended to use pyarrow for on-the-wire transmission of
|
||||
pandas objects.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path_or_buf : string File path, buffer-like, or None
|
||||
if None, return generated bytes
|
||||
args : an object or objects to serialize
|
||||
encoding : encoding for unicode objects
|
||||
append : boolean whether to append to an existing msgpack
|
||||
(default is False)
|
||||
compress : type of compressor (zlib or blosc), default to None (no
|
||||
compression)
|
||||
"""
|
||||
warnings.warn(
|
||||
"to_msgpack is deprecated and will be removed in a "
|
||||
"future version.\n"
|
||||
"It is recommended to use pyarrow for on-the-wire "
|
||||
"transmission of pandas objects.",
|
||||
FutureWarning,
|
||||
stacklevel=3,
|
||||
)
|
||||
|
||||
global compressor
|
||||
compressor = kwargs.pop("compress", None)
|
||||
append = kwargs.pop("append", None)
|
||||
if append:
|
||||
mode = "a+b"
|
||||
else:
|
||||
mode = "wb"
|
||||
|
||||
def writer(fh):
|
||||
for a in args:
|
||||
fh.write(pack(a, **kwargs))
|
||||
|
||||
path_or_buf = _stringify_path(path_or_buf)
|
||||
if isinstance(path_or_buf, str):
|
||||
try:
|
||||
with open(path_or_buf, mode) as fh:
|
||||
writer(fh)
|
||||
except FileNotFoundError:
|
||||
msg = "File b'{}' does not exist".format(path_or_buf)
|
||||
raise FileNotFoundError(msg)
|
||||
elif path_or_buf is None:
|
||||
buf = BytesIO()
|
||||
writer(buf)
|
||||
return buf.getvalue()
|
||||
else:
|
||||
writer(path_or_buf)
|
||||
|
||||
|
||||
def read_msgpack(path_or_buf, encoding="utf-8", iterator=False, **kwargs):
|
||||
"""
|
||||
Load msgpack pandas object from the specified
|
||||
file path.
|
||||
|
||||
.. deprecated:: 0.25.0
|
||||
|
||||
read_msgpack is deprecated and will be removed in a future version.
|
||||
It is recommended to use pyarrow for on-the-wire transmission of
|
||||
pandas objects.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path_or_buf : str, path object or file-like object
|
||||
Any valid string path is acceptable. The string could be a URL. Valid
|
||||
URL schemes include http, ftp, s3, and file. For file URLs, a host is
|
||||
expected.
|
||||
|
||||
If you want to pass in a path object, pandas accepts any
|
||||
``os.PathLike``.
|
||||
|
||||
By file-like object, we refer to objects with a ``read()`` method,
|
||||
such as a file handler (e.g. via builtin ``open`` function) or
|
||||
``StringIO``.
|
||||
encoding : Encoding for decoding msgpack str type
|
||||
iterator : boolean, if True, return an iterator to the unpacker
|
||||
(default is False)
|
||||
|
||||
Returns
|
||||
-------
|
||||
obj : same type as object stored in file
|
||||
|
||||
Notes
|
||||
-----
|
||||
read_msgpack is only guaranteed to be backwards compatible to pandas
|
||||
0.20.3.
|
||||
"""
|
||||
warnings.warn(
|
||||
"The read_msgpack is deprecated and will be removed in a "
|
||||
"future version.\n"
|
||||
"It is recommended to use pyarrow for on-the-wire "
|
||||
"transmission of pandas objects.",
|
||||
FutureWarning,
|
||||
stacklevel=3,
|
||||
)
|
||||
|
||||
path_or_buf, _, _, should_close = get_filepath_or_buffer(path_or_buf)
|
||||
if iterator:
|
||||
return Iterator(path_or_buf)
|
||||
|
||||
def read(fh):
|
||||
unpacked_obj = list(unpack(fh, encoding=encoding, **kwargs))
|
||||
if len(unpacked_obj) == 1:
|
||||
return unpacked_obj[0]
|
||||
|
||||
if should_close:
|
||||
try:
|
||||
path_or_buf.close()
|
||||
except IOError:
|
||||
pass
|
||||
return unpacked_obj
|
||||
|
||||
# see if we have an actual file
|
||||
if isinstance(path_or_buf, str):
|
||||
try:
|
||||
with open(path_or_buf, "rb") as fh:
|
||||
return read(fh)
|
||||
except FileNotFoundError:
|
||||
msg = "File b'{}' does not exist".format(path_or_buf)
|
||||
raise FileNotFoundError(msg)
|
||||
|
||||
if isinstance(path_or_buf, bytes):
|
||||
# treat as a binary-like
|
||||
fh = None
|
||||
try:
|
||||
fh = BytesIO(path_or_buf)
|
||||
return read(fh)
|
||||
finally:
|
||||
if fh is not None:
|
||||
fh.close()
|
||||
elif hasattr(path_or_buf, "read") and callable(path_or_buf.read):
|
||||
# treat as a buffer like
|
||||
return read(path_or_buf)
|
||||
|
||||
raise ValueError("path_or_buf needs to be a string file path or file-like")
|
||||
|
||||
|
||||
dtype_dict = {
|
||||
21: np.dtype("M8[ns]"),
|
||||
"datetime64[ns]": np.dtype("M8[ns]"),
|
||||
"datetime64[us]": np.dtype("M8[us]"),
|
||||
22: np.dtype("m8[ns]"),
|
||||
"timedelta64[ns]": np.dtype("m8[ns]"),
|
||||
"timedelta64[us]": np.dtype("m8[us]"),
|
||||
# this is platform int, which we need to remap to np.int64
|
||||
# for compat on windows platforms
|
||||
7: np.dtype("int64"),
|
||||
"category": "category",
|
||||
}
|
||||
|
||||
|
||||
def dtype_for(t):
|
||||
""" return my dtype mapping, whether number or name """
|
||||
if t in dtype_dict:
|
||||
return dtype_dict[t]
|
||||
return np.typeDict.get(t, t)
|
||||
|
||||
|
||||
c2f_dict = {"complex": np.float64, "complex128": np.float64, "complex64": np.float32}
|
||||
|
||||
# windows (32 bit) compat
|
||||
if hasattr(np, "float128"):
|
||||
c2f_dict["complex256"] = np.float128
|
||||
|
||||
|
||||
def c2f(r, i, ctype_name):
|
||||
"""
|
||||
Convert strings to complex number instance with specified numpy type.
|
||||
"""
|
||||
|
||||
ftype = c2f_dict[ctype_name]
|
||||
return np.typeDict[ctype_name](ftype(r) + 1j * ftype(i))
|
||||
|
||||
|
||||
def convert(values):
|
||||
""" convert the numpy values to a list """
|
||||
|
||||
dtype = values.dtype
|
||||
|
||||
if is_categorical_dtype(values):
|
||||
return values
|
||||
|
||||
elif is_object_dtype(dtype):
|
||||
return values.ravel().tolist()
|
||||
|
||||
if needs_i8_conversion(dtype):
|
||||
values = values.view("i8")
|
||||
v = values.ravel()
|
||||
|
||||
if compressor == "zlib":
|
||||
zlib = import_optional_dependency(
|
||||
"zlib", extra="zlib is required when `compress='zlib'`."
|
||||
)
|
||||
|
||||
# return string arrays like they are
|
||||
if dtype == np.object_:
|
||||
return v.tolist()
|
||||
|
||||
# convert to a bytes array
|
||||
v = v.tostring()
|
||||
return ExtType(0, zlib.compress(v))
|
||||
|
||||
elif compressor == "blosc":
|
||||
blosc = import_optional_dependency(
|
||||
"blosc", extra="zlib is required when `compress='blosc'`."
|
||||
)
|
||||
|
||||
# return string arrays like they are
|
||||
if dtype == np.object_:
|
||||
return v.tolist()
|
||||
|
||||
# convert to a bytes array
|
||||
v = v.tostring()
|
||||
return ExtType(0, blosc.compress(v, typesize=dtype.itemsize))
|
||||
|
||||
# ndarray (on original dtype)
|
||||
return ExtType(0, v.tostring())
|
||||
|
||||
|
||||
def unconvert(values, dtype, compress=None):
|
||||
|
||||
as_is_ext = isinstance(values, ExtType) and values.code == 0
|
||||
|
||||
if as_is_ext:
|
||||
values = values.data
|
||||
|
||||
if is_categorical_dtype(dtype):
|
||||
return values
|
||||
|
||||
elif is_object_dtype(dtype):
|
||||
return np.array(values, dtype=object)
|
||||
|
||||
dtype = pandas_dtype(dtype).base
|
||||
|
||||
if not as_is_ext:
|
||||
values = values.encode("latin1")
|
||||
|
||||
if compress:
|
||||
if compress == "zlib":
|
||||
zlib = import_optional_dependency(
|
||||
"zlib", extra="zlib is required when `compress='zlib'`."
|
||||
)
|
||||
decompress = zlib.decompress
|
||||
elif compress == "blosc":
|
||||
blosc = import_optional_dependency(
|
||||
"blosc", extra="zlib is required when `compress='blosc'`."
|
||||
)
|
||||
decompress = blosc.decompress
|
||||
else:
|
||||
raise ValueError("compress must be one of 'zlib' or 'blosc'")
|
||||
|
||||
try:
|
||||
return np.frombuffer(
|
||||
_move_into_mutable_buffer(decompress(values)), dtype=dtype
|
||||
)
|
||||
except _BadMove as e:
|
||||
# Pull the decompressed data off of the `_BadMove` exception.
|
||||
# We don't just store this in the locals because we want to
|
||||
# minimize the risk of giving users access to a `bytes` object
|
||||
# whose data is also given to a mutable buffer.
|
||||
values = e.args[0]
|
||||
if len(values) > 1:
|
||||
# The empty string and single characters are memoized in many
|
||||
# string creating functions in the capi. This case should not
|
||||
# warn even though we need to make a copy because we are only
|
||||
# copying at most 1 byte.
|
||||
warnings.warn(
|
||||
"copying data after decompressing; this may mean that"
|
||||
" decompress is caching its result",
|
||||
PerformanceWarning,
|
||||
)
|
||||
# fall through to copying `np.fromstring`
|
||||
|
||||
# Copy the bytes into a numpy array.
|
||||
buf = np.frombuffer(values, dtype=dtype)
|
||||
buf = buf.copy() # required to not mutate the original data
|
||||
buf.flags.writeable = True
|
||||
return buf
|
||||
|
||||
|
||||
def encode(obj):
|
||||
"""
|
||||
Data encoder
|
||||
"""
|
||||
tobj = type(obj)
|
||||
if isinstance(obj, Index):
|
||||
if isinstance(obj, RangeIndex):
|
||||
return {
|
||||
"typ": "range_index",
|
||||
"klass": obj.__class__.__name__,
|
||||
"name": getattr(obj, "name", None),
|
||||
"start": obj._range.start,
|
||||
"stop": obj._range.stop,
|
||||
"step": obj._range.step,
|
||||
}
|
||||
elif isinstance(obj, PeriodIndex):
|
||||
return {
|
||||
"typ": "period_index",
|
||||
"klass": obj.__class__.__name__,
|
||||
"name": getattr(obj, "name", None),
|
||||
"freq": getattr(obj, "freqstr", None),
|
||||
"dtype": obj.dtype.name,
|
||||
"data": convert(obj.asi8),
|
||||
"compress": compressor,
|
||||
}
|
||||
elif isinstance(obj, DatetimeIndex):
|
||||
tz = getattr(obj, "tz", None)
|
||||
|
||||
# store tz info and data as UTC
|
||||
if tz is not None:
|
||||
tz = tz.zone
|
||||
obj = obj.tz_convert("UTC")
|
||||
return {
|
||||
"typ": "datetime_index",
|
||||
"klass": obj.__class__.__name__,
|
||||
"name": getattr(obj, "name", None),
|
||||
"dtype": obj.dtype.name,
|
||||
"data": convert(obj.asi8),
|
||||
"freq": getattr(obj, "freqstr", None),
|
||||
"tz": tz,
|
||||
"compress": compressor,
|
||||
}
|
||||
elif isinstance(obj, (IntervalIndex, IntervalArray)):
|
||||
if isinstance(obj, IntervalIndex):
|
||||
typ = "interval_index"
|
||||
else:
|
||||
typ = "interval_array"
|
||||
return {
|
||||
"typ": typ,
|
||||
"klass": obj.__class__.__name__,
|
||||
"name": getattr(obj, "name", None),
|
||||
"left": getattr(obj, "left", None),
|
||||
"right": getattr(obj, "right", None),
|
||||
"closed": getattr(obj, "closed", None),
|
||||
}
|
||||
elif isinstance(obj, MultiIndex):
|
||||
return {
|
||||
"typ": "multi_index",
|
||||
"klass": obj.__class__.__name__,
|
||||
"names": getattr(obj, "names", None),
|
||||
"dtype": obj.dtype.name,
|
||||
"data": convert(obj.values),
|
||||
"compress": compressor,
|
||||
}
|
||||
else:
|
||||
return {
|
||||
"typ": "index",
|
||||
"klass": obj.__class__.__name__,
|
||||
"name": getattr(obj, "name", None),
|
||||
"dtype": obj.dtype.name,
|
||||
"data": convert(obj.values),
|
||||
"compress": compressor,
|
||||
}
|
||||
|
||||
elif isinstance(obj, Categorical):
|
||||
return {
|
||||
"typ": "category",
|
||||
"klass": obj.__class__.__name__,
|
||||
"name": getattr(obj, "name", None),
|
||||
"codes": obj.codes,
|
||||
"categories": obj.categories,
|
||||
"ordered": obj.ordered,
|
||||
"compress": compressor,
|
||||
}
|
||||
|
||||
elif isinstance(obj, Series):
|
||||
if isinstance(obj, SparseSeries):
|
||||
raise NotImplementedError("msgpack sparse series is not implemented")
|
||||
# d = {'typ': 'sparse_series',
|
||||
# 'klass': obj.__class__.__name__,
|
||||
# 'dtype': obj.dtype.name,
|
||||
# 'index': obj.index,
|
||||
# 'sp_index': obj.sp_index,
|
||||
# 'sp_values': convert(obj.sp_values),
|
||||
# 'compress': compressor}
|
||||
# for f in ['name', 'fill_value', 'kind']:
|
||||
# d[f] = getattr(obj, f, None)
|
||||
# return d
|
||||
else:
|
||||
return {
|
||||
"typ": "series",
|
||||
"klass": obj.__class__.__name__,
|
||||
"name": getattr(obj, "name", None),
|
||||
"index": obj.index,
|
||||
"dtype": obj.dtype.name,
|
||||
"data": convert(obj.values),
|
||||
"compress": compressor,
|
||||
}
|
||||
elif issubclass(tobj, NDFrame):
|
||||
if isinstance(obj, SparseDataFrame):
|
||||
raise NotImplementedError("msgpack sparse frame is not implemented")
|
||||
# d = {'typ': 'sparse_dataframe',
|
||||
# 'klass': obj.__class__.__name__,
|
||||
# 'columns': obj.columns}
|
||||
# for f in ['default_fill_value', 'default_kind']:
|
||||
# d[f] = getattr(obj, f, None)
|
||||
# d['data'] = dict([(name, ss)
|
||||
# for name, ss in obj.items()])
|
||||
# return d
|
||||
else:
|
||||
|
||||
data = obj._data
|
||||
if not data.is_consolidated():
|
||||
data = data.consolidate()
|
||||
|
||||
# the block manager
|
||||
return {
|
||||
"typ": "block_manager",
|
||||
"klass": obj.__class__.__name__,
|
||||
"axes": data.axes,
|
||||
"blocks": [
|
||||
{
|
||||
"locs": b.mgr_locs.as_array,
|
||||
"values": convert(b.values),
|
||||
"shape": b.values.shape,
|
||||
"dtype": b.dtype.name,
|
||||
"klass": b.__class__.__name__,
|
||||
"compress": compressor,
|
||||
}
|
||||
for b in data.blocks
|
||||
],
|
||||
}
|
||||
|
||||
elif (
|
||||
isinstance(obj, (datetime, date, np.datetime64, timedelta, np.timedelta64))
|
||||
or obj is NaT
|
||||
):
|
||||
if isinstance(obj, Timestamp):
|
||||
tz = obj.tzinfo
|
||||
if tz is not None:
|
||||
tz = tz.zone
|
||||
freq = obj.freq
|
||||
if freq is not None:
|
||||
freq = freq.freqstr
|
||||
return {"typ": "timestamp", "value": obj.value, "freq": freq, "tz": tz}
|
||||
if obj is NaT:
|
||||
return {"typ": "nat"}
|
||||
elif isinstance(obj, np.timedelta64):
|
||||
return {"typ": "timedelta64", "data": obj.view("i8")}
|
||||
elif isinstance(obj, timedelta):
|
||||
return {
|
||||
"typ": "timedelta",
|
||||
"data": (obj.days, obj.seconds, obj.microseconds),
|
||||
}
|
||||
elif isinstance(obj, np.datetime64):
|
||||
return {"typ": "datetime64", "data": str(obj)}
|
||||
elif isinstance(obj, datetime):
|
||||
return {"typ": "datetime", "data": obj.isoformat()}
|
||||
elif isinstance(obj, date):
|
||||
return {"typ": "date", "data": obj.isoformat()}
|
||||
raise Exception("cannot encode this datetimelike object: {obj}".format(obj=obj))
|
||||
elif isinstance(obj, Period):
|
||||
return {"typ": "period", "ordinal": obj.ordinal, "freq": obj.freqstr}
|
||||
elif isinstance(obj, Interval):
|
||||
return {
|
||||
"typ": "interval",
|
||||
"left": obj.left,
|
||||
"right": obj.right,
|
||||
"closed": obj.closed,
|
||||
}
|
||||
elif isinstance(obj, BlockIndex):
|
||||
return {
|
||||
"typ": "block_index",
|
||||
"klass": obj.__class__.__name__,
|
||||
"blocs": obj.blocs,
|
||||
"blengths": obj.blengths,
|
||||
"length": obj.length,
|
||||
}
|
||||
elif isinstance(obj, IntIndex):
|
||||
return {
|
||||
"typ": "int_index",
|
||||
"klass": obj.__class__.__name__,
|
||||
"indices": obj.indices,
|
||||
"length": obj.length,
|
||||
}
|
||||
elif isinstance(obj, np.ndarray):
|
||||
return {
|
||||
"typ": "ndarray",
|
||||
"shape": obj.shape,
|
||||
"ndim": obj.ndim,
|
||||
"dtype": obj.dtype.name,
|
||||
"data": convert(obj),
|
||||
"compress": compressor,
|
||||
}
|
||||
elif isinstance(obj, np.number):
|
||||
if np.iscomplexobj(obj):
|
||||
return {
|
||||
"typ": "np_scalar",
|
||||
"sub_typ": "np_complex",
|
||||
"dtype": obj.dtype.name,
|
||||
"real": np.real(obj).__repr__(),
|
||||
"imag": np.imag(obj).__repr__(),
|
||||
}
|
||||
else:
|
||||
return {"typ": "np_scalar", "dtype": obj.dtype.name, "data": obj.__repr__()}
|
||||
elif isinstance(obj, complex):
|
||||
return {
|
||||
"typ": "np_complex",
|
||||
"real": np.real(obj).__repr__(),
|
||||
"imag": np.imag(obj).__repr__(),
|
||||
}
|
||||
|
||||
return obj
|
||||
|
||||
|
||||
def decode(obj):
|
||||
"""
|
||||
Decoder for deserializing numpy data types.
|
||||
"""
|
||||
|
||||
typ = obj.get("typ")
|
||||
if typ is None:
|
||||
return obj
|
||||
elif typ == "timestamp":
|
||||
freq = obj["freq"] if "freq" in obj else obj["offset"]
|
||||
return Timestamp(obj["value"], tz=obj["tz"], freq=freq)
|
||||
elif typ == "nat":
|
||||
return NaT
|
||||
elif typ == "period":
|
||||
return Period(ordinal=obj["ordinal"], freq=obj["freq"])
|
||||
elif typ == "index":
|
||||
dtype = dtype_for(obj["dtype"])
|
||||
data = unconvert(obj["data"], dtype, obj.get("compress"))
|
||||
return Index(data, dtype=dtype, name=obj["name"])
|
||||
elif typ == "range_index":
|
||||
return RangeIndex(obj["start"], obj["stop"], obj["step"], name=obj["name"])
|
||||
elif typ == "multi_index":
|
||||
dtype = dtype_for(obj["dtype"])
|
||||
data = unconvert(obj["data"], dtype, obj.get("compress"))
|
||||
data = [tuple(x) for x in data]
|
||||
return MultiIndex.from_tuples(data, names=obj["names"])
|
||||
elif typ == "period_index":
|
||||
data = unconvert(obj["data"], np.int64, obj.get("compress"))
|
||||
d = dict(name=obj["name"], freq=obj["freq"])
|
||||
freq = d.pop("freq", None)
|
||||
return PeriodIndex(PeriodArray(data, freq), **d)
|
||||
|
||||
elif typ == "datetime_index":
|
||||
data = unconvert(obj["data"], np.int64, obj.get("compress"))
|
||||
d = dict(name=obj["name"], freq=obj["freq"])
|
||||
result = DatetimeIndex(data, **d)
|
||||
tz = obj["tz"]
|
||||
|
||||
# reverse tz conversion
|
||||
if tz is not None:
|
||||
result = result.tz_localize("UTC").tz_convert(tz)
|
||||
return result
|
||||
|
||||
elif typ in ("interval_index", "interval_array"):
|
||||
return globals()[obj["klass"]].from_arrays(
|
||||
obj["left"], obj["right"], obj["closed"], name=obj["name"]
|
||||
)
|
||||
elif typ == "category":
|
||||
from_codes = globals()[obj["klass"]].from_codes
|
||||
return from_codes(
|
||||
codes=obj["codes"], categories=obj["categories"], ordered=obj["ordered"]
|
||||
)
|
||||
|
||||
elif typ == "interval":
|
||||
return Interval(obj["left"], obj["right"], obj["closed"])
|
||||
elif typ == "series":
|
||||
dtype = dtype_for(obj["dtype"])
|
||||
index = obj["index"]
|
||||
data = unconvert(obj["data"], dtype, obj["compress"])
|
||||
return Series(data, index=index, dtype=dtype, name=obj["name"])
|
||||
|
||||
elif typ == "block_manager":
|
||||
axes = obj["axes"]
|
||||
|
||||
def create_block(b):
|
||||
values = _safe_reshape(
|
||||
unconvert(b["values"], dtype_for(b["dtype"]), b["compress"]), b["shape"]
|
||||
)
|
||||
|
||||
# locs handles duplicate column names, and should be used instead
|
||||
# of items; see GH 9618
|
||||
if "locs" in b:
|
||||
placement = b["locs"]
|
||||
else:
|
||||
placement = axes[0].get_indexer(b["items"])
|
||||
|
||||
if is_datetime64tz_dtype(b["dtype"]):
|
||||
assert isinstance(values, np.ndarray), type(values)
|
||||
assert values.dtype == "M8[ns]", values.dtype
|
||||
values = DatetimeArray(values, dtype=b["dtype"])
|
||||
|
||||
return make_block(
|
||||
values=values,
|
||||
klass=getattr(internals, b["klass"]),
|
||||
placement=placement,
|
||||
dtype=b["dtype"],
|
||||
)
|
||||
|
||||
blocks = [create_block(b) for b in obj["blocks"]]
|
||||
return globals()[obj["klass"]](BlockManager(blocks, axes))
|
||||
elif typ == "datetime":
|
||||
return parse(obj["data"])
|
||||
elif typ == "datetime64":
|
||||
return np.datetime64(parse(obj["data"]))
|
||||
elif typ == "date":
|
||||
return parse(obj["data"]).date()
|
||||
elif typ == "timedelta":
|
||||
return timedelta(*obj["data"])
|
||||
elif typ == "timedelta64":
|
||||
return np.timedelta64(int(obj["data"]))
|
||||
# elif typ == 'sparse_series':
|
||||
# dtype = dtype_for(obj['dtype'])
|
||||
# return SparseSeries(
|
||||
# unconvert(obj['sp_values'], dtype, obj['compress']),
|
||||
# sparse_index=obj['sp_index'], index=obj['index'],
|
||||
# fill_value=obj['fill_value'], kind=obj['kind'], name=obj['name'])
|
||||
# elif typ == 'sparse_dataframe':
|
||||
# return SparseDataFrame(
|
||||
# obj['data'], columns=obj['columns'],
|
||||
# default_fill_value=obj['default_fill_value'],
|
||||
# default_kind=obj['default_kind']
|
||||
# )
|
||||
elif typ == "block_index":
|
||||
return globals()[obj["klass"]](obj["length"], obj["blocs"], obj["blengths"])
|
||||
elif typ == "int_index":
|
||||
return globals()[obj["klass"]](obj["length"], obj["indices"])
|
||||
elif typ == "ndarray":
|
||||
return unconvert(
|
||||
obj["data"], np.typeDict[obj["dtype"]], obj.get("compress")
|
||||
).reshape(obj["shape"])
|
||||
elif typ == "np_scalar":
|
||||
if obj.get("sub_typ") == "np_complex":
|
||||
return c2f(obj["real"], obj["imag"], obj["dtype"])
|
||||
else:
|
||||
dtype = dtype_for(obj["dtype"])
|
||||
try:
|
||||
return dtype(obj["data"])
|
||||
except (ValueError, TypeError):
|
||||
return dtype.type(obj["data"])
|
||||
elif typ == "np_complex":
|
||||
return complex(obj["real"] + "+" + obj["imag"] + "j")
|
||||
elif isinstance(obj, (dict, list, set)):
|
||||
return obj
|
||||
else:
|
||||
return obj
|
||||
|
||||
|
||||
def pack(
|
||||
o,
|
||||
default=encode,
|
||||
encoding="utf-8",
|
||||
unicode_errors="strict",
|
||||
use_single_float=False,
|
||||
autoreset=1,
|
||||
use_bin_type=1,
|
||||
):
|
||||
"""
|
||||
Pack an object and return the packed bytes.
|
||||
"""
|
||||
|
||||
return Packer(
|
||||
default=default,
|
||||
encoding=encoding,
|
||||
unicode_errors=unicode_errors,
|
||||
use_single_float=use_single_float,
|
||||
autoreset=autoreset,
|
||||
use_bin_type=use_bin_type,
|
||||
).pack(o)
|
||||
|
||||
|
||||
def unpack(
|
||||
packed,
|
||||
object_hook=decode,
|
||||
list_hook=None,
|
||||
use_list=False,
|
||||
encoding="utf-8",
|
||||
unicode_errors="strict",
|
||||
object_pairs_hook=None,
|
||||
max_buffer_size=0,
|
||||
ext_hook=ExtType,
|
||||
):
|
||||
"""
|
||||
Unpack a packed object, return an iterator
|
||||
Note: packed lists will be returned as tuples
|
||||
"""
|
||||
|
||||
return Unpacker(
|
||||
packed,
|
||||
object_hook=object_hook,
|
||||
list_hook=list_hook,
|
||||
use_list=use_list,
|
||||
encoding=encoding,
|
||||
unicode_errors=unicode_errors,
|
||||
object_pairs_hook=object_pairs_hook,
|
||||
max_buffer_size=max_buffer_size,
|
||||
ext_hook=ext_hook,
|
||||
)
|
||||
|
||||
|
||||
class Packer(_Packer):
|
||||
def __init__(
|
||||
self,
|
||||
default=encode,
|
||||
encoding="utf-8",
|
||||
unicode_errors="strict",
|
||||
use_single_float=False,
|
||||
autoreset=1,
|
||||
use_bin_type=1,
|
||||
):
|
||||
super().__init__(
|
||||
default=default,
|
||||
encoding=encoding,
|
||||
unicode_errors=unicode_errors,
|
||||
use_single_float=use_single_float,
|
||||
autoreset=autoreset,
|
||||
use_bin_type=use_bin_type,
|
||||
)
|
||||
|
||||
|
||||
class Unpacker(_Unpacker):
|
||||
def __init__(
|
||||
self,
|
||||
file_like=None,
|
||||
read_size=0,
|
||||
use_list=False,
|
||||
object_hook=decode,
|
||||
object_pairs_hook=None,
|
||||
list_hook=None,
|
||||
encoding="utf-8",
|
||||
unicode_errors="strict",
|
||||
max_buffer_size=0,
|
||||
ext_hook=ExtType,
|
||||
):
|
||||
super().__init__(
|
||||
file_like=file_like,
|
||||
read_size=read_size,
|
||||
use_list=use_list,
|
||||
object_hook=object_hook,
|
||||
object_pairs_hook=object_pairs_hook,
|
||||
list_hook=list_hook,
|
||||
encoding=encoding,
|
||||
unicode_errors=unicode_errors,
|
||||
max_buffer_size=max_buffer_size,
|
||||
ext_hook=ext_hook,
|
||||
)
|
||||
|
||||
|
||||
class Iterator:
|
||||
|
||||
""" manage the unpacking iteration,
|
||||
close the file on completion """
|
||||
|
||||
def __init__(self, path, **kwargs):
|
||||
self.path = path
|
||||
self.kwargs = kwargs
|
||||
|
||||
def __iter__(self):
|
||||
|
||||
needs_closing = True
|
||||
try:
|
||||
|
||||
# see if we have an actual file
|
||||
if isinstance(self.path, str):
|
||||
|
||||
try:
|
||||
path_exists = os.path.exists(self.path)
|
||||
except TypeError:
|
||||
path_exists = False
|
||||
|
||||
if path_exists:
|
||||
fh = open(self.path, "rb")
|
||||
else:
|
||||
fh = BytesIO(self.path)
|
||||
|
||||
else:
|
||||
|
||||
if not hasattr(self.path, "read"):
|
||||
fh = BytesIO(self.path)
|
||||
|
||||
else:
|
||||
|
||||
# a file-like
|
||||
needs_closing = False
|
||||
fh = self.path
|
||||
|
||||
unpacker = unpack(fh)
|
||||
for o in unpacker:
|
||||
yield o
|
||||
finally:
|
||||
if needs_closing:
|
||||
fh.close()
|
294
venv/lib/python3.6/site-packages/pandas/io/parquet.py
Normal file
294
venv/lib/python3.6/site-packages/pandas/io/parquet.py
Normal file
@@ -0,0 +1,294 @@
|
||||
""" parquet compat """
|
||||
|
||||
from warnings import catch_warnings
|
||||
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
from pandas.errors import AbstractMethodError
|
||||
|
||||
from pandas import DataFrame, get_option
|
||||
|
||||
from pandas.io.common import get_filepath_or_buffer, is_s3_url
|
||||
|
||||
|
||||
def get_engine(engine):
|
||||
""" return our implementation """
|
||||
|
||||
if engine == "auto":
|
||||
engine = get_option("io.parquet.engine")
|
||||
|
||||
if engine == "auto":
|
||||
# try engines in this order
|
||||
try:
|
||||
return PyArrowImpl()
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
return FastParquetImpl()
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
raise ImportError(
|
||||
"Unable to find a usable engine; "
|
||||
"tried using: 'pyarrow', 'fastparquet'.\n"
|
||||
"pyarrow or fastparquet is required for parquet "
|
||||
"support"
|
||||
)
|
||||
|
||||
if engine not in ["pyarrow", "fastparquet"]:
|
||||
raise ValueError("engine must be one of 'pyarrow', 'fastparquet'")
|
||||
|
||||
if engine == "pyarrow":
|
||||
return PyArrowImpl()
|
||||
elif engine == "fastparquet":
|
||||
return FastParquetImpl()
|
||||
|
||||
|
||||
class BaseImpl:
|
||||
|
||||
api = None # module
|
||||
|
||||
@staticmethod
|
||||
def validate_dataframe(df):
|
||||
|
||||
if not isinstance(df, DataFrame):
|
||||
raise ValueError("to_parquet only supports IO with DataFrames")
|
||||
|
||||
# must have value column names (strings only)
|
||||
if df.columns.inferred_type not in {"string", "unicode", "empty"}:
|
||||
raise ValueError("parquet must have string column names")
|
||||
|
||||
# index level names must be strings
|
||||
valid_names = all(
|
||||
isinstance(name, str) for name in df.index.names if name is not None
|
||||
)
|
||||
if not valid_names:
|
||||
raise ValueError("Index level names must be strings")
|
||||
|
||||
def write(self, df, path, compression, **kwargs):
|
||||
raise AbstractMethodError(self)
|
||||
|
||||
def read(self, path, columns=None, **kwargs):
|
||||
raise AbstractMethodError(self)
|
||||
|
||||
|
||||
class PyArrowImpl(BaseImpl):
|
||||
def __init__(self):
|
||||
pyarrow = import_optional_dependency(
|
||||
"pyarrow", extra="pyarrow is required for parquet support."
|
||||
)
|
||||
import pyarrow.parquet
|
||||
|
||||
self.api = pyarrow
|
||||
|
||||
def write(
|
||||
self,
|
||||
df,
|
||||
path,
|
||||
compression="snappy",
|
||||
coerce_timestamps="ms",
|
||||
index=None,
|
||||
partition_cols=None,
|
||||
**kwargs
|
||||
):
|
||||
self.validate_dataframe(df)
|
||||
path, _, _, _ = get_filepath_or_buffer(path, mode="wb")
|
||||
|
||||
if index is None:
|
||||
from_pandas_kwargs = {}
|
||||
else:
|
||||
from_pandas_kwargs = {"preserve_index": index}
|
||||
table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
|
||||
if partition_cols is not None:
|
||||
self.api.parquet.write_to_dataset(
|
||||
table,
|
||||
path,
|
||||
compression=compression,
|
||||
coerce_timestamps=coerce_timestamps,
|
||||
partition_cols=partition_cols,
|
||||
**kwargs
|
||||
)
|
||||
else:
|
||||
self.api.parquet.write_table(
|
||||
table,
|
||||
path,
|
||||
compression=compression,
|
||||
coerce_timestamps=coerce_timestamps,
|
||||
**kwargs
|
||||
)
|
||||
|
||||
def read(self, path, columns=None, **kwargs):
|
||||
path, _, _, should_close = get_filepath_or_buffer(path)
|
||||
|
||||
kwargs["use_pandas_metadata"] = True
|
||||
result = self.api.parquet.read_table(
|
||||
path, columns=columns, **kwargs
|
||||
).to_pandas()
|
||||
if should_close:
|
||||
try:
|
||||
path.close()
|
||||
except: # noqa: flake8
|
||||
pass
|
||||
|
||||
return result
|
||||
|
||||
|
||||
class FastParquetImpl(BaseImpl):
|
||||
def __init__(self):
|
||||
# since pandas is a dependency of fastparquet
|
||||
# we need to import on first use
|
||||
fastparquet = import_optional_dependency(
|
||||
"fastparquet", extra="fastparquet is required for parquet support."
|
||||
)
|
||||
self.api = fastparquet
|
||||
|
||||
def write(
|
||||
self, df, path, compression="snappy", index=None, partition_cols=None, **kwargs
|
||||
):
|
||||
self.validate_dataframe(df)
|
||||
# thriftpy/protocol/compact.py:339:
|
||||
# DeprecationWarning: tostring() is deprecated.
|
||||
# Use tobytes() instead.
|
||||
|
||||
if "partition_on" in kwargs and partition_cols is not None:
|
||||
raise ValueError(
|
||||
"Cannot use both partition_on and "
|
||||
"partition_cols. Use partition_cols for "
|
||||
"partitioning data"
|
||||
)
|
||||
elif "partition_on" in kwargs:
|
||||
partition_cols = kwargs.pop("partition_on")
|
||||
|
||||
if partition_cols is not None:
|
||||
kwargs["file_scheme"] = "hive"
|
||||
|
||||
if is_s3_url(path):
|
||||
# path is s3:// so we need to open the s3file in 'wb' mode.
|
||||
# TODO: Support 'ab'
|
||||
|
||||
path, _, _, _ = get_filepath_or_buffer(path, mode="wb")
|
||||
# And pass the opened s3file to the fastparquet internal impl.
|
||||
kwargs["open_with"] = lambda path, _: path
|
||||
else:
|
||||
path, _, _, _ = get_filepath_or_buffer(path)
|
||||
|
||||
with catch_warnings(record=True):
|
||||
self.api.write(
|
||||
path,
|
||||
df,
|
||||
compression=compression,
|
||||
write_index=index,
|
||||
partition_on=partition_cols,
|
||||
**kwargs
|
||||
)
|
||||
|
||||
def read(self, path, columns=None, **kwargs):
|
||||
if is_s3_url(path):
|
||||
# When path is s3:// an S3File is returned.
|
||||
# We need to retain the original path(str) while also
|
||||
# pass the S3File().open function to fsatparquet impl.
|
||||
s3, _, _, should_close = get_filepath_or_buffer(path)
|
||||
try:
|
||||
parquet_file = self.api.ParquetFile(path, open_with=s3.s3.open)
|
||||
finally:
|
||||
s3.close()
|
||||
else:
|
||||
path, _, _, _ = get_filepath_or_buffer(path)
|
||||
parquet_file = self.api.ParquetFile(path)
|
||||
|
||||
return parquet_file.to_pandas(columns=columns, **kwargs)
|
||||
|
||||
|
||||
def to_parquet(
|
||||
df,
|
||||
path,
|
||||
engine="auto",
|
||||
compression="snappy",
|
||||
index=None,
|
||||
partition_cols=None,
|
||||
**kwargs
|
||||
):
|
||||
"""
|
||||
Write a DataFrame to the parquet format.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str
|
||||
File path or Root Directory path. Will be used as Root Directory path
|
||||
while writing a partitioned dataset.
|
||||
|
||||
.. versionchanged:: 0.24.0
|
||||
|
||||
engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
|
||||
Parquet library to use. If 'auto', then the option
|
||||
``io.parquet.engine`` is used. The default ``io.parquet.engine``
|
||||
behavior is to try 'pyarrow', falling back to 'fastparquet' if
|
||||
'pyarrow' is unavailable.
|
||||
compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy'
|
||||
Name of the compression to use. Use ``None`` for no compression.
|
||||
index : bool, default None
|
||||
If ``True``, include the dataframe's index(es) in the file output. If
|
||||
``False``, they will not be written to the file. If ``None``, the
|
||||
engine's default behavior will be used.
|
||||
|
||||
.. versionadded 0.24.0
|
||||
|
||||
partition_cols : list, optional, default None
|
||||
Column names by which to partition the dataset
|
||||
Columns are partitioned in the order they are given
|
||||
|
||||
.. versionadded:: 0.24.0
|
||||
|
||||
kwargs
|
||||
Additional keyword arguments passed to the engine
|
||||
"""
|
||||
impl = get_engine(engine)
|
||||
return impl.write(
|
||||
df,
|
||||
path,
|
||||
compression=compression,
|
||||
index=index,
|
||||
partition_cols=partition_cols,
|
||||
**kwargs
|
||||
)
|
||||
|
||||
|
||||
def read_parquet(path, engine="auto", columns=None, **kwargs):
|
||||
"""
|
||||
Load a parquet object from the file path, returning a DataFrame.
|
||||
|
||||
.. versionadded 0.21.0
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str, path object or file-like object
|
||||
Any valid string path is acceptable. The string could be a URL. Valid
|
||||
URL schemes include http, ftp, s3, and file. For file URLs, a host is
|
||||
expected. A local file could be:
|
||||
``file://localhost/path/to/table.parquet``.
|
||||
|
||||
If you want to pass in a path object, pandas accepts any
|
||||
``os.PathLike``.
|
||||
|
||||
By file-like object, we refer to objects with a ``read()`` method,
|
||||
such as a file handler (e.g. via builtin ``open`` function)
|
||||
or ``StringIO``.
|
||||
engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
|
||||
Parquet library to use. If 'auto', then the option
|
||||
``io.parquet.engine`` is used. The default ``io.parquet.engine``
|
||||
behavior is to try 'pyarrow', falling back to 'fastparquet' if
|
||||
'pyarrow' is unavailable.
|
||||
columns : list, default=None
|
||||
If not None, only these columns will be read from the file.
|
||||
|
||||
.. versionadded 0.21.1
|
||||
**kwargs
|
||||
Any additional kwargs are passed to the engine.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
"""
|
||||
|
||||
impl = get_engine(engine)
|
||||
return impl.read(path, columns=columns, **kwargs)
|
3708
venv/lib/python3.6/site-packages/pandas/io/parsers.py
Normal file
3708
venv/lib/python3.6/site-packages/pandas/io/parsers.py
Normal file
File diff suppressed because it is too large
Load Diff
173
venv/lib/python3.6/site-packages/pandas/io/pickle.py
Normal file
173
venv/lib/python3.6/site-packages/pandas/io/pickle.py
Normal file
@@ -0,0 +1,173 @@
|
||||
""" pickle compat """
|
||||
from io import BytesIO
|
||||
import pickle
|
||||
import warnings
|
||||
|
||||
from numpy.lib.format import read_array
|
||||
|
||||
from pandas.compat import pickle_compat as pc
|
||||
|
||||
from pandas.io.common import _get_handle, _stringify_path
|
||||
|
||||
|
||||
def to_pickle(obj, path, compression="infer", protocol=pickle.HIGHEST_PROTOCOL):
|
||||
"""
|
||||
Pickle (serialize) object to file.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obj : any object
|
||||
Any python object.
|
||||
path : str
|
||||
File path where the pickled object will be stored.
|
||||
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
|
||||
A string representing the compression to use in the output file. By
|
||||
default, infers from the file extension in specified path.
|
||||
|
||||
.. versionadded:: 0.20.0
|
||||
protocol : int
|
||||
Int which indicates which protocol should be used by the pickler,
|
||||
default HIGHEST_PROTOCOL (see [1], paragraph 12.1.2). The possible
|
||||
values for this parameter depend on the version of Python. For Python
|
||||
2.x, possible values are 0, 1, 2. For Python>=3.0, 3 is a valid value.
|
||||
For Python >= 3.4, 4 is a valid value. A negative value for the
|
||||
protocol parameter is equivalent to setting its value to
|
||||
HIGHEST_PROTOCOL.
|
||||
|
||||
.. [1] https://docs.python.org/3/library/pickle.html
|
||||
.. versionadded:: 0.21.0
|
||||
|
||||
See Also
|
||||
--------
|
||||
read_pickle : Load pickled pandas object (or any object) from file.
|
||||
DataFrame.to_hdf : Write DataFrame to an HDF5 file.
|
||||
DataFrame.to_sql : Write DataFrame to a SQL database.
|
||||
DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)})
|
||||
>>> original_df
|
||||
foo bar
|
||||
0 0 5
|
||||
1 1 6
|
||||
2 2 7
|
||||
3 3 8
|
||||
4 4 9
|
||||
>>> pd.to_pickle(original_df, "./dummy.pkl")
|
||||
|
||||
>>> unpickled_df = pd.read_pickle("./dummy.pkl")
|
||||
>>> unpickled_df
|
||||
foo bar
|
||||
0 0 5
|
||||
1 1 6
|
||||
2 2 7
|
||||
3 3 8
|
||||
4 4 9
|
||||
|
||||
>>> import os
|
||||
>>> os.remove("./dummy.pkl")
|
||||
"""
|
||||
path = _stringify_path(path)
|
||||
f, fh = _get_handle(path, "wb", compression=compression, is_text=False)
|
||||
if protocol < 0:
|
||||
protocol = pickle.HIGHEST_PROTOCOL
|
||||
try:
|
||||
f.write(pickle.dumps(obj, protocol=protocol))
|
||||
finally:
|
||||
f.close()
|
||||
for _f in fh:
|
||||
_f.close()
|
||||
|
||||
|
||||
def read_pickle(path, compression="infer"):
|
||||
"""
|
||||
Load pickled pandas object (or any object) from file.
|
||||
|
||||
.. warning::
|
||||
|
||||
Loading pickled data received from untrusted sources can be
|
||||
unsafe. See `here <https://docs.python.org/3/library/pickle.html>`__.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str
|
||||
File path where the pickled object will be loaded.
|
||||
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
|
||||
For on-the-fly decompression of on-disk data. If 'infer', then use
|
||||
gzip, bz2, xz or zip if path ends in '.gz', '.bz2', '.xz',
|
||||
or '.zip' respectively, and no decompression otherwise.
|
||||
Set to None for no decompression.
|
||||
|
||||
.. versionadded:: 0.20.0
|
||||
|
||||
Returns
|
||||
-------
|
||||
unpickled : same type as object stored in file
|
||||
|
||||
See Also
|
||||
--------
|
||||
DataFrame.to_pickle : Pickle (serialize) DataFrame object to file.
|
||||
Series.to_pickle : Pickle (serialize) Series object to file.
|
||||
read_hdf : Read HDF5 file into a DataFrame.
|
||||
read_sql : Read SQL query or database table into a DataFrame.
|
||||
read_parquet : Load a parquet object, returning a DataFrame.
|
||||
|
||||
Notes
|
||||
-----
|
||||
read_pickle is only guaranteed to be backwards compatible to pandas 0.20.3.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)})
|
||||
>>> original_df
|
||||
foo bar
|
||||
0 0 5
|
||||
1 1 6
|
||||
2 2 7
|
||||
3 3 8
|
||||
4 4 9
|
||||
>>> pd.to_pickle(original_df, "./dummy.pkl")
|
||||
|
||||
>>> unpickled_df = pd.read_pickle("./dummy.pkl")
|
||||
>>> unpickled_df
|
||||
foo bar
|
||||
0 0 5
|
||||
1 1 6
|
||||
2 2 7
|
||||
3 3 8
|
||||
4 4 9
|
||||
|
||||
>>> import os
|
||||
>>> os.remove("./dummy.pkl")
|
||||
"""
|
||||
path = _stringify_path(path)
|
||||
f, fh = _get_handle(path, "rb", compression=compression, is_text=False)
|
||||
|
||||
# 1) try standard libary Pickle
|
||||
# 2) try pickle_compat (older pandas version) to handle subclass changes
|
||||
# 3) try pickle_compat with latin1 encoding
|
||||
|
||||
try:
|
||||
with warnings.catch_warnings(record=True):
|
||||
# We want to silence any warnings about, e.g. moved modules.
|
||||
warnings.simplefilter("ignore", Warning)
|
||||
return pickle.load(f)
|
||||
except Exception: # noqa: E722
|
||||
try:
|
||||
return pc.load(f, encoding=None)
|
||||
except Exception: # noqa: E722
|
||||
return pc.load(f, encoding="latin1")
|
||||
finally:
|
||||
f.close()
|
||||
for _f in fh:
|
||||
_f.close()
|
||||
|
||||
|
||||
# compat with sparse pickle / unpickle
|
||||
|
||||
|
||||
def _unpickle_array(bytes):
|
||||
arr = read_array(BytesIO(bytes))
|
||||
|
||||
return arr
|
5066
venv/lib/python3.6/site-packages/pandas/io/pytables.py
Normal file
5066
venv/lib/python3.6/site-packages/pandas/io/pytables.py
Normal file
File diff suppressed because it is too large
Load Diff
37
venv/lib/python3.6/site-packages/pandas/io/s3.py
Normal file
37
venv/lib/python3.6/site-packages/pandas/io/s3.py
Normal file
@@ -0,0 +1,37 @@
|
||||
""" s3 support for remote file interactivity """
|
||||
from urllib.parse import urlparse as parse_url
|
||||
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
|
||||
s3fs = import_optional_dependency(
|
||||
"s3fs", extra="The s3fs package is required to handle s3 files."
|
||||
)
|
||||
|
||||
|
||||
def _strip_schema(url):
|
||||
"""Returns the url without the s3:// part"""
|
||||
result = parse_url(url, allow_fragments=False)
|
||||
return result.netloc + result.path
|
||||
|
||||
|
||||
def get_filepath_or_buffer(
|
||||
filepath_or_buffer, encoding=None, compression=None, mode=None
|
||||
):
|
||||
from botocore.exceptions import NoCredentialsError
|
||||
|
||||
if mode is None:
|
||||
mode = "rb"
|
||||
|
||||
fs = s3fs.S3FileSystem(anon=False)
|
||||
try:
|
||||
filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer), mode)
|
||||
except (FileNotFoundError, NoCredentialsError):
|
||||
# boto3 has troubles when trying to access a public file
|
||||
# when credentialed...
|
||||
# An OSError is raised if you have credentials, but they
|
||||
# aren't valid for that bucket.
|
||||
# A NoCredentialsError is raised if you don't have creds
|
||||
# for that bucket.
|
||||
fs = s3fs.S3FileSystem(anon=True)
|
||||
filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer), mode)
|
||||
return filepath_or_buffer, None, compression, True
|
@@ -0,0 +1 @@
|
||||
from .sasreader import read_sas # noqa
|
Binary file not shown.
732
venv/lib/python3.6/site-packages/pandas/io/sas/sas7bdat.py
Normal file
732
venv/lib/python3.6/site-packages/pandas/io/sas/sas7bdat.py
Normal file
@@ -0,0 +1,732 @@
|
||||
"""
|
||||
Read SAS7BDAT files
|
||||
|
||||
Based on code written by Jared Hobbs:
|
||||
https://bitbucket.org/jaredhobbs/sas7bdat
|
||||
|
||||
See also:
|
||||
https://github.com/BioStatMatt/sas7bdat
|
||||
|
||||
Partial documentation of the file format:
|
||||
https://cran.r-project.org/package=sas7bdat/vignettes/sas7bdat.pdf
|
||||
|
||||
Reference for binary data compression:
|
||||
http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm
|
||||
"""
|
||||
from datetime import datetime
|
||||
import struct
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.errors import EmptyDataError
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from pandas.io.common import BaseIterator, get_filepath_or_buffer
|
||||
from pandas.io.sas._sas import Parser
|
||||
import pandas.io.sas.sas_constants as const
|
||||
|
||||
|
||||
class _subheader_pointer:
|
||||
pass
|
||||
|
||||
|
||||
class _column:
|
||||
pass
|
||||
|
||||
|
||||
# SAS7BDAT represents a SAS data file in SAS7BDAT format.
|
||||
class SAS7BDATReader(BaseIterator):
|
||||
"""
|
||||
Read SAS files in SAS7BDAT format.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path_or_buf : path name or buffer
|
||||
Name of SAS file or file-like object pointing to SAS file
|
||||
contents.
|
||||
index : column identifier, defaults to None
|
||||
Column to use as index.
|
||||
convert_dates : boolean, defaults to True
|
||||
Attempt to convert dates to Pandas datetime values. Note that
|
||||
some rarely used SAS date formats may be unsupported.
|
||||
blank_missing : boolean, defaults to True
|
||||
Convert empty strings to missing values (SAS uses blanks to
|
||||
indicate missing character variables).
|
||||
chunksize : int, defaults to None
|
||||
Return SAS7BDATReader object for iterations, returns chunks
|
||||
with given number of lines.
|
||||
encoding : string, defaults to None
|
||||
String encoding.
|
||||
convert_text : bool, defaults to True
|
||||
If False, text variables are left as raw bytes.
|
||||
convert_header_text : bool, defaults to True
|
||||
If False, header text, including column names, are left as raw
|
||||
bytes.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
path_or_buf,
|
||||
index=None,
|
||||
convert_dates=True,
|
||||
blank_missing=True,
|
||||
chunksize=None,
|
||||
encoding=None,
|
||||
convert_text=True,
|
||||
convert_header_text=True,
|
||||
):
|
||||
|
||||
self.index = index
|
||||
self.convert_dates = convert_dates
|
||||
self.blank_missing = blank_missing
|
||||
self.chunksize = chunksize
|
||||
self.encoding = encoding
|
||||
self.convert_text = convert_text
|
||||
self.convert_header_text = convert_header_text
|
||||
|
||||
self.default_encoding = "latin-1"
|
||||
self.compression = ""
|
||||
self.column_names_strings = []
|
||||
self.column_names = []
|
||||
self.column_formats = []
|
||||
self.columns = []
|
||||
|
||||
self._current_page_data_subheader_pointers = []
|
||||
self._cached_page = None
|
||||
self._column_data_lengths = []
|
||||
self._column_data_offsets = []
|
||||
self._column_types = []
|
||||
|
||||
self._current_row_in_file_index = 0
|
||||
self._current_row_on_page_index = 0
|
||||
self._current_row_in_file_index = 0
|
||||
|
||||
self._path_or_buf, _, _, _ = get_filepath_or_buffer(path_or_buf)
|
||||
if isinstance(self._path_or_buf, str):
|
||||
self._path_or_buf = open(self._path_or_buf, "rb")
|
||||
self.handle = self._path_or_buf
|
||||
|
||||
self._get_properties()
|
||||
self._parse_metadata()
|
||||
|
||||
def column_data_lengths(self):
|
||||
"""Return a numpy int64 array of the column data lengths"""
|
||||
return np.asarray(self._column_data_lengths, dtype=np.int64)
|
||||
|
||||
def column_data_offsets(self):
|
||||
"""Return a numpy int64 array of the column offsets"""
|
||||
return np.asarray(self._column_data_offsets, dtype=np.int64)
|
||||
|
||||
def column_types(self):
|
||||
"""Returns a numpy character array of the column types:
|
||||
s (string) or d (double)"""
|
||||
return np.asarray(self._column_types, dtype=np.dtype("S1"))
|
||||
|
||||
def close(self):
|
||||
try:
|
||||
self.handle.close()
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
def _get_properties(self):
|
||||
|
||||
# Check magic number
|
||||
self._path_or_buf.seek(0)
|
||||
self._cached_page = self._path_or_buf.read(288)
|
||||
if self._cached_page[0 : len(const.magic)] != const.magic:
|
||||
self.close()
|
||||
raise ValueError("magic number mismatch (not a SAS file?)")
|
||||
|
||||
# Get alignment information
|
||||
align1, align2 = 0, 0
|
||||
buf = self._read_bytes(const.align_1_offset, const.align_1_length)
|
||||
if buf == const.u64_byte_checker_value:
|
||||
align2 = const.align_2_value
|
||||
self.U64 = True
|
||||
self._int_length = 8
|
||||
self._page_bit_offset = const.page_bit_offset_x64
|
||||
self._subheader_pointer_length = const.subheader_pointer_length_x64
|
||||
else:
|
||||
self.U64 = False
|
||||
self._page_bit_offset = const.page_bit_offset_x86
|
||||
self._subheader_pointer_length = const.subheader_pointer_length_x86
|
||||
self._int_length = 4
|
||||
buf = self._read_bytes(const.align_2_offset, const.align_2_length)
|
||||
if buf == const.align_1_checker_value:
|
||||
align1 = const.align_2_value
|
||||
total_align = align1 + align2
|
||||
|
||||
# Get endianness information
|
||||
buf = self._read_bytes(const.endianness_offset, const.endianness_length)
|
||||
if buf == b"\x01":
|
||||
self.byte_order = "<"
|
||||
else:
|
||||
self.byte_order = ">"
|
||||
|
||||
# Get encoding information
|
||||
buf = self._read_bytes(const.encoding_offset, const.encoding_length)[0]
|
||||
if buf in const.encoding_names:
|
||||
self.file_encoding = const.encoding_names[buf]
|
||||
else:
|
||||
self.file_encoding = "unknown (code={name!s})".format(name=buf)
|
||||
|
||||
# Get platform information
|
||||
buf = self._read_bytes(const.platform_offset, const.platform_length)
|
||||
if buf == b"1":
|
||||
self.platform = "unix"
|
||||
elif buf == b"2":
|
||||
self.platform = "windows"
|
||||
else:
|
||||
self.platform = "unknown"
|
||||
|
||||
buf = self._read_bytes(const.dataset_offset, const.dataset_length)
|
||||
self.name = buf.rstrip(b"\x00 ")
|
||||
if self.convert_header_text:
|
||||
self.name = self.name.decode(self.encoding or self.default_encoding)
|
||||
|
||||
buf = self._read_bytes(const.file_type_offset, const.file_type_length)
|
||||
self.file_type = buf.rstrip(b"\x00 ")
|
||||
if self.convert_header_text:
|
||||
self.file_type = self.file_type.decode(
|
||||
self.encoding or self.default_encoding
|
||||
)
|
||||
|
||||
# Timestamp is epoch 01/01/1960
|
||||
epoch = datetime(1960, 1, 1)
|
||||
x = self._read_float(
|
||||
const.date_created_offset + align1, const.date_created_length
|
||||
)
|
||||
self.date_created = epoch + pd.to_timedelta(x, unit="s")
|
||||
x = self._read_float(
|
||||
const.date_modified_offset + align1, const.date_modified_length
|
||||
)
|
||||
self.date_modified = epoch + pd.to_timedelta(x, unit="s")
|
||||
|
||||
self.header_length = self._read_int(
|
||||
const.header_size_offset + align1, const.header_size_length
|
||||
)
|
||||
|
||||
# Read the rest of the header into cached_page.
|
||||
buf = self._path_or_buf.read(self.header_length - 288)
|
||||
self._cached_page += buf
|
||||
if len(self._cached_page) != self.header_length:
|
||||
self.close()
|
||||
raise ValueError("The SAS7BDAT file appears to be truncated.")
|
||||
|
||||
self._page_length = self._read_int(
|
||||
const.page_size_offset + align1, const.page_size_length
|
||||
)
|
||||
self._page_count = self._read_int(
|
||||
const.page_count_offset + align1, const.page_count_length
|
||||
)
|
||||
|
||||
buf = self._read_bytes(
|
||||
const.sas_release_offset + total_align, const.sas_release_length
|
||||
)
|
||||
self.sas_release = buf.rstrip(b"\x00 ")
|
||||
if self.convert_header_text:
|
||||
self.sas_release = self.sas_release.decode(
|
||||
self.encoding or self.default_encoding
|
||||
)
|
||||
|
||||
buf = self._read_bytes(
|
||||
const.sas_server_type_offset + total_align, const.sas_server_type_length
|
||||
)
|
||||
self.server_type = buf.rstrip(b"\x00 ")
|
||||
if self.convert_header_text:
|
||||
self.server_type = self.server_type.decode(
|
||||
self.encoding or self.default_encoding
|
||||
)
|
||||
|
||||
buf = self._read_bytes(
|
||||
const.os_version_number_offset + total_align, const.os_version_number_length
|
||||
)
|
||||
self.os_version = buf.rstrip(b"\x00 ")
|
||||
if self.convert_header_text:
|
||||
self.os_version = self.os_version.decode(
|
||||
self.encoding or self.default_encoding
|
||||
)
|
||||
|
||||
buf = self._read_bytes(const.os_name_offset + total_align, const.os_name_length)
|
||||
buf = buf.rstrip(b"\x00 ")
|
||||
if len(buf) > 0:
|
||||
self.os_name = buf.decode(self.encoding or self.default_encoding)
|
||||
else:
|
||||
buf = self._read_bytes(
|
||||
const.os_maker_offset + total_align, const.os_maker_length
|
||||
)
|
||||
self.os_name = buf.rstrip(b"\x00 ")
|
||||
if self.convert_header_text:
|
||||
self.os_name = self.os_name.decode(
|
||||
self.encoding or self.default_encoding
|
||||
)
|
||||
|
||||
def __next__(self):
|
||||
da = self.read(nrows=self.chunksize or 1)
|
||||
if da is None:
|
||||
raise StopIteration
|
||||
return da
|
||||
|
||||
# Read a single float of the given width (4 or 8).
|
||||
def _read_float(self, offset, width):
|
||||
if width not in (4, 8):
|
||||
self.close()
|
||||
raise ValueError("invalid float width")
|
||||
buf = self._read_bytes(offset, width)
|
||||
fd = "f" if width == 4 else "d"
|
||||
return struct.unpack(self.byte_order + fd, buf)[0]
|
||||
|
||||
# Read a single signed integer of the given width (1, 2, 4 or 8).
|
||||
def _read_int(self, offset, width):
|
||||
if width not in (1, 2, 4, 8):
|
||||
self.close()
|
||||
raise ValueError("invalid int width")
|
||||
buf = self._read_bytes(offset, width)
|
||||
it = {1: "b", 2: "h", 4: "l", 8: "q"}[width]
|
||||
iv = struct.unpack(self.byte_order + it, buf)[0]
|
||||
return iv
|
||||
|
||||
def _read_bytes(self, offset, length):
|
||||
if self._cached_page is None:
|
||||
self._path_or_buf.seek(offset)
|
||||
buf = self._path_or_buf.read(length)
|
||||
if len(buf) < length:
|
||||
self.close()
|
||||
msg = "Unable to read {:d} bytes from file position {:d}."
|
||||
raise ValueError(msg.format(length, offset))
|
||||
return buf
|
||||
else:
|
||||
if offset + length > len(self._cached_page):
|
||||
self.close()
|
||||
raise ValueError("The cached page is too small.")
|
||||
return self._cached_page[offset : offset + length]
|
||||
|
||||
def _parse_metadata(self):
|
||||
done = False
|
||||
while not done:
|
||||
self._cached_page = self._path_or_buf.read(self._page_length)
|
||||
if len(self._cached_page) <= 0:
|
||||
break
|
||||
if len(self._cached_page) != self._page_length:
|
||||
self.close()
|
||||
raise ValueError("Failed to read a meta data page from the SAS file.")
|
||||
done = self._process_page_meta()
|
||||
|
||||
def _process_page_meta(self):
|
||||
self._read_page_header()
|
||||
pt = [const.page_meta_type, const.page_amd_type] + const.page_mix_types
|
||||
if self._current_page_type in pt:
|
||||
self._process_page_metadata()
|
||||
is_data_page = self._current_page_type & const.page_data_type
|
||||
is_mix_page = self._current_page_type in const.page_mix_types
|
||||
return (
|
||||
is_data_page
|
||||
or is_mix_page
|
||||
or self._current_page_data_subheader_pointers != []
|
||||
)
|
||||
|
||||
def _read_page_header(self):
|
||||
bit_offset = self._page_bit_offset
|
||||
tx = const.page_type_offset + bit_offset
|
||||
self._current_page_type = self._read_int(tx, const.page_type_length)
|
||||
tx = const.block_count_offset + bit_offset
|
||||
self._current_page_block_count = self._read_int(tx, const.block_count_length)
|
||||
tx = const.subheader_count_offset + bit_offset
|
||||
self._current_page_subheaders_count = self._read_int(
|
||||
tx, const.subheader_count_length
|
||||
)
|
||||
|
||||
def _process_page_metadata(self):
|
||||
bit_offset = self._page_bit_offset
|
||||
|
||||
for i in range(self._current_page_subheaders_count):
|
||||
pointer = self._process_subheader_pointers(
|
||||
const.subheader_pointers_offset + bit_offset, i
|
||||
)
|
||||
if pointer.length == 0:
|
||||
continue
|
||||
if pointer.compression == const.truncated_subheader_id:
|
||||
continue
|
||||
subheader_signature = self._read_subheader_signature(pointer.offset)
|
||||
subheader_index = self._get_subheader_index(
|
||||
subheader_signature, pointer.compression, pointer.ptype
|
||||
)
|
||||
self._process_subheader(subheader_index, pointer)
|
||||
|
||||
def _get_subheader_index(self, signature, compression, ptype):
|
||||
index = const.subheader_signature_to_index.get(signature)
|
||||
if index is None:
|
||||
f1 = (compression == const.compressed_subheader_id) or (compression == 0)
|
||||
f2 = ptype == const.compressed_subheader_type
|
||||
if (self.compression != "") and f1 and f2:
|
||||
index = const.SASIndex.data_subheader_index
|
||||
else:
|
||||
self.close()
|
||||
raise ValueError("Unknown subheader signature")
|
||||
return index
|
||||
|
||||
def _process_subheader_pointers(self, offset, subheader_pointer_index):
|
||||
|
||||
subheader_pointer_length = self._subheader_pointer_length
|
||||
total_offset = offset + subheader_pointer_length * subheader_pointer_index
|
||||
|
||||
subheader_offset = self._read_int(total_offset, self._int_length)
|
||||
total_offset += self._int_length
|
||||
|
||||
subheader_length = self._read_int(total_offset, self._int_length)
|
||||
total_offset += self._int_length
|
||||
|
||||
subheader_compression = self._read_int(total_offset, 1)
|
||||
total_offset += 1
|
||||
|
||||
subheader_type = self._read_int(total_offset, 1)
|
||||
|
||||
x = _subheader_pointer()
|
||||
x.offset = subheader_offset
|
||||
x.length = subheader_length
|
||||
x.compression = subheader_compression
|
||||
x.ptype = subheader_type
|
||||
|
||||
return x
|
||||
|
||||
def _read_subheader_signature(self, offset):
|
||||
subheader_signature = self._read_bytes(offset, self._int_length)
|
||||
return subheader_signature
|
||||
|
||||
def _process_subheader(self, subheader_index, pointer):
|
||||
offset = pointer.offset
|
||||
length = pointer.length
|
||||
|
||||
if subheader_index == const.SASIndex.row_size_index:
|
||||
processor = self._process_rowsize_subheader
|
||||
elif subheader_index == const.SASIndex.column_size_index:
|
||||
processor = self._process_columnsize_subheader
|
||||
elif subheader_index == const.SASIndex.column_text_index:
|
||||
processor = self._process_columntext_subheader
|
||||
elif subheader_index == const.SASIndex.column_name_index:
|
||||
processor = self._process_columnname_subheader
|
||||
elif subheader_index == const.SASIndex.column_attributes_index:
|
||||
processor = self._process_columnattributes_subheader
|
||||
elif subheader_index == const.SASIndex.format_and_label_index:
|
||||
processor = self._process_format_subheader
|
||||
elif subheader_index == const.SASIndex.column_list_index:
|
||||
processor = self._process_columnlist_subheader
|
||||
elif subheader_index == const.SASIndex.subheader_counts_index:
|
||||
processor = self._process_subheader_counts
|
||||
elif subheader_index == const.SASIndex.data_subheader_index:
|
||||
self._current_page_data_subheader_pointers.append(pointer)
|
||||
return
|
||||
else:
|
||||
raise ValueError("unknown subheader index")
|
||||
|
||||
processor(offset, length)
|
||||
|
||||
def _process_rowsize_subheader(self, offset, length):
|
||||
|
||||
int_len = self._int_length
|
||||
lcs_offset = offset
|
||||
lcp_offset = offset
|
||||
if self.U64:
|
||||
lcs_offset += 682
|
||||
lcp_offset += 706
|
||||
else:
|
||||
lcs_offset += 354
|
||||
lcp_offset += 378
|
||||
|
||||
self.row_length = self._read_int(
|
||||
offset + const.row_length_offset_multiplier * int_len, int_len
|
||||
)
|
||||
self.row_count = self._read_int(
|
||||
offset + const.row_count_offset_multiplier * int_len, int_len
|
||||
)
|
||||
self.col_count_p1 = self._read_int(
|
||||
offset + const.col_count_p1_multiplier * int_len, int_len
|
||||
)
|
||||
self.col_count_p2 = self._read_int(
|
||||
offset + const.col_count_p2_multiplier * int_len, int_len
|
||||
)
|
||||
mx = const.row_count_on_mix_page_offset_multiplier * int_len
|
||||
self._mix_page_row_count = self._read_int(offset + mx, int_len)
|
||||
self._lcs = self._read_int(lcs_offset, 2)
|
||||
self._lcp = self._read_int(lcp_offset, 2)
|
||||
|
||||
def _process_columnsize_subheader(self, offset, length):
|
||||
int_len = self._int_length
|
||||
offset += int_len
|
||||
self.column_count = self._read_int(offset, int_len)
|
||||
if self.col_count_p1 + self.col_count_p2 != self.column_count:
|
||||
print(
|
||||
"Warning: column count mismatch ({p1} + {p2} != "
|
||||
"{column_count})\n".format(
|
||||
p1=self.col_count_p1,
|
||||
p2=self.col_count_p2,
|
||||
column_count=self.column_count,
|
||||
)
|
||||
)
|
||||
|
||||
# Unknown purpose
|
||||
def _process_subheader_counts(self, offset, length):
|
||||
pass
|
||||
|
||||
def _process_columntext_subheader(self, offset, length):
|
||||
|
||||
offset += self._int_length
|
||||
text_block_size = self._read_int(offset, const.text_block_size_length)
|
||||
|
||||
buf = self._read_bytes(offset, text_block_size)
|
||||
cname_raw = buf[0:text_block_size].rstrip(b"\x00 ")
|
||||
cname = cname_raw
|
||||
if self.convert_header_text:
|
||||
cname = cname.decode(self.encoding or self.default_encoding)
|
||||
self.column_names_strings.append(cname)
|
||||
|
||||
if len(self.column_names_strings) == 1:
|
||||
compression_literal = ""
|
||||
for cl in const.compression_literals:
|
||||
if cl in cname_raw:
|
||||
compression_literal = cl
|
||||
self.compression = compression_literal
|
||||
offset -= self._int_length
|
||||
|
||||
offset1 = offset + 16
|
||||
if self.U64:
|
||||
offset1 += 4
|
||||
|
||||
buf = self._read_bytes(offset1, self._lcp)
|
||||
compression_literal = buf.rstrip(b"\x00")
|
||||
if compression_literal == "":
|
||||
self._lcs = 0
|
||||
offset1 = offset + 32
|
||||
if self.U64:
|
||||
offset1 += 4
|
||||
buf = self._read_bytes(offset1, self._lcp)
|
||||
self.creator_proc = buf[0 : self._lcp]
|
||||
elif compression_literal == const.rle_compression:
|
||||
offset1 = offset + 40
|
||||
if self.U64:
|
||||
offset1 += 4
|
||||
buf = self._read_bytes(offset1, self._lcp)
|
||||
self.creator_proc = buf[0 : self._lcp]
|
||||
elif self._lcs > 0:
|
||||
self._lcp = 0
|
||||
offset1 = offset + 16
|
||||
if self.U64:
|
||||
offset1 += 4
|
||||
buf = self._read_bytes(offset1, self._lcs)
|
||||
self.creator_proc = buf[0 : self._lcp]
|
||||
if self.convert_header_text:
|
||||
if hasattr(self, "creator_proc"):
|
||||
self.creator_proc = self.creator_proc.decode(
|
||||
self.encoding or self.default_encoding
|
||||
)
|
||||
|
||||
def _process_columnname_subheader(self, offset, length):
|
||||
int_len = self._int_length
|
||||
offset += int_len
|
||||
column_name_pointers_count = (length - 2 * int_len - 12) // 8
|
||||
for i in range(column_name_pointers_count):
|
||||
text_subheader = (
|
||||
offset
|
||||
+ const.column_name_pointer_length * (i + 1)
|
||||
+ const.column_name_text_subheader_offset
|
||||
)
|
||||
col_name_offset = (
|
||||
offset
|
||||
+ const.column_name_pointer_length * (i + 1)
|
||||
+ const.column_name_offset_offset
|
||||
)
|
||||
col_name_length = (
|
||||
offset
|
||||
+ const.column_name_pointer_length * (i + 1)
|
||||
+ const.column_name_length_offset
|
||||
)
|
||||
|
||||
idx = self._read_int(
|
||||
text_subheader, const.column_name_text_subheader_length
|
||||
)
|
||||
col_offset = self._read_int(
|
||||
col_name_offset, const.column_name_offset_length
|
||||
)
|
||||
col_len = self._read_int(col_name_length, const.column_name_length_length)
|
||||
|
||||
name_str = self.column_names_strings[idx]
|
||||
self.column_names.append(name_str[col_offset : col_offset + col_len])
|
||||
|
||||
def _process_columnattributes_subheader(self, offset, length):
|
||||
int_len = self._int_length
|
||||
column_attributes_vectors_count = (length - 2 * int_len - 12) // (int_len + 8)
|
||||
for i in range(column_attributes_vectors_count):
|
||||
col_data_offset = (
|
||||
offset + int_len + const.column_data_offset_offset + i * (int_len + 8)
|
||||
)
|
||||
col_data_len = (
|
||||
offset
|
||||
+ 2 * int_len
|
||||
+ const.column_data_length_offset
|
||||
+ i * (int_len + 8)
|
||||
)
|
||||
col_types = (
|
||||
offset + 2 * int_len + const.column_type_offset + i * (int_len + 8)
|
||||
)
|
||||
|
||||
x = self._read_int(col_data_offset, int_len)
|
||||
self._column_data_offsets.append(x)
|
||||
|
||||
x = self._read_int(col_data_len, const.column_data_length_length)
|
||||
self._column_data_lengths.append(x)
|
||||
|
||||
x = self._read_int(col_types, const.column_type_length)
|
||||
self._column_types.append(b"d" if x == 1 else b"s")
|
||||
|
||||
def _process_columnlist_subheader(self, offset, length):
|
||||
# unknown purpose
|
||||
pass
|
||||
|
||||
def _process_format_subheader(self, offset, length):
|
||||
int_len = self._int_length
|
||||
text_subheader_format = (
|
||||
offset + const.column_format_text_subheader_index_offset + 3 * int_len
|
||||
)
|
||||
col_format_offset = offset + const.column_format_offset_offset + 3 * int_len
|
||||
col_format_len = offset + const.column_format_length_offset + 3 * int_len
|
||||
text_subheader_label = (
|
||||
offset + const.column_label_text_subheader_index_offset + 3 * int_len
|
||||
)
|
||||
col_label_offset = offset + const.column_label_offset_offset + 3 * int_len
|
||||
col_label_len = offset + const.column_label_length_offset + 3 * int_len
|
||||
|
||||
x = self._read_int(
|
||||
text_subheader_format, const.column_format_text_subheader_index_length
|
||||
)
|
||||
format_idx = min(x, len(self.column_names_strings) - 1)
|
||||
|
||||
format_start = self._read_int(
|
||||
col_format_offset, const.column_format_offset_length
|
||||
)
|
||||
format_len = self._read_int(col_format_len, const.column_format_length_length)
|
||||
|
||||
label_idx = self._read_int(
|
||||
text_subheader_label, const.column_label_text_subheader_index_length
|
||||
)
|
||||
label_idx = min(label_idx, len(self.column_names_strings) - 1)
|
||||
|
||||
label_start = self._read_int(col_label_offset, const.column_label_offset_length)
|
||||
label_len = self._read_int(col_label_len, const.column_label_length_length)
|
||||
|
||||
label_names = self.column_names_strings[label_idx]
|
||||
column_label = label_names[label_start : label_start + label_len]
|
||||
format_names = self.column_names_strings[format_idx]
|
||||
column_format = format_names[format_start : format_start + format_len]
|
||||
current_column_number = len(self.columns)
|
||||
|
||||
col = _column()
|
||||
col.col_id = current_column_number
|
||||
col.name = self.column_names[current_column_number]
|
||||
col.label = column_label
|
||||
col.format = column_format
|
||||
col.ctype = self._column_types[current_column_number]
|
||||
col.length = self._column_data_lengths[current_column_number]
|
||||
|
||||
self.column_formats.append(column_format)
|
||||
self.columns.append(col)
|
||||
|
||||
def read(self, nrows=None):
|
||||
|
||||
if (nrows is None) and (self.chunksize is not None):
|
||||
nrows = self.chunksize
|
||||
elif nrows is None:
|
||||
nrows = self.row_count
|
||||
|
||||
if len(self._column_types) == 0:
|
||||
self.close()
|
||||
raise EmptyDataError("No columns to parse from file")
|
||||
|
||||
if self._current_row_in_file_index >= self.row_count:
|
||||
return None
|
||||
|
||||
m = self.row_count - self._current_row_in_file_index
|
||||
if nrows > m:
|
||||
nrows = m
|
||||
|
||||
nd = self._column_types.count(b"d")
|
||||
ns = self._column_types.count(b"s")
|
||||
|
||||
self._string_chunk = np.empty((ns, nrows), dtype=np.object)
|
||||
self._byte_chunk = np.zeros((nd, 8 * nrows), dtype=np.uint8)
|
||||
|
||||
self._current_row_in_chunk_index = 0
|
||||
p = Parser(self)
|
||||
p.read(nrows)
|
||||
|
||||
rslt = self._chunk_to_dataframe()
|
||||
if self.index is not None:
|
||||
rslt = rslt.set_index(self.index)
|
||||
|
||||
return rslt
|
||||
|
||||
def _read_next_page(self):
|
||||
self._current_page_data_subheader_pointers = []
|
||||
self._cached_page = self._path_or_buf.read(self._page_length)
|
||||
if len(self._cached_page) <= 0:
|
||||
return True
|
||||
elif len(self._cached_page) != self._page_length:
|
||||
self.close()
|
||||
msg = "failed to read complete page from file " "(read {:d} of {:d} bytes)"
|
||||
raise ValueError(msg.format(len(self._cached_page), self._page_length))
|
||||
|
||||
self._read_page_header()
|
||||
page_type = self._current_page_type
|
||||
if page_type == const.page_meta_type:
|
||||
self._process_page_metadata()
|
||||
|
||||
is_data_page = page_type & const.page_data_type
|
||||
pt = [const.page_meta_type] + const.page_mix_types
|
||||
if not is_data_page and self._current_page_type not in pt:
|
||||
return self._read_next_page()
|
||||
|
||||
return False
|
||||
|
||||
def _chunk_to_dataframe(self):
|
||||
|
||||
n = self._current_row_in_chunk_index
|
||||
m = self._current_row_in_file_index
|
||||
ix = range(m - n, m)
|
||||
rslt = pd.DataFrame(index=ix)
|
||||
|
||||
js, jb = 0, 0
|
||||
for j in range(self.column_count):
|
||||
|
||||
name = self.column_names[j]
|
||||
|
||||
if self._column_types[j] == b"d":
|
||||
rslt[name] = self._byte_chunk[jb, :].view(dtype=self.byte_order + "d")
|
||||
rslt[name] = np.asarray(rslt[name], dtype=np.float64)
|
||||
if self.convert_dates:
|
||||
unit = None
|
||||
if self.column_formats[j] in const.sas_date_formats:
|
||||
unit = "d"
|
||||
elif self.column_formats[j] in const.sas_datetime_formats:
|
||||
unit = "s"
|
||||
if unit:
|
||||
rslt[name] = pd.to_datetime(
|
||||
rslt[name], unit=unit, origin="1960-01-01"
|
||||
)
|
||||
jb += 1
|
||||
elif self._column_types[j] == b"s":
|
||||
rslt[name] = self._string_chunk[js, :]
|
||||
if self.convert_text and (self.encoding is not None):
|
||||
rslt[name] = rslt[name].str.decode(
|
||||
self.encoding or self.default_encoding
|
||||
)
|
||||
if self.blank_missing:
|
||||
ii = rslt[name].str.len() == 0
|
||||
rslt.loc[ii, name] = np.nan
|
||||
js += 1
|
||||
else:
|
||||
self.close()
|
||||
raise ValueError(
|
||||
"unknown column type {type}".format(type=self._column_types[j])
|
||||
)
|
||||
|
||||
return rslt
|
253
venv/lib/python3.6/site-packages/pandas/io/sas/sas_constants.py
Normal file
253
venv/lib/python3.6/site-packages/pandas/io/sas/sas_constants.py
Normal file
@@ -0,0 +1,253 @@
|
||||
magic = (
|
||||
b"\x00\x00\x00\x00\x00\x00\x00\x00"
|
||||
+ b"\x00\x00\x00\x00\xc2\xea\x81\x60"
|
||||
+ b"\xb3\x14\x11\xcf\xbd\x92\x08\x00"
|
||||
+ b"\x09\xc7\x31\x8c\x18\x1f\x10\x11"
|
||||
)
|
||||
|
||||
align_1_checker_value = b"3"
|
||||
align_1_offset = 32
|
||||
align_1_length = 1
|
||||
align_1_value = 4
|
||||
u64_byte_checker_value = b"3"
|
||||
align_2_offset = 35
|
||||
align_2_length = 1
|
||||
align_2_value = 4
|
||||
endianness_offset = 37
|
||||
endianness_length = 1
|
||||
platform_offset = 39
|
||||
platform_length = 1
|
||||
encoding_offset = 70
|
||||
encoding_length = 1
|
||||
dataset_offset = 92
|
||||
dataset_length = 64
|
||||
file_type_offset = 156
|
||||
file_type_length = 8
|
||||
date_created_offset = 164
|
||||
date_created_length = 8
|
||||
date_modified_offset = 172
|
||||
date_modified_length = 8
|
||||
header_size_offset = 196
|
||||
header_size_length = 4
|
||||
page_size_offset = 200
|
||||
page_size_length = 4
|
||||
page_count_offset = 204
|
||||
page_count_length = 4
|
||||
sas_release_offset = 216
|
||||
sas_release_length = 8
|
||||
sas_server_type_offset = 224
|
||||
sas_server_type_length = 16
|
||||
os_version_number_offset = 240
|
||||
os_version_number_length = 16
|
||||
os_maker_offset = 256
|
||||
os_maker_length = 16
|
||||
os_name_offset = 272
|
||||
os_name_length = 16
|
||||
page_bit_offset_x86 = 16
|
||||
page_bit_offset_x64 = 32
|
||||
subheader_pointer_length_x86 = 12
|
||||
subheader_pointer_length_x64 = 24
|
||||
page_type_offset = 0
|
||||
page_type_length = 2
|
||||
block_count_offset = 2
|
||||
block_count_length = 2
|
||||
subheader_count_offset = 4
|
||||
subheader_count_length = 2
|
||||
page_meta_type = 0
|
||||
page_data_type = 256
|
||||
page_amd_type = 1024
|
||||
page_metc_type = 16384
|
||||
page_comp_type = -28672
|
||||
page_mix_types = [512, 640]
|
||||
subheader_pointers_offset = 8
|
||||
truncated_subheader_id = 1
|
||||
compressed_subheader_id = 4
|
||||
compressed_subheader_type = 1
|
||||
text_block_size_length = 2
|
||||
row_length_offset_multiplier = 5
|
||||
row_count_offset_multiplier = 6
|
||||
col_count_p1_multiplier = 9
|
||||
col_count_p2_multiplier = 10
|
||||
row_count_on_mix_page_offset_multiplier = 15
|
||||
column_name_pointer_length = 8
|
||||
column_name_text_subheader_offset = 0
|
||||
column_name_text_subheader_length = 2
|
||||
column_name_offset_offset = 2
|
||||
column_name_offset_length = 2
|
||||
column_name_length_offset = 4
|
||||
column_name_length_length = 2
|
||||
column_data_offset_offset = 8
|
||||
column_data_length_offset = 8
|
||||
column_data_length_length = 4
|
||||
column_type_offset = 14
|
||||
column_type_length = 1
|
||||
column_format_text_subheader_index_offset = 22
|
||||
column_format_text_subheader_index_length = 2
|
||||
column_format_offset_offset = 24
|
||||
column_format_offset_length = 2
|
||||
column_format_length_offset = 26
|
||||
column_format_length_length = 2
|
||||
column_label_text_subheader_index_offset = 28
|
||||
column_label_text_subheader_index_length = 2
|
||||
column_label_offset_offset = 30
|
||||
column_label_offset_length = 2
|
||||
column_label_length_offset = 32
|
||||
column_label_length_length = 2
|
||||
rle_compression = b"SASYZCRL"
|
||||
rdc_compression = b"SASYZCR2"
|
||||
|
||||
compression_literals = [rle_compression, rdc_compression]
|
||||
|
||||
# Incomplete list of encodings, using SAS nomenclature:
|
||||
# http://support.sas.com/documentation/cdl/en/nlsref/61893/HTML/default/viewer.htm#a002607278.htm
|
||||
encoding_names = {
|
||||
29: "latin1",
|
||||
20: "utf-8",
|
||||
33: "cyrillic",
|
||||
60: "wlatin2",
|
||||
61: "wcyrillic",
|
||||
62: "wlatin1",
|
||||
90: "ebcdic870",
|
||||
}
|
||||
|
||||
|
||||
class SASIndex:
|
||||
row_size_index = 0
|
||||
column_size_index = 1
|
||||
subheader_counts_index = 2
|
||||
column_text_index = 3
|
||||
column_name_index = 4
|
||||
column_attributes_index = 5
|
||||
format_and_label_index = 6
|
||||
column_list_index = 7
|
||||
data_subheader_index = 8
|
||||
|
||||
|
||||
subheader_signature_to_index = {
|
||||
b"\xF7\xF7\xF7\xF7": SASIndex.row_size_index,
|
||||
b"\x00\x00\x00\x00\xF7\xF7\xF7\xF7": SASIndex.row_size_index,
|
||||
b"\xF7\xF7\xF7\xF7\x00\x00\x00\x00": SASIndex.row_size_index,
|
||||
b"\xF7\xF7\xF7\xF7\xFF\xFF\xFB\xFE": SASIndex.row_size_index,
|
||||
b"\xF6\xF6\xF6\xF6": SASIndex.column_size_index,
|
||||
b"\x00\x00\x00\x00\xF6\xF6\xF6\xF6": SASIndex.column_size_index,
|
||||
b"\xF6\xF6\xF6\xF6\x00\x00\x00\x00": SASIndex.column_size_index,
|
||||
b"\xF6\xF6\xF6\xF6\xFF\xFF\xFB\xFE": SASIndex.column_size_index,
|
||||
b"\x00\xFC\xFF\xFF": SASIndex.subheader_counts_index,
|
||||
b"\xFF\xFF\xFC\x00": SASIndex.subheader_counts_index,
|
||||
b"\x00\xFC\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.subheader_counts_index,
|
||||
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFC\x00": SASIndex.subheader_counts_index,
|
||||
b"\xFD\xFF\xFF\xFF": SASIndex.column_text_index,
|
||||
b"\xFF\xFF\xFF\xFD": SASIndex.column_text_index,
|
||||
b"\xFD\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_text_index,
|
||||
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFD": SASIndex.column_text_index,
|
||||
b"\xFF\xFF\xFF\xFF": SASIndex.column_name_index,
|
||||
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_name_index,
|
||||
b"\xFC\xFF\xFF\xFF": SASIndex.column_attributes_index,
|
||||
b"\xFF\xFF\xFF\xFC": SASIndex.column_attributes_index,
|
||||
b"\xFC\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_attributes_index,
|
||||
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFC": SASIndex.column_attributes_index,
|
||||
b"\xFE\xFB\xFF\xFF": SASIndex.format_and_label_index,
|
||||
b"\xFF\xFF\xFB\xFE": SASIndex.format_and_label_index,
|
||||
b"\xFE\xFB\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.format_and_label_index,
|
||||
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFB\xFE": SASIndex.format_and_label_index,
|
||||
b"\xFE\xFF\xFF\xFF": SASIndex.column_list_index,
|
||||
b"\xFF\xFF\xFF\xFE": SASIndex.column_list_index,
|
||||
b"\xFE\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_list_index,
|
||||
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFE": SASIndex.column_list_index,
|
||||
}
|
||||
|
||||
|
||||
# List of frequently used SAS date and datetime formats
|
||||
# http://support.sas.com/documentation/cdl/en/etsug/60372/HTML/default/viewer.htm#etsug_intervals_sect009.htm
|
||||
# https://github.com/epam/parso/blob/master/src/main/java/com/epam/parso/impl/SasFileConstants.java
|
||||
sas_date_formats = (
|
||||
"DATE",
|
||||
"DAY",
|
||||
"DDMMYY",
|
||||
"DOWNAME",
|
||||
"JULDAY",
|
||||
"JULIAN",
|
||||
"MMDDYY",
|
||||
"MMYY",
|
||||
"MMYYC",
|
||||
"MMYYD",
|
||||
"MMYYP",
|
||||
"MMYYS",
|
||||
"MMYYN",
|
||||
"MONNAME",
|
||||
"MONTH",
|
||||
"MONYY",
|
||||
"QTR",
|
||||
"QTRR",
|
||||
"NENGO",
|
||||
"WEEKDATE",
|
||||
"WEEKDATX",
|
||||
"WEEKDAY",
|
||||
"WEEKV",
|
||||
"WORDDATE",
|
||||
"WORDDATX",
|
||||
"YEAR",
|
||||
"YYMM",
|
||||
"YYMMC",
|
||||
"YYMMD",
|
||||
"YYMMP",
|
||||
"YYMMS",
|
||||
"YYMMN",
|
||||
"YYMON",
|
||||
"YYMMDD",
|
||||
"YYQ",
|
||||
"YYQC",
|
||||
"YYQD",
|
||||
"YYQP",
|
||||
"YYQS",
|
||||
"YYQN",
|
||||
"YYQR",
|
||||
"YYQRC",
|
||||
"YYQRD",
|
||||
"YYQRP",
|
||||
"YYQRS",
|
||||
"YYQRN",
|
||||
"YYMMDDP",
|
||||
"YYMMDDC",
|
||||
"E8601DA",
|
||||
"YYMMDDN",
|
||||
"MMDDYYC",
|
||||
"MMDDYYS",
|
||||
"MMDDYYD",
|
||||
"YYMMDDS",
|
||||
"B8601DA",
|
||||
"DDMMYYN",
|
||||
"YYMMDDD",
|
||||
"DDMMYYB",
|
||||
"DDMMYYP",
|
||||
"MMDDYYP",
|
||||
"YYMMDDB",
|
||||
"MMDDYYN",
|
||||
"DDMMYYC",
|
||||
"DDMMYYD",
|
||||
"DDMMYYS",
|
||||
"MINGUO",
|
||||
)
|
||||
|
||||
sas_datetime_formats = (
|
||||
"DATETIME",
|
||||
"DTWKDATX",
|
||||
"B8601DN",
|
||||
"B8601DT",
|
||||
"B8601DX",
|
||||
"B8601DZ",
|
||||
"B8601LX",
|
||||
"E8601DN",
|
||||
"E8601DT",
|
||||
"E8601DX",
|
||||
"E8601DZ",
|
||||
"E8601LX",
|
||||
"DATEAMPM",
|
||||
"DTDATE",
|
||||
"DTMONYY",
|
||||
"DTMONYY",
|
||||
"DTWKDATX",
|
||||
"DTYEAR",
|
||||
"TOD",
|
||||
"MDYAMPM",
|
||||
)
|
507
venv/lib/python3.6/site-packages/pandas/io/sas/sas_xport.py
Normal file
507
venv/lib/python3.6/site-packages/pandas/io/sas/sas_xport.py
Normal file
@@ -0,0 +1,507 @@
|
||||
"""
|
||||
Read a SAS XPort format file into a Pandas DataFrame.
|
||||
|
||||
Based on code from Jack Cushman (github.com/jcushman/xport).
|
||||
|
||||
The file format is defined here:
|
||||
|
||||
https://support.sas.com/techsup/technote/ts140.pdf
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from io import BytesIO
|
||||
import struct
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas.util._decorators import Appender
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from pandas.io.common import BaseIterator, get_filepath_or_buffer
|
||||
|
||||
_correct_line1 = (
|
||||
"HEADER RECORD*******LIBRARY HEADER RECORD!!!!!!!"
|
||||
"000000000000000000000000000000 "
|
||||
)
|
||||
_correct_header1 = (
|
||||
"HEADER RECORD*******MEMBER HEADER RECORD!!!!!!!" "000000000000000001600000000"
|
||||
)
|
||||
_correct_header2 = (
|
||||
"HEADER RECORD*******DSCRPTR HEADER RECORD!!!!!!!"
|
||||
"000000000000000000000000000000 "
|
||||
)
|
||||
_correct_obs_header = (
|
||||
"HEADER RECORD*******OBS HEADER RECORD!!!!!!!"
|
||||
"000000000000000000000000000000 "
|
||||
)
|
||||
_fieldkeys = [
|
||||
"ntype",
|
||||
"nhfun",
|
||||
"field_length",
|
||||
"nvar0",
|
||||
"name",
|
||||
"label",
|
||||
"nform",
|
||||
"nfl",
|
||||
"num_decimals",
|
||||
"nfj",
|
||||
"nfill",
|
||||
"niform",
|
||||
"nifl",
|
||||
"nifd",
|
||||
"npos",
|
||||
"_",
|
||||
]
|
||||
|
||||
|
||||
_base_params_doc = """\
|
||||
Parameters
|
||||
----------
|
||||
filepath_or_buffer : string or file-like object
|
||||
Path to SAS file or object implementing binary read method."""
|
||||
|
||||
_params2_doc = """\
|
||||
index : identifier of index column
|
||||
Identifier of column that should be used as index of the DataFrame.
|
||||
encoding : string
|
||||
Encoding for text data.
|
||||
chunksize : int
|
||||
Read file `chunksize` lines at a time, returns iterator."""
|
||||
|
||||
_format_params_doc = """\
|
||||
format : string
|
||||
File format, only `xport` is currently supported."""
|
||||
|
||||
_iterator_doc = """\
|
||||
iterator : boolean, default False
|
||||
Return XportReader object for reading file incrementally."""
|
||||
|
||||
|
||||
_read_sas_doc = """Read a SAS file into a DataFrame.
|
||||
|
||||
%(_base_params_doc)s
|
||||
%(_format_params_doc)s
|
||||
%(_params2_doc)s
|
||||
%(_iterator_doc)s
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame or XportReader
|
||||
|
||||
Examples
|
||||
--------
|
||||
Read a SAS Xport file:
|
||||
|
||||
>>> df = pd.read_sas('filename.XPT')
|
||||
|
||||
Read a Xport file in 10,000 line chunks:
|
||||
|
||||
>>> itr = pd.read_sas('filename.XPT', chunksize=10000)
|
||||
>>> for chunk in itr:
|
||||
>>> do_something(chunk)
|
||||
|
||||
""" % {
|
||||
"_base_params_doc": _base_params_doc,
|
||||
"_format_params_doc": _format_params_doc,
|
||||
"_params2_doc": _params2_doc,
|
||||
"_iterator_doc": _iterator_doc,
|
||||
}
|
||||
|
||||
|
||||
_xport_reader_doc = """\
|
||||
Class for reading SAS Xport files.
|
||||
|
||||
%(_base_params_doc)s
|
||||
%(_params2_doc)s
|
||||
|
||||
Attributes
|
||||
----------
|
||||
member_info : list
|
||||
Contains information about the file
|
||||
fields : list
|
||||
Contains information about the variables in the file
|
||||
""" % {
|
||||
"_base_params_doc": _base_params_doc,
|
||||
"_params2_doc": _params2_doc,
|
||||
}
|
||||
|
||||
|
||||
_read_method_doc = """\
|
||||
Read observations from SAS Xport file, returning as data frame.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
nrows : int
|
||||
Number of rows to read from data file; if None, read whole
|
||||
file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
A DataFrame.
|
||||
"""
|
||||
|
||||
|
||||
def _parse_date(datestr):
|
||||
""" Given a date in xport format, return Python date. """
|
||||
try:
|
||||
# e.g. "16FEB11:10:07:55"
|
||||
return datetime.strptime(datestr, "%d%b%y:%H:%M:%S")
|
||||
except ValueError:
|
||||
return pd.NaT
|
||||
|
||||
|
||||
def _split_line(s, parts):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
s: string
|
||||
Fixed-length string to split
|
||||
parts: list of (name, length) pairs
|
||||
Used to break up string, name '_' will be filtered from output.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Dict of name:contents of string at given location.
|
||||
"""
|
||||
out = {}
|
||||
start = 0
|
||||
for name, length in parts:
|
||||
out[name] = s[start : start + length].strip()
|
||||
start += length
|
||||
del out["_"]
|
||||
return out
|
||||
|
||||
|
||||
def _handle_truncated_float_vec(vec, nbytes):
|
||||
# This feature is not well documented, but some SAS XPORT files
|
||||
# have 2-7 byte "truncated" floats. To read these truncated
|
||||
# floats, pad them with zeros on the right to make 8 byte floats.
|
||||
#
|
||||
# References:
|
||||
# https://github.com/jcushman/xport/pull/3
|
||||
# The R "foreign" library
|
||||
|
||||
if nbytes != 8:
|
||||
vec1 = np.zeros(len(vec), np.dtype("S8"))
|
||||
dtype = np.dtype("S%d,S%d" % (nbytes, 8 - nbytes))
|
||||
vec2 = vec1.view(dtype=dtype)
|
||||
vec2["f0"] = vec
|
||||
return vec2
|
||||
|
||||
return vec
|
||||
|
||||
|
||||
def _parse_float_vec(vec):
|
||||
"""
|
||||
Parse a vector of float values representing IBM 8 byte floats into
|
||||
native 8 byte floats.
|
||||
"""
|
||||
|
||||
dtype = np.dtype(">u4,>u4")
|
||||
vec1 = vec.view(dtype=dtype)
|
||||
xport1 = vec1["f0"]
|
||||
xport2 = vec1["f1"]
|
||||
|
||||
# Start by setting first half of ieee number to first half of IBM
|
||||
# number sans exponent
|
||||
ieee1 = xport1 & 0x00FFFFFF
|
||||
|
||||
# The fraction bit to the left of the binary point in the ieee
|
||||
# format was set and the number was shifted 0, 1, 2, or 3
|
||||
# places. This will tell us how to adjust the ibm exponent to be a
|
||||
# power of 2 ieee exponent and how to shift the fraction bits to
|
||||
# restore the correct magnitude.
|
||||
shift = np.zeros(len(vec), dtype=np.uint8)
|
||||
shift[np.where(xport1 & 0x00200000)] = 1
|
||||
shift[np.where(xport1 & 0x00400000)] = 2
|
||||
shift[np.where(xport1 & 0x00800000)] = 3
|
||||
|
||||
# shift the ieee number down the correct number of places then
|
||||
# set the second half of the ieee number to be the second half
|
||||
# of the ibm number shifted appropriately, ored with the bits
|
||||
# from the first half that would have been shifted in if we
|
||||
# could shift a double. All we are worried about are the low
|
||||
# order 3 bits of the first half since we're only shifting by
|
||||
# 1, 2, or 3.
|
||||
ieee1 >>= shift
|
||||
ieee2 = (xport2 >> shift) | ((xport1 & 0x00000007) << (29 + (3 - shift)))
|
||||
|
||||
# clear the 1 bit to the left of the binary point
|
||||
ieee1 &= 0xFFEFFFFF
|
||||
|
||||
# set the exponent of the ieee number to be the actual exponent
|
||||
# plus the shift count + 1023. Or this into the first half of the
|
||||
# ieee number. The ibm exponent is excess 64 but is adjusted by 65
|
||||
# since during conversion to ibm format the exponent is
|
||||
# incremented by 1 and the fraction bits left 4 positions to the
|
||||
# right of the radix point. (had to add >> 24 because C treats &
|
||||
# 0x7f as 0x7f000000 and Python doesn't)
|
||||
ieee1 |= ((((((xport1 >> 24) & 0x7F) - 65) << 2) + shift + 1023) << 20) | (
|
||||
xport1 & 0x80000000
|
||||
)
|
||||
|
||||
ieee = np.empty((len(ieee1),), dtype=">u4,>u4")
|
||||
ieee["f0"] = ieee1
|
||||
ieee["f1"] = ieee2
|
||||
ieee = ieee.view(dtype=">f8")
|
||||
ieee = ieee.astype("f8")
|
||||
|
||||
return ieee
|
||||
|
||||
|
||||
class XportReader(BaseIterator):
|
||||
__doc__ = _xport_reader_doc
|
||||
|
||||
def __init__(
|
||||
self, filepath_or_buffer, index=None, encoding="ISO-8859-1", chunksize=None
|
||||
):
|
||||
|
||||
self._encoding = encoding
|
||||
self._lines_read = 0
|
||||
self._index = index
|
||||
self._chunksize = chunksize
|
||||
|
||||
if isinstance(filepath_or_buffer, str):
|
||||
(
|
||||
filepath_or_buffer,
|
||||
encoding,
|
||||
compression,
|
||||
should_close,
|
||||
) = get_filepath_or_buffer(filepath_or_buffer, encoding=encoding)
|
||||
|
||||
if isinstance(filepath_or_buffer, (str, bytes)):
|
||||
self.filepath_or_buffer = open(filepath_or_buffer, "rb")
|
||||
else:
|
||||
# Copy to BytesIO, and ensure no encoding
|
||||
contents = filepath_or_buffer.read()
|
||||
try:
|
||||
contents = contents.encode(self._encoding)
|
||||
except UnicodeEncodeError:
|
||||
pass
|
||||
self.filepath_or_buffer = BytesIO(contents)
|
||||
|
||||
self._read_header()
|
||||
|
||||
def close(self):
|
||||
self.filepath_or_buffer.close()
|
||||
|
||||
def _get_row(self):
|
||||
return self.filepath_or_buffer.read(80).decode()
|
||||
|
||||
def _read_header(self):
|
||||
self.filepath_or_buffer.seek(0)
|
||||
|
||||
# read file header
|
||||
line1 = self._get_row()
|
||||
if line1 != _correct_line1:
|
||||
self.close()
|
||||
raise ValueError("Header record is not an XPORT file.")
|
||||
|
||||
line2 = self._get_row()
|
||||
fif = [["prefix", 24], ["version", 8], ["OS", 8], ["_", 24], ["created", 16]]
|
||||
file_info = _split_line(line2, fif)
|
||||
if file_info["prefix"] != "SAS SAS SASLIB":
|
||||
self.close()
|
||||
raise ValueError("Header record has invalid prefix.")
|
||||
file_info["created"] = _parse_date(file_info["created"])
|
||||
self.file_info = file_info
|
||||
|
||||
line3 = self._get_row()
|
||||
file_info["modified"] = _parse_date(line3[:16])
|
||||
|
||||
# read member header
|
||||
header1 = self._get_row()
|
||||
header2 = self._get_row()
|
||||
headflag1 = header1.startswith(_correct_header1)
|
||||
headflag2 = header2 == _correct_header2
|
||||
if not (headflag1 and headflag2):
|
||||
self.close()
|
||||
raise ValueError("Member header not found")
|
||||
# usually 140, could be 135
|
||||
fieldnamelength = int(header1[-5:-2])
|
||||
|
||||
# member info
|
||||
mem = [
|
||||
["prefix", 8],
|
||||
["set_name", 8],
|
||||
["sasdata", 8],
|
||||
["version", 8],
|
||||
["OS", 8],
|
||||
["_", 24],
|
||||
["created", 16],
|
||||
]
|
||||
member_info = _split_line(self._get_row(), mem)
|
||||
mem = [["modified", 16], ["_", 16], ["label", 40], ["type", 8]]
|
||||
member_info.update(_split_line(self._get_row(), mem))
|
||||
member_info["modified"] = _parse_date(member_info["modified"])
|
||||
member_info["created"] = _parse_date(member_info["created"])
|
||||
self.member_info = member_info
|
||||
|
||||
# read field names
|
||||
types = {1: "numeric", 2: "char"}
|
||||
fieldcount = int(self._get_row()[54:58])
|
||||
datalength = fieldnamelength * fieldcount
|
||||
# round up to nearest 80
|
||||
if datalength % 80:
|
||||
datalength += 80 - datalength % 80
|
||||
fielddata = self.filepath_or_buffer.read(datalength)
|
||||
fields = []
|
||||
obs_length = 0
|
||||
while len(fielddata) >= fieldnamelength:
|
||||
# pull data for one field
|
||||
field, fielddata = (
|
||||
fielddata[:fieldnamelength],
|
||||
fielddata[fieldnamelength:],
|
||||
)
|
||||
|
||||
# rest at end gets ignored, so if field is short, pad out
|
||||
# to match struct pattern below
|
||||
field = field.ljust(140)
|
||||
|
||||
fieldstruct = struct.unpack(">hhhh8s40s8shhh2s8shhl52s", field)
|
||||
field = dict(zip(_fieldkeys, fieldstruct))
|
||||
del field["_"]
|
||||
field["ntype"] = types[field["ntype"]]
|
||||
fl = field["field_length"]
|
||||
if field["ntype"] == "numeric" and ((fl < 2) or (fl > 8)):
|
||||
self.close()
|
||||
msg = "Floating field width {0} is not between 2 and 8."
|
||||
raise TypeError(msg.format(fl))
|
||||
|
||||
for k, v in field.items():
|
||||
try:
|
||||
field[k] = v.strip()
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
obs_length += field["field_length"]
|
||||
fields += [field]
|
||||
|
||||
header = self._get_row()
|
||||
if not header == _correct_obs_header:
|
||||
self.close()
|
||||
raise ValueError("Observation header not found.")
|
||||
|
||||
self.fields = fields
|
||||
self.record_length = obs_length
|
||||
self.record_start = self.filepath_or_buffer.tell()
|
||||
|
||||
self.nobs = self._record_count()
|
||||
self.columns = [x["name"].decode() for x in self.fields]
|
||||
|
||||
# Setup the dtype.
|
||||
dtypel = [
|
||||
("s" + str(i), "S" + str(field["field_length"]))
|
||||
for i, field in enumerate(self.fields)
|
||||
]
|
||||
dtype = np.dtype(dtypel)
|
||||
self._dtype = dtype
|
||||
|
||||
def __next__(self):
|
||||
return self.read(nrows=self._chunksize or 1)
|
||||
|
||||
def _record_count(self):
|
||||
"""
|
||||
Get number of records in file.
|
||||
|
||||
This is maybe suboptimal because we have to seek to the end of
|
||||
the file.
|
||||
|
||||
Side effect: returns file position to record_start.
|
||||
"""
|
||||
|
||||
self.filepath_or_buffer.seek(0, 2)
|
||||
total_records_length = self.filepath_or_buffer.tell() - self.record_start
|
||||
|
||||
if total_records_length % 80 != 0:
|
||||
warnings.warn("xport file may be corrupted")
|
||||
|
||||
if self.record_length > 80:
|
||||
self.filepath_or_buffer.seek(self.record_start)
|
||||
return total_records_length // self.record_length
|
||||
|
||||
self.filepath_or_buffer.seek(-80, 2)
|
||||
last_card = self.filepath_or_buffer.read(80)
|
||||
last_card = np.frombuffer(last_card, dtype=np.uint64)
|
||||
|
||||
# 8 byte blank
|
||||
ix = np.flatnonzero(last_card == 2314885530818453536)
|
||||
|
||||
if len(ix) == 0:
|
||||
tail_pad = 0
|
||||
else:
|
||||
tail_pad = 8 * len(ix)
|
||||
|
||||
self.filepath_or_buffer.seek(self.record_start)
|
||||
|
||||
return (total_records_length - tail_pad) // self.record_length
|
||||
|
||||
def get_chunk(self, size=None):
|
||||
"""
|
||||
Reads lines from Xport file and returns as dataframe
|
||||
|
||||
Parameters
|
||||
----------
|
||||
size : int, defaults to None
|
||||
Number of lines to read. If None, reads whole file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
"""
|
||||
if size is None:
|
||||
size = self._chunksize
|
||||
return self.read(nrows=size)
|
||||
|
||||
def _missing_double(self, vec):
|
||||
v = vec.view(dtype="u1,u1,u2,u4")
|
||||
miss = (v["f1"] == 0) & (v["f2"] == 0) & (v["f3"] == 0)
|
||||
miss1 = (
|
||||
((v["f0"] >= 0x41) & (v["f0"] <= 0x5A))
|
||||
| (v["f0"] == 0x5F)
|
||||
| (v["f0"] == 0x2E)
|
||||
)
|
||||
miss &= miss1
|
||||
return miss
|
||||
|
||||
@Appender(_read_method_doc)
|
||||
def read(self, nrows=None):
|
||||
|
||||
if nrows is None:
|
||||
nrows = self.nobs
|
||||
|
||||
read_lines = min(nrows, self.nobs - self._lines_read)
|
||||
read_len = read_lines * self.record_length
|
||||
if read_len <= 0:
|
||||
self.close()
|
||||
raise StopIteration
|
||||
raw = self.filepath_or_buffer.read(read_len)
|
||||
data = np.frombuffer(raw, dtype=self._dtype, count=read_lines)
|
||||
|
||||
df = pd.DataFrame(index=range(read_lines))
|
||||
for j, x in enumerate(self.columns):
|
||||
vec = data["s%d" % j]
|
||||
ntype = self.fields[j]["ntype"]
|
||||
if ntype == "numeric":
|
||||
vec = _handle_truncated_float_vec(vec, self.fields[j]["field_length"])
|
||||
miss = self._missing_double(vec)
|
||||
v = _parse_float_vec(vec)
|
||||
v[miss] = np.nan
|
||||
elif self.fields[j]["ntype"] == "char":
|
||||
v = [y.rstrip() for y in vec]
|
||||
|
||||
if self._encoding is not None:
|
||||
v = [y.decode(self._encoding) for y in v]
|
||||
|
||||
df[x] = v
|
||||
|
||||
if self._index is None:
|
||||
df.index = range(self._lines_read, self._lines_read + read_lines)
|
||||
else:
|
||||
df = df.set_index(self._index)
|
||||
|
||||
self._lines_read += read_lines
|
||||
|
||||
return df
|
86
venv/lib/python3.6/site-packages/pandas/io/sas/sasreader.py
Normal file
86
venv/lib/python3.6/site-packages/pandas/io/sas/sasreader.py
Normal file
@@ -0,0 +1,86 @@
|
||||
"""
|
||||
Read SAS sas7bdat or xport files.
|
||||
"""
|
||||
from pandas.io.common import _stringify_path
|
||||
|
||||
|
||||
def read_sas(
|
||||
filepath_or_buffer,
|
||||
format=None,
|
||||
index=None,
|
||||
encoding=None,
|
||||
chunksize=None,
|
||||
iterator=False,
|
||||
):
|
||||
"""
|
||||
Read SAS files stored as either XPORT or SAS7BDAT format files.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filepath_or_buffer : str, path object or file-like object
|
||||
Any valid string path is acceptable. The string could be a URL. Valid
|
||||
URL schemes include http, ftp, s3, and file. For file URLs, a host is
|
||||
expected. A local file could be:
|
||||
``file://localhost/path/to/table.sas``.
|
||||
|
||||
If you want to pass in a path object, pandas accepts any
|
||||
``os.PathLike``.
|
||||
|
||||
By file-like object, we refer to objects with a ``read()`` method,
|
||||
such as a file handler (e.g. via builtin ``open`` function)
|
||||
or ``StringIO``.
|
||||
format : string {'xport', 'sas7bdat'} or None
|
||||
If None, file format is inferred from file extension. If 'xport' or
|
||||
'sas7bdat', uses the corresponding format.
|
||||
index : identifier of index column, defaults to None
|
||||
Identifier of column that should be used as index of the DataFrame.
|
||||
encoding : string, default is None
|
||||
Encoding for text data. If None, text data are stored as raw bytes.
|
||||
chunksize : int
|
||||
Read file `chunksize` lines at a time, returns iterator.
|
||||
iterator : bool, defaults to False
|
||||
If True, returns an iterator for reading the file incrementally.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame if iterator=False and chunksize=None, else SAS7BDATReader
|
||||
or XportReader
|
||||
"""
|
||||
if format is None:
|
||||
buffer_error_msg = (
|
||||
"If this is a buffer object rather "
|
||||
"than a string name, you must specify "
|
||||
"a format string"
|
||||
)
|
||||
filepath_or_buffer = _stringify_path(filepath_or_buffer)
|
||||
if not isinstance(filepath_or_buffer, str):
|
||||
raise ValueError(buffer_error_msg)
|
||||
fname = filepath_or_buffer.lower()
|
||||
if fname.endswith(".xpt"):
|
||||
format = "xport"
|
||||
elif fname.endswith(".sas7bdat"):
|
||||
format = "sas7bdat"
|
||||
else:
|
||||
raise ValueError("unable to infer format of SAS file")
|
||||
|
||||
if format.lower() == "xport":
|
||||
from pandas.io.sas.sas_xport import XportReader
|
||||
|
||||
reader = XportReader(
|
||||
filepath_or_buffer, index=index, encoding=encoding, chunksize=chunksize
|
||||
)
|
||||
elif format.lower() == "sas7bdat":
|
||||
from pandas.io.sas.sas7bdat import SAS7BDATReader
|
||||
|
||||
reader = SAS7BDATReader(
|
||||
filepath_or_buffer, index=index, encoding=encoding, chunksize=chunksize
|
||||
)
|
||||
else:
|
||||
raise ValueError("unknown SAS format")
|
||||
|
||||
if iterator or chunksize:
|
||||
return reader
|
||||
|
||||
data = reader.read()
|
||||
reader.close()
|
||||
return data
|
44
venv/lib/python3.6/site-packages/pandas/io/spss.py
Normal file
44
venv/lib/python3.6/site-packages/pandas/io/spss.py
Normal file
@@ -0,0 +1,44 @@
|
||||
from pathlib import Path
|
||||
from typing import Optional, Sequence, Union
|
||||
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
|
||||
from pandas.api.types import is_list_like
|
||||
from pandas.core.api import DataFrame
|
||||
|
||||
|
||||
def read_spss(
|
||||
path: Union[str, Path],
|
||||
usecols: Optional[Sequence[str]] = None,
|
||||
convert_categoricals: bool = True,
|
||||
) -> DataFrame:
|
||||
"""
|
||||
Load an SPSS file from the file path, returning a DataFrame.
|
||||
|
||||
.. versionadded 0.25.0
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : string or Path
|
||||
File path
|
||||
usecols : list-like, optional
|
||||
Return a subset of the columns. If None, return all columns.
|
||||
convert_categoricals : bool, default is True
|
||||
Convert categorical columns into pd.Categorical.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
"""
|
||||
pyreadstat = import_optional_dependency("pyreadstat")
|
||||
|
||||
if usecols is not None:
|
||||
if not is_list_like(usecols):
|
||||
raise TypeError("usecols must be list-like.")
|
||||
else:
|
||||
usecols = list(usecols) # pyreadstat requires a list
|
||||
|
||||
df, _ = pyreadstat.read_sav(
|
||||
path, usecols=usecols, apply_value_formats=convert_categoricals
|
||||
)
|
||||
return df
|
1800
venv/lib/python3.6/site-packages/pandas/io/sql.py
Normal file
1800
venv/lib/python3.6/site-packages/pandas/io/sql.py
Normal file
File diff suppressed because it is too large
Load Diff
3185
venv/lib/python3.6/site-packages/pandas/io/stata.py
Normal file
3185
venv/lib/python3.6/site-packages/pandas/io/stata.py
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user