Initial commit
This commit is contained in:
596
venv/lib/python3.8/site-packages/joblib/compressor.py
Normal file
596
venv/lib/python3.8/site-packages/joblib/compressor.py
Normal file
@@ -0,0 +1,596 @@
|
||||
"""Classes and functions for managing compressors."""
|
||||
|
||||
import sys
|
||||
import io
|
||||
import zlib
|
||||
from pkg_resources import parse_version
|
||||
|
||||
from ._compat import _basestring, PY3_OR_LATER
|
||||
|
||||
try:
|
||||
from threading import RLock
|
||||
except ImportError:
|
||||
from dummy_threading import RLock
|
||||
|
||||
try:
|
||||
import bz2
|
||||
except ImportError:
|
||||
bz2 = None
|
||||
|
||||
try:
|
||||
import lzma
|
||||
except ImportError:
|
||||
lzma = None
|
||||
|
||||
try:
|
||||
import lz4
|
||||
if PY3_OR_LATER:
|
||||
from lz4.frame import LZ4FrameFile
|
||||
except ImportError:
|
||||
lz4 = None
|
||||
|
||||
LZ4_NOT_INSTALLED_ERROR = ('LZ4 is not installed. Install it with pip: '
|
||||
'https://python-lz4.readthedocs.io/')
|
||||
|
||||
# Registered compressors
|
||||
_COMPRESSORS = {}
|
||||
|
||||
# Magic numbers of supported compression file formats.
|
||||
_ZFILE_PREFIX = b'ZF' # used with pickle files created before 0.9.3.
|
||||
_ZLIB_PREFIX = b'\x78'
|
||||
_GZIP_PREFIX = b'\x1f\x8b'
|
||||
_BZ2_PREFIX = b'BZ'
|
||||
_XZ_PREFIX = b'\xfd\x37\x7a\x58\x5a'
|
||||
_LZMA_PREFIX = b'\x5d\x00'
|
||||
_LZ4_PREFIX = b'\x04\x22\x4D\x18'
|
||||
|
||||
|
||||
def register_compressor(compressor_name, compressor,
|
||||
force=False):
|
||||
"""Register a new compressor.
|
||||
|
||||
Parameters
|
||||
-----------
|
||||
compressor_name: str.
|
||||
The name of the compressor.
|
||||
compressor: CompressorWrapper
|
||||
An instance of a 'CompressorWrapper'.
|
||||
"""
|
||||
global _COMPRESSORS
|
||||
if not isinstance(compressor_name, _basestring):
|
||||
raise ValueError("Compressor name should be a string, "
|
||||
"'{}' given.".format(compressor_name))
|
||||
|
||||
if not isinstance(compressor, CompressorWrapper):
|
||||
raise ValueError("Compressor should implement the CompressorWrapper "
|
||||
"interface, '{}' given.".format(compressor))
|
||||
|
||||
if (compressor.fileobj_factory is not None and
|
||||
(not hasattr(compressor.fileobj_factory, 'read') or
|
||||
not hasattr(compressor.fileobj_factory, 'write') or
|
||||
not hasattr(compressor.fileobj_factory, 'seek') or
|
||||
not hasattr(compressor.fileobj_factory, 'tell'))):
|
||||
raise ValueError("Compressor 'fileobj_factory' attribute should "
|
||||
"implement the file object interface, '{}' given."
|
||||
.format(compressor.fileobj_factory))
|
||||
|
||||
if compressor_name in _COMPRESSORS and not force:
|
||||
raise ValueError("Compressor '{}' already registered."
|
||||
.format(compressor_name))
|
||||
|
||||
_COMPRESSORS[compressor_name] = compressor
|
||||
|
||||
|
||||
class CompressorWrapper():
|
||||
"""A wrapper around a compressor file object.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
obj: a file-like object
|
||||
The object must implement the buffer interface and will be used
|
||||
internally to compress/decompress the data.
|
||||
prefix: bytestring
|
||||
A bytestring corresponding to the magic number that identifies the
|
||||
file format associated to the compressor.
|
||||
extention: str
|
||||
The file extension used to automatically select this compressor during
|
||||
a dump to a file.
|
||||
"""
|
||||
|
||||
def __init__(self, obj, prefix=b'', extension=''):
|
||||
self.fileobj_factory = obj
|
||||
self.prefix = prefix
|
||||
self.extension = extension
|
||||
|
||||
def compressor_file(self, fileobj, compresslevel=None):
|
||||
"""Returns an instance of a compressor file object."""
|
||||
if compresslevel is None:
|
||||
return self.fileobj_factory(fileobj, 'wb')
|
||||
else:
|
||||
return self.fileobj_factory(fileobj, 'wb',
|
||||
compresslevel=compresslevel)
|
||||
|
||||
def decompressor_file(self, fileobj):
|
||||
"""Returns an instance of a decompressor file object."""
|
||||
return self.fileobj_factory(fileobj, 'rb')
|
||||
|
||||
|
||||
class BZ2CompressorWrapper(CompressorWrapper):
|
||||
|
||||
prefix = _BZ2_PREFIX
|
||||
extension = '.bz2'
|
||||
|
||||
def __init__(self):
|
||||
if bz2 is not None:
|
||||
self.fileobj_factory = bz2.BZ2File
|
||||
else:
|
||||
self.fileobj_factory = None
|
||||
|
||||
def _check_versions(self):
|
||||
if bz2 is None:
|
||||
raise ValueError('bz2 module is not compiled on your python '
|
||||
'standard library.')
|
||||
|
||||
def compressor_file(self, fileobj, compresslevel=None):
|
||||
"""Returns an instance of a compressor file object."""
|
||||
self._check_versions()
|
||||
if compresslevel is None:
|
||||
return self.fileobj_factory(fileobj, 'wb')
|
||||
else:
|
||||
return self.fileobj_factory(fileobj, 'wb',
|
||||
compresslevel=compresslevel)
|
||||
|
||||
def decompressor_file(self, fileobj):
|
||||
"""Returns an instance of a decompressor file object."""
|
||||
self._check_versions()
|
||||
if PY3_OR_LATER:
|
||||
fileobj = self.fileobj_factory(fileobj, 'rb')
|
||||
else:
|
||||
# In python 2, BZ2File doesn't support a fileobj opened in
|
||||
# binary mode. In this case, we pass the filename.
|
||||
fileobj = self.fileobj_factory(fileobj.name, 'rb')
|
||||
return fileobj
|
||||
|
||||
|
||||
class LZMACompressorWrapper(CompressorWrapper):
|
||||
|
||||
prefix = _LZMA_PREFIX
|
||||
extension = '.lzma'
|
||||
|
||||
def __init__(self):
|
||||
if lzma is not None:
|
||||
self.fileobj_factory = lzma.LZMAFile
|
||||
else:
|
||||
self.fileobj_factory = None
|
||||
|
||||
def compressor_file(self, fileobj, compresslevel=None):
|
||||
"""Returns an instance of a compressor file object."""
|
||||
if compresslevel is None:
|
||||
return self.fileobj_factory(fileobj, 'wb',
|
||||
format=lzma.FORMAT_ALONE)
|
||||
else:
|
||||
return self.fileobj_factory(fileobj, 'wb',
|
||||
format=lzma.FORMAT_ALONE,
|
||||
preset=compresslevel)
|
||||
|
||||
def decompressor_file(self, fileobj):
|
||||
"""Returns an instance of a decompressor file object."""
|
||||
if PY3_OR_LATER and lzma is not None:
|
||||
# We support lzma only in python 3 because in python 2 users
|
||||
# may have installed the pyliblzma package, which also provides
|
||||
# the lzma module, but that unfortunately doesn't fully support
|
||||
# the buffer interface required by joblib.
|
||||
# See https://github.com/joblib/joblib/issues/403 for details.
|
||||
return lzma.LZMAFile(fileobj, 'rb')
|
||||
else:
|
||||
raise NotImplementedError("Lzma decompression is not "
|
||||
"supported for this version of "
|
||||
"python ({}.{})"
|
||||
.format(sys.version_info[0],
|
||||
sys.version_info[1]))
|
||||
|
||||
|
||||
class XZCompressorWrapper(LZMACompressorWrapper):
|
||||
|
||||
prefix = _XZ_PREFIX
|
||||
extension = '.xz'
|
||||
|
||||
def __init__(self):
|
||||
if lzma is not None:
|
||||
self.fileobj_factory = lzma.LZMAFile
|
||||
else:
|
||||
self.fileobj_factory = None
|
||||
|
||||
def compressor_file(self, fileobj, compresslevel=None):
|
||||
"""Returns an instance of a compressor file object."""
|
||||
if compresslevel is None:
|
||||
return self.fileobj_factory(fileobj, 'wb', check=lzma.CHECK_NONE)
|
||||
else:
|
||||
return self.fileobj_factory(fileobj, 'wb', check=lzma.CHECK_NONE,
|
||||
preset=compresslevel)
|
||||
|
||||
|
||||
class LZ4CompressorWrapper(CompressorWrapper):
|
||||
|
||||
prefix = _LZ4_PREFIX
|
||||
extension = '.lz4'
|
||||
|
||||
def __init__(self):
|
||||
if PY3_OR_LATER and lz4 is not None:
|
||||
self.fileobj_factory = LZ4FrameFile
|
||||
else:
|
||||
self.fileobj_factory = None
|
||||
|
||||
def _check_versions(self):
|
||||
if not PY3_OR_LATER:
|
||||
raise ValueError('lz4 compression is only available with '
|
||||
'python3+.')
|
||||
|
||||
if lz4 is None or (
|
||||
parse_version(lz4.__version__) < parse_version('0.19')
|
||||
):
|
||||
raise ValueError(LZ4_NOT_INSTALLED_ERROR)
|
||||
|
||||
def compressor_file(self, fileobj, compresslevel=None):
|
||||
"""Returns an instance of a compressor file object."""
|
||||
self._check_versions()
|
||||
if compresslevel is None:
|
||||
return self.fileobj_factory(fileobj, 'wb')
|
||||
else:
|
||||
return self.fileobj_factory(fileobj, 'wb',
|
||||
compression_level=compresslevel)
|
||||
|
||||
def decompressor_file(self, fileobj):
|
||||
"""Returns an instance of a decompressor file object."""
|
||||
self._check_versions()
|
||||
return self.fileobj_factory(fileobj, 'rb')
|
||||
|
||||
|
||||
###############################################################################
|
||||
# base file compression/decompression object definition
|
||||
_MODE_CLOSED = 0
|
||||
_MODE_READ = 1
|
||||
_MODE_READ_EOF = 2
|
||||
_MODE_WRITE = 3
|
||||
_BUFFER_SIZE = 8192
|
||||
|
||||
|
||||
class BinaryZlibFile(io.BufferedIOBase):
|
||||
"""A file object providing transparent zlib (de)compression.
|
||||
|
||||
A BinaryZlibFile can act as a wrapper for an existing file object, or refer
|
||||
directly to a named file on disk.
|
||||
|
||||
Note that BinaryZlibFile provides only a *binary* file interface: data read
|
||||
is returned as bytes, and data to be written should be given as bytes.
|
||||
|
||||
This object is an adaptation of the BZ2File object and is compatible with
|
||||
versions of python >= 2.7.
|
||||
|
||||
If filename is a str or bytes object, it gives the name
|
||||
of the file to be opened. Otherwise, it should be a file object,
|
||||
which will be used to read or write the compressed data.
|
||||
|
||||
mode can be 'rb' for reading (default) or 'wb' for (over)writing
|
||||
|
||||
If mode is 'wb', compresslevel can be a number between 1
|
||||
and 9 specifying the level of compression: 1 produces the least
|
||||
compression, and 9 produces the most compression. 3 is the default.
|
||||
"""
|
||||
|
||||
wbits = zlib.MAX_WBITS
|
||||
|
||||
def __init__(self, filename, mode="rb", compresslevel=3):
|
||||
# This lock must be recursive, so that BufferedIOBase's
|
||||
# readline(), readlines() and writelines() don't deadlock.
|
||||
self._lock = RLock()
|
||||
self._fp = None
|
||||
self._closefp = False
|
||||
self._mode = _MODE_CLOSED
|
||||
self._pos = 0
|
||||
self._size = -1
|
||||
self.compresslevel = compresslevel
|
||||
|
||||
if not isinstance(compresslevel, int) or not (1 <= compresslevel <= 9):
|
||||
raise ValueError("'compresslevel' must be an integer "
|
||||
"between 1 and 9. You provided 'compresslevel={}'"
|
||||
.format(compresslevel))
|
||||
|
||||
if mode == "rb":
|
||||
self._mode = _MODE_READ
|
||||
self._decompressor = zlib.decompressobj(self.wbits)
|
||||
self._buffer = b""
|
||||
self._buffer_offset = 0
|
||||
elif mode == "wb":
|
||||
self._mode = _MODE_WRITE
|
||||
self._compressor = zlib.compressobj(self.compresslevel,
|
||||
zlib.DEFLATED, self.wbits,
|
||||
zlib.DEF_MEM_LEVEL, 0)
|
||||
else:
|
||||
raise ValueError("Invalid mode: %r" % (mode,))
|
||||
|
||||
if isinstance(filename, _basestring):
|
||||
self._fp = io.open(filename, mode)
|
||||
self._closefp = True
|
||||
elif hasattr(filename, "read") or hasattr(filename, "write"):
|
||||
self._fp = filename
|
||||
else:
|
||||
raise TypeError("filename must be a str or bytes object, "
|
||||
"or a file")
|
||||
|
||||
def close(self):
|
||||
"""Flush and close the file.
|
||||
|
||||
May be called more than once without error. Once the file is
|
||||
closed, any other operation on it will raise a ValueError.
|
||||
"""
|
||||
with self._lock:
|
||||
if self._mode == _MODE_CLOSED:
|
||||
return
|
||||
try:
|
||||
if self._mode in (_MODE_READ, _MODE_READ_EOF):
|
||||
self._decompressor = None
|
||||
elif self._mode == _MODE_WRITE:
|
||||
self._fp.write(self._compressor.flush())
|
||||
self._compressor = None
|
||||
finally:
|
||||
try:
|
||||
if self._closefp:
|
||||
self._fp.close()
|
||||
finally:
|
||||
self._fp = None
|
||||
self._closefp = False
|
||||
self._mode = _MODE_CLOSED
|
||||
self._buffer = b""
|
||||
self._buffer_offset = 0
|
||||
|
||||
@property
|
||||
def closed(self):
|
||||
"""True if this file is closed."""
|
||||
return self._mode == _MODE_CLOSED
|
||||
|
||||
def fileno(self):
|
||||
"""Return the file descriptor for the underlying file."""
|
||||
self._check_not_closed()
|
||||
return self._fp.fileno()
|
||||
|
||||
def seekable(self):
|
||||
"""Return whether the file supports seeking."""
|
||||
return self.readable() and self._fp.seekable()
|
||||
|
||||
def readable(self):
|
||||
"""Return whether the file was opened for reading."""
|
||||
self._check_not_closed()
|
||||
return self._mode in (_MODE_READ, _MODE_READ_EOF)
|
||||
|
||||
def writable(self):
|
||||
"""Return whether the file was opened for writing."""
|
||||
self._check_not_closed()
|
||||
return self._mode == _MODE_WRITE
|
||||
|
||||
# Mode-checking helper functions.
|
||||
|
||||
def _check_not_closed(self):
|
||||
if self.closed:
|
||||
fname = getattr(self._fp, 'name', None)
|
||||
msg = "I/O operation on closed file"
|
||||
if fname is not None:
|
||||
msg += " {}".format(fname)
|
||||
msg += "."
|
||||
raise ValueError(msg)
|
||||
|
||||
def _check_can_read(self):
|
||||
if self._mode not in (_MODE_READ, _MODE_READ_EOF):
|
||||
self._check_not_closed()
|
||||
raise io.UnsupportedOperation("File not open for reading")
|
||||
|
||||
def _check_can_write(self):
|
||||
if self._mode != _MODE_WRITE:
|
||||
self._check_not_closed()
|
||||
raise io.UnsupportedOperation("File not open for writing")
|
||||
|
||||
def _check_can_seek(self):
|
||||
if self._mode not in (_MODE_READ, _MODE_READ_EOF):
|
||||
self._check_not_closed()
|
||||
raise io.UnsupportedOperation("Seeking is only supported "
|
||||
"on files open for reading")
|
||||
if not self._fp.seekable():
|
||||
raise io.UnsupportedOperation("The underlying file object "
|
||||
"does not support seeking")
|
||||
|
||||
# Fill the readahead buffer if it is empty. Returns False on EOF.
|
||||
def _fill_buffer(self):
|
||||
if self._mode == _MODE_READ_EOF:
|
||||
return False
|
||||
# Depending on the input data, our call to the decompressor may not
|
||||
# return any data. In this case, try again after reading another block.
|
||||
while self._buffer_offset == len(self._buffer):
|
||||
try:
|
||||
rawblock = (self._decompressor.unused_data or
|
||||
self._fp.read(_BUFFER_SIZE))
|
||||
if not rawblock:
|
||||
raise EOFError
|
||||
except EOFError:
|
||||
# End-of-stream marker and end of file. We're good.
|
||||
self._mode = _MODE_READ_EOF
|
||||
self._size = self._pos
|
||||
return False
|
||||
else:
|
||||
self._buffer = self._decompressor.decompress(rawblock)
|
||||
self._buffer_offset = 0
|
||||
return True
|
||||
|
||||
# Read data until EOF.
|
||||
# If return_data is false, consume the data without returning it.
|
||||
def _read_all(self, return_data=True):
|
||||
# The loop assumes that _buffer_offset is 0. Ensure that this is true.
|
||||
self._buffer = self._buffer[self._buffer_offset:]
|
||||
self._buffer_offset = 0
|
||||
|
||||
blocks = []
|
||||
while self._fill_buffer():
|
||||
if return_data:
|
||||
blocks.append(self._buffer)
|
||||
self._pos += len(self._buffer)
|
||||
self._buffer = b""
|
||||
if return_data:
|
||||
return b"".join(blocks)
|
||||
|
||||
# Read a block of up to n bytes.
|
||||
# If return_data is false, consume the data without returning it.
|
||||
def _read_block(self, n_bytes, return_data=True):
|
||||
# If we have enough data buffered, return immediately.
|
||||
end = self._buffer_offset + n_bytes
|
||||
if end <= len(self._buffer):
|
||||
data = self._buffer[self._buffer_offset: end]
|
||||
self._buffer_offset = end
|
||||
self._pos += len(data)
|
||||
return data if return_data else None
|
||||
|
||||
# The loop assumes that _buffer_offset is 0. Ensure that this is true.
|
||||
self._buffer = self._buffer[self._buffer_offset:]
|
||||
self._buffer_offset = 0
|
||||
|
||||
blocks = []
|
||||
while n_bytes > 0 and self._fill_buffer():
|
||||
if n_bytes < len(self._buffer):
|
||||
data = self._buffer[:n_bytes]
|
||||
self._buffer_offset = n_bytes
|
||||
else:
|
||||
data = self._buffer
|
||||
self._buffer = b""
|
||||
if return_data:
|
||||
blocks.append(data)
|
||||
self._pos += len(data)
|
||||
n_bytes -= len(data)
|
||||
if return_data:
|
||||
return b"".join(blocks)
|
||||
|
||||
def read(self, size=-1):
|
||||
"""Read up to size uncompressed bytes from the file.
|
||||
|
||||
If size is negative or omitted, read until EOF is reached.
|
||||
Returns b'' if the file is already at EOF.
|
||||
"""
|
||||
with self._lock:
|
||||
self._check_can_read()
|
||||
if size == 0:
|
||||
return b""
|
||||
elif size < 0:
|
||||
return self._read_all()
|
||||
else:
|
||||
return self._read_block(size)
|
||||
|
||||
def readinto(self, b):
|
||||
"""Read up to len(b) bytes into b.
|
||||
|
||||
Returns the number of bytes read (0 for EOF).
|
||||
"""
|
||||
with self._lock:
|
||||
return io.BufferedIOBase.readinto(self, b)
|
||||
|
||||
def write(self, data):
|
||||
"""Write a byte string to the file.
|
||||
|
||||
Returns the number of uncompressed bytes written, which is
|
||||
always len(data). Note that due to buffering, the file on disk
|
||||
may not reflect the data written until close() is called.
|
||||
"""
|
||||
with self._lock:
|
||||
self._check_can_write()
|
||||
# Convert data type if called by io.BufferedWriter.
|
||||
if isinstance(data, memoryview):
|
||||
data = data.tobytes()
|
||||
|
||||
compressed = self._compressor.compress(data)
|
||||
self._fp.write(compressed)
|
||||
self._pos += len(data)
|
||||
return len(data)
|
||||
|
||||
# Rewind the file to the beginning of the data stream.
|
||||
def _rewind(self):
|
||||
self._fp.seek(0, 0)
|
||||
self._mode = _MODE_READ
|
||||
self._pos = 0
|
||||
self._decompressor = zlib.decompressobj(self.wbits)
|
||||
self._buffer = b""
|
||||
self._buffer_offset = 0
|
||||
|
||||
def seek(self, offset, whence=0):
|
||||
"""Change the file position.
|
||||
|
||||
The new position is specified by offset, relative to the
|
||||
position indicated by whence. Values for whence are:
|
||||
|
||||
0: start of stream (default); offset must not be negative
|
||||
1: current stream position
|
||||
2: end of stream; offset must not be positive
|
||||
|
||||
Returns the new file position.
|
||||
|
||||
Note that seeking is emulated, so depending on the parameters,
|
||||
this operation may be extremely slow.
|
||||
"""
|
||||
with self._lock:
|
||||
self._check_can_seek()
|
||||
|
||||
# Recalculate offset as an absolute file position.
|
||||
if whence == 0:
|
||||
pass
|
||||
elif whence == 1:
|
||||
offset = self._pos + offset
|
||||
elif whence == 2:
|
||||
# Seeking relative to EOF - we need to know the file's size.
|
||||
if self._size < 0:
|
||||
self._read_all(return_data=False)
|
||||
offset = self._size + offset
|
||||
else:
|
||||
raise ValueError("Invalid value for whence: %s" % (whence,))
|
||||
|
||||
# Make it so that offset is the number of bytes to skip forward.
|
||||
if offset < self._pos:
|
||||
self._rewind()
|
||||
else:
|
||||
offset -= self._pos
|
||||
|
||||
# Read and discard data until we reach the desired position.
|
||||
self._read_block(offset, return_data=False)
|
||||
|
||||
return self._pos
|
||||
|
||||
def tell(self):
|
||||
"""Return the current file position."""
|
||||
with self._lock:
|
||||
self._check_not_closed()
|
||||
return self._pos
|
||||
|
||||
|
||||
class ZlibCompressorWrapper(CompressorWrapper):
|
||||
|
||||
def __init__(self):
|
||||
CompressorWrapper.__init__(self, obj=BinaryZlibFile,
|
||||
prefix=_ZLIB_PREFIX, extension='.z')
|
||||
|
||||
|
||||
class BinaryGzipFile(BinaryZlibFile):
|
||||
"""A file object providing transparent gzip (de)compression.
|
||||
|
||||
If filename is a str or bytes object, it gives the name
|
||||
of the file to be opened. Otherwise, it should be a file object,
|
||||
which will be used to read or write the compressed data.
|
||||
|
||||
mode can be 'rb' for reading (default) or 'wb' for (over)writing
|
||||
|
||||
If mode is 'wb', compresslevel can be a number between 1
|
||||
and 9 specifying the level of compression: 1 produces the least
|
||||
compression, and 9 produces the most compression. 3 is the default.
|
||||
"""
|
||||
|
||||
wbits = 31 # zlib compressor/decompressor wbits value for gzip format.
|
||||
|
||||
|
||||
class GzipCompressorWrapper(CompressorWrapper):
|
||||
|
||||
def __init__(self):
|
||||
CompressorWrapper.__init__(self, obj=BinaryGzipFile,
|
||||
prefix=_GZIP_PREFIX, extension='.gz')
|
||||
Reference in New Issue
Block a user