from __future__ import annotations
from typing import Protocol, ByteString, BinaryIO, Iterable, Generator, Optional
from enum import Enum, auto
from pathlib import PurePath, PureWindowsPath
import errno
import os
from io import IOBase
from exasol.bucketfs._buckets import BucketLike, SaaSBucket, MountedBucket
from exasol.bucketfs._service import Service
from exasol.bucketfs._error import BucketFsError
ARCHIVE_SUFFIXES = [".tar", ".gz", ".tgz", ".zip", ".tar"]
class StorageBackend(Enum):
onprem = auto()
saas = auto()
mounted = auto()
[docs]
class PathLike(Protocol):
"""
Definition of the PathLike view of the files in a Bucket.
"""
@property
def name(self) -> str:
"""
A string representing the final path component, excluding the drive and root, if any.
"""
@property
def suffix(self) -> str:
"""
The file extension of the final component, if any.
"""
@property
def root(self) -> str:
"""
A string representing the root, if any.
"""
@property
def parent(self) -> str:
"""
The logical parent of this path.
"""
[docs]
def as_uri(self) -> str:
"""
Represent the path as a file URI. Can be used to reconstruct the location/path.
"""
[docs]
def as_udf_path(self) -> str:
"""
This method is specific to a BucketFS flavour of the PathLike.
It returns a corresponding path, as it's seen from a UDF.
"""
[docs]
def exists(self) -> bool:
"""
Return True if the path points to an existing file or directory.
"""
[docs]
def is_dir(self) -> bool:
"""
Return True if the path points to a directory, False if it points to another kind of file.
"""
[docs]
def is_file(self) -> bool:
"""
Return True if the path points to a regular file, False if it points to another kind of file.
"""
[docs]
def read(self, chunk_size: int = 8192) -> Iterable[ByteString]:
"""
Read the content of the file behind this path.
Only works for PathLike objects which return True for `is_file()`.
Args:
chunk_size: which will be yielded by the iterator.
Returns:
Returns an iterator which can be used to read the contents of the path in chunks.
Raises:
FileNotFoundError: If the file does not exist.
IsADirectoryError: if the pathlike object points to a directory.
"""
[docs]
def write(self, data: ByteString | BinaryIO | Iterable[ByteString]) -> None:
"""
Writes data to this path.
Q. Should it create the parent directory if it doesn't exit?
A. Yes, it should.
After successfully writing to this path `exists` will yield true for this path.
If the file already existed it will be overwritten.
Args:
data: which shall be writen to the path.
Raises:
NotAFileError: if the pathlike object is not a file path.
"""
[docs]
def rm(self) -> None:
"""
Remove this file.
Note:
If `exists()` and is_file yields true for this path, the path will be deleted,
otherwise exception will be thrown.
Raises:
FileNotFoundError: If the file does not exist.
"""
[docs]
def rmdir(self, recursive: bool = False) -> None:
"""
Removes this directory.
Note: In order to stay close to pathlib, by default `rmdir` with `recursive`
set to `False` won't delete non-empty directories.
Args:
recursive: if true the directory itself and its entire contents (files and subdirs)
will be deleted. If false and the directory is not empty an error will be thrown.
Raises:
FileNotFoundError: If the file does not exist.
PermissionError: If recursive is false and the directory is not empty.
"""
[docs]
def joinpath(self, *path_segments) -> "PathLike":
"""
Calling this method is equivalent to combining the path with each of the given path segments in turn.
Returns:
A new pathlike object pointing the combined path.
"""
[docs]
def walk(self, top_down: bool = True) -> Generator[tuple["PathLike", list[str], list[str]], None, None]:
"""
Generate the file names in a directory tree by walking the tree either top-down or bottom-up.
Note:
Try to mimik https://docs.python.org/3/library/pathlib.html#pathlib.Path.walk as closely as possible,
except the functionality associated with the parameters of the `pathlib` walk.
Yields:
A 3-tuple of (dirpath, dirnames, filenames).
"""
[docs]
def iterdir(self) -> Generator["PathLike", None, None]:
"""
When the path points to a directory, yield path objects of the directory contents.
Note:
If `path` points to a file then `iterdir()` will yield nothing.
Yields:
All direct children of the pathlike object.
"""
def __truediv__(self, other):
"""
Overload / for joining, see also joinpath or `pathlib.Path`.
"""
def _remove_archive_suffix(path: PurePath) -> PurePath:
while path.suffix in ARCHIVE_SUFFIXES:
path = path.with_suffix('')
return path
class _BucketFile:
"""
A node in a perceived file structure of a bucket.
This can be a file, a directory or both.
"""
def __init__(self, name: str, parent: str = ''):
self._name = name
self._path = f'{parent}/{name}' if parent else name
self._children: Optional[dict[str, "_BucketFile"]] = None
self.is_file = False
@property
def name(self):
return self._name
@property
def path(self):
return self._path
@property
def is_dir(self):
# The node can be a directory as well as a file,
# hence is the is_dir property, independent of is_file.
return bool(self._children)
def __iter__(self):
if self._children is None:
return iter(())
return iter(self._children.values())
def get_child(self, child_name: str) -> "_BucketFile":
"""
Returns a child object with the specified name.
Creates one if it hasn't been created yet.
"""
if self._children is None:
self._children = {}
child: Optional["_BucketFile"] = None
else:
child = self._children.get(child_name)
if child is None:
child = _BucketFile(child_name, self._path)
self._children[child_name] = child
return child
class BucketPath:
"""
Implementation of the PathLike view for files in a bucket.
"""
def __init__(self, path: str | PurePath, bucket_api: BucketLike):
"""
:param path: A pure path of a file or directory. The path is assumed to
be relative to the bucket. It is also permissible to have
this path in an absolute form, e.g. '/dir1/...'
or '\\\\abc\\...\\'.
All Pure Path methods of the PathLike protocol will be
delegated to this object.
:param bucket_api: An object supporting the Bucket API protocol.
"""
self._path = PurePath(path)
self._bucket_api = bucket_api
def _get_relative_posix(self):
"""
Returns the pure path of this object as a string, in the format of a bucket
file: 'dir/subdir/.../filename'.
"""
path_str = str(self._path)[len(self._path.anchor):]
if isinstance(self._path, PureWindowsPath):
path_str = path_str.replace('\\', '/')
if path_str == '.':
path_str = ''
return path_str
def _navigate(self) -> Optional[_BucketFile]:
"""
Reads the bucket file structure and navigates to the node corresponding to the
pure path of this object. Returns None if such node doesn't exist, otherwise
returns this node.
"""
path_str = self._get_relative_posix()
path_len = len(path_str)
path_root: Optional[_BucketFile] = None
for file_name in self._bucket_api.files:
if file_name.startswith(path_str):
path_root = path_root or _BucketFile(self._path.name, str(self.parent))
node = path_root
for part in file_name[path_len:].split('/'):
if part:
node = node.get_child(part)
node.is_file = True
return path_root
@property
def name(self) -> str:
return self._path.name
@property
def suffix(self) -> str:
return self._path.suffix
@property
def root(self) -> str:
return self._path.root
@property
def parent(self) -> str:
return self._path.parent.name
def as_uri(self) -> str:
return self._path.as_uri()
def as_udf_path(self) -> str:
return str(PurePath(self._bucket_api.udf_path) /
_remove_archive_suffix(self._path))
def exists(self) -> bool:
return self._navigate() is not None
def is_dir(self) -> bool:
current_node = self._navigate()
return (current_node is not None) and current_node.is_dir
def is_file(self) -> bool:
current_node = self._navigate()
return (current_node is not None) and current_node.is_file
def read(self, chunk_size: int = 8192) -> Iterable[ByteString]:
return self._bucket_api.download(str(self._path), chunk_size)
def write(self, data: ByteString | BinaryIO | Iterable[ByteString]) -> None:
if (not isinstance(data, IOBase) and isinstance(data, Iterable) and
all(isinstance(chunk, ByteString) for chunk in data)):
data = b''.join(data)
self._bucket_api.upload(str(self._path), data)
def rm(self) -> None:
current_node = self._navigate()
if current_node is None:
raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), str(self._path))
if not current_node.is_file:
raise IsADirectoryError(errno.EISDIR, os.strerror(errno.EISDIR), str(self._path))
self._bucket_api.delete(str(self._path))
def rmdir(self, recursive: bool = False) -> None:
current_node = self._navigate()
if current_node is None:
# There is no such thing as an empty directory. So, for the sake of
# compatibility with the PathLike, any directory that doesn't exist
# is considered empty.
return
if not current_node.is_dir:
raise NotADirectoryError(errno.ENOTDIR, os.strerror(errno.ENOTDIR), str(self._path))
if recursive:
self._rmdir_recursive(current_node)
else:
raise OSError(errno.ENOTEMPTY, os.strerror(errno.ENOTEMPTY), str(self._path))
def _rmdir_recursive(self, node: _BucketFile):
for child in node:
self._rmdir_recursive(child)
if node.is_file:
self._bucket_api.delete(node.path)
def joinpath(self, *path_segments) -> PathLike:
# The path segments can be of either this type or an os.PathLike.
cls = type(self)
seg_paths = [seg._path if isinstance(seg, cls) else seg for seg in path_segments]
new_path = self._path.joinpath(*seg_paths)
return cls(new_path, self._bucket_api)
def walk(self, top_down: bool = True) -> Generator[tuple[PathLike, list[str], list[str]], None, None]:
current_node = self._navigate()
if current_node is None:
raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), str(self._path))
if current_node.is_dir:
yield from self._walk_recursive(current_node, top_down)
def _walk_recursive(self, node: _BucketFile, top_down: bool) -> \
Generator[tuple[PathLike, list[str], list[str]], None, None]:
bucket_path = BucketPath(node.path, self._bucket_api)
dir_list: list[str] = []
file_list: list[str] = []
for child in node:
if child.is_file:
file_list.append(child.name)
if child.is_dir:
dir_list.append(child.name)
# The difference between the top_down and bottom_up is in the order of
# yielding the current node and its children. Top down - current node first,
# bottom_up - children first.
if top_down:
yield bucket_path, dir_list, file_list
for child in node:
if child.is_dir:
yield from self._walk_recursive(child, top_down)
if not top_down:
yield bucket_path, dir_list, file_list
def iterdir(self) -> Generator[PathLike, None, None]:
current_node = self._navigate()
if current_node is None:
raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), str(self._path))
if not current_node.is_dir:
raise NotADirectoryError(errno.ENOTDIR, os.strerror(errno.ENOTDIR), str(self._path))
for child in current_node:
yield BucketPath(self._path / child.name, self._bucket_api)
def __truediv__(self, other):
# The other object can be of either this type or an os.PathLike.
cls = type(self)
new_path = self._path / (other._path if isinstance(other, cls) else other)
return cls(new_path, self._bucket_api)
def __str__(self):
return str(self._path)
def _create_onprem_bucket(url: str,
username: str,
password: str,
bucket_name: str = 'default',
verify: bool | str = True,
service_name: Optional[str] = None
) -> BucketLike:
"""
Creates an on-prem bucket.
"""
credentials = {bucket_name: {'username': username, 'password': password}}
service = Service(url, credentials, verify, service_name)
buckets = service.buckets
if bucket_name not in buckets:
raise BucketFsError(f'Bucket {bucket_name} does not exist.')
return buckets[bucket_name]
def _create_saas_bucket(account_id: str,
database_id: str,
pat: str,
url: str = 'https://cloud.exasol.com'
) -> BucketLike:
"""
Creates a SaaS bucket.
"""
return SaaSBucket(url=url, account_id=account_id, database_id=database_id, pat=pat)
def _create_mounted_bucket(service_name: str = 'bfsdefault',
bucket_name: str = 'default',
base_path: Optional[str] = None
) -> BucketLike:
"""
Creates a bucket mounted to a UDF.
"""
bucket = MountedBucket(service_name, bucket_name, base_path)
if not bucket.root.exists():
raise BucketFsError(f'Service {service_name} or bucket {bucket_name} do not exist.')
return bucket
[docs]
def build_path(**kwargs) -> PathLike:
"""
Creates a PathLike object based on a bucket in one of the BucketFS storage backends.
It provides the same interface for the following BucketFS implementations:
- On-Premises
- SaaS
- BucketFS files mounted as read-only directory in a UDF.
Arguments:
backend:
This is a mandatory parameter that indicates the BucketFS storage backend.
The available backends are defined in the StorageBackend enumeration,
Currently, these are "onprem", "saas" and "mounted". The parameter value
can be provided either as a string, e.g. "onprem", or as an enum, e.g.
StorageBackend.onprem.
path:
Optional parameter that selects a path within the bucket. If not provided
the returned PathLike objects corresponds to the root of the bucket. Hence,
an alternative way of creating a PathLike pointing to a particular file or
directory is as in the code below.
path = build_path(...) / "the_desired_path"
The rest of the arguments are backend specific.
On-prem arguments:
url:
Url of the BucketFS service, e.g. `http(s)://127.0.0.1:2580`.
username:
BucketFS username (generally, different from the DB username).
password:
BucketFS user password.
bucket_name:
Name of the bucket. Currently, a PathLike cannot span multiple buckets.
verify:
Either a boolean, in which case it controls whether we verify the server's
TLS certificate, or a string, in which case it must be a path to a CA bundle
to use. Defaults to ``True``.
service_name:
Optional name of the BucketFS service.
SaaS arguments:
url:
Url of the Exasol SaaS. Defaults to 'https://cloud.exasol.com'.
account_id:
SaaS user account ID, e.g. 'org_LVeOj4pwXhPatNz5'
(given example is not a valid ID of an existing account).
database_id:
Database ID, e.g. 'msduZKlMR8QCP_MsLsVRwy'
(given example is not a valid ID of an existing database).
pat:
Personal Access Token, e.g. 'exa_pat_aj39AsM3bYR9bQ4qk2wiG8SWHXbRUGNCThnep5YV73az6A'
(given example is not a valid PAT).
Mounted BucketFS directory arguments:
service_name:
Name of the BucketFS service (not a service url). Defaults to 'bfsdefault'.
bucket_name:
Name of the bucket. Currently, a PathLike cannot span multiple buckets.
base_path:
Explicitly specified root path in a file system. This is an alternative to
providing the service_name and the bucket_name.
"""
backend = kwargs.pop('backend', StorageBackend.onprem)
path = kwargs.pop('path') if 'path' in kwargs else ''
if isinstance(backend, str):
backend = StorageBackend[backend.lower()]
if backend == StorageBackend.onprem:
bucket = _create_onprem_bucket(**kwargs)
elif backend == StorageBackend.saas:
bucket = _create_saas_bucket(**kwargs)
else:
bucket = _create_mounted_bucket(**kwargs)
return BucketPath(path, bucket)