From 502492308e7d839bf8d7a47fa9784d13abe4f501 Mon Sep 17 00:00:00 2001 From: Ben Date: Sun, 7 Aug 2016 19:45:05 -0400 Subject: [PATCH 01/63] support for ecryptfs; this requires sequential file writing using multiple file descriptors, as well as support for getting and setting xattrs on files. --- acd_cli.py | 32 ++++----- acdcli/acd_fuse.py | 145 ++++++++++++++++++++++++++++++++++++----- acdcli/cache/db.py | 13 +++- acdcli/cache/query.py | 34 ++++++++++ acdcli/cache/schema.py | 9 +++ acdcli/cache/sync.py | 32 +++++++++ 6 files changed, 231 insertions(+), 34 deletions(-) diff --git a/acd_cli.py b/acd_cli.py index 52f4b4b..74e224e 100755 --- a/acd_cli.py +++ b/acd_cli.py @@ -1,35 +1,33 @@ #!/usr/bin/env python3 -import sys -import os -import json import argparse +import json import logging import logging.handlers +import os +import re import signal +import sys import time -import re -import appdirs - from collections import namedtuple from configparser import ConfigParser from functools import partial from multiprocessing import Event - from pkgutil import walk_packages + +import appdirs from pkg_resources import iter_entry_points import acdcli +from acdcli import plugins from acdcli.api import client from acdcli.api.common import RequestError, is_valid_id from acdcli.cache import format, db +from acdcli.cache.db import CacheConsts from acdcli.utils import hashing, progress from acdcli.utils.conf import get_conf from acdcli.utils.threading import QueuedLoader from acdcli.utils.time import * -# load local plugin modules (default ones, for developers) -from acdcli import plugins - for importer, modname, ispkg in walk_packages(path=plugins.__path__, prefix=plugins.__name__ + '.', onerror=lambda x: None): if not ispkg: @@ -120,11 +118,13 @@ def pprint(d: dict): # Glue functions (API, cache) # - -class CacheConsts(object): - CHECKPOINT_KEY = 'checkpoint' - LAST_SYNC_KEY = 'last_sync' - MAX_AGE = 30 +def sync_owner_id(): + global cache + owner_id = cache.KeyValueStorage.get(CacheConsts.OWNER_ID) + if not owner_id: + owner_id = acd_client.get_owner_id() + cache.KeyValueStorage[CacheConsts.OWNER_ID] = owner_id + return owner_id def sync_node_list(full=False, to_file=None, from_file=None) -> 'Union[int, None]': @@ -185,12 +185,14 @@ def sync_node_list(full=False, to_file=None, from_file=None) -> 'Union[int, None print() if to_file: out.close() + sync_owner_id() def old_sync() -> 'Union[int, None]': global cache cache.drop_all() cache = db.NodeCache(CACHE_PATH) + sync_owner_id() try: folders = acd_client.get_folder_list() folders.extend(acd_client.get_trashed_folders()) diff --git a/acdcli/acd_fuse.py b/acdcli/acd_fuse.py index 68e026c..24465b9 100644 --- a/acdcli/acd_fuse.py +++ b/acdcli/acd_fuse.py @@ -2,6 +2,7 @@ import configparser import errno +import json import logging import os import stat @@ -14,6 +15,10 @@ from time import time, sleep import ctypes.util +import binascii + +from acdcli.cache.db import CacheConsts + ctypes.util.__find_library = ctypes.util.find_library def find_library(*args): @@ -43,6 +48,7 @@ def find_library(*args): errno.EREMOTEIO = errno.EIO _SETTINGS_FILENAME = 'fuse.ini' +_XATTR_PROPERTY_NAME = 'xattrs' _def_conf = configparser.ConfigParser() _def_conf['read'] = dict(open_chunk_limit=10, timeout=5) @@ -211,7 +217,7 @@ class WriteStream(object): """A WriteStream is a binary file-like object that is backed by a Queue. It will remember its current offset.""" - __slots__ = ('q', 'offset', 'error', 'closed', 'done', 'timeout') + __slots__ = ('q', 'offset', 'error', 'closed', 'done', 'timeout', 'lock') def __init__(self, buffer_size, timeout): self.q = Queue(maxsize=buffer_size) @@ -224,6 +230,8 @@ def __init__(self, buffer_size, timeout): self.done = Event() """done event is triggered when file is successfully read and transferred""" self.timeout = timeout + self.lock = Lock() + """make sure only one writer is appending to the queue at once""" def write(self, data: bytes): """Writes data into queue. @@ -309,35 +317,36 @@ def write(self, node_id, fh, offset, bytes_): :raises: FuseOSError: wrong offset or writing failed""" - f = self.files[fh] + f = self.files[node_id] - if f.offset == offset: - f.write(bytes_) - else: - f.error = True # necessary? - logger.error('Wrong offset for writing to fh %s.' % fh) - raise FuseOSError(errno.ESPIPE) + with f.lock: + if f.offset == offset: + f.write(bytes_) + else: + f.error = True # necessary? + logger.error('Wrong offset for writing to fh %s.' % fh) + raise FuseOSError(errno.ESPIPE) if offset == 0: t = Thread(target=self.write_n_sync, args=(f, node_id)) t.daemon = True t.start() - def flush(self, fh): - f = self.files.get(fh) + def flush(self, node_id, fh): + f = self.files.get(node_id) if f: f.flush() - def release(self, fh): + def release(self, node_id, fh): """:raises: FuseOSError""" - f = self.files.get(fh) + f = self.files.get(node_id) if f: try: f.close() except: raise finally: - del self.files[fh] + del self.files[node_id] class LoggingMixIn(object): @@ -377,15 +386,20 @@ def __init__(self, **kwargs): :param kwargs: cache (NodeCache), acd_client (ACDClient), autosync (partial)""" + self.xattr_cache = {} + self.xattr_dirty = set() + self.xattr_cache_lock = Lock() + self.cache = kwargs['cache'] self.acd_client = kwargs['acd_client'] + self.acd_client_owner = self.cache.KeyValueStorage.get(CacheConsts.OWNER_ID) autosync = kwargs['autosync'] conf = kwargs['conf'] - self.rp = ReadProxy(self.acd_client, + self.rp = ReadProxy(self.acd_client, conf.getint('read', 'open_chunk_limit'), conf.getint('read', 'timeout')) """collection of files opened for reading""" - self.wp = WriteProxy(self.acd_client, self.cache, + self.wp = WriteProxy(self.acd_client, self.cache, conf.getint('write', 'buffer_size'), conf.getint('write', 'timeout')) """collection of files opened for writing""" try: @@ -403,6 +417,8 @@ def __init__(self, **kwargs): """file handle counter\n\n :type: int""" self.handles = {} """map fh->node\n\n :type: dict""" + self.node_to_fh = defaultdict(lambda: set()) + """map node_id to list of interested file handles""" self.fh_lock = Lock() """lock for fh counter increment and handle dict writes""" self.nlinks = kwargs.get('nlinks', False) @@ -415,6 +431,7 @@ def __init__(self, **kwargs): p.start() def destroy(self, path): + self._xattr_write_and_sync() self.destroyed.set() def readdir(self, path, fh) -> 'List[str]': @@ -455,6 +472,87 @@ def getattr(self, path, fh=None) -> dict: st_size=node.size, **times) + # def listxattr(self, path): + # node_id = self.cache.resolve_id(path) + # if not node_id: + # raise FuseOSError(errno.ENOENT) + # self._xattr_load(node_id) + # + # with self.xattr_cache_lock: + # try: + # return [k for k, v in self.xattr_cache[node_id].items()] + # except: + # return [] + + def getxattr(self, path, name, position=0): + node_id = self.cache.resolve_id(path) + if not node_id: + raise FuseOSError(errno.ENOENT) + self._xattr_load(node_id) + + with self.xattr_cache_lock: + try: + ret = self.xattr_cache[node_id][name] + if ret: + return ret + except: + raise FuseOSError(errno.ENODATA) # should be ENOATTR + else: + raise FuseOSError(errno.ENODATA) # should be ENOATTR + + # def removexattr(self, path, name): + # node_id = self.cache.resolve_id(path) + # if not node_id: + # raise FuseOSError(errno.ENOENT) + # self._xattr_load(node_id) + # + # with self.xattr_cache_lock: + # try: + # if self.xattr_cache[node_id][name]: + # del self.xattr_cache[node_id][name] + # self.properties_dirty.add(node_id) + # except: + # raise FuseOSError(errno.ENODATA) # should be ENOATTR + + def setxattr(self, path, name, value, options, position=0): + node_id = self.cache.resolve_id(path) + if not node_id: + raise FuseOSError(errno.ENOENT) + self._xattr_load(node_id) + + with self.xattr_cache_lock: + try: + self.xattr_cache[node_id][name] = value + self.xattr_dirty.add(node_id) + except: + raise FuseOSError(errno.ENOTSUP) + + def _xattr_load(self, node_id): + with self.xattr_cache_lock: + if node_id not in self.xattr_cache: + xattrs_str = self.cache.get_property(node_id, self.acd_client_owner, _XATTR_PROPERTY_NAME) + try: self.xattr_cache[node_id] = json.loads(xattrs_str) + except: self.xattr_cache[node_id] = {} + for k, v in self.xattr_cache[node_id].items(): + self.xattr_cache[node_id][k] = binascii.a2b_base64(v) + + def _xattr_write_and_sync(self): + with self.xattr_cache_lock: + for node_id in self.xattr_dirty: + try: + xattrs = {} + for k, v in self.xattr_cache[node_id].items(): + xattrs[k] = binascii.b2a_base64(v).decode("utf-8") + xattrs_str = json.dumps(xattrs) + + self.acd_client.add_property(node_id, self.acd_client_owner, _XATTR_PROPERTY_NAME, + xattrs_str) + except (RequestError, IOError) as e: + logger.error('Error writing node xattrs "%s". %s' % (node_id, str(e))) + else: + self.cache.insert_property(node_id, self.acd_client_owner, _XATTR_PROPERTY_NAME, xattrs_str) + self.xattr_dirty.clear() + def read(self, path, length, offset, fh) -> bytes: """Read ```length`` bytes from ``path`` at ``offset``.""" @@ -550,6 +648,7 @@ def create(self, path, mode) -> int: with self.fh_lock: self.fh += 1 self.handles[self.fh] = node + self.node_to_fh[node.id].add(self.fh) return self.fh def rename(self, old, new): @@ -618,6 +717,7 @@ def open(self, path, flags) -> int: with self.fh_lock: self.fh += 1 self.handles[self.fh] = node + self.node_to_fh[node.id].add(self.fh) return self.fh def write(self, path, data, offset, fh) -> int: @@ -631,7 +731,8 @@ def write(self, path, data, offset, fh) -> int: def flush(self, path, fh): """Flushes ``fh`` in WriteProxy.""" - self.wp.flush(fh) + node_id = self.handles[fh].id + self.wp.flush(node_id, fh) def truncate(self, path, length, fh=None): """Pseudo-truncates a file, i.e. clears content if ``length``==0 or does nothing @@ -666,8 +767,18 @@ def release(self, path, fh): node = self.cache.resolve(path, trash=False) if node: self.rp.release(node.id) - self.wp.release(fh) with self.fh_lock: + """release the writer if there's no more interest. This allows many file + handles to write to a single node provided they do it in order, enabling + sequential writes using mmap. + """ + interest = self.node_to_fh.get(node.id) + if interest: + interest.discard(fh) + if not interest: + self.wp.release(node.id, fh) + self._xattr_write_and_sync() + del self.node_to_fh[node.id] del self.handles[fh] else: raise FuseOSError(errno.ENOENT) diff --git a/acdcli/cache/db.py b/acdcli/cache/db.py index 994a925..6ac66f2 100644 --- a/acdcli/cache/db.py +++ b/acdcli/cache/db.py @@ -3,7 +3,7 @@ import os import re import sqlite3 -from threading import local +from threading import local, Lock from acdcli.utils.conf import get_conf @@ -24,7 +24,11 @@ _def_conf['sqlite'] = dict(filename='nodes.db', busy_timeout=30000, journal_mode='wal') _def_conf['blacklist'] = dict(folders= []) - +class CacheConsts(object): + CHECKPOINT_KEY = 'checkpoint' + LAST_SYNC_KEY = 'last_sync' + OWNER_ID = 'owner_id' + MAX_AGE = 30 class IntegrityError(Exception): def __init__(self, msg): @@ -61,6 +65,11 @@ def __init__(self, cache_path: str='', settings_path='', check=IntegrityCheckTyp self._conn.create_function('REGEXP', _regex_match.__code__.co_argcount, _regex_match) + self.path_to_node_id = {} + self.path_to_node_id_lock = Lock() + """There are a huge number of repeated path lookups, + so cache results and invalidate on new nodes.""" + with cursor(self._conn) as c: c.execute(_ROOT_ID_SQL) row = c.fetchone() diff --git a/acdcli/cache/query.py b/acdcli/cache/query.py index 44ea869..04a0d6a 100644 --- a/acdcli/cache/query.py +++ b/acdcli/cache/query.py @@ -45,6 +45,8 @@ def datetime_from_string(dt: str) -> datetime: NODE_BY_ID_SQL = """SELECT n.*, f.* FROM nodes n LEFT OUTER JOIN files f ON n.id = f.id WHERE n.id = (?)""" +PROPERTY_BY_ID_SQL = """SELECT * FROM properties WHERE id=? AND owner=? AND key=?""" + USAGE_SQL = 'SELECT SUM(size) FROM files' FIND_BY_NAME_SQL = """SELECT n.*, f.* FROM nodes n @@ -151,7 +153,31 @@ def get_conflicting_node(self, name: str, parent_id: str): if n.is_available and n.name.lower() == name.lower(): return n + def resolve_id(self, path: str, trash=False) -> 'Union[str|None]': + """Gets a node's id from a path + This is far faster than the below method if the id is cached; + there are zero sqlite queries.""" + with self.path_to_node_id_lock: + try: return self.path_to_node_id[path] + except: pass + n = self._resolve(path, trash) + if n: + self.path_to_node_id[path] = n.id + return n.id + return None + def resolve(self, path: str, trash=False) -> 'Union[Node|None]': + """Gets a node from a path""" + with self.path_to_node_id_lock: + try: return self.get_node(self.path_to_node_id[path]) + except: pass + n = self._resolve(path,trash) + if n: + self.path_to_node_id[path] = n.id + return n + return None + + def _resolve(self, path: str, trash=False) -> 'Union[Node|None]': segments = list(filter(bool, path.split('/'))) if not segments: if not self.root_id: @@ -312,3 +338,11 @@ def file_size_exists(self, size) -> bool: no = c.fetchone()[0] return bool(no) + + def get_property(self, node_id, owner_id, key) -> 'Union[str|None]': + with cursor(self._conn) as c: + c.execute(PROPERTY_BY_ID_SQL, [node_id, owner_id, key]) + r = c.fetchone() + if r: + return r['value'] + return None diff --git a/acdcli/cache/schema.py b/acdcli/cache/schema.py index d5e138b..9939af1 100644 --- a/acdcli/cache/schema.py +++ b/acdcli/cache/schema.py @@ -28,6 +28,15 @@ CHECK (status IN ('AVAILABLE', 'TRASH', 'PURGED', 'PENDING')) ); + CREATE TABLE properties ( + id VARCHAR(50) NOT NULL, + owner TEXT NOT NULL, + key TEXT NOT NULL, + value TEXT, + PRIMARY KEY (id), + FOREIGN KEY(id) REFERENCES nodes (id) + ); + CREATE TABLE labels ( id VARCHAR(50) NOT NULL, name VARCHAR(256) NOT NULL, diff --git a/acdcli/cache/sync.py b/acdcli/cache/sync.py index d6dbe80..1ff4672 100644 --- a/acdcli/cache/sync.py +++ b/acdcli/cache/sync.py @@ -42,12 +42,16 @@ def remove_purged(self, purged: list): c.execute('DELETE FROM files WHERE id IN %s' % placeholders(slice_), slice_) c.execute('DELETE FROM parentage WHERE parent IN %s' % placeholders(slice_), slice_) c.execute('DELETE FROM parentage WHERE child IN %s' % placeholders(slice_), slice_) + c.execute('DELETE FROM properties WHERE id IN %s' % placeholders(slice_), slice_) c.execute('DELETE FROM labels WHERE id IN %s' % placeholders(slice_), slice_) logger.info('Purged %i node(s).' % len(purged)) def insert_nodes(self, nodes: list, partial=True): """Inserts mixed list of files and folders into cache.""" + with self.path_to_node_id_lock: + self.path_to_node_id.clear() + files = [] folders = [] for node in nodes: @@ -72,6 +76,7 @@ def insert_nodes(self, nodes: list, partial=True): self.insert_files(files) self.insert_parentage(files + folders, partial) + self.insert_properties(files + folders) def insert_node(self, node: dict): """Inserts single file or folder into cache.""" @@ -143,3 +148,30 @@ def insert_parentage(self, nodes: list, partial=True): c.execute('INSERT OR IGNORE INTO parentage VALUES (?, ?)', [p, n['id']]) logger.info('Parented %d node(s).' % len(nodes)) + + def insert_properties(self, nodes: list): + if not nodes: + return + + with mod_cursor(self._conn) as c: + for n in nodes: + if 'properties' not in n: + continue + id = n['id'] + for owner_id, key_value in n['properties'].items(): + for key, value in key_value.items(): + c.execute('INSERT OR REPLACE INTO properties ' + '(id, owner, key, value) ' + 'VALUES (?, ?, ?, ?)', + [id, owner_id, key, value] + ) + + logger.info('Applied properties to %d node(s).' % len(nodes)) + + def insert_property(self, node_id, owner_id, key, value): + with mod_cursor(self._conn) as c: + c.execute('INSERT OR REPLACE INTO properties ' + '(id, owner, key, value) ' + 'VALUES (?, ?, ?, ?)', + [node_id, owner_id, key, value] + ) \ No newline at end of file From 6a1ee48be9274adea11b1dd450dec0926c816e8c Mon Sep 17 00:00:00 2001 From: Ben Date: Mon, 8 Aug 2016 14:13:26 -0400 Subject: [PATCH 02/63] prevent binary blob spam in the logs when using set/getxattr --- acdcli/acd_fuse.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/acdcli/acd_fuse.py b/acdcli/acd_fuse.py index 24465b9..636f3ca 100644 --- a/acdcli/acd_fuse.py +++ b/acdcli/acd_fuse.py @@ -361,6 +361,8 @@ def __call__(self, op, path, *args): targs = (len(args[0]),) + args[1:] elif op == 'chmod': targs = (oct(args[0]),) + args[1:] + elif op == 'setxattr': + targs = (args[0], "binary") logger.debug('-> %s %s %s', op, path, repr(args if not targs else targs)) @@ -374,6 +376,8 @@ def __call__(self, op, path, *args): finally: if op == 'read': ret = len(ret) + elif op == 'getxattr' and ret: + ret = "binary" logger.debug('<- %s %s', op, repr(ret)) From 15ac1e798687f954e3dbcfe2e70243ac92d50e34 Mon Sep 17 00:00:00 2001 From: Ben Date: Wed, 10 Aug 2016 18:31:58 -0400 Subject: [PATCH 03/63] fix requirements --- requirements.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/requirements.txt b/requirements.txt index d59ad89..e161b1b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,9 @@ +appdirs +colorama +fusepy +python-dateutil +requests +requests_toolbelt # adds sphinx module for rtfd.org build process -e . From db9f9b066edaf1888a2e0886c9aa26b589f1fd47 Mon Sep 17 00:00:00 2001 From: Ben Date: Wed, 10 Aug 2016 18:41:46 -0400 Subject: [PATCH 04/63] implement proper mtime handling so rsync can work over acd_cli. --- acdcli/acd_fuse.py | 89 +++++++++++++++++++++++++++++-------------- acdcli/cache/query.py | 7 +++- 2 files changed, 65 insertions(+), 31 deletions(-) diff --git a/acdcli/acd_fuse.py b/acdcli/acd_fuse.py index 636f3ca..f93ef95 100644 --- a/acdcli/acd_fuse.py +++ b/acdcli/acd_fuse.py @@ -6,6 +6,7 @@ import logging import os import stat +import struct import sys from collections import deque, defaultdict @@ -49,6 +50,7 @@ def find_library(*args): _SETTINGS_FILENAME = 'fuse.ini' _XATTR_PROPERTY_NAME = 'xattrs' +_XATTR_MTIME_OVERRIDE_NAME = 'fuse.mtime' _def_conf = configparser.ConfigParser() _def_conf['read'] = dict(open_chunk_limit=10, timeout=5) @@ -462,8 +464,11 @@ def getattr(self, path, fh=None) -> dict: if not node: raise FuseOSError(errno.ENOENT) + try: mtime = self._getxattr_f(node.id, _XATTR_MTIME_OVERRIDE_NAME) + except: mtime = node.modified.timestamp() + times = dict(st_atime=time(), - st_mtime=node.modified.timestamp(), + st_mtime=mtime, st_ctime=node.created.timestamp()) if node.is_folder: @@ -476,24 +481,28 @@ def getattr(self, path, fh=None) -> dict: st_size=node.size, **times) - # def listxattr(self, path): - # node_id = self.cache.resolve_id(path) - # if not node_id: - # raise FuseOSError(errno.ENOENT) - # self._xattr_load(node_id) - # - # with self.xattr_cache_lock: - # try: - # return [k for k, v in self.xattr_cache[node_id].items()] - # except: - # return [] + def listxattr(self, path): + node_id = self.cache.resolve_id(path) + if not node_id: + raise FuseOSError(errno.ENOENT) + return self._listxattr(node_id) + + def _listxattr(self, node_id): + self._xattr_load(node_id) + with self.xattr_cache_lock: + try: + return [k for k, v in self.xattr_cache[node_id].items()] + except: + return [] def getxattr(self, path, name, position=0): node_id = self.cache.resolve_id(path) if not node_id: raise FuseOSError(errno.ENOENT) - self._xattr_load(node_id) + return self._getxattr(node_id, name) + def _getxattr(self, node_id, name): + self._xattr_load(node_id) with self.xattr_cache_lock: try: ret = self.xattr_cache[node_id][name] @@ -504,26 +513,30 @@ def getxattr(self, path, name, position=0): else: raise FuseOSError(errno.ENODATA) # should be ENOATTR - # def removexattr(self, path, name): - # node_id = self.cache.resolve_id(path) - # if not node_id: - # raise FuseOSError(errno.ENOENT) - # self._xattr_load(node_id) - # - # with self.xattr_cache_lock: - # try: - # if self.xattr_cache[node_id][name]: - # del self.xattr_cache[node_id][name] - # self.properties_dirty.add(node_id) - # except: - # raise FuseOSError(errno.ENODATA) # should be ENOATTR + def _getxattr_f(self, node_id, name): + return struct.unpack('d', self._getxattr(node_id, name))[0] - def setxattr(self, path, name, value, options, position=0): + def removexattr(self, path, name): node_id = self.cache.resolve_id(path) if not node_id: raise FuseOSError(errno.ENOENT) + self._removexattr(node_id, name) + + def _removexattr(self, node_id, name): self._xattr_load(node_id) + with self.xattr_cache_lock: + if name in self.xattr_cache[node_id]: + del self.xattr_cache[node_id][name] + self.properties_dirty.add(node_id) + def setxattr(self, path, name, value, options, position=0): + node_id = self.cache.resolve_id(path) + if not node_id: + raise FuseOSError(errno.ENOENT) + self._setxattr(node_id, name, value) + + def _setxattr(self, node_id, name, value): + self._xattr_load(node_id) with self.xattr_cache_lock: try: self.xattr_cache[node_id][name] = value @@ -531,6 +544,9 @@ def setxattr(self, path, name, value, options, position=0): except: raise FuseOSError(errno.ENOTSUP) + def _setxattr_f(self, node_id, name, value: float): + self._setxattr(node_id, name, struct.pack('d', value)) + def _xattr_load(self, node_id): with self.xattr_cache_lock: if node_id not in self.xattr_cache: @@ -731,6 +747,8 @@ def write(self, path, data, offset, fh) -> int: node_id = self.handles[fh].id self.wp.write(node_id, fh, offset, data) + """on a write, we can use amazon's modified time""" + self._removexattr(node_id, _XATTR_MTIME_OVERRIDE_NAME) return len(data) def flush(self, path, fh): @@ -788,11 +806,16 @@ def release(self, path, fh): raise FuseOSError(errno.ENOENT) def utimens(self, path, times=None): - """Not functional. Should set node atime and mtime to values as passed in ``times`` - or current time (see :manpage:`utimesat(2)`). + """Should set node atime and mtime to values as passed in ``times`` + or current time (see :manpage:`utimensat(2)`). + Note that this is only implemented for modified time. :param times: [atime, mtime]""" + node_id = self.cache.resolve_id(path) + if not node_id: + raise FuseOSError(errno.ENOENT) + if times: # atime = times[0] mtime = times[1] @@ -800,6 +823,14 @@ def utimens(self, path, times=None): # atime = time() mtime = time() + try: + self._setxattr_f(node_id, _XATTR_MTIME_OVERRIDE_NAME, mtime) + self._xattr_write_and_sync() + except: + raise FuseOSError(errno.ENOTSUP) + + return 0 + def chmod(self, path, mode): """Not implemented.""" pass diff --git a/acdcli/cache/query.py b/acdcli/cache/query.py index 04a0d6a..842189e 100644 --- a/acdcli/cache/query.py +++ b/acdcli/cache/query.py @@ -1,5 +1,8 @@ import logging from datetime import datetime + +from dateutil.tz import tzutc + from .cursors import cursor logger = logging.getLogger(__name__) @@ -7,9 +10,9 @@ def datetime_from_string(dt: str) -> datetime: try: - dt = datetime.strptime(dt, '%Y-%m-%d %H:%M:%S.%f+00:00') + dt = datetime.strptime(dt, '%Y-%m-%d %H:%M:%S.%f+00:00').replace(tzinfo=tzutc()) except ValueError: - dt = datetime.strptime(dt, '%Y-%m-%d %H:%M:%S+00:00') + dt = datetime.strptime(dt, '%Y-%m-%d %H:%M:%S+00:00').replace(tzinfo=tzutc()) return dt From 076dec0542bf8dca08f43325832878c9c7de5d44 Mon Sep 17 00:00:00 2001 From: "benjamin.gemmill@gmail.com" Date: Thu, 11 Aug 2016 18:32:08 -0400 Subject: [PATCH 05/63] bugfix: turns out reinterpret casting floats to bytes and back via structs leads to epsilon problems, causing rsync to think that mtime is different when it isn't. --- acdcli/acd_fuse.py | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/acdcli/acd_fuse.py b/acdcli/acd_fuse.py index f93ef95..0013045 100644 --- a/acdcli/acd_fuse.py +++ b/acdcli/acd_fuse.py @@ -464,7 +464,7 @@ def getattr(self, path, fh=None) -> dict: if not node: raise FuseOSError(errno.ENOENT) - try: mtime = self._getxattr_f(node.id, _XATTR_MTIME_OVERRIDE_NAME) + try: mtime = self._getxattr(node.id, _XATTR_MTIME_OVERRIDE_NAME) except: mtime = node.modified.timestamp() times = dict(st_atime=time(), @@ -499,7 +499,7 @@ def getxattr(self, path, name, position=0): node_id = self.cache.resolve_id(path) if not node_id: raise FuseOSError(errno.ENOENT) - return self._getxattr(node_id, name) + return self._getxattr_bytes(node_id, name) def _getxattr(self, node_id, name): self._xattr_load(node_id) @@ -513,8 +513,8 @@ def _getxattr(self, node_id, name): else: raise FuseOSError(errno.ENODATA) # should be ENOATTR - def _getxattr_f(self, node_id, name): - return struct.unpack('d', self._getxattr(node_id, name))[0] + def _getxattr_bytes(self, node_id, name): + return binascii.a2b_base64(self._getxattr(node_id, name)) def removexattr(self, path, name): node_id = self.cache.resolve_id(path) @@ -533,7 +533,7 @@ def setxattr(self, path, name, value, options, position=0): node_id = self.cache.resolve_id(path) if not node_id: raise FuseOSError(errno.ENOENT) - self._setxattr(node_id, name, value) + self._setxattr_bytes(node_id, name, value) def _setxattr(self, node_id, name, value): self._xattr_load(node_id) @@ -544,8 +544,8 @@ def _setxattr(self, node_id, name, value): except: raise FuseOSError(errno.ENOTSUP) - def _setxattr_f(self, node_id, name, value: float): - self._setxattr(node_id, name, struct.pack('d', value)) + def _setxattr_bytes(self, node_id, name, value: bytes): + self._setxattr(node_id, name, binascii.b2a_base64(value).decode("utf-8")) def _xattr_load(self, node_id): with self.xattr_cache_lock: @@ -553,24 +553,19 @@ def _xattr_load(self, node_id): xattrs_str = self.cache.get_property(node_id, self.acd_client_owner, _XATTR_PROPERTY_NAME) try: self.xattr_cache[node_id] = json.loads(xattrs_str) except: self.xattr_cache[node_id] = {} - for k, v in self.xattr_cache[node_id].items(): - self.xattr_cache[node_id][k] = binascii.a2b_base64(v) def _xattr_write_and_sync(self): with self.xattr_cache_lock: for node_id in self.xattr_dirty: try: - xattrs = {} - for k, v in self.xattr_cache[node_id].items(): - xattrs[k] = binascii.b2a_base64(v).decode("utf-8") - xattrs_str = json.dumps(xattrs) - + xattrs_str = json.dumps(self.xattr_cache[node_id]) self.acd_client.add_property(node_id, self.acd_client_owner, _XATTR_PROPERTY_NAME, xattrs_str) except (RequestError, IOError) as e: logger.error('Error writing node xattrs "%s". %s' % (node_id, str(e))) else: self.cache.insert_property(node_id, self.acd_client_owner, _XATTR_PROPERTY_NAME, xattrs_str) + logger.debug('_xattr_write_and_sync: node: %s xattrs: %s: ' % (node_id, xattrs_str)) self.xattr_dirty.clear() def read(self, path, length, offset, fh) -> bytes: @@ -747,8 +742,6 @@ def write(self, path, data, offset, fh) -> int: node_id = self.handles[fh].id self.wp.write(node_id, fh, offset, data) - """on a write, we can use amazon's modified time""" - self._removexattr(node_id, _XATTR_MTIME_OVERRIDE_NAME) return len(data) def flush(self, path, fh): @@ -824,7 +817,7 @@ def utimens(self, path, times=None): mtime = time() try: - self._setxattr_f(node_id, _XATTR_MTIME_OVERRIDE_NAME, mtime) + self._setxattr(node_id, _XATTR_MTIME_OVERRIDE_NAME, mtime) self._xattr_write_and_sync() except: raise FuseOSError(errno.ENOTSUP) From 9184a803668ea56e44007606d27ad40658ee9cb7 Mon Sep 17 00:00:00 2001 From: Ben Date: Sat, 13 Aug 2016 15:53:28 -0400 Subject: [PATCH 06/63] workaround: due to ecryptfs' bug, reporting incorrect file sizes when using xattrs for crypto headers, we have to allow re-writing the first bytes of a file to make ecryptfs happy. once they fix their bug, this can be removed and we can go back to xattrs. --- acdcli/acd_fuse.py | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/acdcli/acd_fuse.py b/acdcli/acd_fuse.py index 0013045..b15673c 100644 --- a/acdcli/acd_fuse.py +++ b/acdcli/acd_fuse.py @@ -51,6 +51,7 @@ def find_library(*args): _SETTINGS_FILENAME = 'fuse.ini' _XATTR_PROPERTY_NAME = 'xattrs' _XATTR_MTIME_OVERRIDE_NAME = 'fuse.mtime' +_XATTR_HEADER_OVERRIDE_NAME = 'fuse.header' _def_conf = configparser.ConfigParser() _def_conf['read'] = dict(open_chunk_limit=10, timeout=5) @@ -363,8 +364,6 @@ def __call__(self, op, path, *args): targs = (len(args[0]),) + args[1:] elif op == 'chmod': targs = (oct(args[0]),) + args[1:] - elif op == 'setxattr': - targs = (args[0], "binary") logger.debug('-> %s %s %s', op, path, repr(args if not targs else targs)) @@ -378,8 +377,6 @@ def __call__(self, op, path, *args): finally: if op == 'read': ret = len(ret) - elif op == 'getxattr' and ret: - ret = "binary" logger.debug('<- %s %s', op, repr(ret)) @@ -584,7 +581,17 @@ def read(self, path, length, offset, fh) -> bytes: if node.size < offset + length: length = node.size - offset - return self.rp.get(node.id, offset, length, node.size) + ret = self.rp.get(node.id, offset, length, node.size) + + """Check if we're overwriting the file's header, and splice that into the read bytes""" + try: + header = self._getxattr_bytes(node.id, _XATTR_HEADER_OVERRIDE_NAME) + if offset < len(header): + header = header[offset:] + ret = header + ret[len(header):] + except: + pass + return ret def statfs(self, path) -> dict: """Gets some filesystem statistics as specified in :manpage:`stat(2)`.""" @@ -741,6 +748,21 @@ def write(self, path, data, offset, fh) -> int: :returns: number of bytes written""" node_id = self.handles[fh].id + + """Allow overwriting a file's header. This is useful to support encrypted + filesystems that leave a header at the start of each file, and write to + it while writing to the body..""" + f = self.wp.files[node_id] + with f.lock: + if f.offset > 0 and offset == 0: + """sanity check that all headers must be the same size, + or we could end up overwriting the file in an xattr""" + try: header_sz = len(self._getxattr_bytes(node_id, _XATTR_HEADER_OVERRIDE_NAME)) + except: header_sz = len(data) + if header_sz == len(data): + self._setxattr_bytes(node_id, _XATTR_HEADER_OVERRIDE_NAME, data) + return len(data) + self.wp.write(node_id, fh, offset, data) return len(data) From 45ed3fa617853b1fa430abf4578a309b55ad579e Mon Sep 17 00:00:00 2001 From: Ben Date: Mon, 15 Aug 2016 14:41:53 -0400 Subject: [PATCH 07/63] bugfix: who's sick of index errors when writing? this guy. turns out there are some rsync flags that write multiple times to the same memory location, for reasons unknown. this keeps the whole file in a buffer until it's flushed to amazon on file handle closed. future work will be for super large files, we should use a temp file as backing. --- acdcli/acd_fuse.py | 88 ++++++++++++++++++++++++---------------------- 1 file changed, 46 insertions(+), 42 deletions(-) diff --git a/acdcli/acd_fuse.py b/acdcli/acd_fuse.py index b15673c..507ff8d 100644 --- a/acdcli/acd_fuse.py +++ b/acdcli/acd_fuse.py @@ -51,7 +51,6 @@ def find_library(*args): _SETTINGS_FILENAME = 'fuse.ini' _XATTR_PROPERTY_NAME = 'xattrs' _XATTR_MTIME_OVERRIDE_NAME = 'fuse.mtime' -_XATTR_HEADER_OVERRIDE_NAME = 'fuse.header' _def_conf = configparser.ConfigParser() _def_conf['read'] = dict(open_chunk_limit=10, timeout=5) @@ -215,6 +214,37 @@ def __init__(self, acd_client, cache, buffer_size, timeout): self.acd_client = acd_client self.cache = cache self.files = defaultdict(lambda: WriteProxy.WriteStream(buffer_size, timeout)) + self.buffers = defaultdict(lambda: WriteProxy.WriteBuffer()) + + class WriteBuffer(object): + """An in-memory segment of a file. This gets pushed out to amazon via a WriteStream on + flush() calls. Anything that hasn't been flushed yet can be rewritten in place any + number of times.""" + + def __init__(self): + self.b = bytearray() + """The memory backing""" + self.lock = Lock() + + def write(self, offset, bytes_: bytes): + """Writes to the buffer and returns the old buffer length""" + with self.lock: + old_len = len(self.b) + if offset > old_len: + logger.error('Wrong offset for writing to buffer; writing gap detected') + raise FuseOSError(errno.ESPIPE) + self.b[offset:offset + len(bytes_)] = bytes_ + return old_len + + def flush(self) -> bytes: + with self.lock: + ret = self.b + self.b = bytearray() + return ret + + def __len__(self): + with self.lock: + return len(self.b) class WriteStream(object): """A WriteStream is a binary file-like object that is backed by a Queue. @@ -320,28 +350,26 @@ def write(self, node_id, fh, offset, bytes_): :raises: FuseOSError: wrong offset or writing failed""" + b = self.buffers[node_id] f = self.files[node_id] - with f.lock: - if f.offset == offset: - f.write(bytes_) - else: - f.error = True # necessary? - logger.error('Wrong offset for writing to fh %s.' % fh) - raise FuseOSError(errno.ESPIPE) - - if offset == 0: + if b.write(offset, bytes_) == 0: t = Thread(target=self.write_n_sync, args=(f, node_id)) t.daemon = True t.start() - def flush(self, node_id, fh): + def _flush(self, node_id, fh): f = self.files.get(node_id) - if f: - f.flush() + b = self.buffers.get(node_id) + if f and b: + if len(b): + data = b.flush() + with f.lock: + f.write(data) def release(self, node_id, fh): """:raises: FuseOSError""" + self._flush(node_id, fh) f = self.files.get(node_id) if f: try: @@ -350,6 +378,7 @@ def release(self, node_id, fh): raise finally: del self.files[node_id] + del self.buffers[node_id] class LoggingMixIn(object): @@ -581,17 +610,7 @@ def read(self, path, length, offset, fh) -> bytes: if node.size < offset + length: length = node.size - offset - ret = self.rp.get(node.id, offset, length, node.size) - - """Check if we're overwriting the file's header, and splice that into the read bytes""" - try: - header = self._getxattr_bytes(node.id, _XATTR_HEADER_OVERRIDE_NAME) - if offset < len(header): - header = header[offset:] - ret = header + ret[len(header):] - except: - pass - return ret + return self.rp.get(node.id, offset, length, node.size) def statfs(self, path) -> dict: """Gets some filesystem statistics as specified in :manpage:`stat(2)`.""" @@ -748,28 +767,13 @@ def write(self, path, data, offset, fh) -> int: :returns: number of bytes written""" node_id = self.handles[fh].id - - """Allow overwriting a file's header. This is useful to support encrypted - filesystems that leave a header at the start of each file, and write to - it while writing to the body..""" - f = self.wp.files[node_id] - with f.lock: - if f.offset > 0 and offset == 0: - """sanity check that all headers must be the same size, - or we could end up overwriting the file in an xattr""" - try: header_sz = len(self._getxattr_bytes(node_id, _XATTR_HEADER_OVERRIDE_NAME)) - except: header_sz = len(data) - if header_sz == len(data): - self._setxattr_bytes(node_id, _XATTR_HEADER_OVERRIDE_NAME, data) - return len(data) - self.wp.write(node_id, fh, offset, data) return len(data) def flush(self, path, fh): - """Flushes ``fh`` in WriteProxy.""" - node_id = self.handles[fh].id - self.wp.flush(node_id, fh) + """noop since we need to keep the whole buffer in memory; + acd only supports sequentual writes otherwise""" + pass def truncate(self, path, length, fh=None): """Pseudo-truncates a file, i.e. clears content if ``length``==0 or does nothing From 6e30a93934307e8df2eb466e691b438645d09a47 Mon Sep 17 00:00:00 2001 From: Ben Date: Mon, 15 Aug 2016 18:00:35 -0400 Subject: [PATCH 08/63] implement truncate and ftruncate since rsync wants these too sometimes. --- acdcli/acd_fuse.py | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/acdcli/acd_fuse.py b/acdcli/acd_fuse.py index 507ff8d..18f2623 100644 --- a/acdcli/acd_fuse.py +++ b/acdcli/acd_fuse.py @@ -6,7 +6,6 @@ import logging import os import stat -import struct import sys from collections import deque, defaultdict @@ -236,6 +235,13 @@ def write(self, offset, bytes_: bytes): self.b[offset:offset + len(bytes_)] = bytes_ return old_len + def truncate(self, length): + with self.lock: + if len(self.b) < length: + self.b = self.b.ljust(length, '\0') + else: + self.b = self.b[:length] + def flush(self) -> bytes: with self.lock: ret = self.b @@ -358,6 +364,15 @@ def write(self, node_id, fh, offset, bytes_): t.daemon = True t.start() + def truncate(self, node_id, fh, length): + """truncates a buffer if it exists to a given length and returns true. + If not, does nothing (we don't preallocate) and returns false""" + b = self.buffers.get(node_id) + if b: + b.trunate(length) + return True + return False + def _flush(self, node_id, fh): f = self.files.get(node_id) b = self.buffers.get(node_id) @@ -796,8 +811,11 @@ def truncate(self, path, length, fh=None): else: self.cache.insert_node(r) elif length > 0: - if node.size != length: - raise FuseOSError(errno.ENOSYS) + if not self.wp.truncate(node.id, fh, length): + if node.size != length: + """from man 2 truncate; the file is not open for writing""" + raise FuseOSError(errno.EINVAL) + return 0 def release(self, path, fh): """Releases an open ``path``.""" From e80636264e57acdfd000a71aa40713193f066b3e Mon Sep 17 00:00:00 2001 From: Ben Date: Mon, 15 Aug 2016 18:47:19 -0400 Subject: [PATCH 09/63] only truncates that shorten are safe without reading the file from acd first and filling in the gaps --- acdcli/acd_fuse.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/acdcli/acd_fuse.py b/acdcli/acd_fuse.py index 18f2623..f5a564a 100644 --- a/acdcli/acd_fuse.py +++ b/acdcli/acd_fuse.py @@ -237,10 +237,10 @@ def write(self, offset, bytes_: bytes): def truncate(self, length): with self.lock: - if len(self.b) < length: - self.b = self.b.ljust(length, '\0') - else: + if len(self.b) >= length: self.b = self.b[:length] + return True + return False def flush(self) -> bytes: with self.lock: @@ -369,8 +369,7 @@ def truncate(self, node_id, fh, length): If not, does nothing (we don't preallocate) and returns false""" b = self.buffers.get(node_id) if b: - b.trunate(length) - return True + return b.trunate(length) return False def _flush(self, node_id, fh): @@ -812,9 +811,8 @@ def truncate(self, path, length, fh=None): self.cache.insert_node(r) elif length > 0: if not self.wp.truncate(node.id, fh, length): - if node.size != length: - """from man 2 truncate; the file is not open for writing""" - raise FuseOSError(errno.EINVAL) + logger.debug("truncate: attempting to skip ahead, ignoring") + # raise FuseOSError(errno.EINVAL) return 0 def release(self, path, fh): From 8d87e06a0ec36e10156cd84de13988cf5eec554e Mon Sep 17 00:00:00 2001 From: Ben Date: Tue, 16 Aug 2016 00:48:32 -0400 Subject: [PATCH 10/63] The corner cases around implementing truncate when we only have a (partial) buffer in the middle of writing is hard enough that we bail on it. We don't care about pre-allocating files since we have infinite space, and shortening a file is only possible when it's being written to.... so we can only catch the rare use case of file overwrites and truncate back. Neither are worth it. --- acdcli/acd_fuse.py | 27 +++++++-------------------- 1 file changed, 7 insertions(+), 20 deletions(-) diff --git a/acdcli/acd_fuse.py b/acdcli/acd_fuse.py index f5a564a..2991e5d 100644 --- a/acdcli/acd_fuse.py +++ b/acdcli/acd_fuse.py @@ -235,13 +235,6 @@ def write(self, offset, bytes_: bytes): self.b[offset:offset + len(bytes_)] = bytes_ return old_len - def truncate(self, length): - with self.lock: - if len(self.b) >= length: - self.b = self.b[:length] - return True - return False - def flush(self) -> bytes: with self.lock: ret = self.b @@ -364,14 +357,6 @@ def write(self, node_id, fh, offset, bytes_): t.daemon = True t.start() - def truncate(self, node_id, fh, length): - """truncates a buffer if it exists to a given length and returns true. - If not, does nothing (we don't preallocate) and returns false""" - b = self.buffers.get(node_id) - if b: - return b.trunate(length) - return False - def _flush(self, node_id, fh): f = self.files.get(node_id) b = self.buffers.get(node_id) @@ -791,7 +776,7 @@ def flush(self, path, fh): def truncate(self, path, length, fh=None): """Pseudo-truncates a file, i.e. clears content if ``length``==0 or does nothing - if ``length`` is equal to current file size. + if ``length`` is positive. :raises FuseOSError: if pseudo-truncation to length is not supported""" @@ -809,10 +794,12 @@ def truncate(self, path, length, fh=None): raise FuseOSError.convert(e) else: self.cache.insert_node(r) - elif length > 0: - if not self.wp.truncate(node.id, fh, length): - logger.debug("truncate: attempting to skip ahead, ignoring") - # raise FuseOSError(errno.EINVAL) + + """No good way to deal with positive lengths at the moment; since we can only do + something about it in the middle of writing, this means the only use case we can + capture is when a program over-writes and then truncates back. In the future, if + we can get cached file backing instead of memory backing, there would be more to + do here. In the mean time we ignore.""" return 0 def release(self, path, fh): From 06efeca565720d0e74fadbdbc6f1ff2b2ccaeea4 Mon Sep 17 00:00:00 2001 From: Ben Date: Wed, 17 Aug 2016 02:57:17 -0400 Subject: [PATCH 11/63] Use memory as a backing for buffers only if the writing size is 1G (configurable) or smaller, disk otherwise. --- acdcli/acd_fuse.py | 167 ++++++------------------------------------ acdcli/api/content.py | 20 +++++ 2 files changed, 43 insertions(+), 144 deletions(-) diff --git a/acdcli/acd_fuse.py b/acdcli/acd_fuse.py index 2991e5d..1d0bdc3 100644 --- a/acdcli/acd_fuse.py +++ b/acdcli/acd_fuse.py @@ -7,12 +7,12 @@ import os import stat import sys +import tempfile from collections import deque, defaultdict from multiprocessing import Process -from queue import Queue, Full as QueueFull -from threading import Thread, Lock, Event -from time import time, sleep +from threading import Thread, Lock +from time import time import ctypes.util import binascii @@ -53,7 +53,7 @@ def find_library(*args): _def_conf = configparser.ConfigParser() _def_conf['read'] = dict(open_chunk_limit=10, timeout=5) -_def_conf['write'] = dict(buffer_size = 32, timeout=30) +_def_conf['write'] = dict(buffer_size=int(1e9), timeout=30) class FuseOSError(FuseError): @@ -212,172 +212,51 @@ class WriteProxy(object): def __init__(self, acd_client, cache, buffer_size, timeout): self.acd_client = acd_client self.cache = cache - self.files = defaultdict(lambda: WriteProxy.WriteStream(buffer_size, timeout)) - self.buffers = defaultdict(lambda: WriteProxy.WriteBuffer()) + self.buffers = defaultdict(lambda: WriteProxy.WriteBuffer(buffer_size)) class WriteBuffer(object): - """An in-memory segment of a file. This gets pushed out to amazon via a WriteStream on - flush() calls. Anything that hasn't been flushed yet can be rewritten in place any - number of times.""" - - def __init__(self): - self.b = bytearray() - """The memory backing""" + def __init__(self, buffer_size): + self.f = tempfile.SpooledTemporaryFile(max_size=buffer_size) self.lock = Lock() def write(self, offset, bytes_: bytes): - """Writes to the buffer and returns the old buffer length""" with self.lock: - old_len = len(self.b) + self.f.seek(0, os.SEEK_END) + old_len = self.f.tell() if offset > old_len: logger.error('Wrong offset for writing to buffer; writing gap detected') raise FuseOSError(errno.ESPIPE) - self.b[offset:offset + len(bytes_)] = bytes_ + self.f.seek(offset) + self.f.write(bytes_) return old_len - def flush(self) -> bytes: - with self.lock: - ret = self.b - self.b = bytearray() - return ret - - def __len__(self): - with self.lock: - return len(self.b) - - class WriteStream(object): - """A WriteStream is a binary file-like object that is backed by a Queue. - It will remember its current offset.""" - - __slots__ = ('q', 'offset', 'error', 'closed', 'done', 'timeout', 'lock') - - def __init__(self, buffer_size, timeout): - self.q = Queue(maxsize=buffer_size) - """a queue that buffers written blocks""" - self.offset = 0 - """the beginning fpos""" - self.error = False - """whether the read or write failed""" - self.closed = False - self.done = Event() - """done event is triggered when file is successfully read and transferred""" - self.timeout = timeout - self.lock = Lock() - """make sure only one writer is appending to the queue at once""" - - def write(self, data: bytes): - """Writes data into queue. - - :raises: FuseOSError on timeout""" - - if self.error: - raise FuseOSError(errno.EREMOTEIO) - try: - self.q.put(data, timeout=self.timeout) - except QueueFull: - logger.error('Write timeout.') - raise FuseOSError(errno.ETIMEDOUT) - self.offset += len(data) - - def read(self, ln=0) -> bytes: - """Returns as much byte data from queue as possible. - Returns empty bytestring (EOF) if queue is empty and file was closed. - - :raises: IOError""" - - if self.error: - raise IOError(errno.EIO, errno.errorcode[errno.EIO]) - - if self.closed and self.q.empty(): - return b'' - - b = [self.q.get()] - self.q.task_done() - while not self.q.empty(): - b.append(self.q.get()) - self.q.task_done() - - return b''.join(b) - - def flush(self): - """Waits until the queue is emptied. - - :raises: FuseOSError""" - - while True: - if self.error: - raise FuseOSError(errno.EREMOTEIO) - if self.q.empty(): - return - sleep(1) - - def close(self): - """Sets the closed flag to signal 'EOF' to the read function. - Then, waits until :attr:`done` event is triggered. - - :raises: FuseOSError""" - - self.closed = True - # prevent read deadlock - self.q.put(b'') - - # wait until read is complete - while True: - if self.error: - raise FuseOSError(errno.EREMOTEIO) - if self.done.wait(1): - return - - def write_n_sync(self, stream: WriteStream, node_id: str): - """Try to overwrite file with id ``node_id`` with content from ``stream``. - Triggers the :attr:`WriteStream.done` event on success. - - :param stream: a file-like object""" + def get_file(self): + self.f.seek(0) + return self.f + def _write_and_sync(self, buffer: WriteBuffer, node_id: str): try: - r = self.acd_client.overwrite_stream(stream, node_id) + r = self.acd_client.overwrite_tempfile(node_id, buffer.get_file()) except (RequestError, IOError) as e: - stream.error = True logger.error('Error writing node "%s". %s' % (node_id, str(e))) else: self.cache.insert_node(r) - stream.done.set() def write(self, node_id, fh, offset, bytes_): - """Gets WriteStream from defaultdict. Creates overwrite thread if offset is 0, - tries to continue otherwise. + """Gets WriteBuffer from defaultdict. :raises: FuseOSError: wrong offset or writing failed""" b = self.buffers[node_id] - f = self.files[node_id] - - if b.write(offset, bytes_) == 0: - t = Thread(target=self.write_n_sync, args=(f, node_id)) - t.daemon = True - t.start() - - def _flush(self, node_id, fh): - f = self.files.get(node_id) - b = self.buffers.get(node_id) - if f and b: - if len(b): - data = b.flush() - with f.lock: - f.write(data) + b.write(offset, bytes_) def release(self, node_id, fh): """:raises: FuseOSError""" - self._flush(node_id, fh) - f = self.files.get(node_id) - if f: - try: - f.close() - except: - raise - finally: - del self.files[node_id] - del self.buffers[node_id] + + b = self.buffers.get(node_id) + if b: + self._write_and_sync(b, node_id) + del self.buffers[node_id] class LoggingMixIn(object): diff --git a/acdcli/api/content.py b/acdcli/api/content.py index ae5d9fd..1218501 100644 --- a/acdcli/api/content.py +++ b/acdcli/api/content.py @@ -3,6 +3,7 @@ import json import io import mimetypes +import tempfile from collections import OrderedDict import logging from urllib.parse import quote_plus @@ -211,6 +212,25 @@ def overwrite_file(self, node_id: str, file_name: str, return r.json() + def overwrite_tempfile(self, node_id: str, file, + read_callbacks: list = None, deduplication=False) -> dict: + params = {} if deduplication else {'suppress': 'deduplication'} + + basename = "file.bin" + mime_type = _get_mimetype(basename) + f = _TeeBufferedReader(file, callbacks=read_callbacks) + + # basename is ignored + m = MultipartEncoder(fields={('content', (quote_plus(basename), f, mime_type))}) + + r = self.BOReq.put(self.content_url + 'nodes/' + node_id + '/content', params=params, + data=m, stream=True, headers={'Content-Type': m.content_type}) + + if r.status_code not in OK_CODES: + raise RequestError(r.status_code, r.text) + + return r.json() + def overwrite_stream(self, stream, node_id: str, read_callbacks: list = None) -> dict: """Overwrite content of node with ID *node_id* with content of *stream*. From f1428f84379e7f010cee85062dd1bb023b4cb3fd Mon Sep 17 00:00:00 2001 From: Ben Date: Thu, 18 Aug 2016 11:26:49 -0400 Subject: [PATCH 12/63] expand the path -> node caching to solve large delays with readdir + many getattr calls for very full directories --- acdcli/acd_fuse.py | 33 +++++++++++++++++---------------- acdcli/cache/db.py | 4 ++-- acdcli/cache/query.py | 29 ++++++++++++----------------- acdcli/cache/sync.py | 7 +++++-- 4 files changed, 36 insertions(+), 37 deletions(-) diff --git a/acdcli/acd_fuse.py b/acdcli/acd_fuse.py index 1d0bdc3..8b9c28f 100644 --- a/acdcli/acd_fuse.py +++ b/acdcli/acd_fuse.py @@ -355,7 +355,8 @@ def readdir(self, path, fh) -> 'List[str]': if not node.type == 'folder': raise FuseOSError(errno.ENOTDIR) - return [_ for _ in ['.', '..'] + [c for c in self.cache.childrens_names(node.id)]] + folders, files = self.cache.list_children(folder_id=node.id, folder_path=path) + return [_ for _ in ['.', '..'] + [c.name for c in folders + files]] def getattr(self, path, fh=None) -> dict: """Creates a stat-like attribute dict, see :manpage:`stat(2)`. @@ -386,10 +387,10 @@ def getattr(self, path, fh=None) -> dict: **times) def listxattr(self, path): - node_id = self.cache.resolve_id(path) - if not node_id: + node = self.cache.resolve(path) + if not node: raise FuseOSError(errno.ENOENT) - return self._listxattr(node_id) + return self._listxattr(node.id) def _listxattr(self, node_id): self._xattr_load(node_id) @@ -400,10 +401,10 @@ def _listxattr(self, node_id): return [] def getxattr(self, path, name, position=0): - node_id = self.cache.resolve_id(path) - if not node_id: + node = self.cache.resolve(path) + if not node: raise FuseOSError(errno.ENOENT) - return self._getxattr_bytes(node_id, name) + return self._getxattr_bytes(node.id, name) def _getxattr(self, node_id, name): self._xattr_load(node_id) @@ -421,10 +422,10 @@ def _getxattr_bytes(self, node_id, name): return binascii.a2b_base64(self._getxattr(node_id, name)) def removexattr(self, path, name): - node_id = self.cache.resolve_id(path) - if not node_id: + node = self.cache.resolve(path) + if not node: raise FuseOSError(errno.ENOENT) - self._removexattr(node_id, name) + self._removexattr(node.id, name) def _removexattr(self, node_id, name): self._xattr_load(node_id) @@ -434,10 +435,10 @@ def _removexattr(self, node_id, name): self.properties_dirty.add(node_id) def setxattr(self, path, name, value, options, position=0): - node_id = self.cache.resolve_id(path) - if not node_id: + node = self.cache.resolve(path) + if not node: raise FuseOSError(errno.ENOENT) - self._setxattr_bytes(node_id, name, value) + self._setxattr_bytes(node.id, name, value) def _setxattr(self, node_id, name, value): self._xattr_load(node_id) @@ -713,8 +714,8 @@ def utimens(self, path, times=None): :param times: [atime, mtime]""" - node_id = self.cache.resolve_id(path) - if not node_id: + node = self.cache.resolve(path) + if not node: raise FuseOSError(errno.ENOENT) if times: @@ -725,7 +726,7 @@ def utimens(self, path, times=None): mtime = time() try: - self._setxattr(node_id, _XATTR_MTIME_OVERRIDE_NAME, mtime) + self._setxattr(node.id, _XATTR_MTIME_OVERRIDE_NAME, mtime) self._xattr_write_and_sync() except: raise FuseOSError(errno.ENOTSUP) diff --git a/acdcli/cache/db.py b/acdcli/cache/db.py index 6ac66f2..7cb4b6b 100644 --- a/acdcli/cache/db.py +++ b/acdcli/cache/db.py @@ -65,8 +65,8 @@ def __init__(self, cache_path: str='', settings_path='', check=IntegrityCheckTyp self._conn.create_function('REGEXP', _regex_match.__code__.co_argcount, _regex_match) - self.path_to_node_id = {} - self.path_to_node_id_lock = Lock() + self.path_to_node_cache = {} + self.path_to_node_cache_lock = Lock() """There are a huge number of repeated path lookups, so cache results and invalidate on new nodes.""" diff --git a/acdcli/cache/query.py b/acdcli/cache/query.py index 842189e..6e9c05e 100644 --- a/acdcli/cache/query.py +++ b/acdcli/cache/query.py @@ -156,27 +156,14 @@ def get_conflicting_node(self, name: str, parent_id: str): if n.is_available and n.name.lower() == name.lower(): return n - def resolve_id(self, path: str, trash=False) -> 'Union[str|None]': - """Gets a node's id from a path - This is far faster than the below method if the id is cached; - there are zero sqlite queries.""" - with self.path_to_node_id_lock: - try: return self.path_to_node_id[path] - except: pass - n = self._resolve(path, trash) - if n: - self.path_to_node_id[path] = n.id - return n.id - return None - def resolve(self, path: str, trash=False) -> 'Union[Node|None]': """Gets a node from a path""" - with self.path_to_node_id_lock: - try: return self.get_node(self.path_to_node_id[path]) + with self.path_to_node_cache_lock: + try: return self.path_to_node_cache[path] except: pass n = self._resolve(path,trash) if n: - self.path_to_node_id[path] = n.id + self.path_to_node_cache[path] = n return n return None @@ -270,7 +257,7 @@ def get_child(self, folder_id, child_name) -> 'Union[Node|None]': if r.is_available: return r - def list_children(self, folder_id, trash=False) -> 'Tuple[List[Node], List[Node]]': + def list_children(self, folder_id, trash=False, folder_path=None) -> 'Tuple[List[Node], List[Node]]': files = [] folders = [] @@ -286,6 +273,14 @@ def list_children(self, folder_id, trash=False) -> 'Tuple[List[Node], List[Node] folders.append(node) node = c.fetchone() + """If the caller provides the folder_path, we can add all the children to the + path->node_id cache for faster lookup after a directory listing""" + if folder_path: + children = folders + files + with self.path_to_node_cache_lock: + for c in children: + self.path_to_node_cache[folder_path + '/' + c.name] = c + return folders, files def list_trashed_children(self, folder_id) -> 'Tuple[List[Node], List[Node]]': diff --git a/acdcli/cache/sync.py b/acdcli/cache/sync.py index 1ff4672..daacc90 100644 --- a/acdcli/cache/sync.py +++ b/acdcli/cache/sync.py @@ -49,8 +49,11 @@ def remove_purged(self, purged: list): def insert_nodes(self, nodes: list, partial=True): """Inserts mixed list of files and folders into cache.""" - with self.path_to_node_id_lock: - self.path_to_node_id.clear() + + """Flush the path cache since these new nodes may be deletes, moves, or renames + that affect the path cache, or overwrites that would invalidate the data in it.""" + with self.path_to_node_cache_lock: + self.path_to_node_cache.clear() files = [] folders = [] From 4f9bc8b1da32256fdf3a3ba9081e5746e0494c6b Mon Sep 17 00:00:00 2001 From: Ben Date: Sun, 21 Aug 2016 08:51:34 -0400 Subject: [PATCH 13/63] remove tz-aware timestamp references since we're handing mtime in xattrs now. --- acdcli/cache/query.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/acdcli/cache/query.py b/acdcli/cache/query.py index 6e9c05e..46fe0a4 100644 --- a/acdcli/cache/query.py +++ b/acdcli/cache/query.py @@ -1,8 +1,5 @@ import logging from datetime import datetime - -from dateutil.tz import tzutc - from .cursors import cursor logger = logging.getLogger(__name__) @@ -10,9 +7,9 @@ def datetime_from_string(dt: str) -> datetime: try: - dt = datetime.strptime(dt, '%Y-%m-%d %H:%M:%S.%f+00:00').replace(tzinfo=tzutc()) + dt = datetime.strptime(dt, '%Y-%m-%d %H:%M:%S.%f+00:00') except ValueError: - dt = datetime.strptime(dt, '%Y-%m-%d %H:%M:%S+00:00').replace(tzinfo=tzutc()) + dt = datetime.strptime(dt, '%Y-%m-%d %H:%M:%S+00:00') return dt From d75fc545f694ebff32d1221745cec400fbc97f01 Mon Sep 17 00:00:00 2001 From: Ben Date: Sun, 21 Aug 2016 11:28:40 -0400 Subject: [PATCH 14/63] helper methods for debugging, cleanup of log and comments --- acdcli/acd_fuse.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/acdcli/acd_fuse.py b/acdcli/acd_fuse.py index 8b9c28f..25cb315 100644 --- a/acdcli/acd_fuse.py +++ b/acdcli/acd_fuse.py @@ -219,6 +219,11 @@ def __init__(self, buffer_size): self.f = tempfile.SpooledTemporaryFile(max_size=buffer_size) self.lock = Lock() + def read(self, offset, length: int): + with self.lock: + self.f.seek(offset) + return self.f.read(length) + def write(self, offset, bytes_: bytes): with self.lock: self.f.seek(0, os.SEEK_END) @@ -231,18 +236,25 @@ def write(self, offset, bytes_: bytes): return old_len def get_file(self): + """Return the file for direct access. Be sure to lock from the outside when doing so""" self.f.seek(0) return self.f def _write_and_sync(self, buffer: WriteBuffer, node_id: str): try: - r = self.acd_client.overwrite_tempfile(node_id, buffer.get_file()) + with buffer.lock: + r = self.acd_client.overwrite_tempfile(node_id, buffer.get_file()) except (RequestError, IOError) as e: logger.error('Error writing node "%s". %s' % (node_id, str(e))) else: self.cache.insert_node(r) - def write(self, node_id, fh, offset, bytes_): + def read(self, node_id, fh, offset, length: int): + b = self.buffers.get(node_id) + if b: + return b.read(offset, length) + + def write(self, node_id, fh, offset, bytes_: bytes): """Gets WriteBuffer from defaultdict. :raises: FuseOSError: wrong offset or writing failed""" @@ -271,6 +283,8 @@ def __call__(self, op, path, *args): targs = (len(args[0]),) + args[1:] elif op == 'chmod': targs = (oct(args[0]),) + args[1:] + elif op == 'setxattr': + targs = (len(args[0]),) + args[1:] logger.debug('-> %s %s %s', op, path, repr(args if not targs else targs)) @@ -284,6 +298,8 @@ def __call__(self, op, path, *args): finally: if op == 'read': ret = len(ret) + elif op == 'getxattr' and ret and ret != '[Errno 61] No data available': + ret = len(ret) logger.debug('<- %s %s', op, repr(ret)) @@ -677,9 +693,7 @@ def truncate(self, path, length, fh=None): """No good way to deal with positive lengths at the moment; since we can only do something about it in the middle of writing, this means the only use case we can - capture is when a program over-writes and then truncates back. In the future, if - we can get cached file backing instead of memory backing, there would be more to - do here. In the mean time we ignore.""" + capture is when a program over-writes and then truncates back.""" return 0 def release(self, path, fh): From c25a2eadc4c50b0b567504b6111decbc064ec3fc Mon Sep 17 00:00:00 2001 From: Ben Date: Sun, 21 Aug 2016 23:14:04 -0400 Subject: [PATCH 15/63] duplicity support: opportunistic reading and file size query if we're writing at the same time but have not yet flushed to amazon --- acdcli/acd_fuse.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/acdcli/acd_fuse.py b/acdcli/acd_fuse.py index 25cb315..19a5b0c 100644 --- a/acdcli/acd_fuse.py +++ b/acdcli/acd_fuse.py @@ -235,6 +235,11 @@ def write(self, offset, bytes_: bytes): self.f.write(bytes_) return old_len + def length(self): + with self.lock: + self.f.seek(0, os.SEEK_END) + return self.f.tell() + def get_file(self): """Return the file for direct access. Be sure to lock from the outside when doing so""" self.f.seek(0) @@ -262,6 +267,11 @@ def write(self, node_id, fh, offset, bytes_: bytes): b = self.buffers[node_id] b.write(offset, bytes_) + def length(self, node_id, fh): + b = self.buffers.get(node_id) + if b: + return b.length() + def release(self, node_id, fh): """:raises: FuseOSError""" @@ -388,6 +398,9 @@ def getattr(self, path, fh=None) -> dict: try: mtime = self._getxattr(node.id, _XATTR_MTIME_OVERRIDE_NAME) except: mtime = node.modified.timestamp() + size = self.wp.length(node.id, fh) + if not size: size = node.size + times = dict(st_atime=time(), st_mtime=mtime, st_ctime=node.created.timestamp()) @@ -399,7 +412,7 @@ def getattr(self, path, fh=None) -> dict: elif node.is_file: return dict(st_mode=stat.S_IFREG | 0o0666, st_nlink=self.cache.num_parents(node.id) if self.nlinks else 1, - st_size=node.size, + st_size=size, **times) def listxattr(self, path): @@ -505,6 +518,11 @@ def read(self, path, length, offset, fh) -> bytes: if node.size < offset + length: length = node.size - offset + """If we attempt to read something we just wrote, give it back""" + ret = self.wp.read(node.id, fh, offset, length) + if ret and len(ret) == length: + return ret + return self.rp.get(node.id, offset, length, node.size) def statfs(self, path) -> dict: From 1f2b4c698c58e1b940dd51b488845a33521fabd4 Mon Sep 17 00:00:00 2001 From: Ben Date: Sat, 10 Sep 2016 23:16:41 -0500 Subject: [PATCH 16/63] borgbackup support by allowing manual flushes, deduplicated with releases --- acdcli/acd_fuse.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/acdcli/acd_fuse.py b/acdcli/acd_fuse.py index 19a5b0c..916a773 100644 --- a/acdcli/acd_fuse.py +++ b/acdcli/acd_fuse.py @@ -218,6 +218,7 @@ class WriteBuffer(object): def __init__(self, buffer_size): self.f = tempfile.SpooledTemporaryFile(max_size=buffer_size) self.lock = Lock() + self.dirty = True def read(self, offset, length: int): with self.lock: @@ -226,6 +227,7 @@ def read(self, offset, length: int): def write(self, offset, bytes_: bytes): with self.lock: + self.dirty = True self.f.seek(0, os.SEEK_END) old_len = self.f.tell() if offset > old_len: @@ -248,7 +250,10 @@ def get_file(self): def _write_and_sync(self, buffer: WriteBuffer, node_id: str): try: with buffer.lock: + if not buffer.dirty: + return r = self.acd_client.overwrite_tempfile(node_id, buffer.get_file()) + buffer.dirty = False except (RequestError, IOError) as e: logger.error('Error writing node "%s". %s' % (node_id, str(e))) else: @@ -272,9 +277,12 @@ def length(self, node_id, fh): if b: return b.length() - def release(self, node_id, fh): - """:raises: FuseOSError""" + def flush(self, node_id, fh): + b = self.buffers.get(node_id) + if b: + self._write_and_sync(b, node_id) + def release(self, node_id, fh): b = self.buffers.get(node_id) if b: self._write_and_sync(b, node_id) @@ -684,9 +692,13 @@ def write(self, path, data, offset, fh) -> int: return len(data) def flush(self, path, fh): - """noop since we need to keep the whole buffer in memory; - acd only supports sequentual writes otherwise""" - pass + if fh: + node = self.handles[fh] + else: + node = self.cache.resolve(path) + if not node: + raise FuseOSError(errno.ENOENT) + self.wp.flush(node.id, fh) def truncate(self, path, length, fh=None): """Pseudo-truncates a file, i.e. clears content if ``length``==0 or does nothing From d135b519bd1a92ee60e30a91a79aa944e42c44f3 Mon Sep 17 00:00:00 2001 From: Ben Date: Sun, 11 Sep 2016 22:14:06 -0500 Subject: [PATCH 17/63] retry fuse operations if there's a failure on amazon's side and the return code says to retry. --- acdcli/api/content.py | 94 +++++++++++++++++++++++------------------- acdcli/api/metadata.py | 28 +++++++------ acdcli/api/trash.py | 10 +++-- 3 files changed, 73 insertions(+), 59 deletions(-) diff --git a/acdcli/api/content.py b/acdcli/api/content.py index 1218501..357d77f 100644 --- a/acdcli/api/content.py +++ b/acdcli/api/content.py @@ -61,57 +61,63 @@ class ContentMixin(object): """Implements content portion of the ACD API.""" def create_folder(self, name: str, parent=None) -> dict: - body = {'kind': 'FOLDER', 'name': name} - if parent: - body['parents'] = [parent] - body_str = json.dumps(body) + while True: + body = {'kind': 'FOLDER', 'name': name} + if parent: + body['parents'] = [parent] + body_str = json.dumps(body) - acc_codes = [http.CREATED] + acc_codes = [http.CREATED] - r = self.BOReq.post(self.metadata_url + 'nodes', acc_codes=acc_codes, data=body_str) + r = self.BOReq.post(self.metadata_url + 'nodes', acc_codes=acc_codes, data=body_str) + if r.status_code == 500: continue # the fault lies not in our stars, but in amazon - if r.status_code not in acc_codes: - raise RequestError(r.status_code, r.text) + if r.status_code not in acc_codes: + raise RequestError(r.status_code, r.text) - return r.json() + return r.json() def create_file(self, file_name: str, parent: str = None) -> dict: - params = {'suppress': 'deduplication'} + while True: + params = {'suppress': 'deduplication'} - basename = os.path.basename(file_name) - metadata = {'kind': 'FILE', 'name': basename} - if parent: - metadata['parents'] = [parent] - mime_type = _get_mimetype(basename) - f = io.BytesIO() + basename = os.path.basename(file_name) + metadata = {'kind': 'FILE', 'name': basename} + if parent: + metadata['parents'] = [parent] + mime_type = _get_mimetype(basename) + f = io.BytesIO() - # basename is ignored - m = MultipartEncoder(fields=OrderedDict([('metadata', json.dumps(metadata)), - ('content', (quote_plus(basename), f, mime_type))]) - ) + # basename is ignored + m = MultipartEncoder(fields=OrderedDict([('metadata', json.dumps(metadata)), + ('content', (quote_plus(basename), f, mime_type))]) + ) - ok_codes = [http.CREATED] - r = self.BOReq.post(self.content_url + 'nodes', params=params, data=m, - acc_codes=ok_codes, headers={'Content-Type': m.content_type}) + ok_codes = [http.CREATED] + r = self.BOReq.post(self.content_url + 'nodes', params=params, data=m, + acc_codes=ok_codes, headers={'Content-Type': m.content_type}) + if r.status_code == 500: continue # the fault lies not in our stars, but in amazon - if r.status_code not in ok_codes: - raise RequestError(r.status_code, r.text) - return r.json() + if r.status_code not in ok_codes: + raise RequestError(r.status_code, r.text) + return r.json() def clear_file(self, node_id: str) -> dict: """Clears a file's content by overwriting it with an empty BytesIO. :param node_id: valid file node ID""" - m = MultipartEncoder(fields={('content', (' ', io.BytesIO(), _get_mimetype()))}) + while True: + m = MultipartEncoder(fields={('content', (' ', io.BytesIO(), _get_mimetype()))}) - r = self.BOReq.put(self.content_url + 'nodes/' + node_id + '/content', params={}, - data=m, stream=True, headers={'Content-Type': m.content_type}) + r = self.BOReq.put(self.content_url + 'nodes/' + node_id + '/content', params={}, + data=m, stream=True, headers={'Content-Type': m.content_type}) + if r.status_code == 500: continue # the fault lies not in our stars, but in amazon - if r.status_code not in OK_CODES: - raise RequestError(r.status_code, r.text) + if r.status_code not in OK_CODES: + raise RequestError(r.status_code, r.text) - return r.json() + return r.json() def upload_file(self, file_name: str, parent: str = None, read_callbacks=None, deduplication=False) -> dict: @@ -214,22 +220,24 @@ def overwrite_file(self, node_id: str, file_name: str, def overwrite_tempfile(self, node_id: str, file, read_callbacks: list = None, deduplication=False) -> dict: - params = {} if deduplication else {'suppress': 'deduplication'} + while True: + params = {} if deduplication else {'suppress': 'deduplication'} - basename = "file.bin" - mime_type = _get_mimetype(basename) - f = _TeeBufferedReader(file, callbacks=read_callbacks) + basename = "file.bin" + mime_type = _get_mimetype(basename) + f = _TeeBufferedReader(file, callbacks=read_callbacks) - # basename is ignored - m = MultipartEncoder(fields={('content', (quote_plus(basename), f, mime_type))}) + # basename is ignored + m = MultipartEncoder(fields={('content', (quote_plus(basename), f, mime_type))}) - r = self.BOReq.put(self.content_url + 'nodes/' + node_id + '/content', params=params, - data=m, stream=True, headers={'Content-Type': m.content_type}) + r = self.BOReq.put(self.content_url + 'nodes/' + node_id + '/content', params=params, + data=m, stream=True, headers={'Content-Type': m.content_type}) + if r.status_code == 500: continue # the fault lies not in our stars, but in amazon - if r.status_code not in OK_CODES: - raise RequestError(r.status_code, r.text) + if r.status_code not in OK_CODES: + raise RequestError(r.status_code, r.text) - return r.json() + return r.json() def overwrite_stream(self, stream, node_id: str, read_callbacks: list = None) -> dict: """Overwrite content of node with ID *node_id* with content of *stream*. diff --git a/acdcli/api/metadata.py b/acdcli/api/metadata.py index fdfc34e..8071e9c 100644 --- a/acdcli/api/metadata.py +++ b/acdcli/api/metadata.py @@ -153,11 +153,13 @@ def get_metadata(self, node_id: str, assets=False, temp_link=True) -> dict: # this will increment the node's version attribute def update_metadata(self, node_id: str, properties: dict) -> dict: """Update a node's properties like name, description, status, parents, ...""" - body = json.dumps(properties) - r = self.BOReq.patch(self.metadata_url + 'nodes/' + node_id, data=body) - if r.status_code not in OK_CODES: - raise RequestError(r.status_code, r.text) - return r.json() + while True: + body = json.dumps(properties) + r = self.BOReq.patch(self.metadata_url + 'nodes/' + node_id, data=body) + if r.status_code == 500: continue # the fault lies not in our stars, but in amazon + if r.status_code not in OK_CODES: + raise RequestError(r.status_code, r.text) + return r.json() def get_root_id(self) -> str: """Gets the ID of the root node @@ -249,13 +251,15 @@ def add_property(self, node_id: str, owner_id: str, key: str, value: str) -> dic :returns dict: {'key': '', 'location': '/properties/', 'value': ''}""" - ok_codes = [requests.codes.CREATED] - r = self.BOReq.put(self.metadata_url + 'nodes/' + node_id + - '/properties/' + owner_id + '/' + key, - data=json.dumps({'value': value}), acc_codes=ok_codes) - if r.status_code not in ok_codes: - raise RequestError(r.status_code, r.text) - return r.json() + while True: + ok_codes = [requests.codes.CREATED] + r = self.BOReq.put(self.metadata_url + 'nodes/' + node_id + + '/properties/' + owner_id + '/' + key, + data=json.dumps({'value': value}), acc_codes=ok_codes) + if r.status_code == 500: continue # the fault lies not in our stars, but in amazon + if r.status_code not in ok_codes: + raise RequestError(r.status_code, r.text) + return r.json() def delete_property(self, node_id: str, owner_id: str, key: str): """Deletes *key* property from node with ID *node_id*.""" diff --git a/acdcli/api/trash.py b/acdcli/api/trash.py index 36c7186..d42cdb0 100644 --- a/acdcli/api/trash.py +++ b/acdcli/api/trash.py @@ -12,10 +12,12 @@ def list_trash(self) -> list: return self.BOReq.paginated_get(self.metadata_url + 'trash') def move_to_trash(self, node_id: str) -> dict: - r = self.BOReq.put(self.metadata_url + 'trash/' + node_id) - if r.status_code not in OK_CODES: - raise RequestError(r.status_code, r.text) - return r.json() + while True: + r = self.BOReq.put(self.metadata_url + 'trash/' + node_id) + if r.status_code == 500: continue # the fault lies not in our stars, but in amazon + if r.status_code not in OK_CODES: + raise RequestError(r.status_code, r.text) + return r.json() def restore(self, node_id: str) -> dict: r = self.BOReq.post(self.metadata_url + 'trash/' + node_id + '/restore') From d867f531ee32422062dfe99f251169ad6f8dca17 Mon Sep 17 00:00:00 2001 From: Ben Date: Mon, 12 Sep 2016 19:04:39 -0500 Subject: [PATCH 18/63] clean up codes when we retry --- acdcli/api/common.py | 4 ++-- acdcli/api/content.py | 17 +++++++++-------- acdcli/api/metadata.py | 6 +++--- acdcli/api/trash.py | 2 +- 4 files changed, 15 insertions(+), 14 deletions(-) diff --git a/acdcli/api/common.py b/acdcli/api/common.py index 0a4d71c..527f729 100644 --- a/acdcli/api/common.py +++ b/acdcli/api/common.py @@ -1,6 +1,6 @@ -import requests import re +import requests from requests.exceptions import ConnectionError try: @@ -14,7 +14,7 @@ class ReadTimeoutError(Exception): # status codes that indicate request success OK_CODES = [requests.codes.OK] - +RETRY_CODES = [requests.codes.server_error, requests.codes.gateway_timeout] class RequestError(Exception): """Catch-all exception class for various connection and ACD server errors.""" diff --git a/acdcli/api/content.py b/acdcli/api/content.py index 357d77f..283f5a9 100644 --- a/acdcli/api/content.py +++ b/acdcli/api/content.py @@ -1,12 +1,12 @@ import http.client as http -import os -import json import io +import json +import logging import mimetypes -import tempfile +import os from collections import OrderedDict -import logging from urllib.parse import quote_plus + from requests import Response from requests_toolbelt import MultipartEncoder @@ -70,7 +70,7 @@ def create_folder(self, name: str, parent=None) -> dict: acc_codes = [http.CREATED] r = self.BOReq.post(self.metadata_url + 'nodes', acc_codes=acc_codes, data=body_str) - if r.status_code == 500: continue # the fault lies not in our stars, but in amazon + if r.status_code in RETRY_CODES: continue # the fault lies not in our stars, but in amazon if r.status_code not in acc_codes: raise RequestError(r.status_code, r.text) @@ -96,7 +96,7 @@ def create_file(self, file_name: str, parent: str = None) -> dict: ok_codes = [http.CREATED] r = self.BOReq.post(self.content_url + 'nodes', params=params, data=m, acc_codes=ok_codes, headers={'Content-Type': m.content_type}) - if r.status_code == 500: continue # the fault lies not in our stars, but in amazon + if r.status_code in RETRY_CODES: continue # the fault lies not in our stars, but in amazon if r.status_code not in ok_codes: raise RequestError(r.status_code, r.text) @@ -112,7 +112,7 @@ def clear_file(self, node_id: str) -> dict: r = self.BOReq.put(self.content_url + 'nodes/' + node_id + '/content', params={}, data=m, stream=True, headers={'Content-Type': m.content_type}) - if r.status_code == 500: continue # the fault lies not in our stars, but in amazon + if r.status_code in RETRY_CODES: continue # the fault lies not in our stars, but in amazon if r.status_code not in OK_CODES: raise RequestError(r.status_code, r.text) @@ -221,6 +221,7 @@ def overwrite_file(self, node_id: str, file_name: str, def overwrite_tempfile(self, node_id: str, file, read_callbacks: list = None, deduplication=False) -> dict: while True: + file.seek(0) params = {} if deduplication else {'suppress': 'deduplication'} basename = "file.bin" @@ -232,7 +233,7 @@ def overwrite_tempfile(self, node_id: str, file, r = self.BOReq.put(self.content_url + 'nodes/' + node_id + '/content', params=params, data=m, stream=True, headers={'Content-Type': m.content_type}) - if r.status_code == 500: continue # the fault lies not in our stars, but in amazon + if r.status_code in RETRY_CODES: continue # the fault lies not in our stars, but in amazon if r.status_code not in OK_CODES: raise RequestError(r.status_code, r.text) diff --git a/acdcli/api/metadata.py b/acdcli/api/metadata.py index 8071e9c..8ee60f6 100644 --- a/acdcli/api/metadata.py +++ b/acdcli/api/metadata.py @@ -1,8 +1,8 @@ """Node metadata operations""" +import http.client import json import logging -import http.client import tempfile from collections import namedtuple @@ -156,7 +156,7 @@ def update_metadata(self, node_id: str, properties: dict) -> dict: while True: body = json.dumps(properties) r = self.BOReq.patch(self.metadata_url + 'nodes/' + node_id, data=body) - if r.status_code == 500: continue # the fault lies not in our stars, but in amazon + if r.status_code in RETRY_CODES: continue # the fault lies not in our stars, but in amazon if r.status_code not in OK_CODES: raise RequestError(r.status_code, r.text) return r.json() @@ -256,7 +256,7 @@ def add_property(self, node_id: str, owner_id: str, key: str, value: str) -> dic r = self.BOReq.put(self.metadata_url + 'nodes/' + node_id + '/properties/' + owner_id + '/' + key, data=json.dumps({'value': value}), acc_codes=ok_codes) - if r.status_code == 500: continue # the fault lies not in our stars, but in amazon + if r.status_code in RETRY_CODES: continue # the fault lies not in our stars, but in amazon if r.status_code not in ok_codes: raise RequestError(r.status_code, r.text) return r.json() diff --git a/acdcli/api/trash.py b/acdcli/api/trash.py index d42cdb0..904f203 100644 --- a/acdcli/api/trash.py +++ b/acdcli/api/trash.py @@ -14,7 +14,7 @@ def list_trash(self) -> list: def move_to_trash(self, node_id: str) -> dict: while True: r = self.BOReq.put(self.metadata_url + 'trash/' + node_id) - if r.status_code == 500: continue # the fault lies not in our stars, but in amazon + if r.status_code in RETRY_CODES: continue # the fault lies not in our stars, but in amazon if r.status_code not in OK_CODES: raise RequestError(r.status_code, r.text) return r.json() From d7bc91748d5f30e0dc235565ca469bb4fc2b1c48 Mon Sep 17 00:00:00 2001 From: Ben Date: Mon, 12 Sep 2016 22:07:53 -0500 Subject: [PATCH 19/63] Amazon has any number of ways to make file operations fail. --- acdcli/api/content.py | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/acdcli/api/content.py b/acdcli/api/content.py index 283f5a9..4f31526 100644 --- a/acdcli/api/content.py +++ b/acdcli/api/content.py @@ -377,19 +377,21 @@ def chunked_download(self, node_id: str, file: io.BufferedWriter, **kwargs): return def response_chunk(self, node_id: str, offset: int, length: int, **kwargs) -> Response: - ok_codes = [http.PARTIAL_CONTENT] - end = offset + length - 1 - logger.debug('chunk o %d l %d' % (offset, length)) - - r = self.BOReq.get(self.content_url + 'nodes/' + node_id + '/content', - acc_codes=ok_codes, stream=True, - headers={'Range': 'bytes=%d-%d' % (offset, end)}, **kwargs) - # if r.status_code == http.REQUESTED_RANGE_NOT_SATISFIABLE: - # return - if r.status_code not in ok_codes: - raise RequestError(r.status_code, r.text) - - return r + while True: + ok_codes = [http.PARTIAL_CONTENT] + retry_codes = [1000] + end = offset + length - 1 + logger.debug('chunk o %d l %d' % (offset, length)) + + r = self.BOReq.get(self.content_url + 'nodes/' + node_id + '/content', + acc_codes=ok_codes, stream=True, + headers={'Range': 'bytes=%d-%d' % (offset, end)}, **kwargs) + # if r.status_code == http.REQUESTED_RANGE_NOT_SATISFIABLE: + # return + if r.status_code in retry_codes: continue # the fault lies not in our stars, but in amazon + if r.status_code not in ok_codes: + raise RequestError(r.status_code, r.text) + return r def download_chunk(self, node_id: str, offset: int, length: int, **kwargs) -> bytearray: """Load a file chunk into memory. From f4121e894e024339511e17e1a0d63a0d6eaa05b6 Mon Sep 17 00:00:00 2001 From: Ben Date: Wed, 14 Sep 2016 07:53:39 -0500 Subject: [PATCH 20/63] turns out the 1000 response is thrown rather then returned --- acdcli/api/content.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/acdcli/api/content.py b/acdcli/api/content.py index 4f31526..a943929 100644 --- a/acdcli/api/content.py +++ b/acdcli/api/content.py @@ -379,13 +379,17 @@ def chunked_download(self, node_id: str, file: io.BufferedWriter, **kwargs): def response_chunk(self, node_id: str, offset: int, length: int, **kwargs) -> Response: while True: ok_codes = [http.PARTIAL_CONTENT] - retry_codes = [1000] + retry_codes = [400] end = offset + length - 1 logger.debug('chunk o %d l %d' % (offset, length)) - r = self.BOReq.get(self.content_url + 'nodes/' + node_id + '/content', - acc_codes=ok_codes, stream=True, - headers={'Range': 'bytes=%d-%d' % (offset, end)}, **kwargs) + try: + r = self.BOReq.get(self.content_url + 'nodes/' + node_id + '/content', + acc_codes=ok_codes, stream=True, + headers={'Range': 'bytes=%d-%d' % (offset, end)}, **kwargs) + except RequestError as e: + if e.status_code == RequestError.CODE.CONN_EXCEPTION: continue + raise # if r.status_code == http.REQUESTED_RANGE_NOT_SATISFIABLE: # return if r.status_code in retry_codes: continue # the fault lies not in our stars, but in amazon From 18671ea0c190df1655ed5d22f6415d40c42f6d0d Mon Sep 17 00:00:00 2001 From: Ben Gemmill Date: Sun, 6 Nov 2016 23:34:40 -0600 Subject: [PATCH 21/63] we can get 1000 error codes on writing too. --- acdcli/api/content.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/acdcli/api/content.py b/acdcli/api/content.py index a943929..cc19721 100644 --- a/acdcli/api/content.py +++ b/acdcli/api/content.py @@ -231,8 +231,13 @@ def overwrite_tempfile(self, node_id: str, file, # basename is ignored m = MultipartEncoder(fields={('content', (quote_plus(basename), f, mime_type))}) - r = self.BOReq.put(self.content_url + 'nodes/' + node_id + '/content', params=params, - data=m, stream=True, headers={'Content-Type': m.content_type}) + try: + r = self.BOReq.put(self.content_url + 'nodes/' + node_id + '/content', params=params, + data=m, stream=True, headers={'Content-Type': m.content_type}) + except RequestError as e: + if e.status_code == RequestError.CODE.CONN_EXCEPTION: continue + raise + if r.status_code in RETRY_CODES: continue # the fault lies not in our stars, but in amazon if r.status_code not in OK_CODES: From 58c6bd47c40e68478a88c9cb09db635a273a4187 Mon Sep 17 00:00:00 2001 From: Ben Gemmill Date: Tue, 8 Nov 2016 20:28:16 -0600 Subject: [PATCH 22/63] handle more error codes when reading, and generally retry on 400s. --- acdcli/api/common.py | 2 +- acdcli/api/content.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/acdcli/api/common.py b/acdcli/api/common.py index 527f729..1512235 100644 --- a/acdcli/api/common.py +++ b/acdcli/api/common.py @@ -14,7 +14,7 @@ class ReadTimeoutError(Exception): # status codes that indicate request success OK_CODES = [requests.codes.OK] -RETRY_CODES = [requests.codes.server_error, requests.codes.gateway_timeout] +RETRY_CODES = [requests.codes.server_error, requests.codes.gateway_timeout, requests.codes.bad_request] class RequestError(Exception): """Catch-all exception class for various connection and ACD server errors.""" diff --git a/acdcli/api/content.py b/acdcli/api/content.py index cc19721..e9be28c 100644 --- a/acdcli/api/content.py +++ b/acdcli/api/content.py @@ -384,7 +384,6 @@ def chunked_download(self, node_id: str, file: io.BufferedWriter, **kwargs): def response_chunk(self, node_id: str, offset: int, length: int, **kwargs) -> Response: while True: ok_codes = [http.PARTIAL_CONTENT] - retry_codes = [400] end = offset + length - 1 logger.debug('chunk o %d l %d' % (offset, length)) @@ -397,7 +396,7 @@ def response_chunk(self, node_id: str, offset: int, length: int, **kwargs) -> Re raise # if r.status_code == http.REQUESTED_RANGE_NOT_SATISFIABLE: # return - if r.status_code in retry_codes: continue # the fault lies not in our stars, but in amazon + if r.status_code in RETRY_CODES: continue # the fault lies not in our stars, but in amazon if r.status_code not in ok_codes: raise RequestError(r.status_code, r.text) return r From c1f58a497751f6464c05080f645884b3b0ceadd4 Mon Sep 17 00:00:00 2001 From: bgemmill Date: Thu, 1 Dec 2016 23:17:28 -0500 Subject: [PATCH 23/63] handle 503 codes when amazon doesn't get a request body --- acdcli/api/common.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/acdcli/api/common.py b/acdcli/api/common.py index 1512235..b0ddd53 100644 --- a/acdcli/api/common.py +++ b/acdcli/api/common.py @@ -14,7 +14,10 @@ class ReadTimeoutError(Exception): # status codes that indicate request success OK_CODES = [requests.codes.OK] -RETRY_CODES = [requests.codes.server_error, requests.codes.gateway_timeout, requests.codes.bad_request] +RETRY_CODES = [requests.codes.server_error, + requests.codes.gateway_timeout, + requests.codes.bad_request, + requests.codes.service_unavailable] class RequestError(Exception): """Catch-all exception class for various connection and ACD server errors.""" From 5124699d9a1dfb6126d66f72ed681827a91e26c0 Mon Sep 17 00:00:00 2001 From: Ben Date: Sun, 25 Dec 2016 21:48:44 -0500 Subject: [PATCH 24/63] verify that file renames and moves happen since amazon can drop some despite returning success --- acdcli/api/metadata.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/acdcli/api/metadata.py b/acdcli/api/metadata.py index 4850ee8..1dd0c7d 100644 --- a/acdcli/api/metadata.py +++ b/acdcli/api/metadata.py @@ -227,11 +227,24 @@ def move_node_from(self, node_id: str, old_parent_id: str, new_parent_id: str) - return r.json() def move_node(self, node_id: str, parent_id: str) -> dict: - return self.update_metadata(node_id, {'parents': [parent_id]}) + properties = {'parents': [parent_id]} + # logger.debug('MOVE: parents: %s' % str([parent_id])) + while True: + ret = self.update_metadata(node_id, properties) + metadata = self.get_metadata(node_id, False, False) + # logger.debug('MOVE: metadata: %s' % str(metadata)) + if metadata['parents'] == [parent_id]: break + return ret def rename_node(self, node_id: str, new_name: str) -> dict: properties = {'name': new_name} - return self.update_metadata(node_id, properties) + # logger.debug('RENAME: new_name: %s' % new_name) + while True: + ret = self.update_metadata(node_id, properties) + metadata = self.get_metadata(node_id, False, False) + # logger.debug('RENAME: metadata: %s' % str(metadata)) + if metadata['name'] == new_name: break + return ret def set_available(self, node_id: str) -> dict: """Sets node status from 'PENDING' to 'AVAILABLE'.""" From 6fb1a06016682049b87d2368434013a08006728d Mon Sep 17 00:00:00 2001 From: Ben Date: Sat, 7 Jan 2017 23:57:18 -0500 Subject: [PATCH 25/63] move tempfile uploading to multipart streams to see if that helps with large uploads --- acdcli/api/content.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/acdcli/api/content.py b/acdcli/api/content.py index ef81f85..565227a 100644 --- a/acdcli/api/content.py +++ b/acdcli/api/content.py @@ -217,21 +217,27 @@ def overwrite_file(self, node_id: str, file_name: str, return r.json() def overwrite_tempfile(self, node_id: str, file, - read_callbacks: list = None, deduplication=False) -> dict: + read_callbacks: list = None) -> dict: + """Overwrite content of node with ID *node_id* with content of *file*. + + :param file: readable and seekable object""" + while True: + # logger.debug('OVERWRITE: node_id: %s' % node_id) file.seek(0) - params = {} if deduplication else {'suppress': 'deduplication'} - basename = "file.bin" - mime_type = _get_mimetype(basename) - f = _TeeBufferedReader(file, callbacks=read_callbacks) + if _stream_is_empty(file): + return self.clear_file(node_id) - # basename is ignored - m = MultipartEncoder(fields={('content', (quote_plus(basename), f, mime_type))}) + metadata = {} + import uuid + boundary = uuid.uuid4().hex try: - r = self.BOReq.put(self.content_url + 'nodes/' + node_id + '/content', params=params, - data=m, stream=True, headers={'Content-Type': m.content_type}) + r = self.BOReq.put(self.content_url + 'nodes/' + node_id + '/content', + data=self._multipart_stream(metadata, file, boundary, read_callbacks), + headers={'Content-Type': 'multipart/form-data; boundary=%s' + % boundary}) except RequestError as e: if e.status_code == RequestError.CODE.CONN_EXCEPTION: continue raise From 09289761726ce127159e52946ca7a94988b45882 Mon Sep 17 00:00:00 2001 From: Ben Date: Sun, 8 Jan 2017 18:08:26 -0500 Subject: [PATCH 26/63] extra logging around renames to catch when amazon drops rename requests --- acdcli/acd_fuse.py | 13 +++++++++++++ acdcli/api/metadata.py | 14 ++++++-------- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/acdcli/acd_fuse.py b/acdcli/acd_fuse.py index 79ec005..8087c17 100644 --- a/acdcli/acd_fuse.py +++ b/acdcli/acd_fuse.py @@ -17,6 +17,8 @@ import ctypes.util import binascii +import requests + from acdcli.cache.db import CacheConsts ctypes.util.__find_library = ctypes.util.find_library @@ -605,6 +607,17 @@ def create(self, path, mode) -> int: self.cache.insert_node(r) node = self.cache.get_node(r['id']) except RequestError as e: + # file all ready exists, see what we know about it since the + # cache may be out of sync or amazon missed a rename + if e.status_code == requests.codes.conflict: + prior_node_id = json.loads(e.msg)["info"]["nodeId"] + logger.error('create: duplicate name: %s prior_node_id: %s' % (name, prior_node_id)) + prior_node_amazon = self.acd_client.get_metadata(prior_node_id, False, False) + logger.error('create: prior_node(amazon): %s' % str(prior_node_amazon)) + prior_node_cache = self.cache.get_node(prior_node_id) + logger.error('create: prior_node(cache): %s' % str(prior_node_cache)) + # if prior_node_cache.name != prior_node_amazon["name"]: + # self._rename(prior_node_id, prior_node_cache.name) FuseOSError.convert(e) with self.fh_lock: diff --git a/acdcli/api/metadata.py b/acdcli/api/metadata.py index 1dd0c7d..7645473 100644 --- a/acdcli/api/metadata.py +++ b/acdcli/api/metadata.py @@ -228,22 +228,20 @@ def move_node_from(self, node_id: str, old_parent_id: str, new_parent_id: str) - def move_node(self, node_id: str, parent_id: str) -> dict: properties = {'parents': [parent_id]} - # logger.debug('MOVE: parents: %s' % str([parent_id])) + logger.debug('move_node: node_id: %s parents: %s' % (node_id, str([parent_id]))) while True: ret = self.update_metadata(node_id, properties) - metadata = self.get_metadata(node_id, False, False) - # logger.debug('MOVE: metadata: %s' % str(metadata)) - if metadata['parents'] == [parent_id]: break + logger.debug('move_node: metadata: %s' % str(ret)) + if ret['parents'] == [parent_id]: break return ret def rename_node(self, node_id: str, new_name: str) -> dict: properties = {'name': new_name} - # logger.debug('RENAME: new_name: %s' % new_name) + logger.debug('rename_node: node_id: %s new_name: %s' % (node_id, new_name)) while True: ret = self.update_metadata(node_id, properties) - metadata = self.get_metadata(node_id, False, False) - # logger.debug('RENAME: metadata: %s' % str(metadata)) - if metadata['name'] == new_name: break + logger.debug('rename_node: metadata: %s' % str(ret)) + if ret['name'] == new_name: break return ret def set_available(self, node_id: str) -> dict: From 6843dc19b4b3b73cea13644e2612c0935f6b6709 Mon Sep 17 00:00:00 2001 From: Ben Date: Wed, 11 Jan 2017 21:02:30 -0500 Subject: [PATCH 27/63] mode, uid, and gid --- acd_cli.py | 7 +++-- acdcli/acd_fuse.py | 66 +++++++++++++++++++++++++++++++++++----------- 2 files changed, 56 insertions(+), 17 deletions(-) diff --git a/acd_cli.py b/acd_cli.py index ca76841..47d4156 100755 --- a/acd_cli.py +++ b/acd_cli.py @@ -1147,11 +1147,14 @@ def mount_action(args: argparse.Namespace): import acdcli.acd_fuse acdcli.acd_fuse.mount(args.path, dict(acd_client=acd_client, cache=cache, nlinks=args.nlinks, autosync=asp, - settings_path=SETTINGS_PATH), + settings_path=SETTINGS_PATH, + umask = args.umask, + uid = args.uid, + gid = args.gid + ), ro=args.read_only, foreground=args.foreground, nothreads=args.single_threaded, nonempty=args.nonempty, modules=args.modules, - umask=args.umask,gid=args.gid,uid=args.uid, allow_root=args.allow_root, allow_other=args.allow_other, volname=args.volname) diff --git a/acdcli/acd_fuse.py b/acdcli/acd_fuse.py index 8087c17..93eff1a 100644 --- a/acdcli/acd_fuse.py +++ b/acdcli/acd_fuse.py @@ -52,6 +52,9 @@ def find_library(*args): _SETTINGS_FILENAME = 'fuse.ini' _XATTR_PROPERTY_NAME = 'xattrs' _XATTR_MTIME_OVERRIDE_NAME = 'fuse.mtime' +_XATTR_MODE_OVERRIDE_NAME = 'fuse.mode' +_XATTR_UID_OVERRIDE_NAME = 'fuse.uid' +_XATTR_GID_OVERRIDE_NAME = 'fuse.gid' _def_conf = configparser.ConfigParser() _def_conf['read'] = dict(open_chunk_limit=10, timeout=5) @@ -369,6 +372,12 @@ def __init__(self, **kwargs): """lock for fh counter increment and handle dict writes""" self.nlinks = kwargs.get('nlinks', False) """whether to calculate the number of hardlinks for folders""" + self.uid = kwargs['uid'] + """sets the default uid""" + self.gid = kwargs['gid'] + """sets the default gid""" + self.umask = kwargs['umask'] + """sets the default umask""" self.destroyed = autosync.keywords['stop'] """:type: multiprocessing.Event""" @@ -411,19 +420,30 @@ def getattr(self, path, fh=None) -> dict: size = self.wp.length(node.id, fh) if not size: size = node.size - times = dict(st_atime=time(), + try: uid = self._getxattr(node.id, _XATTR_UID_OVERRIDE_NAME) + except: uid = self.uid + + try: gid = self._getxattr(node.id, _XATTR_GID_OVERRIDE_NAME) + except: gid = self.gid + + attrs = dict(st_atime=time(), st_mtime=mtime, - st_ctime=node.created.timestamp()) + st_ctime=node.created.timestamp(), + st_uid=uid, + st_gid=gid) + + try: mode = stat.S_IMODE(self._getxattr(node.id, _XATTR_MODE_OVERRIDE_NAME)) + except: mode = None if node.is_folder: - return dict(st_mode=stat.S_IFDIR | 0o0777, + return dict(st_mode=stat.S_IFDIR | (mode if mode else 0o0777 & ~self.umask), st_nlink=self.cache.num_children(node.id) if self.nlinks else 1, - **times) + **attrs) elif node.is_file: - return dict(st_mode=stat.S_IFREG | 0o0666, + return dict(st_mode=stat.S_IFREG | (mode if mode else 0o0666 & ~self.umask), st_nlink=self.cache.num_parents(node.id) if self.nlinks else 1, st_size=size, - **times) + **attrs) def listxattr(self, path): node = self.cache.resolve(path) @@ -450,7 +470,7 @@ def _getxattr(self, node_id, name): with self.xattr_cache_lock: try: ret = self.xattr_cache[node_id][name] - if ret: + if ret is not None: return ret except: raise FuseOSError(errno.ENODATA) # should be ENOATTR @@ -548,9 +568,7 @@ def statfs(self, path) -> dict: ) def mkdir(self, path, mode): - """Creates a directory at ``path`` (see :manpage:`mkdir(2)`). - - :param mode: not used""" + """Creates a directory at ``path`` (see :manpage:`mkdir(2)`).""" name = os.path.basename(path) ppath = os.path.dirname(path) @@ -564,6 +582,8 @@ def mkdir(self, path, mode): FuseOSError.convert(e) else: self.cache.insert_node(r) + node = self.cache.get_node(r['id']) + self._chmod(node, mode) def _trash(self, path): logger.debug('trash %s' % path) @@ -593,7 +613,6 @@ def unlink(self, path): def create(self, path, mode) -> int: """Creates an empty file at ``path``. - :param mode: not used :returns int: file handle""" name = os.path.basename(path) @@ -620,6 +639,8 @@ def create(self, path, mode) -> int: # self._rename(prior_node_id, prior_node_cache.name) FuseOSError.convert(e) + self._chmod(node, mode) + with self.fh_lock: self.fh += 1 self.handles[self.fh] = node @@ -791,12 +812,27 @@ def utimens(self, path, times=None): return 0 def chmod(self, path, mode): - """Not implemented.""" - pass + node = self.cache.resolve(path) + if not node: + raise FuseOSError(errno.ENOENT) + return self._chmod(node, mode) + + def _chmod(self, node, mode): + self._setxattr(node.id, _XATTR_MODE_OVERRIDE_NAME, mode) + self._xattr_write_and_sync() + return 0 def chown(self, path, uid, gid): - """Not implemented.""" - pass + node = self.cache.resolve(path) + if not node: + raise FuseOSError(errno.ENOENT) + return self._chown(node, uid, gid) + + def _chown(self, node, uid, gid): + if uid != -1: self._setxattr(node.id, _XATTR_UID_OVERRIDE_NAME, uid) + if gid != -1: self._setxattr(node.id, _XATTR_GID_OVERRIDE_NAME, gid) + self._xattr_write_and_sync() + return 0 def mount(path: str, args: dict, **kwargs) -> 'Union[int, None]': From cf3cbb721c9d3406303bbbf6432fa187c0fe888f Mon Sep 17 00:00:00 2001 From: Ben Date: Wed, 11 Jan 2017 21:19:19 -0500 Subject: [PATCH 28/63] blksize, blocks --- acdcli/acd_fuse.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/acdcli/acd_fuse.py b/acdcli/acd_fuse.py index 93eff1a..af85b46 100644 --- a/acdcli/acd_fuse.py +++ b/acdcli/acd_fuse.py @@ -378,6 +378,8 @@ def __init__(self, **kwargs): """sets the default gid""" self.umask = kwargs['umask'] """sets the default umask""" + self.blksize = self.acd_client._conf.getint('transfer', 'fs_chunk_size') + """size of the filesystem blocks for stat queries""" self.destroyed = autosync.keywords['stop'] """:type: multiprocessing.Event""" @@ -443,6 +445,8 @@ def getattr(self, path, fh=None) -> dict: return dict(st_mode=stat.S_IFREG | (mode if mode else 0o0666 & ~self.umask), st_nlink=self.cache.num_parents(node.id) if self.nlinks else 1, st_size=size, + st_blksize=self.blksize, + st_blocks=(node.size + 511) // 512, **attrs) def listxattr(self, path): @@ -558,12 +562,11 @@ def read(self, path, length, offset, fh) -> bytes: def statfs(self, path) -> dict: """Gets some filesystem statistics as specified in :manpage:`stat(2)`.""" - bs = 512 * 1024 # no effect? - return dict(f_bsize=bs, - f_frsize=bs, - f_blocks=self.total // bs, # total no of blocks - f_bfree=self.free // bs, # free blocks - f_bavail=self.free // bs, + return dict(f_bsize=self.blksize, + f_frsize=self.blksize, + f_blocks=self.total // self.blksize, # total no of blocks + f_bfree=self.free // self.blksize, # free blocks + f_bavail=self.free // self.blksize, f_namemax=256 ) From be9a0a206253e9f5ef4ce14a31cd570e0a2962ad Mon Sep 17 00:00:00 2001 From: Ben Date: Wed, 18 Jan 2017 21:45:27 -0500 Subject: [PATCH 29/63] symlink, readlink --- acdcli/acd_fuse.py | 42 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/acdcli/acd_fuse.py b/acdcli/acd_fuse.py index af85b46..9f5232a 100644 --- a/acdcli/acd_fuse.py +++ b/acdcli/acd_fuse.py @@ -415,7 +415,9 @@ def getattr(self, path, fh=None) -> dict: node = self.cache.resolve(path) if not node: raise FuseOSError(errno.ENOENT) + return self._getattr(node, fh) + def _getattr(self, node, fh=None) -> dict: try: mtime = self._getxattr(node.id, _XATTR_MTIME_OVERRIDE_NAME) except: mtime = node.modified.timestamp() @@ -434,15 +436,23 @@ def getattr(self, path, fh=None) -> dict: st_uid=uid, st_gid=gid) - try: mode = stat.S_IMODE(self._getxattr(node.id, _XATTR_MODE_OVERRIDE_NAME)) + try: mode = self._getxattr(node.id, _XATTR_MODE_OVERRIDE_NAME) except: mode = None if node.is_folder: - return dict(st_mode=stat.S_IFDIR | (mode if mode else 0o0777 & ~self.umask), + # directory + mode = stat.S_IFDIR | (stat.S_IMODE(mode) if mode else 0o0777 & ~self.umask) + + return dict(st_mode=mode, st_nlink=self.cache.num_children(node.id) if self.nlinks else 1, **attrs) elif node.is_file: - return dict(st_mode=stat.S_IFREG | (mode if mode else 0o0666 & ~self.umask), + # symlink + if mode and stat.S_ISLNK(stat.S_IFMT(mode)): mode = stat.S_IFLNK | 0o0777 + # file + else: mode = stat.S_IFREG | (stat.S_IMODE(mode) if mode else 0o0666 & ~self.umask) + + return dict(st_mode=mode, st_nlink=self.cache.num_parents(node.id) if self.nlinks else 1, st_size=size, st_blksize=self.blksize, @@ -536,7 +546,7 @@ def _xattr_write_and_sync(self): logger.debug('_xattr_write_and_sync: node: %s xattrs: %s: ' % (node_id, xattrs_str)) self.xattr_dirty.clear() - def read(self, path, length, offset, fh) -> bytes: + def read(self, path, length, offset, fh=None) -> bytes: """Read ```length`` bytes from ``path`` at ``offset``.""" if fh: @@ -642,7 +652,8 @@ def create(self, path, mode) -> int: # self._rename(prior_node_id, prior_node_cache.name) FuseOSError.convert(e) - self._chmod(node, mode) + if mode is not None: + self._chmod(node, mode) with self.fh_lock: self.fh += 1 @@ -821,7 +832,9 @@ def chmod(self, path, mode): return self._chmod(node, mode) def _chmod(self, node, mode): - self._setxattr(node.id, _XATTR_MODE_OVERRIDE_NAME, mode) + mode_perms = stat.S_IMODE(mode) + mode_type = stat.S_IFMT(self._getattr(node)['st_mode']) + self._setxattr(node.id, _XATTR_MODE_OVERRIDE_NAME, mode_type | mode_perms) self._xattr_write_and_sync() return 0 @@ -837,6 +850,23 @@ def _chown(self, node, uid, gid): self._xattr_write_and_sync() return 0 + def symlink(self, target, source): + fh = self.create(target, None) + node = self.handles[fh] + self._setxattr(node.id, _XATTR_MODE_OVERRIDE_NAME, stat.S_IFLNK | 0o0777) + # While it may be tempting to store the link's source in xattr space, note that encrypting file + # systems like gocryptfs pass xattrs straight through to the native file system; so amazon would + # have a look at unencrypted file names via links. So we must place this in the contents. + #TODO: have a cache of node -> link source somewhere in sql and memory so we don't need to read from amazon + self.write(target, source.encode('utf-8'), 0, fh) + self.release(target, fh) + return 0 + + def readlink(self, path): + attr = self.getattr(path) + source = self.read(path, attr['st_size'], 0).decode('utf-8') + return source + def mount(path: str, args: dict, **kwargs) -> 'Union[int, None]': """Fusermounts Amazon Cloud Drive to specified mountpoint. From b2b6d4bfbd8d92e0e15b27ca07341cef44af7d9c Mon Sep 17 00:00:00 2001 From: Ben Date: Wed, 18 Jan 2017 22:27:41 -0500 Subject: [PATCH 30/63] caching of symlink targts --- acdcli/acd_fuse.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/acdcli/acd_fuse.py b/acdcli/acd_fuse.py index 9f5232a..026a530 100644 --- a/acdcli/acd_fuse.py +++ b/acdcli/acd_fuse.py @@ -55,6 +55,7 @@ def find_library(*args): _XATTR_MODE_OVERRIDE_NAME = 'fuse.mode' _XATTR_UID_OVERRIDE_NAME = 'fuse.uid' _XATTR_GID_OVERRIDE_NAME = 'fuse.gid' +_XATTR_SYMLINK_OVERRIDE_NAME = 'fuse.symlink' _def_conf = configparser.ConfigParser() _def_conf['read'] = dict(open_chunk_limit=10, timeout=5) @@ -854,17 +855,15 @@ def symlink(self, target, source): fh = self.create(target, None) node = self.handles[fh] self._setxattr(node.id, _XATTR_MODE_OVERRIDE_NAME, stat.S_IFLNK | 0o0777) - # While it may be tempting to store the link's source in xattr space, note that encrypting file - # systems like gocryptfs pass xattrs straight through to the native file system; so amazon would - # have a look at unencrypted file names via links. So we must place this in the contents. - #TODO: have a cache of node -> link source somewhere in sql and memory so we don't need to read from amazon - self.write(target, source.encode('utf-8'), 0, fh) + self._setxattr(node.id, _XATTR_SYMLINK_OVERRIDE_NAME, source) self.release(target, fh) return 0 def readlink(self, path): - attr = self.getattr(path) - source = self.read(path, attr['st_size'], 0).decode('utf-8') + node = self.cache.resolve(path) + if not node: + raise FuseOSError(errno.ENOENT) + source = self._getxattr(node.id, _XATTR_SYMLINK_OVERRIDE_NAME) return source From 879d652072229b05dfc2519e5b474ca395c2a4d7 Mon Sep 17 00:00:00 2001 From: bgemmill Date: Sat, 21 Jan 2017 00:41:06 -0500 Subject: [PATCH 31/63] invalidate the path resolving cache less to help with rsync --partial file moves --- acdcli/acd_fuse.py | 29 ++++++++++++++++++----------- acdcli/cache/sync.py | 22 +++++++++++++++------- 2 files changed, 33 insertions(+), 18 deletions(-) diff --git a/acdcli/acd_fuse.py b/acdcli/acd_fuse.py index 026a530..4f76370 100644 --- a/acdcli/acd_fuse.py +++ b/acdcli/acd_fuse.py @@ -263,7 +263,7 @@ def _write_and_sync(self, buffer: WriteBuffer, node_id: str): except (RequestError, IOError) as e: logger.error('Error writing node "%s". %s' % (node_id, str(e))) else: - self.cache.insert_node(r) + self.cache.insert_node(r, flush_cache=False) def read(self, node_id, fh, offset, length: int): b = self.buffers.get(node_id) @@ -595,7 +595,7 @@ def mkdir(self, path, mode): except RequestError as e: FuseOSError.convert(e) else: - self.cache.insert_node(r) + self.cache.insert_node(r, flush_cache=False) node = self.cache.get_node(r['id']) self._chmod(node, mode) @@ -614,7 +614,11 @@ def _trash(self, path): except RequestError as e: FuseOSError.convert(e) else: - self.cache.insert_node(r) + if node.is_file: + self.cache.insert_node(r, flush_cache=False) + self.cache.cache_del(path) + else: + self.cache.insert_node(r) def rmdir(self, path): """Moves a directory into ACD trash.""" @@ -637,7 +641,7 @@ def create(self, path, mode) -> int: try: r = self.acd_client.create_file(name, p.id) - self.cache.insert_node(r) + self.cache.insert_node(r, flush_cache=False) node = self.cache.get_node(r['id']) except RequestError as e: # file all ready exists, see what we know about it since the @@ -688,30 +692,33 @@ def rename(self, old, new): raise FuseOSError(errno.EEXIST) if new_bn != old_bn: - self._rename(node.id, new_bn) + self._rename(node.id, new_bn, not node.is_file) if new_dn != old_dn: # odir_id = self.cache.resolve_path(old_dn, False) ndir = self.cache.resolve(new_dn, False) if not ndir: raise FuseOSError(errno.ENOTDIR) - self._move(node.id, ndir.id) + self._move(node.id, ndir.id, not node.is_file) - def _rename(self, id, name): + if node.is_file: + self.cache.cache_del(old) + + def _rename(self, id, name, flush_cache:bool=True): try: r = self.acd_client.rename_node(id, name) except RequestError as e: FuseOSError.convert(e) else: - self.cache.insert_node(r) + self.cache.insert_node(r, flush_cache=flush_cache) - def _move(self, id, new_folder): + def _move(self, id, new_folder, flush_cache:bool=True): try: r = self.acd_client.move_node(id, new_folder) except RequestError as e: FuseOSError.convert(e) else: - self.cache.insert_node(r) + self.cache.insert_node(r, flush_cache=flush_cache) def open(self, path, flags) -> int: """Opens a file. @@ -768,7 +775,7 @@ def truncate(self, path, length, fh=None): except RequestError as e: raise FuseOSError.convert(e) else: - self.cache.insert_node(r) + self.cache.insert_node(r, flush_cache=False) """No good way to deal with positive lengths at the moment; since we can only do something about it in the middle of writing, this means the only use case we can diff --git a/acdcli/cache/sync.py b/acdcli/cache/sync.py index daacc90..c473d45 100644 --- a/acdcli/cache/sync.py +++ b/acdcli/cache/sync.py @@ -47,13 +47,21 @@ def remove_purged(self, purged: list): logger.info('Purged %i node(s).' % len(purged)) - def insert_nodes(self, nodes: list, partial=True): - """Inserts mixed list of files and folders into cache.""" - - """Flush the path cache since these new nodes may be deletes, moves, or renames - that affect the path cache, or overwrites that would invalidate the data in it.""" + def cache_flush(self): with self.path_to_node_cache_lock: self.path_to_node_cache.clear() + logger.warning("path_to_node_cache flushed") + + def cache_del(self, path:str): + with self.path_to_node_cache_lock: + try: del self.path_to_node_cache[path] + except: pass + + def insert_nodes(self, nodes: list, partial:bool=True, flush_cache:bool=True): + """Inserts mixed list of files and folders into cache.""" + + if flush_cache: + self.cache_flush() files = [] folders = [] @@ -81,11 +89,11 @@ def insert_nodes(self, nodes: list, partial=True): self.insert_parentage(files + folders, partial) self.insert_properties(files + folders) - def insert_node(self, node: dict): + def insert_node(self, node:dict, flush_cache:bool=True): """Inserts single file or folder into cache.""" if not node: return - self.insert_nodes([node]) + self.insert_nodes([node], flush_cache=flush_cache) def insert_folders(self, folders: list): """ Inserts list of folders into cache. Sets 'update' column to current date. From bb1bec8f2847eff5360db70202dc02ae79f76484 Mon Sep 17 00:00:00 2001 From: bgemmill Date: Sat, 21 Jan 2017 17:42:16 -0500 Subject: [PATCH 32/63] clean log spam --- acdcli/api/content.py | 8 +++++--- acdcli/cache/sync.py | 1 - 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/acdcli/api/content.py b/acdcli/api/content.py index 565227a..e31c455 100644 --- a/acdcli/api/content.py +++ b/acdcli/api/content.py @@ -222,13 +222,15 @@ def overwrite_tempfile(self, node_id: str, file, :param file: readable and seekable object""" + # If we're writing 0 bytes, clear instead + file.seek(0, os.SEEK_END) + if file.tell() == 0: + return self.clear_file(node_id) + while True: # logger.debug('OVERWRITE: node_id: %s' % node_id) file.seek(0) - if _stream_is_empty(file): - return self.clear_file(node_id) - metadata = {} import uuid boundary = uuid.uuid4().hex diff --git a/acdcli/cache/sync.py b/acdcli/cache/sync.py index c473d45..f7d9317 100644 --- a/acdcli/cache/sync.py +++ b/acdcli/cache/sync.py @@ -50,7 +50,6 @@ def remove_purged(self, purged: list): def cache_flush(self): with self.path_to_node_cache_lock: self.path_to_node_cache.clear() - logger.warning("path_to_node_cache flushed") def cache_del(self, path:str): with self.path_to_node_cache_lock: From 1ef5657d8c5192c81dc4bde8a1f02c74951b7bce Mon Sep 17 00:00:00 2001 From: bgemmill Date: Sun, 22 Jan 2017 12:07:07 -0500 Subject: [PATCH 33/63] add elements to the resolve cache on creation --- acdcli/acd_fuse.py | 11 +++++------ acdcli/cache/query.py | 4 ++++ acdcli/cache/sync.py | 6 +++--- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/acdcli/acd_fuse.py b/acdcli/acd_fuse.py index 4f76370..1c4d21b 100644 --- a/acdcli/acd_fuse.py +++ b/acdcli/acd_fuse.py @@ -597,6 +597,7 @@ def mkdir(self, path, mode): else: self.cache.insert_node(r, flush_cache=False) node = self.cache.get_node(r['id']) + self.cache.resolve_cache_add(path, node) self._chmod(node, mode) def _trash(self, path): @@ -614,11 +615,8 @@ def _trash(self, path): except RequestError as e: FuseOSError.convert(e) else: - if node.is_file: - self.cache.insert_node(r, flush_cache=False) - self.cache.cache_del(path) - else: - self.cache.insert_node(r) + self.cache.insert_node(r, not node.is_file) + self.cache.resolve_cache_del(path) def rmdir(self, path): """Moves a directory into ACD trash.""" @@ -643,6 +641,7 @@ def create(self, path, mode) -> int: r = self.acd_client.create_file(name, p.id) self.cache.insert_node(r, flush_cache=False) node = self.cache.get_node(r['id']) + self.cache.resolve_cache_add(path, node) except RequestError as e: # file all ready exists, see what we know about it since the # cache may be out of sync or amazon missed a rename @@ -702,7 +701,7 @@ def rename(self, old, new): self._move(node.id, ndir.id, not node.is_file) if node.is_file: - self.cache.cache_del(old) + self.cache.resolve_cache_del(old) def _rename(self, id, name, flush_cache:bool=True): try: diff --git a/acdcli/cache/query.py b/acdcli/cache/query.py index 795e37f..487982c 100644 --- a/acdcli/cache/query.py +++ b/acdcli/cache/query.py @@ -142,6 +142,10 @@ def simple_name(self): class QueryMixin(object): + def resolve_cache_add(self, path:str, node:Node): + with self.path_to_node_cache_lock: + self.path_to_node_cache[path] = node + def get_node(self, id) -> 'Union[Node|None]': with cursor(self._conn) as c: c.execute(NODE_BY_ID_SQL, [id]) diff --git a/acdcli/cache/sync.py b/acdcli/cache/sync.py index f7d9317..c2a3f6d 100644 --- a/acdcli/cache/sync.py +++ b/acdcli/cache/sync.py @@ -47,11 +47,11 @@ def remove_purged(self, purged: list): logger.info('Purged %i node(s).' % len(purged)) - def cache_flush(self): + def resolve_cache_flush(self): with self.path_to_node_cache_lock: self.path_to_node_cache.clear() - def cache_del(self, path:str): + def resolve_cache_del(self, path:str): with self.path_to_node_cache_lock: try: del self.path_to_node_cache[path] except: pass @@ -60,7 +60,7 @@ def insert_nodes(self, nodes: list, partial:bool=True, flush_cache:bool=True): """Inserts mixed list of files and folders into cache.""" if flush_cache: - self.cache_flush() + self.resolve_cache_flush() files = [] folders = [] From 067a8872cec740896804f965b6e6d1679ca039a0 Mon Sep 17 00:00:00 2001 From: bgemmill Date: Tue, 24 Jan 2017 17:48:52 -0500 Subject: [PATCH 34/63] The entirety of resolve caching can be removed if we apply gerph's PR. Thanks gerph! --- acdcli/acd_fuse.py | 28 +++++++++++----------------- acdcli/cache/db.py | 5 ----- acdcli/cache/query.py | 23 ----------------------- acdcli/cache/schema.py | 18 +++++++++++++++--- acdcli/cache/sync.py | 18 +++--------------- docs/contributors.rst | 2 ++ 6 files changed, 31 insertions(+), 63 deletions(-) diff --git a/acdcli/acd_fuse.py b/acdcli/acd_fuse.py index 1c4d21b..026a530 100644 --- a/acdcli/acd_fuse.py +++ b/acdcli/acd_fuse.py @@ -263,7 +263,7 @@ def _write_and_sync(self, buffer: WriteBuffer, node_id: str): except (RequestError, IOError) as e: logger.error('Error writing node "%s". %s' % (node_id, str(e))) else: - self.cache.insert_node(r, flush_cache=False) + self.cache.insert_node(r) def read(self, node_id, fh, offset, length: int): b = self.buffers.get(node_id) @@ -595,9 +595,8 @@ def mkdir(self, path, mode): except RequestError as e: FuseOSError.convert(e) else: - self.cache.insert_node(r, flush_cache=False) + self.cache.insert_node(r) node = self.cache.get_node(r['id']) - self.cache.resolve_cache_add(path, node) self._chmod(node, mode) def _trash(self, path): @@ -615,8 +614,7 @@ def _trash(self, path): except RequestError as e: FuseOSError.convert(e) else: - self.cache.insert_node(r, not node.is_file) - self.cache.resolve_cache_del(path) + self.cache.insert_node(r) def rmdir(self, path): """Moves a directory into ACD trash.""" @@ -639,9 +637,8 @@ def create(self, path, mode) -> int: try: r = self.acd_client.create_file(name, p.id) - self.cache.insert_node(r, flush_cache=False) + self.cache.insert_node(r) node = self.cache.get_node(r['id']) - self.cache.resolve_cache_add(path, node) except RequestError as e: # file all ready exists, see what we know about it since the # cache may be out of sync or amazon missed a rename @@ -691,33 +688,30 @@ def rename(self, old, new): raise FuseOSError(errno.EEXIST) if new_bn != old_bn: - self._rename(node.id, new_bn, not node.is_file) + self._rename(node.id, new_bn) if new_dn != old_dn: # odir_id = self.cache.resolve_path(old_dn, False) ndir = self.cache.resolve(new_dn, False) if not ndir: raise FuseOSError(errno.ENOTDIR) - self._move(node.id, ndir.id, not node.is_file) + self._move(node.id, ndir.id) - if node.is_file: - self.cache.resolve_cache_del(old) - - def _rename(self, id, name, flush_cache:bool=True): + def _rename(self, id, name): try: r = self.acd_client.rename_node(id, name) except RequestError as e: FuseOSError.convert(e) else: - self.cache.insert_node(r, flush_cache=flush_cache) + self.cache.insert_node(r) - def _move(self, id, new_folder, flush_cache:bool=True): + def _move(self, id, new_folder): try: r = self.acd_client.move_node(id, new_folder) except RequestError as e: FuseOSError.convert(e) else: - self.cache.insert_node(r, flush_cache=flush_cache) + self.cache.insert_node(r) def open(self, path, flags) -> int: """Opens a file. @@ -774,7 +768,7 @@ def truncate(self, path, length, fh=None): except RequestError as e: raise FuseOSError.convert(e) else: - self.cache.insert_node(r, flush_cache=False) + self.cache.insert_node(r) """No good way to deal with positive lengths at the moment; since we can only do something about it in the middle of writing, this means the only use case we can diff --git a/acdcli/cache/db.py b/acdcli/cache/db.py index b7af974..2f78ddb 100644 --- a/acdcli/cache/db.py +++ b/acdcli/cache/db.py @@ -69,11 +69,6 @@ def __init__(self, cache_path: str='', settings_path='', check=IntegrityCheckTyp self._conn.create_function('REGEXP', _regex_match.__code__.co_argcount, _regex_match) - self.path_to_node_cache = {} - self.path_to_node_cache_lock = Lock() - """There are a huge number of repeated path lookups, - so cache results and invalidate on new nodes.""" - with cursor(self._conn) as c: c.execute(_ROOT_ID_SQL) row = c.fetchone() diff --git a/acdcli/cache/query.py b/acdcli/cache/query.py index 487982c..e79c51b 100644 --- a/acdcli/cache/query.py +++ b/acdcli/cache/query.py @@ -142,10 +142,6 @@ def simple_name(self): class QueryMixin(object): - def resolve_cache_add(self, path:str, node:Node): - with self.path_to_node_cache_lock: - self.path_to_node_cache[path] = node - def get_node(self, id) -> 'Union[Node|None]': with cursor(self._conn) as c: c.execute(NODE_BY_ID_SQL, [id]) @@ -165,17 +161,6 @@ def get_conflicting_node(self, name: str, parent_id: str): return Node(r) def resolve(self, path: str, trash=False) -> 'Union[Node|None]': - """Gets a node from a path""" - with self.path_to_node_cache_lock: - try: return self.path_to_node_cache[path] - except: pass - n = self._resolve(path,trash) - if n: - self.path_to_node_cache[path] = n - return n - return None - - def _resolve(self, path: str, trash=False) -> 'Union[Node|None]': segments = list(filter(bool, path.split('/'))) if not segments: if not self.root_id: @@ -281,14 +266,6 @@ def list_children(self, folder_id, trash=False, folder_path=None) -> 'Tuple[List folders.append(node) node = c.fetchone() - """If the caller provides the folder_path, we can add all the children to the - path->node_id cache for faster lookup after a directory listing""" - if folder_path: - children = folders + files - with self.path_to_node_cache_lock: - for c in children: - self.path_to_node_cache[folder_path + '/' + c.name] = c - return folders, files def list_trashed_children(self, folder_id) -> 'Tuple[List[Node], List[Node]]': diff --git a/acdcli/cache/schema.py b/acdcli/cache/schema.py index 9939af1..71b5c5e 100644 --- a/acdcli/cache/schema.py +++ b/acdcli/cache/schema.py @@ -61,8 +61,9 @@ FOREIGN KEY(child) REFERENCES nodes (id) ); + CREATE INDEX ix_parentage_child ON parentage(child); CREATE INDEX ix_nodes_names ON nodes(name); - PRAGMA user_version = 2; + PRAGMA user_version = 3; """ _GEN_DROP_TABLES_SQL = \ @@ -88,12 +89,23 @@ def _1_to_2(conn): conn.commit() -_migrations = [_0_to_1, _1_to_2] +def _2_to_3(conn): + conn.executescript( + 'CREATE INDEX IF NOT EXISTS ix_parentage_child ON parentage(child);' + # Having changed the schema, the queries can be optimised differently. + # In order to be aware of that, re-analyze the type of data and indexes, + # allowing SQLite3 to make better decisions. + 'ANALYZE;' + 'PRAGMA user_version = 3;' + ) + conn.commit() + +_migrations = [_0_to_1, _1_to_2, _2_to_3] """list of all migrations from index -> index+1""" class SchemaMixin(object): - _DB_SCHEMA_VER = 2 + _DB_SCHEMA_VER = 3 def init(self): try: diff --git a/acdcli/cache/sync.py b/acdcli/cache/sync.py index c2a3f6d..393c551 100644 --- a/acdcli/cache/sync.py +++ b/acdcli/cache/sync.py @@ -47,21 +47,9 @@ def remove_purged(self, purged: list): logger.info('Purged %i node(s).' % len(purged)) - def resolve_cache_flush(self): - with self.path_to_node_cache_lock: - self.path_to_node_cache.clear() - - def resolve_cache_del(self, path:str): - with self.path_to_node_cache_lock: - try: del self.path_to_node_cache[path] - except: pass - - def insert_nodes(self, nodes: list, partial:bool=True, flush_cache:bool=True): + def insert_nodes(self, nodes: list, partial:bool=True): """Inserts mixed list of files and folders into cache.""" - if flush_cache: - self.resolve_cache_flush() - files = [] folders = [] for node in nodes: @@ -88,11 +76,11 @@ def insert_nodes(self, nodes: list, partial:bool=True, flush_cache:bool=True): self.insert_parentage(files + folders, partial) self.insert_properties(files + folders) - def insert_node(self, node:dict, flush_cache:bool=True): + def insert_node(self, node:dict): """Inserts single file or folder into cache.""" if not node: return - self.insert_nodes([node], flush_cache=flush_cache) + self.insert_nodes([node]) def insert_folders(self, folders: list): """ Inserts list of folders into cache. Sets 'update' column to current date. diff --git a/docs/contributors.rst b/docs/contributors.rst index ce1944a..89cd829 100644 --- a/docs/contributors.rst +++ b/docs/contributors.rst @@ -23,6 +23,8 @@ Thanks to - `memoz `_ for amending proxy documentation +- `gerph `_ for making file searches faster, particularly on large repositories + Also thanks to - `fibersnet `_ for pointing out a possible deadlock in ACDFuse. From cc478117b5bd516974eebf264ed568745a211672 Mon Sep 17 00:00:00 2001 From: bgemmill Date: Tue, 24 Jan 2017 22:32:13 -0500 Subject: [PATCH 35/63] turns out it's faster with both --- acdcli/acd_fuse.py | 29 ++++++++++++++++------------- acdcli/cache/db.py | 5 +++++ acdcli/cache/query.py | 20 ++++++++++++++++++++ acdcli/cache/sync.py | 15 ++++++++++++--- 4 files changed, 53 insertions(+), 16 deletions(-) diff --git a/acdcli/acd_fuse.py b/acdcli/acd_fuse.py index 026a530..738b64f 100644 --- a/acdcli/acd_fuse.py +++ b/acdcli/acd_fuse.py @@ -263,7 +263,7 @@ def _write_and_sync(self, buffer: WriteBuffer, node_id: str): except (RequestError, IOError) as e: logger.error('Error writing node "%s". %s' % (node_id, str(e))) else: - self.cache.insert_node(r) + self.cache.insert_node(r, flush_resolve_cache=False) def read(self, node_id, fh, offset, length: int): b = self.buffers.get(node_id) @@ -595,7 +595,7 @@ def mkdir(self, path, mode): except RequestError as e: FuseOSError.convert(e) else: - self.cache.insert_node(r) + self.cache.insert_node(r, flush_resolve_cache=False) node = self.cache.get_node(r['id']) self._chmod(node, mode) @@ -614,7 +614,8 @@ def _trash(self, path): except RequestError as e: FuseOSError.convert(e) else: - self.cache.insert_node(r) + self.cache.insert_node(r, flush_resolve_cache=node.is_folder) + self.cache.resolve_cache_del(path) def rmdir(self, path): """Moves a directory into ACD trash.""" @@ -637,7 +638,7 @@ def create(self, path, mode) -> int: try: r = self.acd_client.create_file(name, p.id) - self.cache.insert_node(r) + self.cache.insert_node(r, flush_resolve_cache=False) node = self.cache.get_node(r['id']) except RequestError as e: # file all ready exists, see what we know about it since the @@ -687,31 +688,33 @@ def rename(self, old, new): else: raise FuseOSError(errno.EEXIST) + self.cache.resolve_cache_del(old) + if new_bn != old_bn: - self._rename(node.id, new_bn) + self._rename(node, new_bn) if new_dn != old_dn: # odir_id = self.cache.resolve_path(old_dn, False) ndir = self.cache.resolve(new_dn, False) if not ndir: raise FuseOSError(errno.ENOTDIR) - self._move(node.id, ndir.id) + self._move(node, ndir.id) - def _rename(self, id, name): + def _rename(self, node, name): try: - r = self.acd_client.rename_node(id, name) + r = self.acd_client.rename_node(node.id, name) except RequestError as e: FuseOSError.convert(e) else: - self.cache.insert_node(r) + self.cache.insert_node(r, flush_resolve_cache=node.is_folder) - def _move(self, id, new_folder): + def _move(self, node, new_folder): try: - r = self.acd_client.move_node(id, new_folder) + r = self.acd_client.move_node(node.id, new_folder) except RequestError as e: FuseOSError.convert(e) else: - self.cache.insert_node(r) + self.cache.insert_node(r, flush_resolve_cache=node.is_folder) def open(self, path, flags) -> int: """Opens a file. @@ -768,7 +771,7 @@ def truncate(self, path, length, fh=None): except RequestError as e: raise FuseOSError.convert(e) else: - self.cache.insert_node(r) + self.cache.insert_node(r, flush_resolve_cache=False) """No good way to deal with positive lengths at the moment; since we can only do something about it in the middle of writing, this means the only use case we can diff --git a/acdcli/cache/db.py b/acdcli/cache/db.py index 2f78ddb..b7af974 100644 --- a/acdcli/cache/db.py +++ b/acdcli/cache/db.py @@ -69,6 +69,11 @@ def __init__(self, cache_path: str='', settings_path='', check=IntegrityCheckTyp self._conn.create_function('REGEXP', _regex_match.__code__.co_argcount, _regex_match) + self.path_to_node_cache = {} + self.path_to_node_cache_lock = Lock() + """There are a huge number of repeated path lookups, + so cache results and invalidate on new nodes.""" + with cursor(self._conn) as c: c.execute(_ROOT_ID_SQL) row = c.fetchone() diff --git a/acdcli/cache/query.py b/acdcli/cache/query.py index e79c51b..1955dd6 100644 --- a/acdcli/cache/query.py +++ b/acdcli/cache/query.py @@ -161,6 +161,19 @@ def get_conflicting_node(self, name: str, parent_id: str): return Node(r) def resolve(self, path: str, trash=False) -> 'Union[Node|None]': + """Gets a node from a path""" + with self.path_to_node_cache_lock: + try: + return self.path_to_node_cache[path] + except: + pass + n = self._resolve(path, trash) + if n: + self.path_to_node_cache[path] = n + return n + return None + + def _resolve(self, path: str, trash=False) -> 'Union[Node|None]': segments = list(filter(bool, path.split('/'))) if not segments: if not self.root_id: @@ -266,6 +279,13 @@ def list_children(self, folder_id, trash=False, folder_path=None) -> 'Tuple[List folders.append(node) node = c.fetchone() + """If the caller provides the folder_path, we can add all the children to the + path->node_id cache for faster lookup after a directory listing""" + if folder_path: + with self.path_to_node_cache_lock: + for c in folders + files: + self.path_to_node_cache[folder_path + '/' + c.name] = c + return folders, files def list_trashed_children(self, folder_id) -> 'Tuple[List[Node], List[Node]]': diff --git a/acdcli/cache/sync.py b/acdcli/cache/sync.py index 393c551..7de6b2d 100644 --- a/acdcli/cache/sync.py +++ b/acdcli/cache/sync.py @@ -47,9 +47,18 @@ def remove_purged(self, purged: list): logger.info('Purged %i node(s).' % len(purged)) - def insert_nodes(self, nodes: list, partial:bool=True): + def resolve_cache_del(self, path:str): + with self.path_to_node_cache_lock: + try: del self.path_to_node_cache[path] + except:pass + + def insert_nodes(self, nodes: list, partial:bool=True, flush_resolve_cache:bool=False): """Inserts mixed list of files and folders into cache.""" + if flush_resolve_cache: + with self.path_to_node_cache_lock: + self.path_to_node_cache.clear() + files = [] folders = [] for node in nodes: @@ -76,11 +85,11 @@ def insert_nodes(self, nodes: list, partial:bool=True): self.insert_parentage(files + folders, partial) self.insert_properties(files + folders) - def insert_node(self, node:dict): + def insert_node(self, node:dict, flush_resolve_cache:bool=False): """Inserts single file or folder into cache.""" if not node: return - self.insert_nodes([node]) + self.insert_nodes([node], flush_resolve_cache=flush_resolve_cache) def insert_folders(self, folders: list): """ Inserts list of folders into cache. Sets 'update' column to current date. From 2b75256756bfb2c5d33efbe709ce42ffd2d02af3 Mon Sep 17 00:00:00 2001 From: bgemmill Date: Sat, 28 Jan 2017 21:48:31 -0500 Subject: [PATCH 36/63] cache node ids instead of nodes to fix 0 file size issues --- acdcli/acd_fuse.py | 141 ++++++++++++++++++++++-------------------- acdcli/cache/db.py | 6 +- acdcli/cache/query.py | 23 +++++-- acdcli/cache/sync.py | 8 +-- 4 files changed, 98 insertions(+), 80 deletions(-) diff --git a/acdcli/acd_fuse.py b/acdcli/acd_fuse.py index 738b64f..d9eaef8 100644 --- a/acdcli/acd_fuse.py +++ b/acdcli/acd_fuse.py @@ -308,7 +308,7 @@ def __call__(self, op, path, *args): elif op == 'chmod': targs = (oct(args[0]),) + args[1:] elif op == 'setxattr': - targs = (len(args[0]),) + args[1:] + targs = (args[0],) + (len(args[1]),) logger.debug('-> %s %s %s', op, path, repr(args if not targs else targs)) @@ -365,8 +365,8 @@ def __init__(self, **kwargs): """manually calculated available disk space""" self.fh = 1 """file handle counter\n\n :type: int""" - self.handles = {} - """map fh->node\n\n :type: dict""" + self.fh_to_node = {} + """map fh->node_id\n\n :type: dict""" self.node_to_fh = defaultdict(lambda: set()) """map node_id to list of interested file handles""" self.fh_lock = Lock() @@ -411,7 +411,8 @@ def getattr(self, path, fh=None) -> dict: Calculates correct number of links for folders if :attr:`nlinks` is set.""" if fh: - node = self.handles[fh] + node_id = self.fh_to_node[fh] + node = self.cache.get_node(node_id) else: node = self.cache.resolve(path) if not node: @@ -457,14 +458,14 @@ def _getattr(self, node, fh=None) -> dict: st_nlink=self.cache.num_parents(node.id) if self.nlinks else 1, st_size=size, st_blksize=self.blksize, - st_blocks=(node.size + 511) // 512, + st_blocks=(size + 511) // 512, **attrs) def listxattr(self, path): - node = self.cache.resolve(path) - if not node: + node_id = self.cache.resolve_id(path) + if not node_id: raise FuseOSError(errno.ENOENT) - return self._listxattr(node.id) + return self._listxattr(node_id) def _listxattr(self, node_id): self._xattr_load(node_id) @@ -475,10 +476,10 @@ def _listxattr(self, node_id): return [] def getxattr(self, path, name, position=0): - node = self.cache.resolve(path) - if not node: + node_id = self.cache.resolve_id(path) + if not node_id: raise FuseOSError(errno.ENOENT) - return self._getxattr_bytes(node.id, name) + return self._getxattr_bytes(node_id, name) def _getxattr(self, node_id, name): self._xattr_load(node_id) @@ -496,10 +497,10 @@ def _getxattr_bytes(self, node_id, name): return binascii.a2b_base64(self._getxattr(node_id, name)) def removexattr(self, path, name): - node = self.cache.resolve(path) - if not node: + node_id = self.cache.resolve_id(path) + if not node_id: raise FuseOSError(errno.ENOENT) - self._removexattr(node.id, name) + self._removexattr(node_id, name) def _removexattr(self, node_id, name): self._xattr_load(node_id) @@ -509,10 +510,10 @@ def _removexattr(self, node_id, name): self.properties_dirty.add(node_id) def setxattr(self, path, name, value, options, position=0): - node = self.cache.resolve(path) - if not node: + node_id = self.cache.resolve_id(path) + if not node_id: raise FuseOSError(errno.ENOENT) - self._setxattr_bytes(node.id, name, value) + self._setxattr_bytes(node_id, name, value) def _setxattr(self, node_id, name, value): self._xattr_load(node_id) @@ -551,21 +552,25 @@ def read(self, path, length, offset, fh=None) -> bytes: """Read ```length`` bytes from ``path`` at ``offset``.""" if fh: - node = self.handles[fh] + node_id = self.fh_to_node[fh] + node = self.cache.get_node(node_id) else: node = self.cache.resolve(path, trash=False) if not node: raise FuseOSError(errno.ENOENT) - if node.size <= offset: + size = self.wp.length(node.id, fh) + if size is None: size = node.size + + if size <= offset: return b'' - if node.size < offset + length: - length = node.size - offset + if size < offset + length: + length = size - offset """If we attempt to read something we just wrote, give it back""" ret = self.wp.read(node.id, fh, offset, length) - if ret and len(ret) == length: + if ret is not None: return ret return self.rp.get(node.id, offset, length, node.size) @@ -586,12 +591,12 @@ def mkdir(self, path, mode): name = os.path.basename(path) ppath = os.path.dirname(path) - p = self.cache.resolve(ppath) - if not p: + p_id = self.cache.resolve_id(ppath) + if not p_id: raise FuseOSError(errno.ENOTDIR) try: - r = self.acd_client.create_folder(name, p.id) + r = self.acd_client.create_folder(name, p_id) except RequestError as e: FuseOSError.convert(e) else: @@ -632,14 +637,14 @@ def create(self, path, mode) -> int: name = os.path.basename(path) ppath = os.path.dirname(path) - p = self.cache.resolve(ppath, False) - if not p: + p_id = self.cache.resolve_id(ppath, False) + if not p_id: raise FuseOSError(errno.ENOTDIR) try: - r = self.acd_client.create_file(name, p.id) + r = self.acd_client.create_file(name, p_id) self.cache.insert_node(r, flush_resolve_cache=False) - node = self.cache.get_node(r['id']) + node_id = r['id'] except RequestError as e: # file all ready exists, see what we know about it since the # cache may be out of sync or amazon missed a rename @@ -655,12 +660,12 @@ def create(self, path, mode) -> int: FuseOSError.convert(e) if mode is not None: - self._chmod(node, mode) + self._setxattr(node_id, _XATTR_MODE_OVERRIDE_NAME, stat.S_IFREG | (stat.S_IMODE(mode))) with self.fh_lock: self.fh += 1 - self.handles[self.fh] = node - self.node_to_fh[node.id].add(self.fh) + self.fh_to_node[self.fh] = node_id + self.node_to_fh[node_id].add(self.fh) return self.fh def rename(self, old, new): @@ -725,13 +730,13 @@ def open(self, path, flags) -> int: if (flags & os.O_APPEND) == os.O_APPEND: raise FuseOSError(errno.EFAULT) - node = self.cache.resolve(path, False) - if not node: + node_id = self.cache.resolve_id(path, False) + if not node_id: raise FuseOSError(errno.ENOENT) with self.fh_lock: self.fh += 1 - self.handles[self.fh] = node - self.node_to_fh[node.id].add(self.fh) + self.fh_to_node[self.fh] = node_id + self.node_to_fh[node_id].add(self.fh) return self.fh def write(self, path, data, offset, fh) -> int: @@ -739,18 +744,18 @@ def write(self, path, data, offset, fh) -> int: :returns: number of bytes written""" - node_id = self.handles[fh].id + node_id = self.fh_to_node[fh] self.wp.write(node_id, fh, offset, data) return len(data) def flush(self, path, fh): if fh: - node = self.handles[fh] + node_id = self.fh_to_node[fh] else: - node = self.cache.resolve(path) - if not node: + node_id = self.cache.resolve_id(path) + if not node_id: raise FuseOSError(errno.ENOENT) - self.wp.flush(node.id, fh) + self.wp.flush(node_id, fh) def truncate(self, path, length, fh=None): """Pseudo-truncates a file, i.e. clears content if ``length``==0 or does nothing @@ -759,15 +764,15 @@ def truncate(self, path, length, fh=None): :raises FuseOSError: if pseudo-truncation to length is not supported""" if fh: - node = self.handles[fh] + node_id = self.fh_to_node[fh] else: - node = self.cache.resolve(path) - if not node: + node_id = self.cache.resolve_id(path) + if not node_id: raise FuseOSError(errno.ENOENT) if length == 0: try: - r = self.acd_client.clear_file(node.id) + r = self.acd_client.clear_file(node_id) except RequestError as e: raise FuseOSError.convert(e) else: @@ -782,24 +787,24 @@ def release(self, path, fh): """Releases an open ``path``.""" if fh: - node = self.handles[fh] + node_id = self.fh_to_node[fh] else: - node = self.cache.resolve(path, trash=False) - if node: - self.rp.release(node.id) + node_id = self.cache.resolve_id(path) + if node_id: + self.rp.release(node_id) with self.fh_lock: """release the writer if there's no more interest. This allows many file handles to write to a single node provided they do it in order, enabling sequential writes using mmap. """ - interest = self.node_to_fh.get(node.id) + interest = self.node_to_fh.get(node_id) if interest: interest.discard(fh) if not interest: - self.wp.release(node.id, fh) + self.wp.release(node_id, fh) self._xattr_write_and_sync() - del self.node_to_fh[node.id] - del self.handles[fh] + del self.node_to_fh[node_id] + del self.fh_to_node[fh] else: raise FuseOSError(errno.ENOENT) @@ -810,8 +815,8 @@ def utimens(self, path, times=None): :param times: [atime, mtime]""" - node = self.cache.resolve(path) - if not node: + node_id = self.cache.resolve_id(path) + if not node_id: raise FuseOSError(errno.ENOENT) if times: @@ -822,7 +827,7 @@ def utimens(self, path, times=None): mtime = time() try: - self._setxattr(node.id, _XATTR_MTIME_OVERRIDE_NAME, mtime) + self._setxattr(node_id, _XATTR_MTIME_OVERRIDE_NAME, mtime) self._xattr_write_and_sync() except: raise FuseOSError(errno.ENOTSUP) @@ -843,30 +848,30 @@ def _chmod(self, node, mode): return 0 def chown(self, path, uid, gid): - node = self.cache.resolve(path) - if not node: + node_id = self.cache.resolve_id(path) + if not node_id: raise FuseOSError(errno.ENOENT) - return self._chown(node, uid, gid) + return self._chown(node_id, uid, gid) - def _chown(self, node, uid, gid): - if uid != -1: self._setxattr(node.id, _XATTR_UID_OVERRIDE_NAME, uid) - if gid != -1: self._setxattr(node.id, _XATTR_GID_OVERRIDE_NAME, gid) + def _chown(self, node_id, uid, gid): + if uid != -1: self._setxattr(node_id, _XATTR_UID_OVERRIDE_NAME, uid) + if gid != -1: self._setxattr(node_id, _XATTR_GID_OVERRIDE_NAME, gid) self._xattr_write_and_sync() return 0 def symlink(self, target, source): fh = self.create(target, None) - node = self.handles[fh] - self._setxattr(node.id, _XATTR_MODE_OVERRIDE_NAME, stat.S_IFLNK | 0o0777) - self._setxattr(node.id, _XATTR_SYMLINK_OVERRIDE_NAME, source) + node_id = self.fh_to_node[fh] + self._setxattr(node_id, _XATTR_MODE_OVERRIDE_NAME, stat.S_IFLNK | 0o0777) + self._setxattr(node_id, _XATTR_SYMLINK_OVERRIDE_NAME, source) self.release(target, fh) return 0 def readlink(self, path): - node = self.cache.resolve(path) - if not node: + node_id = self.cache.resolve_id(path) + if not node_id: raise FuseOSError(errno.ENOENT) - source = self._getxattr(node.id, _XATTR_SYMLINK_OVERRIDE_NAME) + source = self._getxattr(node_id, _XATTR_SYMLINK_OVERRIDE_NAME) return source diff --git a/acdcli/cache/db.py b/acdcli/cache/db.py index b7af974..304d60b 100644 --- a/acdcli/cache/db.py +++ b/acdcli/cache/db.py @@ -69,10 +69,10 @@ def __init__(self, cache_path: str='', settings_path='', check=IntegrityCheckTyp self._conn.create_function('REGEXP', _regex_match.__code__.co_argcount, _regex_match) - self.path_to_node_cache = {} - self.path_to_node_cache_lock = Lock() + self.path_to_node_id_cache = {} + self.path_to_node_id_cache_lock = Lock() """There are a huge number of repeated path lookups, - so cache results and invalidate on new nodes.""" + so cache results and selectively invalidate.""" with cursor(self._conn) as c: c.execute(_ROOT_ID_SQL) diff --git a/acdcli/cache/query.py b/acdcli/cache/query.py index 1955dd6..6a9eb7d 100644 --- a/acdcli/cache/query.py +++ b/acdcli/cache/query.py @@ -160,16 +160,29 @@ def get_conflicting_node(self, name: str, parent_id: str): if r: return Node(r) + def resolve_id(self, path: str, trash=False) -> str: + with self.path_to_node_id_cache_lock: + try: + return self.path_to_node_id_cache[path] + except: + pass + n = self._resolve(path, trash) + if n: + self.path_to_node_id_cache[path] = n.id + return n.id + return None + def resolve(self, path: str, trash=False) -> 'Union[Node|None]': """Gets a node from a path""" - with self.path_to_node_cache_lock: + with self.path_to_node_id_cache_lock: try: - return self.path_to_node_cache[path] + node_id = self.path_to_node_id_cache[path] + return self.get_node(node_id) except: pass n = self._resolve(path, trash) if n: - self.path_to_node_cache[path] = n + self.path_to_node_id_cache[path] = n.id return n return None @@ -282,9 +295,9 @@ def list_children(self, folder_id, trash=False, folder_path=None) -> 'Tuple[List """If the caller provides the folder_path, we can add all the children to the path->node_id cache for faster lookup after a directory listing""" if folder_path: - with self.path_to_node_cache_lock: + with self.path_to_node_id_cache_lock: for c in folders + files: - self.path_to_node_cache[folder_path + '/' + c.name] = c + self.path_to_node_id_cache[folder_path + '/' + c.name] = c.id return folders, files diff --git a/acdcli/cache/sync.py b/acdcli/cache/sync.py index 7de6b2d..12a9e9c 100644 --- a/acdcli/cache/sync.py +++ b/acdcli/cache/sync.py @@ -48,16 +48,16 @@ def remove_purged(self, purged: list): logger.info('Purged %i node(s).' % len(purged)) def resolve_cache_del(self, path:str): - with self.path_to_node_cache_lock: - try: del self.path_to_node_cache[path] + with self.path_to_node_id_cache_lock: + try: del self.path_to_node_id_cache[path] except:pass def insert_nodes(self, nodes: list, partial:bool=True, flush_resolve_cache:bool=False): """Inserts mixed list of files and folders into cache.""" if flush_resolve_cache: - with self.path_to_node_cache_lock: - self.path_to_node_cache.clear() + with self.path_to_node_id_cache_lock: + self.path_to_node_id_cache.clear() files = [] folders = [] From 4c191846ab8ec8d21e1fa655aa3c002bf876d76d Mon Sep 17 00:00:00 2001 From: Ben Date: Fri, 3 Feb 2017 20:14:46 -0500 Subject: [PATCH 37/63] cache write buffer length so long uploads won't hold up getattr calls --- acdcli/acd_fuse.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/acdcli/acd_fuse.py b/acdcli/acd_fuse.py index d9eaef8..2b389a7 100644 --- a/acdcli/acd_fuse.py +++ b/acdcli/acd_fuse.py @@ -225,6 +225,7 @@ def __init__(self, buffer_size): self.f = tempfile.SpooledTemporaryFile(max_size=buffer_size) self.lock = Lock() self.dirty = True + self.len = 0 def read(self, offset, length: int): with self.lock: @@ -234,19 +235,17 @@ def read(self, offset, length: int): def write(self, offset, bytes_: bytes): with self.lock: self.dirty = True - self.f.seek(0, os.SEEK_END) - old_len = self.f.tell() - if offset > old_len: + if offset > self.len: logger.error('Wrong offset for writing to buffer; writing gap detected') raise FuseOSError(errno.ESPIPE) self.f.seek(offset) - self.f.write(bytes_) - return old_len + ret = self.f.write(bytes_) + self.f.seek(0, os.SEEK_END) + self.len = self.f.tell() + return ret def length(self): - with self.lock: - self.f.seek(0, os.SEEK_END) - return self.f.tell() + return self.len def get_file(self): """Return the file for direct access. Be sure to lock from the outside when doing so""" @@ -424,7 +423,7 @@ def _getattr(self, node, fh=None) -> dict: except: mtime = node.modified.timestamp() size = self.wp.length(node.id, fh) - if not size: size = node.size + if size is None: size = node.size try: uid = self._getxattr(node.id, _XATTR_UID_OVERRIDE_NAME) except: uid = self.uid From 3b5044bfcd7e38d09513aba40b77ebc9d8025818 Mon Sep 17 00:00:00 2001 From: Ben Date: Mon, 6 Feb 2017 00:10:13 -0500 Subject: [PATCH 38/63] hit sqlite less --- acdcli/acd_fuse.py | 11 ++++++++--- acdcli/api/common.py | 1 + acdcli/cache/sync.py | 4 ++++ 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/acdcli/acd_fuse.py b/acdcli/acd_fuse.py index 2b389a7..6c81506 100644 --- a/acdcli/acd_fuse.py +++ b/acdcli/acd_fuse.py @@ -544,7 +544,6 @@ def _xattr_write_and_sync(self): logger.error('Error writing node xattrs "%s". %s' % (node_id, str(e))) else: self.cache.insert_property(node_id, self.acd_client_owner, _XATTR_PROPERTY_NAME, xattrs_str) - logger.debug('_xattr_write_and_sync: node: %s xattrs: %s: ' % (node_id, xattrs_str)) self.xattr_dirty.clear() def read(self, path, length, offset, fh=None) -> bytes: @@ -600,8 +599,11 @@ def mkdir(self, path, mode): FuseOSError.convert(e) else: self.cache.insert_node(r, flush_resolve_cache=False) - node = self.cache.get_node(r['id']) - self._chmod(node, mode) + node_id = r['id'] + self.cache.resolve_cache_add(path, node_id) + if mode is not None: + self._setxattr(node_id, _XATTR_MODE_OVERRIDE_NAME, stat.S_IFDIR | (stat.S_IMODE(mode))) + self._xattr_write_and_sync() def _trash(self, path): logger.debug('trash %s' % path) @@ -644,6 +646,7 @@ def create(self, path, mode) -> int: r = self.acd_client.create_file(name, p_id) self.cache.insert_node(r, flush_resolve_cache=False) node_id = r['id'] + self.cache.resolve_cache_add(path, node_id) except RequestError as e: # file all ready exists, see what we know about it since the # cache may be out of sync or amazon missed a rename @@ -704,6 +707,8 @@ def rename(self, old, new): raise FuseOSError(errno.ENOTDIR) self._move(node, ndir.id) + self.cache.resolve_cache_add(new, node.id) + def _rename(self, node, name): try: r = self.acd_client.rename_node(node.id, name) diff --git a/acdcli/api/common.py b/acdcli/api/common.py index b0ddd53..d6e2246 100644 --- a/acdcli/api/common.py +++ b/acdcli/api/common.py @@ -16,6 +16,7 @@ class ReadTimeoutError(Exception): OK_CODES = [requests.codes.OK] RETRY_CODES = [requests.codes.server_error, requests.codes.gateway_timeout, + requests.codes.request_timeout, requests.codes.bad_request, requests.codes.service_unavailable] diff --git a/acdcli/cache/sync.py b/acdcli/cache/sync.py index 12a9e9c..f1fdcf4 100644 --- a/acdcli/cache/sync.py +++ b/acdcli/cache/sync.py @@ -47,6 +47,10 @@ def remove_purged(self, purged: list): logger.info('Purged %i node(s).' % len(purged)) + def resolve_cache_add(self, path:str, node_id:str): + with self.path_to_node_id_cache_lock: + self.path_to_node_id_cache[path] = node_id + def resolve_cache_del(self, path:str): with self.path_to_node_id_cache_lock: try: del self.path_to_node_id_cache[path] From 8c0e506a9dcc7133c831fae6876c5a5c53b24a73 Mon Sep 17 00:00:00 2001 From: Ben Date: Fri, 10 Feb 2017 12:02:39 -0500 Subject: [PATCH 39/63] tidy and make flush a noop again --- acdcli/acd_fuse.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/acdcli/acd_fuse.py b/acdcli/acd_fuse.py index 6c81506..b4ef701 100644 --- a/acdcli/acd_fuse.py +++ b/acdcli/acd_fuse.py @@ -488,9 +488,8 @@ def _getxattr(self, node_id, name): if ret is not None: return ret except: - raise FuseOSError(errno.ENODATA) # should be ENOATTR - else: - raise FuseOSError(errno.ENODATA) # should be ENOATTR + pass + raise FuseOSError(errno.ENODATA) # should be ENOATTR def _getxattr_bytes(self, node_id, name): return binascii.a2b_base64(self._getxattr(node_id, name)) @@ -748,18 +747,15 @@ def write(self, path, data, offset, fh) -> int: :returns: number of bytes written""" - node_id = self.fh_to_node[fh] - self.wp.write(node_id, fh, offset, data) - return len(data) - - def flush(self, path, fh): if fh: node_id = self.fh_to_node[fh] - else: - node_id = self.cache.resolve_id(path) + # This is not resolving by path on purpose, since flushing to + # amazon is done on closing all interested file handles. if not node_id: raise FuseOSError(errno.ENOENT) - self.wp.flush(node_id, fh) + + self.wp.write(node_id, fh, offset, data) + return len(data) def truncate(self, path, length, fh=None): """Pseudo-truncates a file, i.e. clears content if ``length``==0 or does nothing @@ -798,8 +794,7 @@ def release(self, path, fh): self.rp.release(node_id) with self.fh_lock: """release the writer if there's no more interest. This allows many file - handles to write to a single node provided they do it in order, enabling - sequential writes using mmap. + handles to write to a single node provided they do it in order. """ interest = self.node_to_fh.get(node_id) if interest: From c5401ac76384e5cb46dd378a4e5f8368e41fdd85 Mon Sep 17 00:00:00 2001 From: Ben Date: Fri, 10 Feb 2017 12:02:48 -0500 Subject: [PATCH 40/63] credit --- docs/contributors.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/contributors.rst b/docs/contributors.rst index 89cd829..b922a66 100644 --- a/docs/contributors.rst +++ b/docs/contributors.rst @@ -25,6 +25,8 @@ Thanks to - `gerph `_ for making file searches faster, particularly on large repositories +- `bgemmill `_ for fuse write-back caching, xattrs, symlinks, and rsync support + Also thanks to - `fibersnet `_ for pointing out a possible deadlock in ACDFuse. From a167351c292dda74c0b4045a8dc94b04cc36b131 Mon Sep 17 00:00:00 2001 From: bgemmill Date: Tue, 14 Feb 2017 12:51:17 -0500 Subject: [PATCH 41/63] cache nodes by id for faster getattr/getxattr calls in large directories --- acdcli/cache/db.py | 3 +- acdcli/cache/query.py | 50 +++++++++++++----------- acdcli/cache/sync.py | 88 +++++++++++++++++++++++++++++++------------ 3 files changed, 94 insertions(+), 47 deletions(-) diff --git a/acdcli/cache/db.py b/acdcli/cache/db.py index 304d60b..d03f92e 100644 --- a/acdcli/cache/db.py +++ b/acdcli/cache/db.py @@ -69,8 +69,9 @@ def __init__(self, cache_path: str='', settings_path='', check=IntegrityCheckTyp self._conn.create_function('REGEXP', _regex_match.__code__.co_argcount, _regex_match) + self.node_id_to_node_cache = {} self.path_to_node_id_cache = {} - self.path_to_node_id_cache_lock = Lock() + self.node_cache_lock = Lock() """There are a huge number of repeated path lookups, so cache results and selectively invalidate.""" diff --git a/acdcli/cache/query.py b/acdcli/cache/query.py index 6a9eb7d..d3ea90e 100644 --- a/acdcli/cache/query.py +++ b/acdcli/cache/query.py @@ -128,11 +128,15 @@ def is_trashed(self): @property def created(self): - return datetime_from_string(self.cre) + if isinstance(self.cre, str): + self.cre = datetime_from_string(self.cre) + return self.cre @property def modified(self): - return datetime_from_string(self.mod) + if isinstance(self.mod, str): + self.mod = datetime_from_string(self.mod) + return self.mod @property def simple_name(self): @@ -143,11 +147,19 @@ def simple_name(self): class QueryMixin(object): def get_node(self, id) -> 'Union[Node|None]': - with cursor(self._conn) as c: - c.execute(NODE_BY_ID_SQL, [id]) - r = c.fetchone() - if r: - return Node(r) + with self.node_cache_lock: + try: + return self.node_id_to_node_cache[id] + except: + pass + with cursor(self._conn) as c: + c.execute(NODE_BY_ID_SQL, [id]) + r = c.fetchone() + if r: + n = Node(r) + if n.is_available: + self.node_id_to_node_cache[n.id] = n + return n def get_root_node(self): return self.get_node(self.root_id) @@ -161,30 +173,22 @@ def get_conflicting_node(self, name: str, parent_id: str): return Node(r) def resolve_id(self, path: str, trash=False) -> str: - with self.path_to_node_id_cache_lock: + with self.node_cache_lock: try: return self.path_to_node_id_cache[path] except: pass n = self._resolve(path, trash) if n: + self.node_id_to_node_cache[n.id] = n self.path_to_node_id_cache[path] = n.id return n.id return None def resolve(self, path: str, trash=False) -> 'Union[Node|None]': """Gets a node from a path""" - with self.path_to_node_id_cache_lock: - try: - node_id = self.path_to_node_id_cache[path] - return self.get_node(node_id) - except: - pass - n = self._resolve(path, trash) - if n: - self.path_to_node_id_cache[path] = n.id - return n - return None + id = self.resolve_id(path=path, trash=trash) + return self.get_node(id=id) if id else None def _resolve(self, path: str, trash=False) -> 'Union[Node|None]': segments = list(filter(bool, path.split('/'))) @@ -294,9 +298,11 @@ def list_children(self, folder_id, trash=False, folder_path=None) -> 'Tuple[List """If the caller provides the folder_path, we can add all the children to the path->node_id cache for faster lookup after a directory listing""" - if folder_path: - with self.path_to_node_id_cache_lock: - for c in folders + files: + with self.node_cache_lock: + for c in folders + files: + if c.is_available: + self.node_id_to_node_cache[c.id] = c + if folder_path: self.path_to_node_id_cache[folder_path + '/' + c.name] = c.id return folders, files diff --git a/acdcli/cache/sync.py b/acdcli/cache/sync.py index f1fdcf4..a1756dc 100644 --- a/acdcli/cache/sync.py +++ b/acdcli/cache/sync.py @@ -5,6 +5,8 @@ import logging from datetime import datetime from itertools import islice + +from acdcli.cache.query import Node from .cursors import mod_cursor import dateutil.parser as iso_date @@ -48,11 +50,11 @@ def remove_purged(self, purged: list): logger.info('Purged %i node(s).' % len(purged)) def resolve_cache_add(self, path:str, node_id:str): - with self.path_to_node_id_cache_lock: + with self.node_cache_lock: self.path_to_node_id_cache[path] = node_id def resolve_cache_del(self, path:str): - with self.path_to_node_id_cache_lock: + with self.node_cache_lock: try: del self.path_to_node_id_cache[path] except:pass @@ -60,7 +62,7 @@ def insert_nodes(self, nodes: list, partial:bool=True, flush_resolve_cache:bool= """Inserts mixed list of files and folders into cache.""" if flush_resolve_cache: - with self.path_to_node_id_cache_lock: + with self.node_cache_lock: self.path_to_node_id_cache.clear() files = [] @@ -105,14 +107,32 @@ def insert_folders(self, folders: list): with mod_cursor(self._conn) as c: for f in folders: + n = Node(dict(id=f['id'], + type="folder", + name=f.get('name'), + description=f.get('description'), + created=iso_date.parse(f['createdDate']), + modified=iso_date.parse(f['modifiedDate']), + updated=datetime.utcnow(), + status=f['status'], + md5=None, + size=0, + )) + + with self.node_cache_lock: + if n.is_available: + self.node_id_to_node_cache[n.id] = n + else: + self.node_id_to_node_cache.clear() + c.execute( 'INSERT OR REPLACE INTO nodes ' '(id, type, name, description, created, modified, updated, status) ' - 'VALUES (?, "folder", ?, ?, ?, ?, ?, ?)', - [f['id'], f.get('name'), f.get('description'), - iso_date.parse(f['createdDate']), iso_date.parse(f['modifiedDate']), - datetime.utcnow(), - f['status'] + 'VALUES (?, ?, ?, ?, ?, ?, ?, ?)', + [n.id, n.type, n.name, n.description, + n.created, n.modified, + n.updated, + n.status ] ) @@ -124,22 +144,42 @@ def insert_files(self, files: list): with mod_cursor(self._conn) as c: for f in files: - c.execute('INSERT OR REPLACE INTO nodes ' - '(id, type, name, description, created, modified, updated, status)' - 'VALUES (?, "file", ?, ?, ?, ?, ?, ?)', - [f['id'], f.get('name'), f.get('description'), - iso_date.parse(f['createdDate']), iso_date.parse(f['modifiedDate']), - datetime.utcnow(), - f['status'] - ] - ) - c.execute('INSERT OR REPLACE INTO files (id, md5, size) VALUES (?, ?, ?)', - [f['id'], - f.get('contentProperties', {}).get('md5', - 'd41d8cd98f00b204e9800998ecf8427e'), - f.get('contentProperties', {}).get('size', 0) - ] - ) + n = Node(dict(id=f['id'], + type="file", + name=f.get('name'), + description=f.get('description'), + created=iso_date.parse(f['createdDate']), + modified=iso_date.parse(f['modifiedDate']), + updated=datetime.utcnow(), + status=f['status'], + md5=f.get('contentProperties', {}).get('md5', 'd41d8cd98f00b204e9800998ecf8427e'), + size=f.get('contentProperties', {}).get('size', 0), + )) + + with self.node_cache_lock: + if n.is_available: + self.node_id_to_node_cache[n.id] = n + else: + try: del self.node_id_to_node_cache[n.id] + except: pass + + c.execute( + 'INSERT OR REPLACE INTO nodes ' + '(id, type, name, description, created, modified, updated, status) ' + 'VALUES (?, ?, ?, ?, ?, ?, ?, ?)', + [n.id, n.type, n.name, n.description, + n.created, n.modified, + n.updated, + n.status + ] + ) + c.execute( + 'INSERT OR REPLACE INTO files (id, md5, size) VALUES (?, ?, ?)', + [n.id, + n.md5, + n.size + ] + ) logger.info('Inserted/updated %d file(s).' % len(files)) From 885b4d0b73d2b9a8b0dcbad20b5c29938dc97439 Mon Sep 17 00:00:00 2001 From: bgemmill Date: Wed, 15 Feb 2017 15:10:31 -0500 Subject: [PATCH 42/63] store symlinks targets in file contents to ultimately allow for longer targets than amazon's max xattr size --- acdcli/acd_fuse.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/acdcli/acd_fuse.py b/acdcli/acd_fuse.py index b4ef701..e02493b 100644 --- a/acdcli/acd_fuse.py +++ b/acdcli/acd_fuse.py @@ -863,14 +863,19 @@ def symlink(self, target, source): node_id = self.fh_to_node[fh] self._setxattr(node_id, _XATTR_MODE_OVERRIDE_NAME, stat.S_IFLNK | 0o0777) self._setxattr(node_id, _XATTR_SYMLINK_OVERRIDE_NAME, source) + self.write(target, source.encode('utf-8'), 0, fh) self.release(target, fh) return 0 def readlink(self, path): - node_id = self.cache.resolve_id(path) - if not node_id: + node = self.cache.resolve(path) + if not node: raise FuseOSError(errno.ENOENT) - source = self._getxattr(node_id, _XATTR_SYMLINK_OVERRIDE_NAME) + source = self._getxattr(node.id, _XATTR_SYMLINK_OVERRIDE_NAME) + if source is None: + size = self.wp.length(node.id, None) + if size is None: size = node.size + source = self.read(path, size, 0).decode('utf-8') return source From dc47fe2d873692ddd29bdef08b42d85fa0d436ba Mon Sep 17 00:00:00 2001 From: bgemmill Date: Sat, 18 Feb 2017 21:09:57 -0500 Subject: [PATCH 43/63] cache files' content, starting with small files and symlinks --- acdcli/acd_fuse.py | 73 +++++++++++++++++++++++++++++++----------- acdcli/api/content.py | 4 +-- acdcli/cache/query.py | 16 +++++++++ acdcli/cache/schema.py | 45 ++++++++++++++++++++++++-- acdcli/cache/sync.py | 30 ++++++++++++++--- 5 files changed, 140 insertions(+), 28 deletions(-) diff --git a/acdcli/acd_fuse.py b/acdcli/acd_fuse.py index e02493b..bfacbd5 100644 --- a/acdcli/acd_fuse.py +++ b/acdcli/acd_fuse.py @@ -56,9 +56,10 @@ def find_library(*args): _XATTR_UID_OVERRIDE_NAME = 'fuse.uid' _XATTR_GID_OVERRIDE_NAME = 'fuse.gid' _XATTR_SYMLINK_OVERRIDE_NAME = 'fuse.symlink' +_FS_BLOCK_SIZE = 4096 # for stat and statfs calls. This could be anything as long as it's consistent _def_conf = configparser.ConfigParser() -_def_conf['read'] = dict(open_chunk_limit=10, timeout=5) +_def_conf['read'] = dict(open_chunk_limit=10, timeout=5, cache_small_file_size=1024) _def_conf['write'] = dict(buffer_size=int(1e9), timeout=30) @@ -378,8 +379,8 @@ def __init__(self, **kwargs): """sets the default gid""" self.umask = kwargs['umask'] """sets the default umask""" - self.blksize = self.acd_client._conf.getint('transfer', 'fs_chunk_size') - """size of the filesystem blocks for stat queries""" + self.cache_small_file_size = conf.getint('read', 'cache_small_file_size') + """size of files under which we cache the contents automatically""" self.destroyed = autosync.keywords['stop'] """:type: multiprocessing.Event""" @@ -456,8 +457,8 @@ def _getattr(self, node, fh=None) -> dict: return dict(st_mode=mode, st_nlink=self.cache.num_parents(node.id) if self.nlinks else 1, st_size=size, - st_blksize=self.blksize, - st_blocks=(size + 511) // 512, + st_blksize=_FS_BLOCK_SIZE, + st_blocks=(size + 511) // 512, # this field always expects a 512 block size **attrs) def listxattr(self, path): @@ -505,7 +506,7 @@ def _removexattr(self, node_id, name): with self.xattr_cache_lock: if name in self.xattr_cache[node_id]: del self.xattr_cache[node_id][name] - self.properties_dirty.add(node_id) + self.xattr_dirty.add(node_id) def setxattr(self, path, name, value, options, position=0): node_id = self.cache.resolve_id(path) @@ -570,17 +571,29 @@ def read(self, path, length, offset, fh=None) -> bytes: if ret is not None: return ret + """Next, check our local cache""" + content = self.cache.get_content(node.id, node.version) + if content is not None: + return content[offset:offset+length] + + """For small files, read and cache the whole file""" + if node.size <= self.cache_small_file_size: + content = self.acd_client.download_chunk(node.id, 0, node.size) + self.cache.insert_content(node.id, node.version, content) + return content[offset:offset+length] + + """For all other files, stream from amazon""" return self.rp.get(node.id, offset, length, node.size) def statfs(self, path) -> dict: - """Gets some filesystem statistics as specified in :manpage:`stat(2)`.""" - - return dict(f_bsize=self.blksize, - f_frsize=self.blksize, - f_blocks=self.total // self.blksize, # total no of blocks - f_bfree=self.free // self.blksize, # free blocks - f_bavail=self.free // self.blksize, - f_namemax=256 + """Gets some filesystem statistics as specified in :manpage:`statfs(2)`.""" + + return dict(f_bsize=_FS_BLOCK_SIZE, + f_frsize=_FS_BLOCK_SIZE, + f_blocks=self.total // _FS_BLOCK_SIZE, # total no of blocks + f_bfree=self.free // _FS_BLOCK_SIZE, # free blocks + f_bavail=self.free // _FS_BLOCK_SIZE, + f_namemax=256 # from amazon's spec ) def mkdir(self, path, mode): @@ -619,7 +632,7 @@ def _trash(self, path): except RequestError as e: FuseOSError.convert(e) else: - self.cache.insert_node(r, flush_resolve_cache=node.is_folder) + self.cache.insert_node(r, flush_resolve_cache=False) self.cache.resolve_cache_del(path) def rmdir(self, path): @@ -859,11 +872,12 @@ def _chown(self, node_id, uid, gid): return 0 def symlink(self, target, source): + source_bytes = source.encode('utf-8') fh = self.create(target, None) node_id = self.fh_to_node[fh] self._setxattr(node_id, _XATTR_MODE_OVERRIDE_NAME, stat.S_IFLNK | 0o0777) - self._setxattr(node_id, _XATTR_SYMLINK_OVERRIDE_NAME, source) - self.write(target, source.encode('utf-8'), 0, fh) + # self._setxattr(node_id, _XATTR_SYMLINK_OVERRIDE_NAME, source) + self.write(target, source_bytes, 0, fh) self.release(target, fh) return 0 @@ -871,11 +885,32 @@ def readlink(self, path): node = self.cache.resolve(path) if not node: raise FuseOSError(errno.ENOENT) - source = self._getxattr(node.id, _XATTR_SYMLINK_OVERRIDE_NAME) + + source = None + + # amazon reduced property size (all our xattr space) to 500 characters or less, + # so we're moving symlinks to file bodies. + try: source = self._getxattr(node.id, _XATTR_SYMLINK_OVERRIDE_NAME) + except: pass + if source is not None: + logger.debug("readlink: upgrading node: %s path: %s" % (node.id, path)) + source_bytes = source.encode('utf-8') + fh = self.open(path, 0) + self.write(path, source_bytes, 0, fh) + self.release(path, fh) + self._removexattr(node.id, _XATTR_SYMLINK_OVERRIDE_NAME) + + if source is None: + source_bytes = self.cache.get_content(node.id, node.version) + if source_bytes is not None: + source = source_bytes.decode('utf-8') + if source is None: size = self.wp.length(node.id, None) if size is None: size = node.size - source = self.read(path, size, 0).decode('utf-8') + source_bytes = self.read(path, size, 0) + source = source_bytes.decode('utf-8') + self.cache.insert_content(node.id, node.version, source_bytes) return source diff --git a/acdcli/api/content.py b/acdcli/api/content.py index e31c455..710b13a 100644 --- a/acdcli/api/content.py +++ b/acdcli/api/content.py @@ -416,7 +416,7 @@ def response_chunk(self, node_id: str, offset: int, length: int, **kwargs) -> Re raise RequestError(r.status_code, r.text) return r - def download_chunk(self, node_id: str, offset: int, length: int, **kwargs) -> bytearray: + def download_chunk(self, node_id: str, offset: int, length: int, **kwargs) -> bytes: """Load a file chunk into memory. :param length: the length of the download chunk""" @@ -432,7 +432,7 @@ def download_chunk(self, node_id: str, offset: int, length: int, **kwargs) -> by buffer.extend(chunk) finally: r.close() - return buffer + return bytes(buffer) def download_thumbnail(self, node_id: str, file_name: str, max_dim=128): """Download a movie's or picture's thumbnail into a file. diff --git a/acdcli/cache/query.py b/acdcli/cache/query.py index d3ea90e..d2a5c39 100644 --- a/acdcli/cache/query.py +++ b/acdcli/cache/query.py @@ -53,6 +53,9 @@ def datetime_from_string(dt: str) -> datetime: PROPERTY_BY_ID_SQL = """SELECT * FROM properties WHERE id=? AND owner=? AND key=?""" +CONTENT_BY_ID_SQL = """SELECT * FROM content WHERE id=? AND version=?""" +CONTENT_ACCESSED_SQL = """UPDATE content SET accessed=? WHERE id=?""" + USAGE_SQL = 'SELECT SUM(size) FROM files' FIND_BY_NAME_SQL = """SELECT n.*, f.* FROM nodes n @@ -100,6 +103,10 @@ def __init__(self, row): self.size = row['size'] except IndexError: self.size = 0 + try: + self.version = row['version'] + except IndexError: + self.version = 0 def __lt__(self, other): return self.name < other.name @@ -368,3 +375,12 @@ def get_property(self, node_id, owner_id, key) -> 'Union[str|None]': if r: return r['value'] return None + + def get_content(self, node_id:str, version:int) -> 'Union[bytes|None]': + if version == 0: return None + with cursor(self._conn) as c: + c.execute(CONTENT_ACCESSED_SQL, [datetime.utcnow(), node_id]) + c.execute(CONTENT_BY_ID_SQL, [node_id, version]) + r = c.fetchone() + if r: + return r['value'] diff --git a/acdcli/cache/schema.py b/acdcli/cache/schema.py index 71b5c5e..5bf6a4c 100644 --- a/acdcli/cache/schema.py +++ b/acdcli/cache/schema.py @@ -48,6 +48,7 @@ id VARCHAR(50) NOT NULL, md5 VARCHAR(32), size BIGINT, + version BIGINT, PRIMARY KEY (id), UNIQUE (id), FOREIGN KEY(id) REFERENCES nodes (id) @@ -61,9 +62,22 @@ FOREIGN KEY(child) REFERENCES nodes (id) ); + CREATE TABLE content ( + id VARCHAR(50) NOT NULL, + value BLOB, + size BIGINT, + version BIGINT, + accessed DATETIME, + PRIMARY KEY (id), + UNIQUE (id), + FOREIGN KEY(id) REFERENCES nodes (id) + ); + + CREATE INDEX ix_content_size ON content(size); + CREATE INDEX ix_content_accessed ON content(accessed); CREATE INDEX ix_parentage_child ON parentage(child); CREATE INDEX ix_nodes_names ON nodes(name); - PRAGMA user_version = 3; + PRAGMA user_version = 4; """ _GEN_DROP_TABLES_SQL = \ @@ -91,6 +105,12 @@ def _1_to_2(conn): def _2_to_3(conn): conn.executescript( + # For people upgrading from the main branch to PR374, this line should make the db queries work. + # The user would also need to old-sync if they had multiple databases *and* were all ready using + # properties in some of them. It's not clear how to do that from here aside from dropping all data. + 'CREATE TABLE IF NOT EXISTS properties (id VARCHAR(50) NOT NULL, owner TEXT NOT NULL, ' + 'key TEXT NOT NULL, value TEXT, PRIMARY KEY (id), FOREIGN KEY(id) REFERENCES nodes (id));' + 'CREATE INDEX IF NOT EXISTS ix_parentage_child ON parentage(child);' # Having changed the schema, the queries can be optimised differently. # In order to be aware of that, re-analyze the type of data and indexes, @@ -100,12 +120,31 @@ def _2_to_3(conn): ) conn.commit() -_migrations = [_0_to_1, _1_to_2, _2_to_3] + +def _3_to_4(conn): + conn.executescript( + 'ALTER TABLE files ADD version BIGINT;' + + 'DROP TABLE IF EXISTS content;' + 'CREATE TABLE content (id VARCHAR(50) NOT NULL, value BLOB, size BIGINT, version BIGINT, accessed DATETIME,' + 'PRIMARY KEY (id), UNIQUE (id), FOREIGN KEY(id) REFERENCES nodes (id)); ' + + 'CREATE INDEX IF NOT EXISTS ix_content_size ON content(size);' + 'CREATE INDEX IF NOT EXISTS ix_content_accessed ON content(accessed);' + # Having changed the schema, the queries can be optimised differently. + # In order to be aware of that, re-analyze the type of data and indexes, + # allowing SQLite3 to make better decisions. + 'ANALYZE;' + 'PRAGMA user_version = 4;' + ) + conn.commit() + +_migrations = [_0_to_1, _1_to_2, _2_to_3, _3_to_4] """list of all migrations from index -> index+1""" class SchemaMixin(object): - _DB_SCHEMA_VER = 3 + _DB_SCHEMA_VER = 4 def init(self): try: diff --git a/acdcli/cache/sync.py b/acdcli/cache/sync.py index a1756dc..c9cfae1 100644 --- a/acdcli/cache/sync.py +++ b/acdcli/cache/sync.py @@ -42,6 +42,7 @@ def remove_purged(self, purged: list): with mod_cursor(self._conn) as c: c.execute('DELETE FROM nodes WHERE id IN %s' % placeholders(slice_), slice_) c.execute('DELETE FROM files WHERE id IN %s' % placeholders(slice_), slice_) + c.execute('DELETE FROM content WHERE id IN %s' % placeholders(slice_), slice_) c.execute('DELETE FROM parentage WHERE parent IN %s' % placeholders(slice_), slice_) c.execute('DELETE FROM parentage WHERE child IN %s' % placeholders(slice_), slice_) c.execute('DELETE FROM properties WHERE id IN %s' % placeholders(slice_), slice_) @@ -117,13 +118,15 @@ def insert_folders(self, folders: list): status=f['status'], md5=None, size=0, + version=0, )) with self.node_cache_lock: if n.is_available: self.node_id_to_node_cache[n.id] = n else: - self.node_id_to_node_cache.clear() + try: del self.node_id_to_node_cache[n.id] + except: pass c.execute( 'INSERT OR REPLACE INTO nodes ' @@ -154,6 +157,7 @@ def insert_files(self, files: list): status=f['status'], md5=f.get('contentProperties', {}).get('md5', 'd41d8cd98f00b204e9800998ecf8427e'), size=f.get('contentProperties', {}).get('size', 0), + version=f.get('contentProperties', {}).get('version', 0), )) with self.node_cache_lock: @@ -163,6 +167,9 @@ def insert_files(self, files: list): try: del self.node_id_to_node_cache[n.id] except: pass + if not n.is_available: + self.remove_content(n.id) + c.execute( 'INSERT OR REPLACE INTO nodes ' '(id, type, name, description, created, modified, updated, status) ' @@ -174,10 +181,11 @@ def insert_files(self, files: list): ] ) c.execute( - 'INSERT OR REPLACE INTO files (id, md5, size) VALUES (?, ?, ?)', + 'INSERT OR REPLACE INTO files (id, md5, size, version) VALUES (?, ?, ?, ?)', [n.id, n.md5, - n.size + n.size, + n.version, ] ) @@ -225,4 +233,18 @@ def insert_property(self, node_id, owner_id, key, value): '(id, owner, key, value) ' 'VALUES (?, ?, ?, ?)', [node_id, owner_id, key, value] - ) \ No newline at end of file + ) + + def insert_content(self, node_id:str, version:int, value:bytes): + with mod_cursor(self._conn) as c: + c.execute('INSERT OR REPLACE INTO content ' + '(id, value, size, version, accessed) ' + 'VALUES (?, ?, ?, ?, ?)', + [node_id, value, len(value), version, datetime.utcnow()] + ) + + def remove_content(self, node_id:str): + with mod_cursor(self._conn) as c: + c.execute('DELETE FROM content WHERE id=?', + [node_id] + ) From d42854490e490f41d0f69e46d52eba17ba0adb53 Mon Sep 17 00:00:00 2001 From: bgemmill Date: Tue, 21 Feb 2017 12:28:57 -0500 Subject: [PATCH 44/63] retry on 429 rate throttling errors --- acdcli/api/common.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/acdcli/api/common.py b/acdcli/api/common.py index d6e2246..505f9a3 100644 --- a/acdcli/api/common.py +++ b/acdcli/api/common.py @@ -14,11 +14,14 @@ class ReadTimeoutError(Exception): # status codes that indicate request success OK_CODES = [requests.codes.OK] -RETRY_CODES = [requests.codes.server_error, - requests.codes.gateway_timeout, - requests.codes.request_timeout, - requests.codes.bad_request, - requests.codes.service_unavailable] +RETRY_CODES = [requests.codes.server_error, # 500 + requests.codes.service_unavailable, # 503 + requests.codes.gateway_timeout, # 504 + requests.codes.bad_request, # 400 + requests.codes.request_timeout, # 408 + requests.codes.too_many_requests, # 429 + ] + class RequestError(Exception): """Catch-all exception class for various connection and ACD server errors.""" From 3a88b053e810a8c42abf24b3f91175bf9be0e076 Mon Sep 17 00:00:00 2001 From: bgemmill Date: Wed, 22 Feb 2017 21:52:06 -0500 Subject: [PATCH 45/63] release file handle lock during file writes to amazon --- acdcli/acd_fuse.py | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/acdcli/acd_fuse.py b/acdcli/acd_fuse.py index bfacbd5..0290ae9 100644 --- a/acdcli/acd_fuse.py +++ b/acdcli/acd_fuse.py @@ -294,6 +294,9 @@ def release(self, node_id, fh): self._write_and_sync(b, node_id) del self.buffers[node_id] + def remove(self, node_id, fh): + try: del self.buffers[node_id] + except: pass class LoggingMixIn(object): """Modified fusepy LoggingMixIn that does not log read or written bytes @@ -803,22 +806,32 @@ def release(self, path, fh): node_id = self.fh_to_node[fh] else: node_id = self.cache.resolve_id(path) - if node_id: + if not node_id: + raise FuseOSError(errno.ENOENT) + + flush = False + with self.fh_lock: + """release the writer if there's no more interest. This allows many file + handles to write to a single node provided they do it in order. + """ + interest = self.node_to_fh.get(node_id) + if interest: + interest.discard(fh) + if not interest: + flush = True + del self.node_to_fh[node_id] + del self.fh_to_node[fh] + + if flush: self.rp.release(node_id) + self.wp.flush(node_id, None) + self._xattr_write_and_sync() + """make sure no additional file handles showed interest before we get rid of the write buffer""" with self.fh_lock: - """release the writer if there's no more interest. This allows many file - handles to write to a single node provided they do it in order. - """ interest = self.node_to_fh.get(node_id) - if interest: - interest.discard(fh) if not interest: - self.wp.release(node_id, fh) - self._xattr_write_and_sync() - del self.node_to_fh[node_id] - del self.fh_to_node[fh] - else: - raise FuseOSError(errno.ENOENT) + self.wp.remove(node_id, None) + return 0 def utimens(self, path, times=None): """Should set node atime and mtime to values as passed in ``times`` From f760f46b07dbed0ff03b404beed6abeb2cd07be3 Mon Sep 17 00:00:00 2001 From: bgemmill Date: Sat, 25 Feb 2017 22:41:31 -0500 Subject: [PATCH 46/63] rely on rsync's later chmod to save an amazon call per file/folder create. --- acdcli/acd_fuse.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/acdcli/acd_fuse.py b/acdcli/acd_fuse.py index 0290ae9..0acdd3b 100644 --- a/acdcli/acd_fuse.py +++ b/acdcli/acd_fuse.py @@ -616,7 +616,10 @@ def mkdir(self, path, mode): self.cache.insert_node(r, flush_resolve_cache=False) node_id = r['id'] self.cache.resolve_cache_add(path, node_id) - if mode is not None: + + # TODO: Set properties in the node creation call. Doing it here means we call amazon twice; + # and if we're rsyncing chmod does it a third time. + if False and mode is not None: self._setxattr(node_id, _XATTR_MODE_OVERRIDE_NAME, stat.S_IFDIR | (stat.S_IMODE(mode))) self._xattr_write_and_sync() @@ -676,7 +679,9 @@ def create(self, path, mode) -> int: # self._rename(prior_node_id, prior_node_cache.name) FuseOSError.convert(e) - if mode is not None: + # TODO: Set properties in the node creation call. Doing it here means we call amazon twice; + # and if we're rsyncing chmod does it a third time. + if False and mode is not None: self._setxattr(node_id, _XATTR_MODE_OVERRIDE_NAME, stat.S_IFREG | (stat.S_IMODE(mode))) with self.fh_lock: From 6098ebcc7358a903814ff930c7d26bf95c0251e4 Mon Sep 17 00:00:00 2001 From: bgemmill Date: Sat, 25 Feb 2017 23:01:45 -0500 Subject: [PATCH 47/63] the folders table no longer exists --- acdcli/cache/schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/acdcli/cache/schema.py b/acdcli/cache/schema.py index 5bf6a4c..ca26d51 100644 --- a/acdcli/cache/schema.py +++ b/acdcli/cache/schema.py @@ -58,7 +58,7 @@ parent VARCHAR(50) NOT NULL, child VARCHAR(50) NOT NULL, PRIMARY KEY (parent, child), - FOREIGN KEY(parent) REFERENCES folders (id), + FOREIGN KEY(parent) REFERENCES nodes (id), FOREIGN KEY(child) REFERENCES nodes (id) ); From a3e291e7d62120e0c5fadc4e398e8facd295ac80 Mon Sep 17 00:00:00 2001 From: bgemmill Date: Mon, 27 Feb 2017 13:33:15 -0500 Subject: [PATCH 48/63] turns out rsync needs this for proper change detection in some cases. --- acdcli/acd_fuse.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/acdcli/acd_fuse.py b/acdcli/acd_fuse.py index 0acdd3b..0290ae9 100644 --- a/acdcli/acd_fuse.py +++ b/acdcli/acd_fuse.py @@ -616,10 +616,7 @@ def mkdir(self, path, mode): self.cache.insert_node(r, flush_resolve_cache=False) node_id = r['id'] self.cache.resolve_cache_add(path, node_id) - - # TODO: Set properties in the node creation call. Doing it here means we call amazon twice; - # and if we're rsyncing chmod does it a third time. - if False and mode is not None: + if mode is not None: self._setxattr(node_id, _XATTR_MODE_OVERRIDE_NAME, stat.S_IFDIR | (stat.S_IMODE(mode))) self._xattr_write_and_sync() @@ -679,9 +676,7 @@ def create(self, path, mode) -> int: # self._rename(prior_node_id, prior_node_cache.name) FuseOSError.convert(e) - # TODO: Set properties in the node creation call. Doing it here means we call amazon twice; - # and if we're rsyncing chmod does it a third time. - if False and mode is not None: + if mode is not None: self._setxattr(node_id, _XATTR_MODE_OVERRIDE_NAME, stat.S_IFREG | (stat.S_IMODE(mode))) with self.fh_lock: From ac80bec9d252170ab88954ebef3af5565cbbf04b Mon Sep 17 00:00:00 2001 From: bgemmill Date: Tue, 28 Feb 2017 19:41:25 -0500 Subject: [PATCH 49/63] releasing the fh lock can lead to 409 concurrent modification errors in huge directory trees with rsync --- acdcli/acd_fuse.py | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/acdcli/acd_fuse.py b/acdcli/acd_fuse.py index 0290ae9..3862f6e 100644 --- a/acdcli/acd_fuse.py +++ b/acdcli/acd_fuse.py @@ -294,9 +294,6 @@ def release(self, node_id, fh): self._write_and_sync(b, node_id) del self.buffers[node_id] - def remove(self, node_id, fh): - try: del self.buffers[node_id] - except: pass class LoggingMixIn(object): """Modified fusepy LoggingMixIn that does not log read or written bytes @@ -809,7 +806,6 @@ def release(self, path, fh): if not node_id: raise FuseOSError(errno.ENOENT) - flush = False with self.fh_lock: """release the writer if there's no more interest. This allows many file handles to write to a single node provided they do it in order. @@ -818,19 +814,11 @@ def release(self, path, fh): if interest: interest.discard(fh) if not interest: - flush = True + self.rp.release(node_id) + self.wp.release(node_id, None) + self._xattr_write_and_sync() del self.node_to_fh[node_id] del self.fh_to_node[fh] - - if flush: - self.rp.release(node_id) - self.wp.flush(node_id, None) - self._xattr_write_and_sync() - """make sure no additional file handles showed interest before we get rid of the write buffer""" - with self.fh_lock: - interest = self.node_to_fh.get(node_id) - if not interest: - self.wp.remove(node_id, None) return 0 def utimens(self, path, times=None): From c9c07193b6a5b9016ad83c8689681e325e4b899d Mon Sep 17 00:00:00 2001 From: bgemmill Date: Fri, 3 Mar 2017 16:16:55 -0500 Subject: [PATCH 50/63] sparse file support --- acdcli/acd_fuse.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/acdcli/acd_fuse.py b/acdcli/acd_fuse.py index 3862f6e..ee512ef 100644 --- a/acdcli/acd_fuse.py +++ b/acdcli/acd_fuse.py @@ -236,9 +236,6 @@ def read(self, offset, length: int): def write(self, offset, bytes_: bytes): with self.lock: self.dirty = True - if offset > self.len: - logger.error('Wrong offset for writing to buffer; writing gap detected') - raise FuseOSError(errno.ESPIPE) self.f.seek(offset) ret = self.f.write(bytes_) self.f.seek(0, os.SEEK_END) From 3d2c26a8565a2fc79780078554edc4c318c6bfd7 Mon Sep 17 00:00:00 2001 From: bgemmill Date: Sat, 4 Mar 2017 00:59:19 -0500 Subject: [PATCH 51/63] sparse file support at ends of files too --- acdcli/acd_fuse.py | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/acdcli/acd_fuse.py b/acdcli/acd_fuse.py index ee512ef..2ab3100 100644 --- a/acdcli/acd_fuse.py +++ b/acdcli/acd_fuse.py @@ -768,30 +768,40 @@ def write(self, path, data, offset, fh) -> int: return len(data) def truncate(self, path, length, fh=None): - """Pseudo-truncates a file, i.e. clears content if ``length``==0 or does nothing - if ``length`` is positive. + """Pseudo-truncates a file, i.e. clears content if ``length``==0 or grows + newly created nodes if ``length`` is greater than the write-back cache size. :raises FuseOSError: if pseudo-truncation to length is not supported""" if fh: node_id = self.fh_to_node[fh] + node = self.cache.get_node(node_id) else: - node_id = self.cache.resolve_id(path) - if not node_id: + node = self.cache.resolve(path, trash=False) + if not node: raise FuseOSError(errno.ENOENT) - if length == 0: + # cut file size to 0 + if length == 0 and node.size: try: - r = self.acd_client.clear_file(node_id) + r = self.acd_client.clear_file(node.id) except RequestError as e: raise FuseOSError.convert(e) else: self.cache.insert_node(r, flush_resolve_cache=False) + return 0 - """No good way to deal with positive lengths at the moment; since we can only do - something about it in the middle of writing, this means the only use case we can - capture is when a program over-writes and then truncates back.""" - return 0 + # grow newly created files + if node.size == 0 and length: + size = self.wp.length(node.id, fh) + if size is None: size = node.size + if length > size: + # amazon doesn't understand sparse files, so we send zeros + self.wp.write(node.id, fh, size, bytes(length - size)) + return 0 + + # throw until there's an api for modifying existing files' length + raise FuseOSError(errno.ENOSYS) def release(self, path, fh): """Releases an open ``path``.""" From 96735d5dac20de37c59daf3a890d6c0230d33d93 Mon Sep 17 00:00:00 2001 From: bgemmill Date: Sun, 5 Mar 2017 20:25:52 -0500 Subject: [PATCH 52/63] put flush back to try to solve plex issues --- acdcli/acd_fuse.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/acdcli/acd_fuse.py b/acdcli/acd_fuse.py index 2ab3100..7adf5ee 100644 --- a/acdcli/acd_fuse.py +++ b/acdcli/acd_fuse.py @@ -767,6 +767,15 @@ def write(self, path, data, offset, fh) -> int: self.wp.write(node_id, fh, offset, data) return len(data) + def flush(self, path, fh): + if fh: + node_id = self.fh_to_node[fh] + else: + node_id = self.cache.resolve_id(path) + if not node_id: + raise FuseOSError(errno.ENOENT) + self.wp.flush(node_id, fh) + def truncate(self, path, length, fh=None): """Pseudo-truncates a file, i.e. clears content if ``length``==0 or grows newly created nodes if ``length`` is greater than the write-back cache size. From 26325db0f3df4d4b5adf12d35c33ed5496079d64 Mon Sep 17 00:00:00 2001 From: bgemmill Date: Mon, 6 Mar 2017 20:41:06 -0500 Subject: [PATCH 53/63] fix hanging write buffers full of 0s after a truncate if no fh's are open --- acdcli/acd_fuse.py | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/acdcli/acd_fuse.py b/acdcli/acd_fuse.py index 7adf5ee..284d5fe 100644 --- a/acdcli/acd_fuse.py +++ b/acdcli/acd_fuse.py @@ -291,6 +291,11 @@ def release(self, node_id, fh): self._write_and_sync(b, node_id) del self.buffers[node_id] + def remove(self, node_id, fh): + b = self.buffers.get(node_id) + if b: + del self.buffers[node_id] + class LoggingMixIn(object): """Modified fusepy LoggingMixIn that does not log read or written bytes @@ -746,6 +751,9 @@ def open(self, path, flags) -> int: node_id = self.cache.resolve_id(path, False) if not node_id: raise FuseOSError(errno.ENOENT) + return self._open(node_id) + + def _open(self, node_id): with self.fh_lock: self.fh += 1 self.fh_to_node[self.fh] = node_id @@ -791,14 +799,16 @@ def truncate(self, path, length, fh=None): raise FuseOSError(errno.ENOENT) # cut file size to 0 - if length == 0 and node.size: - try: - r = self.acd_client.clear_file(node.id) - except RequestError as e: - raise FuseOSError.convert(e) - else: - self.cache.insert_node(r, flush_resolve_cache=False) - return 0 + if length == 0: + if node.size: + try: + r = self.acd_client.clear_file(node.id) + except RequestError as e: + raise FuseOSError.convert(e) + else: + self.cache.insert_node(r, flush_resolve_cache=False) + self.wp.remove(node.id, None) + return 0 # grow newly created files if node.size == 0 and length: @@ -806,7 +816,9 @@ def truncate(self, path, length, fh=None): if size is None: size = node.size if length > size: # amazon doesn't understand sparse files, so we send zeros - self.wp.write(node.id, fh, size, bytes(length - size)) + internal_fh = self._open(node.id) + self.wp.write(node.id, fh, length-1, bytes(1)) + self.release(path, internal_fh) return 0 # throw until there's an api for modifying existing files' length From 89c33e362368d3fd0c7f70768de63d14bcc31efc Mon Sep 17 00:00:00 2001 From: bgemmill Date: Tue, 7 Mar 2017 18:36:35 -0500 Subject: [PATCH 54/63] lazy xattr writing and general cleanup --- acdcli/acd_fuse.py | 71 +++++++++++++++++++++++++++++----------------- 1 file changed, 45 insertions(+), 26 deletions(-) diff --git a/acdcli/acd_fuse.py b/acdcli/acd_fuse.py index 284d5fe..e8f01d4 100644 --- a/acdcli/acd_fuse.py +++ b/acdcli/acd_fuse.py @@ -11,8 +11,8 @@ from collections import deque, defaultdict from multiprocessing import Process -from threading import Thread, Lock -from time import time +from threading import Lock, Thread +from time import time, sleep import ctypes.util import binascii @@ -21,6 +21,10 @@ from acdcli.cache.db import CacheConsts +from fuse import FUSE, FuseOSError as FuseError, Operations +from acdcli.api.common import RequestError +from acdcli.utils.conf import get_conf + ctypes.util.__find_library = ctypes.util.find_library def find_library(*args): @@ -32,12 +36,6 @@ def find_library(*args): return ctypes.util.__find_library(*args) ctypes.util.find_library = find_library - -from fuse import FUSE, FuseOSError as FuseError, Operations -from acdcli.api.common import RequestError -from acdcli.utils.conf import get_conf -from acdcli.utils.time import * - logger = logging.getLogger(__name__) try: @@ -56,7 +54,8 @@ def find_library(*args): _XATTR_UID_OVERRIDE_NAME = 'fuse.uid' _XATTR_GID_OVERRIDE_NAME = 'fuse.gid' _XATTR_SYMLINK_OVERRIDE_NAME = 'fuse.symlink' -_FS_BLOCK_SIZE = 4096 # for stat and statfs calls. This could be anything as long as it's consistent +_XATTR_DELAY = 2 # seconds to wait for additional xattr changes before flushing to amazon +_FS_BLOCK_SIZE = 4096 # for stat and statfs calls. Needs to be consistent and may affect read sizes from fuse _def_conf = configparser.ConfigParser() _def_conf['read'] = dict(open_chunk_limit=10, timeout=5, cache_small_file_size=1024) @@ -391,7 +390,6 @@ def __init__(self, **kwargs): p.start() def destroy(self, path): - self._xattr_write_and_sync() self.destroyed.set() def readdir(self, path, fh) -> 'List[str]': @@ -535,9 +533,18 @@ def _xattr_load(self, node_id): try: self.xattr_cache[node_id] = json.loads(xattrs_str) except: self.xattr_cache[node_id] = {} - def _xattr_write_and_sync(self): + def _xattr_flush(self, node_id): + # collect all xattr changes while any fh's are open so we talk to amazon less + with self.fh_lock: + if self.node_to_fh.get(node_id): + return + Thread(target=self._xattr_write_and_sync, args=(node_id,)).start() + + def _xattr_write_and_sync(self, node_id): + # try to collect many xattr changes at once so we talk to amazon less + sleep(_XATTR_DELAY) with self.xattr_cache_lock: - for node_id in self.xattr_dirty: + if node_id in self.xattr_dirty: try: xattrs_str = json.dumps(self.xattr_cache[node_id]) self.acd_client.add_property(node_id, self.acd_client_owner, _XATTR_PROPERTY_NAME, @@ -546,7 +553,7 @@ def _xattr_write_and_sync(self): logger.error('Error writing node xattrs "%s". %s' % (node_id, str(e))) else: self.cache.insert_property(node_id, self.acd_client_owner, _XATTR_PROPERTY_NAME, xattrs_str) - self.xattr_dirty.clear() + self.xattr_dirty.discard(node_id) def read(self, path, length, offset, fh=None) -> bytes: """Read ```length`` bytes from ``path`` at ``offset``.""" @@ -555,7 +562,7 @@ def read(self, path, length, offset, fh=None) -> bytes: node_id = self.fh_to_node[fh] node = self.cache.get_node(node_id) else: - node = self.cache.resolve(path, trash=False) + node = self.cache.resolve(path) if not node: raise FuseOSError(errno.ENOENT) @@ -617,7 +624,7 @@ def mkdir(self, path, mode): self.cache.resolve_cache_add(path, node_id) if mode is not None: self._setxattr(node_id, _XATTR_MODE_OVERRIDE_NAME, stat.S_IFDIR | (stat.S_IMODE(mode))) - self._xattr_write_and_sync() + self._xattr_flush(node_id) def _trash(self, path): logger.debug('trash %s' % path) @@ -645,9 +652,11 @@ def unlink(self, path): """Moves a file into ACD trash.""" self._trash(path) - def create(self, path, mode) -> int: - """Creates an empty file at ``path``. + def create(self, path, mode, **kwargs) -> int: + """Creates an empty file at ``path`` with access ``mode``. + :param mode: + :param path: :returns int: file handle""" name = os.path.basename(path) @@ -742,6 +751,7 @@ def _move(self, node, new_folder): def open(self, path, flags) -> int: """Opens a file. + :param path: :param flags: flags defined as in :manpage:`open(2)` :returns: file handle""" @@ -767,8 +777,10 @@ def write(self, path, data, offset, fh) -> int: if fh: node_id = self.fh_to_node[fh] - # This is not resolving by path on purpose, since flushing to - # amazon is done on closing all interested file handles. + else: + # This is not resolving by path on purpose, since flushing to + # amazon is done on closing all interested file handles. + node_id = None if not node_id: raise FuseOSError(errno.ENOENT) @@ -794,7 +806,7 @@ def truncate(self, path, length, fh=None): node_id = self.fh_to_node[fh] node = self.cache.get_node(node_id) else: - node = self.cache.resolve(path, trash=False) + node = self.cache.resolve(path) if not node: raise FuseOSError(errno.ENOENT) @@ -834,6 +846,7 @@ def release(self, path, fh): if not node_id: raise FuseOSError(errno.ENOENT) + last_handle = False with self.fh_lock: """release the writer if there's no more interest. This allows many file handles to write to a single node provided they do it in order. @@ -842,11 +855,15 @@ def release(self, path, fh): if interest: interest.discard(fh) if not interest: - self.rp.release(node_id) - self.wp.release(node_id, None) - self._xattr_write_and_sync() + last_handle = True del self.node_to_fh[node_id] del self.fh_to_node[fh] + + if last_handle: + self.rp.release(node_id) + self.wp.release(node_id, None) + self._xattr_flush(node_id) + return 0 def utimens(self, path, times=None): @@ -854,6 +871,7 @@ def utimens(self, path, times=None): or current time (see :manpage:`utimensat(2)`). Note that this is only implemented for modified time. + :param path: :param times: [atime, mtime]""" node_id = self.cache.resolve_id(path) @@ -869,7 +887,7 @@ def utimens(self, path, times=None): try: self._setxattr(node_id, _XATTR_MTIME_OVERRIDE_NAME, mtime) - self._xattr_write_and_sync() + self._xattr_flush(node_id) except: raise FuseOSError(errno.ENOTSUP) @@ -885,7 +903,7 @@ def _chmod(self, node, mode): mode_perms = stat.S_IMODE(mode) mode_type = stat.S_IFMT(self._getattr(node)['st_mode']) self._setxattr(node.id, _XATTR_MODE_OVERRIDE_NAME, mode_type | mode_perms) - self._xattr_write_and_sync() + self._xattr_flush(node.id) return 0 def chown(self, path, uid, gid): @@ -897,7 +915,7 @@ def chown(self, path, uid, gid): def _chown(self, node_id, uid, gid): if uid != -1: self._setxattr(node_id, _XATTR_UID_OVERRIDE_NAME, uid) if gid != -1: self._setxattr(node_id, _XATTR_GID_OVERRIDE_NAME, gid) - self._xattr_write_and_sync() + self._xattr_flush(node_id) return 0 def symlink(self, target, source): @@ -946,6 +964,7 @@ def readlink(self, path): def mount(path: str, args: dict, **kwargs) -> 'Union[int, None]': """Fusermounts Amazon Cloud Drive to specified mountpoint. + :param path: :raises: RuntimeError :param args: args to pass on to ACDFuse init :param kwargs: fuse mount options as described in :manpage:`fuse(8)`""" From b084226a7b3f1428580feac84cc2395e6c58be73 Mon Sep 17 00:00:00 2001 From: bgemmill Date: Wed, 15 Mar 2017 18:49:08 -0400 Subject: [PATCH 55/63] tidy --- acdcli/acd_fuse.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/acdcli/acd_fuse.py b/acdcli/acd_fuse.py index e8f01d4..272c948 100644 --- a/acdcli/acd_fuse.py +++ b/acdcli/acd_fuse.py @@ -2,6 +2,7 @@ import configparser import errno +import io import json import logging import os @@ -55,7 +56,7 @@ def find_library(*args): _XATTR_GID_OVERRIDE_NAME = 'fuse.gid' _XATTR_SYMLINK_OVERRIDE_NAME = 'fuse.symlink' _XATTR_DELAY = 2 # seconds to wait for additional xattr changes before flushing to amazon -_FS_BLOCK_SIZE = 4096 # for stat and statfs calls. Needs to be consistent and may affect read sizes from fuse +_FS_BLOCK_SIZE = io.DEFAULT_BUFFER_SIZE # for stat and statfs calls. Needs to be consistent and may affect read sizes from fuse _def_conf = configparser.ConfigParser() _def_conf['read'] = dict(open_chunk_limit=10, timeout=5, cache_small_file_size=1024) @@ -450,7 +451,7 @@ def _getattr(self, node, fh=None) -> dict: **attrs) elif node.is_file: # symlink - if mode and stat.S_ISLNK(stat.S_IFMT(mode)): mode = stat.S_IFLNK | 0o0777 + if mode and stat.S_ISLNK(mode): mode = stat.S_IFLNK | 0o0777 # file else: mode = stat.S_IFREG | (stat.S_IMODE(mode) if mode else 0o0666 & ~self.umask) From 1a4b91e6092b272b7f29e61c7bdb03c2d8592bc6 Mon Sep 17 00:00:00 2001 From: bgemmill Date: Wed, 15 Mar 2017 22:07:44 -0400 Subject: [PATCH 56/63] speed up smaller syncs by not using the disk --- acdcli/api/metadata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/acdcli/api/metadata.py b/acdcli/api/metadata.py index 7645473..c5edcd3 100644 --- a/acdcli/api/metadata.py +++ b/acdcli/api/metadata.py @@ -53,7 +53,7 @@ def get_changes(self, checkpoint='', include_purged=False, silent=True, file=Non if file: tmp = open(file, 'w+b') else: - tmp = tempfile.TemporaryFile('w+b') + tmp = tempfile.SpooledTemporaryFile(max_size=1e9, mode='w+b') try: for line in r.iter_lines(chunk_size=10 * 1024 ** 2, decode_unicode=False): if line: From 1d68ecfc22e2e5b2aa6a310e7ece6fde800beda7 Mon Sep 17 00:00:00 2001 From: bgemmill Date: Wed, 15 Mar 2017 23:17:59 -0400 Subject: [PATCH 57/63] make resolve recursive to cache intermediate results --- acdcli/cache/db.py | 4 +- acdcli/cache/query.py | 125 +++++++++++++++++++++++++----------------- 2 files changed, 76 insertions(+), 53 deletions(-) diff --git a/acdcli/cache/db.py b/acdcli/cache/db.py index d03f92e..d72dbae 100644 --- a/acdcli/cache/db.py +++ b/acdcli/cache/db.py @@ -4,7 +4,7 @@ import re import sqlite3 import sys -from threading import local, Lock +from threading import local, RLock from acdcli.utils.conf import get_conf @@ -71,7 +71,7 @@ def __init__(self, cache_path: str='', settings_path='', check=IntegrityCheckTyp self.node_id_to_node_cache = {} self.path_to_node_id_cache = {} - self.node_cache_lock = Lock() + self.node_cache_lock = RLock() """There are a huge number of repeated path lookups, so cache results and selectively invalidate.""" diff --git a/acdcli/cache/query.py b/acdcli/cache/query.py index d2a5c39..b3bffba 100644 --- a/acdcli/cache/query.py +++ b/acdcli/cache/query.py @@ -1,4 +1,5 @@ import logging +import os from datetime import datetime from .cursors import cursor @@ -25,6 +26,12 @@ def datetime_from_string(dt: str) -> datetime: WHERE p.parent = (?) ORDER BY n.name""" +PARENTS_SQL = """SELECT n.*, f.* FROM nodes n + JOIN parentage p ON n.id = p.parent + LEFT OUTER JOIN files f ON n.id = f.id + WHERE p.child = (?) + ORDER BY n.name""" + CHILDRENS_NAMES_SQL = """SELECT n.name FROM nodes n JOIN parentage p ON n.id = p.child WHERE p.parent = (?) AND n.status == 'AVAILABLE' @@ -159,14 +166,15 @@ def get_node(self, id) -> 'Union[Node|None]': return self.node_id_to_node_cache[id] except: pass - with cursor(self._conn) as c: - c.execute(NODE_BY_ID_SQL, [id]) - r = c.fetchone() - if r: - n = Node(r) - if n.is_available: + with cursor(self._conn) as c: + c.execute(NODE_BY_ID_SQL, [id]) + r = c.fetchone() + if r: + n = Node(r) + if n.is_available: + with self.node_cache_lock: self.node_id_to_node_cache[n.id] = n - return n + return n def get_root_node(self): return self.get_node(self.root_id) @@ -179,59 +187,50 @@ def get_conflicting_node(self, name: str, parent_id: str): if r: return Node(r) - def resolve_id(self, path: str, trash=False) -> str: + def resolve_id(self, path: str, trash=False) -> 'Union[str|None]': + n = self.resolve(path, trash) + if n: + return n.id + + def resolve(self, path: str, trash=False) -> 'Union[Node|None]': with self.node_cache_lock: try: - return self.path_to_node_id_cache[path] + return self.get_node(self.path_to_node_id_cache[path]) except: pass - n = self._resolve(path, trash) - if n: - self.node_id_to_node_cache[n.id] = n - self.path_to_node_id_cache[path] = n.id - return n.id - return None - def resolve(self, path: str, trash=False) -> 'Union[Node|None]': - """Gets a node from a path""" - id = self.resolve_id(path=path, trash=trash) - return self.get_node(id=id) if id else None - - def _resolve(self, path: str, trash=False) -> 'Union[Node|None]': - segments = list(filter(bool, path.split('/'))) - if not segments: - if not self.root_id: - return - with cursor(self._conn) as c: - c.execute(NODE_BY_ID_SQL, [self.root_id]) - r = c.fetchone() - return Node(r) + parent_path, name = os.path.split(path) + if not name: + r = self.get_root_node() + with self.node_cache_lock: + self.node_id_to_node_cache[r.id] = r + self.path_to_node_id_cache[path] = r.id + return r - parent = self.root_id - for i, segment in enumerate(segments): - with cursor(self._conn) as c: - c.execute(CHILD_OF_SQL, [segment, parent]) - r = c.fetchone() - r2 = c.fetchone() + parent = self.resolve(parent_path, trash=trash) + if not parent: + return - if not r: - return - r = Node(r) + with cursor(self._conn) as c: + c.execute(CHILD_OF_SQL, [name, parent.id]) + r = c.fetchone() + r2 = c.fetchone() + if not r: + return + r = Node(r) - if not r.is_available: - if not trash: - return - if r2: - logger.debug('None-unique trash name "%s" in %s.' % (segment, parent)) - return - if i + 1 == len(segments): - return r - if r.is_folder: - parent = r.id - continue - else: + if not r.is_available: + if not trash: + return + if r2: + logger.debug('None-unique trash name "%s" in %s.' % (name, parent)) return + with self.node_cache_lock: + self.node_id_to_node_cache[r.id] = r + self.path_to_node_id_cache[path] = r.id + return r + def childrens_names(self, folder_id) -> 'List[str]': with cursor(self._conn) as c: c.execute(CHILDRENS_NAMES_SQL, [folder_id]) @@ -331,6 +330,29 @@ def first_path(self, node_id: str) -> str: return node.simple_name return self.first_path(node.id) + node.name + '/' + def all_path(self, node_id: str, path_suffix=None) -> 'List[str]': + if node_id == self.root_id: + return ["/" + path_suffix] + + n = self.get_node(node_id) + if not n: + return [] + if path_suffix: + path_suffix = os.path.join(n.name, path_suffix) + else: + path_suffix = n.name + + ret = [] + with cursor(self._conn) as c: + c.execute(PARENTS_SQL, [n.id]) + parent = c.fetchone() + while parent: + parent = Node(parent) + if parent.is_available: + ret += self.all_path(parent.id, path_suffix) + parent = c.fetchone() + return ret + def find_by_name(self, name: str) -> 'List[Node]': nodes = [] with cursor(self._conn) as c: @@ -379,7 +401,8 @@ def get_property(self, node_id, owner_id, key) -> 'Union[str|None]': def get_content(self, node_id:str, version:int) -> 'Union[bytes|None]': if version == 0: return None with cursor(self._conn) as c: - c.execute(CONTENT_ACCESSED_SQL, [datetime.utcnow(), node_id]) + # Uncomment if/when we want to purge the cache based on LRU. Until then reduce the db load. + # c.execute(CONTENT_ACCESSED_SQL, [datetime.utcnow(), node_id]) c.execute(CONTENT_BY_ID_SQL, [node_id, version]) r = c.fetchone() if r: From a5fa452f474e55044acb415e519fa35ff7284b95 Mon Sep 17 00:00:00 2001 From: bgemmill Date: Wed, 15 Mar 2017 23:18:55 -0400 Subject: [PATCH 58/63] fix schema for properties table --- acdcli/cache/schema.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/acdcli/cache/schema.py b/acdcli/cache/schema.py index ca26d51..0a8686f 100644 --- a/acdcli/cache/schema.py +++ b/acdcli/cache/schema.py @@ -33,7 +33,7 @@ owner TEXT NOT NULL, key TEXT NOT NULL, value TEXT, - PRIMARY KEY (id), + PRIMARY KEY (id, owner, key), FOREIGN KEY(id) REFERENCES nodes (id) ); @@ -76,6 +76,7 @@ CREATE INDEX ix_content_size ON content(size); CREATE INDEX ix_content_accessed ON content(accessed); CREATE INDEX ix_parentage_child ON parentage(child); + CREATE INDEX ix_parentage_parent ON parentage(parent); CREATE INDEX ix_nodes_names ON nodes(name); PRAGMA user_version = 4; """ @@ -109,9 +110,10 @@ def _2_to_3(conn): # The user would also need to old-sync if they had multiple databases *and* were all ready using # properties in some of them. It's not clear how to do that from here aside from dropping all data. 'CREATE TABLE IF NOT EXISTS properties (id VARCHAR(50) NOT NULL, owner TEXT NOT NULL, ' - 'key TEXT NOT NULL, value TEXT, PRIMARY KEY (id), FOREIGN KEY(id) REFERENCES nodes (id));' + 'key TEXT NOT NULL, value TEXT, PRIMARY KEY (id, owner, key), FOREIGN KEY(id) REFERENCES nodes (id));' 'CREATE INDEX IF NOT EXISTS ix_parentage_child ON parentage(child);' + 'CREATE INDEX IF NOT EXISTS ix_parentage_parent ON parentage(parent);' # Having changed the schema, the queries can be optimised differently. # In order to be aware of that, re-analyze the type of data and indexes, # allowing SQLite3 to make better decisions. From f60ba358f91c0fb0da0f90d9c6b5b516f86ec898 Mon Sep 17 00:00:00 2001 From: bgemmill Date: Tue, 21 Mar 2017 17:36:35 -0400 Subject: [PATCH 59/63] prevent xattr cache from falling out of sync on failed acd calls --- acdcli/acd_fuse.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/acdcli/acd_fuse.py b/acdcli/acd_fuse.py index 272c948..10e03c6 100644 --- a/acdcli/acd_fuse.py +++ b/acdcli/acd_fuse.py @@ -552,6 +552,8 @@ def _xattr_write_and_sync(self, node_id): xattrs_str) except (RequestError, IOError) as e: logger.error('Error writing node xattrs "%s". %s' % (node_id, str(e))) + try: del self.xattr_cache[node_id] + except: pass else: self.cache.insert_property(node_id, self.acd_client_owner, _XATTR_PROPERTY_NAME, xattrs_str) self.xattr_dirty.discard(node_id) From 2583cb0909b49f229cdf2697525b77ec475a81ef Mon Sep 17 00:00:00 2001 From: bgemmill Date: Wed, 29 Mar 2017 16:40:55 -0400 Subject: [PATCH 60/63] the fuse subdir module sometimes leaves trailing slashes on directory paths --- acdcli/cache/query.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/acdcli/cache/query.py b/acdcli/cache/query.py index b3bffba..b7c66fe 100644 --- a/acdcli/cache/query.py +++ b/acdcli/cache/query.py @@ -193,6 +193,7 @@ def resolve_id(self, path: str, trash=False) -> 'Union[str|None]': return n.id def resolve(self, path: str, trash=False) -> 'Union[Node|None]': + path = path.rstrip('/') with self.node_cache_lock: try: return self.get_node(self.path_to_node_id_cache[path]) @@ -304,6 +305,8 @@ def list_children(self, folder_id, trash=False, folder_path=None) -> 'Tuple[List """If the caller provides the folder_path, we can add all the children to the path->node_id cache for faster lookup after a directory listing""" + if folder_path: + folder_path = folder_path.rstrip('/') with self.node_cache_lock: for c in folders + files: if c.is_available: @@ -332,7 +335,7 @@ def first_path(self, node_id: str) -> str: def all_path(self, node_id: str, path_suffix=None) -> 'List[str]': if node_id == self.root_id: - return ["/" + path_suffix] + return ['/' + path_suffix] n = self.get_node(node_id) if not n: From e96644071f88e893350f889e7e256b4e6714dc03 Mon Sep 17 00:00:00 2001 From: bgemmill Date: Fri, 26 May 2017 19:12:46 -0400 Subject: [PATCH 61/63] Merge branch 'master' of https://github.com/yadayada/acd_cli # Conflicts: # acdcli/cache/schema.py # docs/contributors.rst --- acdcli/cache/schema.py | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/acdcli/cache/schema.py b/acdcli/cache/schema.py index be5cf27..6c278c1 100755 --- a/acdcli/cache/schema.py +++ b/acdcli/cache/schema.py @@ -133,25 +133,16 @@ def _3_to_4(conn): # properties in some of them. It's not clear how to do that from here aside from dropping all data. 'CREATE TABLE IF NOT EXISTS properties (id VARCHAR(50) NOT NULL, owner TEXT NOT NULL, ' 'key TEXT NOT NULL, value TEXT, PRIMARY KEY (id, owner, key), FOREIGN KEY(id) REFERENCES nodes (id));' - - 'CREATE INDEX IF NOT EXISTS ix_parentage_child ON parentage(child);' - 'CREATE INDEX IF NOT EXISTS ix_parentage_parent ON parentage(parent);' - # Having changed the schema, the queries can be optimised differently. - # In order to be aware of that, re-analyze the type of data and indexes, - # allowing SQLite3 to make better decisions. - 'ANALYZE;' - 'PRAGMA user_version = 3;' - ) - - conn.executescript( + 'ALTER TABLE files ADD version BIGINT;' - 'DROP TABLE IF EXISTS content;' 'CREATE TABLE content (id VARCHAR(50) NOT NULL, value BLOB, size BIGINT, version BIGINT, accessed DATETIME,' 'PRIMARY KEY (id), UNIQUE (id), FOREIGN KEY(id) REFERENCES nodes (id)); ' 'CREATE INDEX IF NOT EXISTS ix_content_size ON content(size);' 'CREATE INDEX IF NOT EXISTS ix_content_accessed ON content(accessed);' + 'CREATE INDEX IF NOT EXISTS ix_parentage_parent ON parentage(parent);' + # Having changed the schema, the queries can be optimised differently. # In order to be aware of that, re-analyze the type of data and indexes, # allowing SQLite3 to make better decisions. From 3f3bed85ea8227297bc0bbb53c4405344d9121db Mon Sep 17 00:00:00 2001 From: bgemmill Date: Sat, 27 May 2017 08:58:28 -0400 Subject: [PATCH 62/63] remove legacy symlink handling since amazon purged properties during acdcli's ban --- acdcli/acd_fuse.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/acdcli/acd_fuse.py b/acdcli/acd_fuse.py index 10e03c6..9aef97a 100644 --- a/acdcli/acd_fuse.py +++ b/acdcli/acd_fuse.py @@ -926,7 +926,6 @@ def symlink(self, target, source): fh = self.create(target, None) node_id = self.fh_to_node[fh] self._setxattr(node_id, _XATTR_MODE_OVERRIDE_NAME, stat.S_IFLNK | 0o0777) - # self._setxattr(node_id, _XATTR_SYMLINK_OVERRIDE_NAME, source) self.write(target, source_bytes, 0, fh) self.release(target, fh) return 0 @@ -938,18 +937,6 @@ def readlink(self, path): source = None - # amazon reduced property size (all our xattr space) to 500 characters or less, - # so we're moving symlinks to file bodies. - try: source = self._getxattr(node.id, _XATTR_SYMLINK_OVERRIDE_NAME) - except: pass - if source is not None: - logger.debug("readlink: upgrading node: %s path: %s" % (node.id, path)) - source_bytes = source.encode('utf-8') - fh = self.open(path, 0) - self.write(path, source_bytes, 0, fh) - self.release(path, fh) - self._removexattr(node.id, _XATTR_SYMLINK_OVERRIDE_NAME) - if source is None: source_bytes = self.cache.get_content(node.id, node.version) if source_bytes is not None: From e2554a0210c2f2530e9fedc07b6bbb3274d5e018 Mon Sep 17 00:00:00 2001 From: bgemmill Date: Mon, 29 May 2017 23:02:06 -0400 Subject: [PATCH 63/63] docs --- docs/configuration.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 4d8658b..1f8540b 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -81,7 +81,7 @@ fuse.ini :: [fs] ;block size used for size info - block_size = 512 + block_size = io.DEFAULT_BUFFER_SIZE [read] ;maximal number of simultaneously opened chunks per file