# -*- coding: utf-8 -*- """Check database consistency, with some ability to fix problems. This should be able to fix cases where a database gets corrupted due to unexpected system shutdown, and detect other cases that may cause NilmDB to return errors when trying to manipulate the database.""" import nilmdb.utils import nilmdb.server import nilmdb.client.numpyclient from nilmdb.utils.interval import IntervalError from nilmdb.server.interval import Interval, IntervalSet from nilmdb.utils.printf import printf, fprintf, sprintf from collections import defaultdict import sqlite3 import os import sys import progressbar import re import shutil import pickle import numpy class FsckError(Exception): def __init__(self, msg="", *args): if args: msg = sprintf(msg, *args) Exception.__init__(self, msg) class FixableFsckError(FsckError): def __init__(self, msg=""): FsckError.__init__(self, f'{msg}\nThis may be fixable with "--fix".') class RetryFsck(FsckError): pass class FsckFormatError(FsckError): pass def log(format, *args): printf(format, *args) def err(format, *args): fprintf(sys.stderr, format, *args) # Decorator that retries a function if it returns a specific value def retry_if_raised(exc, message=None, max_retries=1000): def f1(func): def f2(*args, **kwargs): for n in range(max_retries): try: return func(*args, **kwargs) except exc: if message: log(f"{message} ({n+1})\n\n") raise Exception("Max number of retries (%d) exceeded; giving up" % max_retries) return f2 return f1 class Progress(object): def __init__(self, maxval): if maxval == 0: maxval = 1 self.bar = progressbar.ProgressBar( maxval=maxval, widgets=[progressbar.Percentage(), ' ', progressbar.Bar(), ' ', progressbar.ETA()]) self.bar.term_width = self.bar.term_width or 75 def __enter__(self): self.bar.start() self.last_update = 0 return self def __exit__(self, exc_type, exc_value, traceback): if exc_type is None: self.bar.finish() else: printf("\n") def update(self, val): self.bar.update(val) class Fsck(object): def __init__(self, path, fix=False): self.basepath = path self.sqlpath = os.path.join(path, "data.sql") self.bulkpath = os.path.join(path, "data") self.bulklock = os.path.join(path, "data.lock") self.fix = fix ### Main checks @retry_if_raised(RetryFsck, "Something was fixed: restarting fsck") def check(self, skip_data=False): self.bulk = None self.sql = None try: self.check_paths() self.check_sql() self.check_streams() self.check_intervals() if skip_data: log("skipped data check\n") else: self.check_data() finally: if self.bulk: self.bulk.close() if self.sql: # pragma: no cover # (coverage doesn't handle finally clauses correctly; # both branches here are tested) self.sql.commit() self.sql.close() log("ok\n") ### Check basic path structure def check_paths(self): log("checking paths\n") if self.bulk: self.bulk.close() if not os.path.isfile(self.sqlpath): raise FsckError("SQL database missing (%s)", self.sqlpath) if not os.path.isdir(self.bulkpath): raise FsckError("Bulk data directory missing (%s)", self.bulkpath) with open(self.bulklock, "w") as lockfile: if not nilmdb.utils.lock.exclusive_lock(lockfile): raise FsckError('Database already locked by another process\n' 'Make sure all other processes that might be ' 'using the database are stopped.\n' 'Restarting apache will cause it to unlock ' 'the db until a request is received.') # unlocked immediately self.bulk = nilmdb.server.bulkdata.BulkData(self.basepath) ### Check SQL database health def check_sql(self): log("checking sqlite database\n") self.sql = sqlite3.connect(self.sqlpath) with self.sql: cur = self.sql.cursor() ver = cur.execute("PRAGMA user_version").fetchone()[0] good = max(nilmdb.server.nilmdb._sql_schema_updates.keys()) if ver != good: raise FsckError("database version %d too old, should be %d", ver, good) self.stream_path = {} self.stream_layout = {} log(" loading paths\n") result = cur.execute("SELECT id, path, layout FROM streams") for r in result: if r[0] in self.stream_path: raise FsckError("duplicated ID %d in stream IDs", r[0]) self.stream_path[r[0]] = r[1] self.stream_layout[r[0]] = r[2] log(" loading intervals\n") self.stream_interval = defaultdict(list) result = cur.execute("SELECT stream_id, start_time, end_time, " "start_pos, end_pos FROM ranges " "ORDER BY start_time") for r in result: if r[0] not in self.stream_path: raise FsckError("interval ID %d not in streams", r[0]) self.stream_interval[r[0]].append((r[1], r[2], r[3], r[4])) log(" loading metadata\n") self.stream_meta = defaultdict(dict) result = cur.execute("SELECT stream_id, key, value FROM metadata") for r in result: if r[0] not in self.stream_path: raise FsckError("metadata ID %d not in streams", r[0]) if r[1] in self.stream_meta[r[0]]: raise FsckError( "duplicate metadata key '%s' for stream %d", r[1], r[0]) self.stream_meta[r[0]][r[1]] = r[2] ### Check streams and basic interval overlap def check_streams(self): ids = list(self.stream_path.keys()) log("checking %s streams\n", "{:,d}".format(len(ids))) with Progress(len(ids)) as pbar: for i, sid in enumerate(ids): pbar.update(i) path = self.stream_path[sid] # unique path, valid layout if list(self.stream_path.values()).count(path) != 1: raise FsckError("duplicated path %s", path) layout = self.stream_layout[sid].split('_')[0] if layout not in ('int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64', 'float32', 'float64'): raise FsckError("bad layout %s for %s", layout, path) count = int(self.stream_layout[sid].split('_')[1]) if count < 1 or count > 1024: raise FsckError("bad count %d for %s", count, path) # must exist in bulkdata bulk = self.bulkpath + path bulk = bulk.encode('utf-8') if not os.path.isdir(bulk): raise FsckError("%s: missing bulkdata dir", path) if not nilmdb.server.bulkdata.Table.exists(bulk): raise FsckError("%s: bad bulkdata table", path) # intervals don't overlap. Abuse IntervalSet to check # for intervals in file positions, too. timeiset = IntervalSet() posiset = IntervalSet() for (stime, etime, spos, epos) in self.stream_interval[sid]: new = Interval(stime, etime) try: timeiset += new except IntervalError: raise FsckError("%s: overlap in intervals:\n" "set: %s\nnew: %s", path, str(timeiset), str(new)) if spos != epos: new = Interval(spos, epos) try: posiset += new except IntervalError: self.fix_row_overlap(sid, path, posiset, new) try: # Check bulkdata self.check_bulkdata(sid, path, bulk) # Check that we can open bulkdata tab = nilmdb.server.bulkdata.Table(bulk) except FsckFormatError as e: # If there are no files except _format, try deleting # the entire stream; this may remove metadata, but # it's probably unimportant. files = list(os.listdir(bulk)) if len(files) > 1: raise FsckFormatError(f"{path}: can't load _format, " f"but data is also present") # Since the stream was empty, just remove it self.fix_remove_stream(sid, path, bulk, "empty, with corrupted format file") except FsckError as e: raise e except Exception as e: # pragma: no cover # No coverage because this is an unknown/unexpected error raise FsckError("%s: can't open bulkdata: %s", path, str(e)) tab.close() def fix_row_overlap(self, sid, path, existing, new): # If the file rows (spos, epos) overlap in the interval table, # and the overlapping ranges look like this: # A --------- C # B -------- D # Then we can try changing the first interval to go from # A to B instead. msg = (f"{path}: overlap in file offests:\n" f"existing ranges: {existing}\n" f"overlapping interval: {new}") if not self.fix: raise FixableFsckError(msg) err(f"\n{msg}\nSeeing if we can truncate one of them...\n") # See if there'e exactly one interval that overlaps the # conflicting one in the right way match = None for intv in self.stream_interval[sid]: (stime, etime, spos, epos) = intv if spos < new.start and epos > new.start: if match: err(f"no, more than one interval matched:\n" f"{intv}\n{match}\n") raise FsckError(f"{path}: unfixable overlap") match = intv if match is None: err(f"no intervals overlapped in the right way\n") raise FsckError(f"{path}: unfixable overlap") # Truncate the file position err(f"truncating {match}\n") with self.sql: cur = self.sql.cursor() cur.execute("UPDATE ranges SET end_pos=? " "WHERE stream_id=? AND start_time=? AND " "end_time=? AND start_pos=? AND end_pos=?", (new.start, sid, *match)) if cur.rowcount != 1: # pragma: no cover (shouldn't fail) raise FsckError("failed to fix SQL database") raise RetryFsck ### Check that bulkdata is good enough to be opened @retry_if_raised(RetryFsck) def check_bulkdata(self, sid, path, bulk): try: with open(os.path.join(bulk, b"_format"), "rb") as f: fmt = pickle.load(f) except Exception as e: raise FsckFormatError(f"{path}: can't load _format file ({e})") if fmt["version"] != 3: raise FsckFormatError("%s: bad or unsupported bulkdata version %d", path, fmt["version"]) rows_per_file = int(fmt["rows_per_file"]) if rows_per_file < 1: raise FsckFormatError(f"{path}: bad rows_per_file {rows_per_file}") files_per_dir = int(fmt["files_per_dir"]) if files_per_dir < 1: raise FsckFormatError(f"{path}: bad files_per_dir {files_per_dir}") layout = fmt["layout"] if layout != self.stream_layout[sid]: raise FsckFormatError("%s: layout mismatch %s != %s", path, layout, self.stream_layout[sid]) # Every file should have a size that's the multiple of the row size rkt = nilmdb.server.rocket.Rocket(layout, None) row_size = rkt.binary_size rkt.close() # Find all directories regex = re.compile(b"^[0-9a-f]{4,}$") subdirs = sorted(filter(regex.search, os.listdir(bulk)), key=lambda x: int(x, 16), reverse=True) for subdir in subdirs: # Find all files in that dir subpath = os.path.join(bulk, subdir) files = list(filter(regex.search, os.listdir(subpath))) if not files: self.fix_empty_subdir(subpath) # Verify that their size is a multiple of the row size for filename in files: filepath = os.path.join(subpath, filename) offset = os.path.getsize(filepath) if offset % row_size: self.fix_bad_filesize(path, filepath, offset, row_size) def fix_empty_subdir(self, subpath): msg = sprintf("bulkdata path %s is missing data files", subpath) if not self.fix: raise FixableFsckError(msg) # Try to fix it by just deleting whatever is present, # as long as it's only ".removed" files. err("\n%s\n", msg) for fn in os.listdir(subpath): if not fn.endswith(b".removed"): raise FsckError("can't fix automatically: please manually " "remove the file '%s' and try again", os.path.join(subpath, fn).decode( 'utf-8', errors='backslashreplace')) # Remove the whole thing err("Removing empty subpath\n") shutil.rmtree(subpath) raise RetryFsck def fix_bad_filesize(self, path, filepath, offset, row_size): extra = offset % row_size msg = sprintf("%s: size of file %s (%d) is not a multiple" + " of row size (%d): %d extra bytes present", path, filepath, offset, row_size, extra) if not self.fix: raise FixableFsckError(msg) # Try to fix it by just truncating the file err("\n%s\n", msg) newsize = offset - extra err("Truncating file to %d bytes and retrying\n", newsize) with open(filepath, "r+b") as f: f.truncate(newsize) raise RetryFsck def fix_remove_stream(self, sid, path, bulk, reason): msg = f"stream {path} is corrupted: {reason}" if not self.fix: raise FixableFsckError(msg) # Remove the stream from disk and the database err(f"\n{msg}\n") err(f"Removing stream {path} from disk and database\n") shutil.rmtree(bulk) with self.sql: cur = self.sql.cursor() cur.execute("DELETE FROM streams WHERE id=?", (sid,)) if cur.rowcount != 1: # pragma: no cover (shouldn't fail) raise FsckError("failed to remove stream") cur.execute("DELETE FROM ranges WHERE stream_id=?", (sid,)) cur.execute("DELETE FROM metadata WHERE stream_id=?", (sid,)) raise RetryFsck ### Check interval endpoints def check_intervals(self): total_ints = sum(len(x) for x in list(self.stream_interval.values())) log("checking %s intervals\n", "{:,d}".format(total_ints)) done = 0 with Progress(total_ints) as pbar: for sid in self.stream_interval: try: bulk = self.bulkpath + self.stream_path[sid] bulk = bulk.encode('utf-8') tab = nilmdb.server.bulkdata.Table(bulk) def update(x): pbar.update(done + x) ints = self.stream_interval[sid] done += self.check_table_intervals(sid, ints, tab, update) finally: tab.close() def check_table_intervals(self, sid, ints, tab, update): # look in the table to make sure we can pick out the interval's # endpoints path = self.stream_path[sid] # noqa: F841 unused tab.file_open.cache_remove_all() for (i, intv) in enumerate(ints): update(i) (stime, etime, spos, epos) = intv if spos == epos and spos >= 0 and spos <= tab.nrows: continue try: srow = tab[spos] # noqa: F841 unused erow = tab[epos-1] # noqa: F841 unused except Exception as e: self.fix_bad_interval(sid, intv, tab, str(e)) return len(ints) def fix_bad_interval(self, sid, intv, tab, msg): path = self.stream_path[sid] msg = sprintf("%s: interval %s error accessing rows: %s", path, str(intv), str(msg)) if not self.fix: raise FixableFsckError(msg) err("\n%s\n", msg) (stime, etime, spos, epos) = intv # If it's just that the end pos is more than the number of rows # in the table, lower end pos and truncate interval time too. if spos < tab.nrows and epos >= tab.nrows: err("end position is past endrows, but it can be truncated\n") err("old end: time %d, pos %d\n", etime, epos) new_epos = tab.nrows new_etime = tab[new_epos-1] + 1 err("new end: time %d, pos %d\n", new_etime, new_epos) if stime < new_etime: # Change it in SQL with self.sql: cur = self.sql.cursor() cur.execute("UPDATE ranges SET end_time=?, end_pos=? " "WHERE stream_id=? AND start_time=? AND " "end_time=? AND start_pos=? AND end_pos=?", (new_etime, new_epos, sid, stime, etime, spos, epos)) if cur.rowcount != 1: # pragma: no cover (shouldn't fail) raise FsckError("failed to fix SQL database") raise RetryFsck err("actually it can't be truncated; times are bad too\n") # Otherwise, the only hope is to delete the interval entirely. err("*** Deleting the entire interval from SQL.\n") err("This may leave stale data on disk. To fix that, copy all " "data from this stream to a new stream using nilm-copy, then\n") err("remove all data from and destroy %s.\n", path) with self.sql: cur = self.sql.cursor() cur.execute("DELETE FROM ranges WHERE " "stream_id=? AND start_time=? AND " "end_time=? AND start_pos=? AND end_pos=?", (sid, stime, etime, spos, epos)) if cur.rowcount != 1: # pragma: no cover (shouldn't fail) raise FsckError("failed to remove interval") raise RetryFsck ### Check data in each interval def check_data(self): total_rows = sum(sum((y[3] - y[2]) for y in x) for x in list(self.stream_interval.values())) log("checking %s rows of data\n", "{:,d}".format(total_rows)) done = 0 with Progress(total_rows) as pbar: for sid in self.stream_interval: try: bulk = self.bulkpath + self.stream_path[sid] bulk = bulk.encode('utf-8') tab = nilmdb.server.bulkdata.Table(bulk) def update(x): pbar.update(done + x) ints = self.stream_interval[sid] done += self.check_table_data(sid, ints, tab, update) finally: tab.close() def check_table_data(self, sid, ints, tab, update): # Pull out all of the interval's data and verify that it's # monotonic. maxrows = getattr(self, 'maxrows_override', 100000) path = self.stream_path[sid] layout = self.stream_layout[sid] dtype = nilmdb.client.numpyclient.layout_to_dtype(layout) tab.file_open.cache_remove_all() done = 0 for intv in ints: last_ts = None (stime, etime, spos, epos) = intv # Break interval into maxrows-sized chunks next_start = spos while next_start < epos: start = next_start stop = min(start + maxrows, epos) count = stop - start next_start = stop # Get raw data, convert to NumPy arary try: raw = tab.get_data(start, stop, binary=True) data = numpy.frombuffer(raw, dtype) except Exception as e: # pragma: no cover # No coverage because it's hard to trigger this -- earlier # checks check the ranges, so this would probably be a real # disk error, malloc failure, etc. raise FsckError( "%s: failed to grab rows %d through %d: %s", path, start, stop, repr(e)) ts = data['timestamp'] # Verify that all timestamps are in range. match = (ts < stime) | (ts >= etime) if match.any(): row = numpy.argmax(match) if ts[row] != 0: raise FsckError("%s: data timestamp %d at row %d " "outside interval range [%d,%d)", path, ts[row], row + start, stime, etime) # Timestamp is zero and out of the expected range; # assume file ends with zeroed data and just truncate it. self.fix_table_by_truncating( path, tab, row + start, "data timestamp is out of range, and zero") # Verify that timestamps are monotonic match = numpy.diff(ts) <= 0 if match.any(): row = numpy.argmax(match) if ts[row+1] != 0: raise FsckError("%s: non-monotonic timestamp (%d -> %d)" " at row %d", path, ts[row], ts[row+1], row + start) # Timestamp is zero and non-monotonic; # assume file ends with zeroed data and just truncate it. self.fix_table_by_truncating( path, tab, row + start + 1, "data timestamp is non-monotonic, and zero") first_ts = ts[0] if last_ts is not None and first_ts <= last_ts: raise FsckError("%s: first interval timestamp %d is not " "greater than the previous last interval " "timestamp %d, at row %d", path, first_ts, last_ts, start) last_ts = ts[-1] # The previous errors are fixable, by removing the # offending intervals, or changing the data # timestamps. But these are probably unlikely errors, # so it's not worth implementing that yet. # Done done += count update(done) return done def fix_table_by_truncating(self, path, tab, row, reason): # Simple fix for bad data: truncate the table at the given row. # On retry, fix_bad_interval will correct the database and timestamps # to account for this truncation. msg = f"{path}: bad data in table, starting at row {row}: {reason}" if not self.fix: raise FixableFsckError(msg) err(f"\n{msg}\nWill try truncating table\n") (subdir, fname, offs, count) = tab._offset_from_row(row) tab._remove_or_truncate_file(subdir, fname, offs) raise RetryFsck