|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484 |
- # -*- coding: utf-8 -*-
-
- """Check database consistency, with some ability to fix problems.
- This should be able to fix cases where a database gets corrupted due
- to unexpected system shutdown, and detect other cases that may cause
- NilmDB to return errors when trying to manipulate the database."""
-
- import nilmdb.utils
- import nilmdb.server
- import nilmdb.client.numpyclient
- from nilmdb.utils.interval import IntervalError
- from nilmdb.server.interval import Interval, IntervalSet
- from nilmdb.utils.printf import printf, fprintf, sprintf
-
- from collections import defaultdict
- import sqlite3
- import os
- import sys
- import progressbar
- import re
- import shutil
- import pickle
- import numpy
-
-
- class FsckError(Exception):
- def __init__(self, msg="", *args):
- if args:
- msg = sprintf(msg, *args)
- Exception.__init__(self, msg)
-
-
- class FixableFsckError(FsckError):
- def __init__(self, msg="", *args):
- if args:
- msg = sprintf(msg, *args)
- FsckError.__init__(self, f'{msg}\nThis may be fixable with "--fix".')
-
-
- class RetryFsck(FsckError):
- pass
-
-
- def log(format, *args):
- printf(format, *args)
-
-
- def err(format, *args):
- fprintf(sys.stderr, format, *args)
-
-
- # Decorator that retries a function if it returns a specific value
- def retry_if_raised(exc, message=None, max_retries=100):
- def f1(func):
- def f2(*args, **kwargs):
- for n in range(max_retries):
- try:
- return func(*args, **kwargs)
- except exc:
- if message:
- log("%s\n\n", message)
- raise Exception("Max number of retries (%d) exceeded; giving up")
- return f2
- return f1
-
-
- class Progress(object):
- def __init__(self, maxval):
- if maxval == 0:
- maxval = 1
- self.bar = progressbar.ProgressBar(
- maxval=maxval,
- widgets=[progressbar.Percentage(), ' ',
- progressbar.Bar(), ' ',
- progressbar.ETA()])
- if self.bar.term_width == 0:
- self.bar.term_width = 75
-
- def __enter__(self):
- self.bar.start()
- self.last_update = 0
- return self
-
- def __exit__(self, exc_type, exc_value, traceback):
- if exc_type is None:
- self.bar.finish()
- else:
- printf("\n")
-
- def update(self, val):
- self.bar.update(val)
-
-
- class Fsck(object):
- def __init__(self, path, fix=False):
- self.basepath = path
- self.sqlpath = os.path.join(path, "data.sql")
- self.bulkpath = os.path.join(path, "data")
- self.bulklock = os.path.join(path, "data.lock")
- self.fix = fix
-
- ### Main checks
-
- @retry_if_raised(RetryFsck, "Something was fixed: restarting fsck")
- def check(self, skip_data=False):
- self.bulk = None
- self.sql = None
- try:
- self.check_paths()
- self.check_sql()
- self.check_streams()
- self.check_intervals()
- if skip_data:
- log("skipped data check\n")
- else:
- self.check_data()
- finally:
- if self.bulk:
- self.bulk.close()
- if self.sql:
- self.sql.commit()
- self.sql.close()
- log("ok\n")
-
- ### Check basic path structure
-
- def check_paths(self):
- log("checking paths\n")
- if self.bulk:
- self.bulk.close()
- if not os.path.isfile(self.sqlpath):
- raise FsckError("SQL database missing (%s)", self.sqlpath)
- if not os.path.isdir(self.bulkpath):
- raise FsckError("Bulk data directory missing (%s)", self.bulkpath)
- with open(self.bulklock, "w") as lockfile:
- if not nilmdb.utils.lock.exclusive_lock(lockfile):
- raise FsckError('Database already locked by another process\n'
- 'Make sure all other processes that might be '
- 'using the database are stopped.\n'
- 'Restarting apache will cause it to unlock '
- 'the db until a request is received.')
- # unlocked immediately
- self.bulk = nilmdb.server.bulkdata.BulkData(self.basepath)
-
- ### Check SQL database health
-
- def check_sql(self):
- log("checking sqlite database\n")
-
- self.sql = sqlite3.connect(self.sqlpath)
- with self.sql:
- cur = self.sql.cursor()
- ver = cur.execute("PRAGMA user_version").fetchone()[0]
- good = max(nilmdb.server.nilmdb._sql_schema_updates.keys())
- if ver != good:
- raise FsckError("database version %d too old, should be %d",
- ver, good)
- self.stream_path = {}
- self.stream_layout = {}
- log(" loading paths\n")
- result = cur.execute("SELECT id, path, layout FROM streams")
- for r in result:
- if r[0] in self.stream_path:
- raise FsckError("duplicated ID %d in stream IDs", r[0])
- self.stream_path[r[0]] = r[1]
- self.stream_layout[r[0]] = r[2]
-
- log(" loading intervals\n")
- self.stream_interval = defaultdict(list)
- result = cur.execute("SELECT stream_id, start_time, end_time, "
- "start_pos, end_pos FROM ranges "
- "ORDER BY start_time")
- for r in result:
- if r[0] not in self.stream_path:
- raise FsckError("interval ID %d not in streams", r[0])
- self.stream_interval[r[0]].append((r[1], r[2], r[3], r[4]))
-
- log(" loading metadata\n")
- self.stream_meta = defaultdict(dict)
- result = cur.execute("SELECT stream_id, key, value FROM metadata")
- for r in result:
- if r[0] not in self.stream_path:
- raise FsckError("metadata ID %d not in streams", r[0])
- if r[1] in self.stream_meta[r[0]]:
- raise FsckError(
- "duplicate metadata key '%s' for stream %d",
- r[1], r[0])
- self.stream_meta[r[0]][r[1]] = r[2]
-
- ### Check streams and basic interval overlap
-
- def check_streams(self):
- ids = list(self.stream_path.keys())
- log("checking %s streams\n", "{:,d}".format(len(ids)))
- with Progress(len(ids)) as pbar:
- for i, sid in enumerate(ids):
- pbar.update(i)
- path = self.stream_path[sid]
-
- # unique path, valid layout
- if list(self.stream_path.values()).count(path) != 1:
- raise FsckError("duplicated path %s", path)
- layout = self.stream_layout[sid].split('_')[0]
- if layout not in ('int8', 'int16', 'int32', 'int64',
- 'uint8', 'uint16', 'uint32', 'uint64',
- 'float32', 'float64'):
- raise FsckError("bad layout %s for %s", layout, path)
- count = int(self.stream_layout[sid].split('_')[1])
- if count < 1 or count > 1024:
- raise FsckError("bad count %d for %s", count, path)
-
- # must exist in bulkdata
- bulk = self.bulkpath + path
- if not os.path.isdir(bulk):
- raise FsckError("%s: missing bulkdata dir", path)
- if not nilmdb.server.bulkdata.Table.exists(bulk):
- raise FsckError("%s: bad bulkdata table", path)
-
- # intervals don't overlap. Abuse IntervalSet to check
- # for intervals in file positions, too.
- timeiset = IntervalSet()
- posiset = IntervalSet()
- for (stime, etime, spos, epos) in self.stream_interval[sid]:
- new = Interval(stime, etime)
- try:
- timeiset += new
- except IntervalError:
- raise FsckError("%s: overlap in intervals:\n"
- "set: %s\nnew: %s",
- path, str(timeiset), str(new))
- if spos != epos:
- new = Interval(spos, epos)
- try:
- posiset += new
- except IntervalError:
- raise FsckError("%s: overlap in file offsets:\n"
- "set: %s\nnew: %s",
- path, str(posiset), str(new))
-
- # check bulkdata
- self.check_bulkdata(sid, path, bulk)
-
- # Check that we can open bulkdata
- try:
- tab = None
- try:
- tab = nilmdb.server.bulkdata.Table(bulk)
- except Exception as e:
- raise FsckError("%s: can't open bulkdata: %s",
- path, str(e))
- finally:
- if tab:
- tab.close()
-
- ### Check that bulkdata is good enough to be opened
-
- @retry_if_raised(RetryFsck)
- def check_bulkdata(self, sid, path, bulk):
- with open(os.path.join(bulk, "_format"), "rb") as f:
- fmt = pickle.load(f)
- if fmt["version"] != 3:
- raise FsckError("%s: bad or unsupported bulkdata version %d",
- path, fmt["version"])
- row_per_file = int(fmt["rows_per_file"])
- if row_per_file < 1:
- raise FsckError(f"{path}: bad row_per_file {row_per_file}")
- files_per_dir = int(fmt["files_per_dir"])
- if files_per_dir < 1:
- raise FsckError(f"{path}: bad files_per_dir {files_per_dir}")
- layout = fmt["layout"]
- if layout != self.stream_layout[sid]:
- raise FsckError("%s: layout mismatch %s != %s", path,
- layout, self.stream_layout[sid])
-
- # Every file should have a size that's the multiple of the row size
- rkt = nilmdb.server.rocket.Rocket(layout, None)
- row_size = rkt.binary_size
- rkt.close()
-
- # Find all directories
- regex = re.compile("^[0-9a-f]{4,}$")
- subdirs = sorted(filter(regex.search, os.listdir(bulk)),
- key=lambda x: int(x, 16), reverse=True)
- for subdir in subdirs:
- # Find all files in that dir
- subpath = os.path.join(bulk, subdir)
- files = list(filter(regex.search, os.listdir(subpath)))
- if not files:
- self.fix_empty_subdir(subpath)
- raise RetryFsck
- # Verify that their size is a multiple of the row size
- for filename in files:
- filepath = os.path.join(subpath, filename)
- offset = os.path.getsize(filepath)
- if offset % row_size:
- self.fix_bad_filesize(path, filepath, offset, row_size)
-
- def fix_empty_subdir(self, subpath):
- msg = sprintf("bulkdata path %s is missing data files", subpath)
- if not self.fix:
- raise FixableFsckError(msg)
- # Try to fix it by just deleting whatever is present,
- # as long as it's only ".removed" files.
- err("\n%s\n", msg)
- for fn in os.listdir(subpath):
- if not fn.endswith(".removed"):
- raise FsckError("can't fix automatically: please manually "
- "remove the file %s and try again",
- os.path.join(subpath, fn))
- # Remove the whole thing
- err("Removing empty subpath\n")
- shutil.rmtree(subpath)
- raise RetryFsck
-
- def fix_bad_filesize(self, path, filepath, offset, row_size):
- extra = offset % row_size
- msg = sprintf("%s: size of file %s (%d) is not a multiple" +
- " of row size (%d): %d extra bytes present",
- path, filepath, offset, row_size, extra)
- if not self.fix:
- raise FixableFsckError(msg)
- # Try to fix it by just truncating the file
- err("\n%s\n", msg)
- newsize = offset - extra
- err("Truncating file to %d bytes and retrying\n", newsize)
- with open(filepath, "r+b") as f:
- f.truncate(newsize)
- raise RetryFsck
-
- ### Check interval endpoints
-
- def check_intervals(self):
- total_ints = sum(len(x) for x in list(self.stream_interval.values()))
- log("checking %s intervals\n", "{:,d}".format(total_ints))
- done = 0
- with Progress(total_ints) as pbar:
- for sid in self.stream_interval:
- try:
- bulk = self.bulkpath + self.stream_path[sid]
- tab = nilmdb.server.bulkdata.Table(bulk)
-
- def update(x):
- pbar.update(done + x)
-
- ints = self.stream_interval[sid]
- done += self.check_table_intervals(sid, ints, tab, update)
- finally:
- tab.close()
-
- def check_table_intervals(self, sid, ints, tab, update):
- # look in the table to make sure we can pick out the interval's
- # endpoints
- path = self.stream_path[sid] # noqa: F841 unused
- tab.file_open.cache_remove_all()
- for (i, intv) in enumerate(ints):
- update(i)
- (stime, etime, spos, epos) = intv
- if spos == epos and spos >= 0 and spos <= tab.nrows:
- continue
- try:
- srow = tab[spos] # noqa: F841 unused
- erow = tab[epos-1] # noqa: F841 unused
- except Exception as e:
- self.fix_bad_interval(sid, intv, tab, str(e))
- raise RetryFsck
- return len(ints)
-
- def fix_bad_interval(self, sid, intv, tab, msg):
- path = self.stream_path[sid]
- msg = sprintf("%s: interval %s error accessing rows: %s",
- path, str(intv), str(msg))
- if not self.fix:
- raise FixableFsckError(msg)
- err("\n%s\n", msg)
-
- (stime, etime, spos, epos) = intv
- # If it's just that the end pos is more than the number of rows
- # in the table, lower end pos and truncate interval time too.
- if spos < tab.nrows and epos >= tab.nrows:
- err("end position is past endrows, but it can be truncated\n")
- err("old end: time %d, pos %d\n", etime, epos)
- new_epos = tab.nrows
- new_etime = tab[new_epos-1] + 1
- err("new end: time %d, pos %d\n", new_etime, new_epos)
- if stime < new_etime:
- # Change it in SQL
- with self.sql:
- cur = self.sql.cursor()
- cur.execute("UPDATE ranges SET end_time=?, end_pos=? "
- "WHERE stream_id=? AND start_time=? AND "
- "end_time=? AND start_pos=? AND end_pos=?",
- (new_etime, new_epos, sid, stime, etime,
- spos, epos))
- if cur.rowcount != 1:
- raise FsckError("failed to fix SQL database")
- raise RetryFsck
- err("actually it can't be truncated; times are bad too")
-
- # Otherwise, the only hope is to delete the interval entirely.
- err("*** Deleting the entire interval from SQL.\n")
- err("This may leave stale data on disk. To fix that, copy all\n")
- err("data from this stream to a new stream, then remove all data\n")
- err("from and destroy %s.\n", path)
- with self.sql:
- cur = self.sql.cursor()
- cur.execute("DELETE FROM ranges WHERE "
- "stream_id=? AND start_time=? AND "
- "end_time=? AND start_pos=? AND end_pos=?",
- (sid, stime, etime, spos, epos))
- if cur.rowcount != 1:
- raise FsckError("failed to remove interval")
- raise RetryFsck
-
- ### Check data in each interval
-
- def check_data(self):
- total_rows = sum(sum((y[3] - y[2]) for y in x)
- for x in list(self.stream_interval.values()))
- log("checking %s rows of data\n", "{:,d}".format(total_rows))
- done = 0
- with Progress(total_rows) as pbar:
- for sid in self.stream_interval:
- try:
- bulk = self.bulkpath + self.stream_path[sid]
- tab = nilmdb.server.bulkdata.Table(bulk)
-
- def update(x):
- pbar.update(done + x)
-
- ints = self.stream_interval[sid]
- done += self.check_table_data(sid, ints, tab, update)
- finally:
- tab.close()
-
- def check_table_data(self, sid, ints, tab, update):
- # Pull out all of the interval's data and verify that it's
- # monotonic.
- maxrows = 100000
- path = self.stream_path[sid]
- layout = self.stream_layout[sid]
- dtype = nilmdb.client.numpyclient.layout_to_dtype(layout)
- tab.file_open.cache_remove_all()
- done = 0
- for intv in ints:
- last_ts = None
- (stime, etime, spos, epos) = intv
-
- # Break interval into maxrows-sized chunks
- next_start = spos
- while next_start < epos:
- start = next_start
- stop = min(start + maxrows, epos)
- count = stop - start
- next_start = stop
-
- # Get raw data, convert to NumPy arary
- try:
- raw = tab.get_data(start, stop, binary=True)
- data = numpy.fromstring(raw, dtype)
- except Exception as e:
- raise FsckError(
- "%s: failed to grab rows %d through %d: %s",
- path, start, stop, repr(e))
-
- # Verify that timestamps are monotonic
- if (numpy.diff(data['timestamp']) <= 0).any():
- raise FsckError("%s: non-monotonic timestamp(s) in rows "
- "%d through %d", path, start, stop)
- first_ts = data['timestamp'][0]
- if last_ts is not None and first_ts <= last_ts:
- raise FsckError("%s: first interval timestamp %d is not "
- "greater than the previous last interval "
- "timestamp %d, at row %d",
- path, first_ts, last_ts, start)
- last_ts = data['timestamp'][-1]
-
- # These are probably fixable, by removing the offending
- # intervals. But I'm not going to bother implementing
- # that yet.
-
- # Done
- done += count
- update(done)
- return done
|