Browse Source

More fsck improvements. Fixed two problems on sharon so far.

tags/nilmdb-1.9.1
Jim Paris 10 years ago
parent
commit
422b1e2df2
1 changed files with 111 additions and 22 deletions
  1. +111
    -22
      nilmdb/fsck/fsck.py

+ 111
- 22
nilmdb/fsck/fsck.py View File

@@ -53,7 +53,11 @@ def retry_if_raised(exc, message = None):

class Progress(object):
def __init__(self, maxval):
self.bar = progressbar.ProgressBar(maxval = maxval)
self.bar = progressbar.ProgressBar(
maxval = maxval,
widgets = [ progressbar.Percentage(), ' ',
progressbar.Bar(), ' ',
progressbar.ETA() ])
if self.bar.term_width == 0:
self.bar.term_width = 75
def __enter__(self):
@@ -79,14 +83,26 @@ class Fsck(object):

@retry_if_raised(RetryFsck, "Something was fixed: restarting fsck")
def check(self):
self.check_paths()
self.check_sql()
self.check_streams()
self.check_intervals()
self.bulk = None
self.sql = None
try:
self.check_paths()
self.check_sql()
self.check_streams()
self.check_intervals()
self.check_data()
finally:
if self.bulk:
self.bulk.close()
if self.sql:
self.sql.commit()
self.sql.close()
log("ok\n")

def check_paths(self):
log("checking paths\n")
if self.bulk:
self.bulk.close()
if not os.path.isfile(self.sqlpath):
raise FsckError("SQL database missing (%s)", self.sqlpath)
if not os.path.isdir(self.bulkpath):
@@ -94,17 +110,16 @@ class Fsck(object):
with open(self.bulklock, "w") as lockfile:
if not nilmdb.utils.lock.exclusive_lock(lockfile):
raise FsckError('database already locked by another process')
# unlocked immediately
self.bulk = nilmdb.server.bulkdata.BulkData(self.basepath)
# override must_close warning
if "_must_close" in dir(self.bulk):
del self.bulk._must_close

def check_sql(self):
log("checking sqlite database\n")

self.sql = sqlite3.connect(self.sqlpath)
with self.sql as con:
ver = con.execute("PRAGMA user_version").fetchone()[0]
with self.sql:
cur = self.sql.cursor()
ver = cur.execute("PRAGMA user_version").fetchone()[0]
good = max(nilmdb.server.nilmdb._sql_schema_updates.keys())
if ver != good:
raise FsckError("database version %d too old, should be %d",
@@ -112,7 +127,7 @@ class Fsck(object):
self.stream_path = {}
self.stream_layout = {}
log(" loading paths\n")
result = con.execute("SELECT id, path, layout FROM streams")
result = cur.execute("SELECT id, path, layout FROM streams")
for r in result:
if r[0] in self.stream_path:
raise FsckError("duplicated ID %d in stream IDs", r[0])
@@ -121,7 +136,7 @@ class Fsck(object):

log(" loading intervals\n")
self.stream_interval = defaultdict(list)
result = con.execute("SELECT stream_id, start_time, end_time, "
result = cur.execute("SELECT stream_id, start_time, end_time, "
"start_pos, end_pos FROM ranges")
for r in result:
if r[0] not in self.stream_path:
@@ -130,7 +145,7 @@ class Fsck(object):

log(" loading metadata\n")
self.stream_meta = defaultdict(dict)
result = con.execute("SELECT stream_id, key, value FROM metadata")
result = cur.execute("SELECT stream_id, key, value FROM metadata")
for r in result:
if r[0] not in self.stream_path:
raise FsckError("metadata ID %d not in streams", k)
@@ -271,9 +286,8 @@ class Fsck(object):
if offset % row_size:
self.fix_bad_filesize(path, filepath, offset, row_size)

def check_intervals(self):
def _check_for_each_interval(self, checkfunc):
total_ints = sum(len(x) for x in self.stream_interval.values())
log("checking %d intervals\n", total_ints)
checked = 0
with Progress(total_ints) as pbar:
for sid in self.stream_interval:
@@ -283,25 +297,100 @@ class Fsck(object):
def update(x):
pbar.update(checked + x)
ints = self.stream_interval[sid]
path = self.stream_path[sid]
self.check_table_intervals(path, ints, tab, update)
checkfunc(sid, ints, tab, update)
checked += len(ints)
finally:
tab.close()

def check_table_intervals(self, path, ints, tab, update):
def check_intervals(self):
total_ints = sum(len(x) for x in self.stream_interval.values())
log("checking %d intervals\n", total_ints)
self._check_for_each_interval(self.check_table_intervals)

def fix_bad_interval(self, sid, intv, tab, msg):
path = self.stream_path[sid]
msg = sprintf("%s: interval %s error accessing rows: %s",
path, str(intv), str(msg))
if not self.fix:
raise FixableFsckError(msg)
err("\n%s\n", msg)

(stime, etime, spos, epos) = intv
# If it's just that the end pos is more than the number of rows
# in the table, lower end pos and truncate interval time too.
if spos < tab.nrows and epos >= tab.nrows:
err("end position is past endrows, but it can be truncated\n")
err("old end: time %d, pos %d\n", etime, epos)
new_epos = tab.nrows
new_etime = tab[new_epos-1] + 1
err("new end: time %d, pos %d\n", new_etime, new_epos)
if stime < new_etime:
# Change it in SQL
with self.sql:
cur = self.sql.cursor()
cur.execute("UPDATE ranges SET end_time=?, end_pos=? "
"WHERE stream_id=? AND start_time=? AND "
"end_time=? AND start_pos=? AND end_pos=?",
(new_etime, new_epos, sid, stime, etime,
spos, epos))
if cur.rowcount != 1:
raise FsckError("failed to fix SQL database")
raise RetryFsck
err("actually it can't be truncated; times are bad too")

# Otherwise, the only hope is to delete the interval entirely.
err("*** Deleting the entire interval from SQL.\n")
err("This may leave stale data on disk. To fix that, copy all\n")
err("data from this stream to a new stream, then remove all data\n")
err("and destroy %s.\n")
with self.sql:
cur = self.sql.cursor()
cur.execute("DELETE FROM ranges WHERE "
"stream_id=? AND start_time=? AND "
"end_time=? AND start_pos=? AND end_pos=?",
(sid, stime, etime, spos, epos))
if cur.rowcount != 1:
raise FsckError("failed to remove interval")
raise RetryFsck

def check_table_intervals(self, sid, ints, tab, update):
# look in the table to make sure we can pick out the interval's
# endpoints
path = self.stream_path[sid]
tab.file_open.cache_remove_all()
for (i, intv) in enumerate(ints):
(stime, etime, spos, epos) = intv
update(i)
if spos == epos:
if spos == epos and spos >= 0 and spos <= tab.nrows:
continue
try:
srow = tab[spos]
erow = tab[epos-1]
except Exception as e:
msg = sprintf("%s: interval %s error accessing rows: %s",
path, str(intv), str(e))
raise FsckError(msg)
self.fix_bad_interval(sid, intv, tab, str(e))
raise RetryFsck

def check_data(self):
total_rows = sum(sum((y[3] - y[2]) for y in x)
for x in self.stream_interval.values())
log("checking %d rows of data\n", total_rows)
self._check_for_each_interval(self.check_table_data)

def check_table_data(self, sid, ints, tab, update):
# look in the table to make sure we can pick out all of
# the interval's data, and that the data is monotonic
path = self.stream_path[sid]
tab.file_open.cache_remove_all()
for (i, intv) in enumerate(ints):
(stime, etime, spos, epos) = intv
update(i)
last_ts = None
for row in xrange(spos, epos):
ts = tab[row]
if ts <= last_ts:
raise FsckError("%s: interval %s has non-monotonic "
"timestamps: %d and then %d\n",
path, intv, last_ts, ts)
if ts < stime or ts >= etime:
raise FsckError("%s: interval %s has out-of-bound "
"timestamp %d\n", ps, intv, ts)

Loading…
Cancel
Save