5 Commits

Author SHA1 Message Date
  Jim Paris 7538c6201b tests: new fsck tests for interval overlap 5 months ago
  Jim Paris 4d9a106ca1 fsck: add fix for file position (row) overlap in database 5 months ago
  Jim Paris e90a79ddad fsck: increase max restarts from 100 to 1000 5 months ago
  Jim Paris 7056c5b4ec tests: new fsck tests 5 months ago
  Jim Paris df4e7f0967 fsck: If data timestamps are unexpectedly zero, truncate data 5 months ago
23 changed files with 95 additions and 15 deletions
Split View
  1. +80
    -14
      nilmdb/fsck/fsck.py
  2. BIN
      tests/fsck-data/test2w1/data.sql
  3. BIN
      tests/fsck-data/test2w1/data/a/b/0000/0000
  4. BIN
      tests/fsck-data/test2w1/data/a/b/_format
  5. BIN
      tests/fsck-data/test2w2/data.sql
  6. BIN
      tests/fsck-data/test2w2/data/a/b/0000/0000
  7. BIN
      tests/fsck-data/test2w2/data/a/b/_format
  8. BIN
      tests/fsck-data/test2x1/data.sql
  9. BIN
      tests/fsck-data/test2x1/data.sql-shm
  10. BIN
      tests/fsck-data/test2x1/data.sql-wal
  11. BIN
      tests/fsck-data/test2x1/data/a/b/0000/0000
  12. BIN
      tests/fsck-data/test2x1/data/a/b/_format
  13. BIN
      tests/fsck-data/test2x2/data.sql
  14. BIN
      tests/fsck-data/test2x2/data.sql-shm
  15. BIN
      tests/fsck-data/test2x2/data.sql-wal
  16. BIN
      tests/fsck-data/test2x2/data/a/b/0000/0000
  17. BIN
      tests/fsck-data/test2x2/data/a/b/_format
  18. BIN
      tests/fsck-data/test2x3/data.sql
  19. BIN
      tests/fsck-data/test2x3/data.sql-shm
  20. BIN
      tests/fsck-data/test2x3/data.sql-wal
  21. BIN
      tests/fsck-data/test2x3/data/a/b/0000/0000
  22. BIN
      tests/fsck-data/test2x3/data/a/b/_format
  23. +15
    -1
      tests/test_fsck.py

+ 80
- 14
nilmdb/fsck/fsck.py View File

@@ -52,7 +52,7 @@ def err(format, *args):


# Decorator that retries a function if it returns a specific value
def retry_if_raised(exc, message=None, max_retries=100):
def retry_if_raised(exc, message=None, max_retries=1000):
def f1(func):
def f2(*args, **kwargs):
for n in range(max_retries):
@@ -60,7 +60,7 @@ def retry_if_raised(exc, message=None, max_retries=100):
return func(*args, **kwargs)
except exc:
if message:
log("%s\n\n", message)
log(f"{message} ({n+1})\n\n")
raise Exception("Max number of retries (%d) exceeded; giving up" %
max_retries)
return f2
@@ -238,9 +238,7 @@ class Fsck(object):
try:
posiset += new
except IntervalError:
raise FsckError("%s: overlap in file offsets:\n"
"set: %s\nnew: %s",
path, str(posiset), str(new))
self.fix_row_overlap(sid, path, posiset, new)

try:
# Check bulkdata
@@ -268,6 +266,48 @@ class Fsck(object):
path, str(e))
tab.close()


def fix_row_overlap(self, sid, path, existing, new):
# If the file rows (spos, epos) overlap in the interval table,
# and the overlapping ranges look like this:
# A --------- C
# B -------- D
# Then we can try changing the first interval to go from
# A to B instead.
msg = (f"{path}: overlap in file offsets:\n"
f"existing ranges: {existing}\n"
f"overlapping interval: {new}")
if not self.fix:
raise FixableFsckError(msg)
err(f"\n{msg}\nSeeing if we can truncate one of them...\n")

# See if there'e exactly one interval that overlaps the
# conflicting one in the right way
match = None
for intv in self.stream_interval[sid]:
(stime, etime, spos, epos) = intv
if spos < new.start and epos > new.start:
if match:
err(f"no, more than one interval matched:\n"
f"{intv}\n{match}\n")
raise FsckError(f"{path}: unfixable overlap")
match = intv
if match is None:
err(f"no intervals overlapped in the right way\n")
raise FsckError(f"{path}: unfixable overlap")

# Truncate the file position
err(f"truncating {match}\n")
with self.sql:
cur = self.sql.cursor()
cur.execute("UPDATE ranges SET end_pos=? "
"WHERE stream_id=? AND start_time=? AND "
"end_time=? AND start_pos=? AND end_pos=?",
(new.start, sid, *match))
if cur.rowcount != 1: # pragma: no cover (shouldn't fail)
raise FsckError("failed to fix SQL database")
raise RetryFsck

### Check that bulkdata is good enough to be opened

@retry_if_raised(RetryFsck)
@@ -438,8 +478,8 @@ class Fsck(object):

# Otherwise, the only hope is to delete the interval entirely.
err("*** Deleting the entire interval from SQL.\n")
err("This may leave stale data on disk. To fix that, copy all\n")
err("data from this stream to a new stream using nilm-copy, then\n")
err("This may leave stale data on disk. To fix that, copy all "
"data from this stream to a new stream using nilm-copy, then\n")
err("remove all data from and destroy %s.\n", path)
with self.sql:
cur = self.sql.cursor()
@@ -512,18 +552,32 @@ class Fsck(object):
match = (ts < stime) | (ts >= etime)
if match.any():
row = numpy.argmax(match)
raise FsckError("%s: data timestamp %d at row %d "
"outside interval range [%d,%d)",
path, ts[row], row + start,
stime, etime)
if ts[row] != 0:
raise FsckError("%s: data timestamp %d at row %d "
"outside interval range [%d,%d)",
path, ts[row], row + start,
stime, etime)

# Timestamp is zero and out of the expected range;
# assume file ends with zeroed data and just truncate it.
self.fix_table_by_truncating(
path, tab, row + start,
"data timestamp is out of range, and zero")

# Verify that timestamps are monotonic
match = numpy.diff(ts) <= 0
if match.any():
row = numpy.argmax(match)
raise FsckError("%s: non-monotonic timestamp (%d -> %d)"
" at row %d", path, ts[row], ts[row+1],
row + start)
if ts[row+1] != 0:
raise FsckError("%s: non-monotonic timestamp (%d -> %d)"
" at row %d", path, ts[row], ts[row+1],
row + start)

# Timestamp is zero and non-monotonic;
# assume file ends with zeroed data and just truncate it.
self.fix_table_by_truncating(
path, tab, row + start + 1,
"data timestamp is non-monotonic, and zero")

first_ts = ts[0]
if last_ts is not None and first_ts <= last_ts:
@@ -542,3 +596,15 @@ class Fsck(object):
done += count
update(done)
return done

def fix_table_by_truncating(self, path, tab, row, reason):
# Simple fix for bad data: truncate the table at the given row.
# On retry, fix_bad_interval will correct the database and timestamps
# to account for this truncation.
msg = f"{path}: bad data in table, starting at row {row}: {reason}"
if not self.fix:
raise FixableFsckError(msg)
err(f"\n{msg}\nWill try truncating table\n")
(subdir, fname, offs, count) = tab._offset_from_row(row)
tab._remove_or_truncate_file(subdir, fname, offs)
raise RetryFsck

BIN
tests/fsck-data/test2w1/data.sql View File


BIN
tests/fsck-data/test2w1/data/a/b/0000/0000 View File


BIN
tests/fsck-data/test2w1/data/a/b/_format View File


BIN
tests/fsck-data/test2w2/data.sql View File


BIN
tests/fsck-data/test2w2/data/a/b/0000/0000 View File


BIN
tests/fsck-data/test2w2/data/a/b/_format View File


BIN
tests/fsck-data/test2x1/data.sql View File


BIN
tests/fsck-data/test2x1/data.sql-shm View File


BIN
tests/fsck-data/test2x1/data.sql-wal View File


BIN
tests/fsck-data/test2x1/data/a/b/0000/0000 View File


BIN
tests/fsck-data/test2x1/data/a/b/_format View File


BIN
tests/fsck-data/test2x2/data.sql View File


BIN
tests/fsck-data/test2x2/data.sql-shm View File


BIN
tests/fsck-data/test2x2/data.sql-wal View File


BIN
tests/fsck-data/test2x2/data/a/b/0000/0000 View File


BIN
tests/fsck-data/test2x2/data/a/b/_format View File


BIN
tests/fsck-data/test2x3/data.sql View File


BIN
tests/fsck-data/test2x3/data.sql-shm View File


BIN
tests/fsck-data/test2x3/data.sql-wal View File


BIN
tests/fsck-data/test2x3/data/a/b/0000/0000 View File


BIN
tests/fsck-data/test2x3/data/a/b/_format View File


+ 15
- 1
tests/test_fsck.py View File

@@ -123,7 +123,7 @@ class TestFsck(object):
self.failmsg("test2h", "missing bulkdata dir")
self.failmsg("test2i", "bad bulkdata table")
self.failmsg("test2j", "overlap in intervals")
self.failmsg("test2k", "overlap in file offsets")
self.failmsg("test2k", "overlap in file offsets", fix=False)
self.ok("test2k1")
self.failmsg("test2l", "unsupported bulkdata version")
self.failmsg("test2m", "bad rows_per_file")
@@ -168,3 +168,17 @@ class TestFsck(object):
self.failmsg("test2v1", "bad bulkdata table")
self.failmsg("test2v2", "empty, with corrupted format file", fix=False)
self.okmsg("test2v2", "empty, with corrupted format file")

self.failmsg("test2w1", "out of range, and zero", fix=False)
self.okmsg("test2w1", "Will try truncating table")
self.contain("Deleting the entire interval")

self.failmsg("test2w2", "non-monotonic, and zero", fix=False)
self.okmsg("test2w2", "Will try truncating table")
self.contain("new end: time 237000001, pos 238")

self.failmsg("test2x1", "overlap in file offsets", fix=False)
self.okmsg("test2x1", "truncating")

self.failmsg("test2x2", "unfixable overlap")
self.failmsg("test2x3", "unfixable overlap")

Loading…
Cancel
Save