From 4d9a106ca176748c209d2f226f2c6393a48b7703 Mon Sep 17 00:00:00 2001 From: Jim Paris Date: Tue, 18 Aug 2020 10:04:33 -0400 Subject: [PATCH] fsck: add fix for file position (row) overlap in database The following sequence could lead to this corruption: (1) Append new data to bulkdata (2) Update interval file positions in SQL (3) Flush (2) (4) Crash before flushing (1) (5) Reload database without running fsck (6) Start writing new data to end of bulkdata and introduce new interval --- nilmdb/fsck/fsck.py | 52 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 46 insertions(+), 6 deletions(-) diff --git a/nilmdb/fsck/fsck.py b/nilmdb/fsck/fsck.py index 687c317..ac96db8 100644 --- a/nilmdb/fsck/fsck.py +++ b/nilmdb/fsck/fsck.py @@ -238,9 +238,7 @@ class Fsck(object): try: posiset += new except IntervalError: - raise FsckError("%s: overlap in file offsets:\n" - "set: %s\nnew: %s", - path, str(posiset), str(new)) + self.fix_row_overlap(sid, path, posiset, new) try: # Check bulkdata @@ -268,6 +266,48 @@ class Fsck(object): path, str(e)) tab.close() + + def fix_row_overlap(self, sid, path, existing, new): + # If the file rows (spos, epos) overlap in the interval table, + # and the overlapping ranges look like this: + # A --------- C + # B -------- D + # Then we can try changing the first interval to go from + # A to B instead. + msg = (f"{path}: overlap in file offests:\n" + f"existing ranges: {existing}\n" + f"overlapping interval: {new}") + if not self.fix: + raise FixableFsckError(msg) + err(f"\n{msg}\nSeeing if we can truncate one of them...\n") + + # See if there'e exactly one interval that overlaps the + # conflicting one in the right way + match = None + for intv in self.stream_interval[sid]: + (stime, etime, spos, epos) = intv + if spos < new.start and epos > new.start: + if match: + err(f"no, more than one interval matched:\n" + f"{intv}\n{match}\n") + raise FsckError(f"{path}: unfixable overlap") + match = intv + if match is None: + err(f"no intervals overlapped in the right way\n") + raise FsckError(f"{path}: unfixable overlap") + + # Truncate the file position + err(f"truncating {match}\n") + with self.sql: + cur = self.sql.cursor() + cur.execute("UPDATE ranges SET end_pos=? " + "WHERE stream_id=? AND start_time=? AND " + "end_time=? AND start_pos=? AND end_pos=?", + (new.start, sid, *match)) + if cur.rowcount != 1: # pragma: no cover (shouldn't fail) + raise FsckError("failed to fix SQL database") + raise RetryFsck + ### Check that bulkdata is good enough to be opened @retry_if_raised(RetryFsck) @@ -438,8 +478,8 @@ class Fsck(object): # Otherwise, the only hope is to delete the interval entirely. err("*** Deleting the entire interval from SQL.\n") - err("This may leave stale data on disk. To fix that, copy all\n") - err("data from this stream to a new stream using nilm-copy, then\n") + err("This may leave stale data on disk. To fix that, copy all " + "data from this stream to a new stream using nilm-copy, then\n") err("remove all data from and destroy %s.\n", path) with self.sql: cur = self.sql.cursor() @@ -564,7 +604,7 @@ class Fsck(object): msg = f"{path}: bad data in table, starting at row {row}: {reason}" if not self.fix: raise FixableFsckError(msg) - err(f"\n{msg}\nWill try truncating table") + err(f"\n{msg}\nWill try truncating table\n") (subdir, fname, offs, count) = tab._offset_from_row(row) tab._remove_or_truncate_file(subdir, fname, offs) raise RetryFsck