Browse Source

More fsck tools, including fixes

tags/nilmdb-1.9.1
Jim Paris 10 years ago
parent
commit
0f745b3047
4 changed files with 134 additions and 21 deletions
  1. +2
    -2
      Makefile
  2. +0
    -1
      nilmdb/fsck/.#fsck.py
  3. +128
    -15
      nilmdb/fsck/fsck.py
  4. +4
    -3
      nilmdb/scripts/nilmdb_fsck.py

+ 2
- 2
Makefile View File

@@ -1,5 +1,5 @@
# By default, run the tests.
all: test
all: fscktest

version:
python setup.py version
@@ -25,7 +25,7 @@ lint:

fscktest:
# python -c "import nilmdb.fsck; nilmdb.fsck.Fsck('/home/jim/wsgi/db').check()"
python -c "import nilmdb.fsck; nilmdb.fsck.Fsck('/home/jim/mnt/bucket/mnt/sharon/data/db').check()"
python -c "import nilmdb.fsck; nilmdb.fsck.Fsck('/home/jim/mnt/bucket/mnt/sharon/data/db', True).check()"

test:
ifeq ($(INSIDE_EMACS), t)


+ 0
- 1
nilmdb/fsck/.#fsck.py View File

@@ -1 +0,0 @@
jim@pilot.lees.18066:1373305995

+ 128
- 15
nilmdb/fsck/fsck.py View File

@@ -12,13 +12,25 @@ from nilmdb.utils.time import timestamp_to_string
from collections import defaultdict
import sqlite3
import os
import sys
import progressbar
import re
import time
import shutil
import cPickle as pickle

class FsckError(Exception):
def __init__(self, format, *args):
Exception.__init__(self, sprintf(format, *args))
def __init__(self, msg = "", *args):
if args:
msg = sprintf(msg, *args)
Exception.__init__(self, msg)
class FixableFsckError(FsckError):
def __init__(self, msg = "", *args):
if args:
msg = sprintf(msg, *args)
FsckError.__init__(self, "%s\nThis may be fixable with \"-y\".", msg)
class RetryFsck(FsckError):
pass

def log(format, *args):
printf(format, *args)
@@ -26,6 +38,19 @@ def log(format, *args):
def err(format, *args):
fprintf(sys.stderr, format, *args)

# Decorator that retries a function if it returns a specific value
def retry_if_raised(exc, message = None):
def f1(func):
def f2(*args, **kwargs):
while True:
try:
return func(*args, **kwargs)
except exc as e:
if message:
log("%s\n\n", message)
return f2
return f1

class Progress(object):
def __init__(self, maxval):
self.bar = progressbar.ProgressBar(maxval = maxval)
@@ -42,31 +67,30 @@ class Progress(object):
printf("\n")
def update(self, val):
self.bar.update(val)
#now = time.time()
#if now - self.last_update < 0.005:
# time.sleep(0.005)
#self.last_update = now

class Fsck(object):

def __init__(self, path):
def __init__(self, path, fix = False):
self.basepath = path
self.sqlpath = os.path.join(path, "data.sql")
self.bulkpath = os.path.join(path, "data")
self.bulklock = os.path.join(path, "data.lock")
self.fix = fix

@retry_if_raised(RetryFsck, "Something was fixed: restarting fsck")
def check(self):
self.check_paths()
self.check_sql()
self.check_streams()
self.check_intervals()
log("ok\n")

def check_paths(self):
log("checking paths\n")
if not os.path.isfile(self.sqlpath):
raise FsckError("SQL database missing")
raise FsckError("SQL database missing (%s)", self.sqlpath)
if not os.path.isdir(self.bulkpath):
raise FsckError("Bulk data directory missing")
raise FsckError("Bulk data directory missing (%s)", self.bulkpath)
with open(self.bulklock, "w") as lockfile:
if not nilmdb.utils.lock.exclusive_lock(lockfile):
raise FsckError('database already locked by another process')
@@ -116,8 +140,8 @@ class Fsck(object):
self.stream_meta[r[0]][r[1]] = r[2]

def check_streams(self):
log("checking streams\n")
ids = self.stream_path.keys()
log("checking %d streams\n", len(ids))
with Progress(len(ids)) as pbar:
for i, sid in enumerate(ids):
pbar.update(i)
@@ -152,7 +176,7 @@ class Fsck(object):
timeiset += new
except IntervalError:
raise FsckError("%s: overlap in intervals:\n"
"set: %s\nnew: %s\n",
"set: %s\nnew: %s",
path, str(timeiset), str(new))
if spos != epos:
new = Interval(spos, epos)
@@ -160,14 +184,13 @@ class Fsck(object):
posiset += new
except IntervalError:
raise FsckError("%s: overlap in file offsets:\n"
"set: %s\nnew: %s\n",
"set: %s\nnew: %s",
path, str(posiset), str(new))

# check bulkdata
self.check_bulkdata(sid, path, bulk)

continue
# verify we can can open it with bulkdata
# Check that we can open bulkdata
try:
tab = None
try:
@@ -175,11 +198,43 @@ class Fsck(object):
except Exception as e:
raise FsckError("%s: can't open bulkdata: %s",
path, str(e))
self.check_bulkdata(path, tab)
finally:
if tab:
tab.close()

def fix_empty_subdir(self, subpath):
msg = sprintf("bulkdata path %s is missing data files", subpath)
if not self.fix:
raise FixableFsckError(msg)
# Try to fix it by just deleting whatever is present,
# as long as it's only ".removed" files.
err("\n%s\n", msg)
for fn in os.listdir(subpath):
if not fn.endswith(".removed"):
raise FsckError("can't fix automatically: please manually "
"remove the file %s and try again",
os.path.join(subpath, fn))
# Remove the whole thing
err("Removing empty subpath\n")
shutil.rmtree(subpath)
raise RetryFsck

def fix_bad_filesize(self, path, filepath, offset, row_size):
extra = offset % row_size
msg = sprintf("%s: size of file %s (%d) is not a multiple" +
" of row size (%d): %d extra bytes present",
path, filepath, offset, row_size, extra)
if not self.fix:
raise FixableFsckError(msg)
# Try to fix it by just truncating the file
err("\n%s\n", msg)
newsize = offset - extra
err("Truncating file to %d bytes and retrying\n", newsize)
with open(filepath, "r+b") as f:
f.truncate(newsize)
raise RetryFsck

@retry_if_raised(RetryFsck)
def check_bulkdata(self, sid, path, bulk):
with open(os.path.join(bulk, "_format"), "rb") as f:
fmt = pickle.load(f)
@@ -192,3 +247,61 @@ class Fsck(object):
if layout != self.stream_layout[sid]:
raise FsckError("%s: layout mismatch %s != %s", path,
layout, self.stream_layout[sid])

# Every file should have a size that's the multiple of the row size
rkt = nilmdb.server.rocket.Rocket(layout, None)
row_size = rkt.binary_size
rkt.close()

# Find all directories
regex = re.compile("^[0-9a-f]{4,}$")
subdirs = sorted(filter(regex.search, os.listdir(bulk)),
key = lambda x: int(x, 16), reverse = True)
for subdir in subdirs:
# Find all files in that dir
subpath = os.path.join(bulk, subdir)
files = filter(regex.search, os.listdir(subpath))
if not files:
self.fix_empty_subdir(subpath)
raise RetryFsck
# Verify that their size is a multiple of the row size
for filename in files:
filepath = os.path.join(subpath, filename)
offset = os.path.getsize(filepath)
if offset % row_size:
self.fix_bad_filesize(path, filepath, offset, row_size)

def check_intervals(self):
total_ints = sum(len(x) for x in self.stream_interval.values())
log("checking %d intervals\n", total_ints)
checked = 0
with Progress(total_ints) as pbar:
for sid in self.stream_interval:
try:
bulk = self.bulkpath + self.stream_path[sid]
tab = nilmdb.server.bulkdata.Table(bulk)
def update(x):
pbar.update(checked + x)
ints = self.stream_interval[sid]
path = self.stream_path[sid]
self.check_table_intervals(path, ints, tab, update)
checked += len(ints)
finally:
tab.close()

def check_table_intervals(self, path, ints, tab, update):
# look in the table to make sure we can pick out the interval's
# endpoints
tab.file_open.cache_remove_all()
for (i, intv) in enumerate(ints):
(stime, etime, spos, epos) = intv
update(i)
if spos == epos:
continue
try:
srow = tab[spos]
erow = tab[epos-1]
except Exception as e:
msg = sprintf("%s: interval %s error accessing rows: %s",
path, str(intv), str(e))
raise FsckError(msg)

+ 4
- 3
nilmdb/scripts/nilmdb_fsck.py View File

@@ -13,11 +13,12 @@ def main():
formatter_class = argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("-V", "--version", action="version",
version = nilmdb.__version__)
parser.add_argument('-d', '--database', help = 'Database directory',
default = "./db")
parser.add_argument('-y', dest="fix", action="store_true",
default=False, help = 'Fix errors')
parser.add_argument('database', help = 'Database directory')
args = parser.parse_args()

nilmdb.fsck.Fsck(args.database).check()
nilmdb.fsck.Fsck(args.database).check(args.fix)

if __name__ == "__main__":
main()

Loading…
Cancel
Save