Fix nilmdb-fsck issue caused by 022b50950f

Use a pure-python version of bisect_left, to fix 32-bit issues
The default bisect module includes a fast C implementation, which requires that array indices fit within the system "long" type. For 32-bit systems, that's not acceptable, as the table indices for raw data can exceed 2^32 very quickly. A pure python version works fine.
2015-06-24 22:14:27 -04:00 · 2015-01-20 18:31:58 -05:00 · 2015-01-18 17:50:54 -05:00 · 2015-01-18 17:50:54 -05:00 · 2015-01-18 17:50:54 -05:00 · 2015-01-18 17:49:52 -05:00
8 changed files with 57 additions and 31 deletions
--- a/nilmdb/fsck/fsck.py
+++ b/nilmdb/fsck/fsck.py
@@ -425,11 +425,15 @@ class Fsck(object):
        for intv in ints:
            last_ts = None
            (stime, etime, spos, epos) = intv
-            if spos == epos:
+
-                continue
+            # Break interval into maxrows-sized chunks
-            for start in xrange(*slice(spos, epos, maxrows).indices(epos)):
+            next_start = spos
            while next_start < epos:
                start = next_start
                stop = min(start + maxrows, epos)
                count = stop - start
                next_start = stop
                # Get raw data, convert to NumPy arary
                try:
                    raw = tab.get_data(start, stop, binary = True)
--- a/nilmdb/server/bulkdata.py
+++ b/nilmdb/server/bulkdata.py
@@ -43,6 +43,12 @@ class BulkData(object):
            # 32768 files per dir should work even on FAT32
            self.files_per_dir = 32768
        if "initial_nrows" in kwargs:
            self.initial_nrows = kwargs["initial_nrows"]
        else:
            # First row is 0
            self.initial_nrows = 0
        # Make root path
        if not os.path.isdir(self.root):
            os.mkdir(self.root)
@@ -254,7 +260,7 @@ class BulkData(object):
        path = self._encode_filename(unicodepath)
        elements = path.lstrip('/').split('/')
        ospath = os.path.join(self.root, *elements)
-        return Table(ospath)
+        return Table(ospath, self.initial_nrows)
@nilmdb.utils.must_close(wrap_verify = False)
 class Table(object):
@@ -291,9 +297,10 @@ class Table(object):
            pickle.dump(fmt, f, 2)
    # Normal methods
-    def __init__(self, root):
+    def __init__(self, root, initial_nrows = 0):
        """'root' is the full OS path to the directory of this table"""
        self.root = root
        self.initial_nrows = initial_nrows
        # Load the format
        with open(os.path.join(self.root, "_format"), "rb") as f:
@@ -353,8 +360,14 @@ class Table(object):
            # Convert to row number
            return self._row_from_offset(subdir, filename, offset)
-        # No files, so no data
+        # No files, so no data.  We typically start at row 0 in this
-        return 0
+        # case, although initial_nrows is specified during some tests
        # to exercise other parts of the code better.  Since we have
        # no files yet, round initial_nrows up so it points to a row
        # that would begin a new file.
        nrows = ((self.initial_nrows + (self.rows_per_file - 1)) //
                 self.rows_per_file) * self.rows_per_file
        return nrows
    def _offset_from_row(self, row):
        """Return a (subdir, filename, offset, count) tuple:
--- a/nilmdb/server/nilmdb.py
+++ b/nilmdb/server/nilmdb.py
@@ -23,7 +23,6 @@ from nilmdb.server.errors import NilmDBError, StreamError, OverlapError
 import sqlite3
 import os
 import errno
 import bisect
 # Note about performance and transactions:
 #
@@ -516,6 +515,17 @@ class NilmDB(object):
        # And that's all
        return
    def _bisect_left(self, a, x, lo, hi):
        # Like bisect.bisect_left, but doesn't choke on large indices on
        # 32-bit systems, like bisect's fast C implementation does.
        while lo < hi:
            mid = (lo + hi) / 2
            if a[mid] < x:
                lo = mid + 1
            else:
                hi = mid
        return lo
    def _find_start(self, table, dbinterval):
        """
        Given a DBInterval, find the row in the database that
@@ -526,10 +536,10 @@ class NilmDB(object):
        # Optimization for the common case where an interval wasn't truncated
        if dbinterval.start == dbinterval.db_start:
            return dbinterval.db_startpos
-        return bisect.bisect_left(table,
+        return self._bisect_left(table,
-                                  dbinterval.start,
+                                 dbinterval.start,
-                                  dbinterval.db_startpos,
+                                 dbinterval.db_startpos,
-                                  dbinterval.db_endpos)
+                                 dbinterval.db_endpos)
    def _find_end(self, table, dbinterval):
        """
@@ -545,10 +555,10 @@ class NilmDB(object):
        # want to include the given timestamp in the results.  This is
        # so a queries like 1:00 -> 2:00 and 2:00 -> 3:00 return
        # non-overlapping data.
-        return bisect.bisect_left(table,
+        return self._bisect_left(table,
-                                  dbinterval.end,
+                                 dbinterval.end,
-                                  dbinterval.db_startpos,
+                                 dbinterval.db_startpos,
-                                  dbinterval.db_endpos)
+                                 dbinterval.db_endpos)
    def stream_extract(self, path, start = None, end = None,
                       count = False, markup = False, binary = False):
--- a/nilmdb/server/server.py
+++ b/nilmdb/server/server.py
@@ -429,7 +429,7 @@ class Server(object):
        cherrypy.config.update({
            'server.socket_host': host,
            'server.socket_port': port,
-            'engine.autoreload_on': False,
+            'engine.autoreload.on': False,
            'server.max_request_body_size': 8*1024*1024,
            })
        if self.embedded:
--- a/nilmdb/utils/time.py
+++ b/nilmdb/utils/time.py
@@ -87,7 +87,7 @@ def parse_time(toparse):
    try:
        return unix_to_timestamp(datetime_tz.datetime_tz.
                                 smartparse(toparse).totimestamp())
-    except (ValueError, OverflowError):
+    except (ValueError, OverflowError, TypeError):
        pass
    # If it's parseable as a float, treat it as a Unix or NILM
--- a/setup.py
+++ b/setup.py
@@ -6,15 +6,6 @@
 # Then just package it up:
 #   python setup.py sdist
 # This is supposed to be using Distribute:
 #
 #   distutils provides a "setup" method.
 #   setuptools is a set of monkeypatches on top of that.
 #   distribute is a particular version/implementation of setuptools.
 #
 # So we don't really know if this is using the old setuptools or the
 # Distribute-provided version of setuptools.
 import traceback
 import sys
 import os
@@ -109,7 +100,7 @@ setup(name='nilmdb',
                        'coverage',
                        'numpy',
                        ],
-      setup_requires = [ 'distribute',
+      setup_requires = [ 'setuptools',
                         ],
      install_requires = [ 'decorator',
                           'cherrypy >= 3.2',
--- a/tests/test_cmdline.py
+++ b/tests/test_cmdline.py
@@ -834,9 +834,12 @@ class TestCmdline(object):
    def test_13_files(self):
        # Test BulkData's ability to split into multiple files,
        # by forcing the file size to be really small.
        # Also increase the initial nrows, so that start/end positions
        # in the database are very large (> 32 bit)
        server_stop()
        server_start(bulkdata_args = { "file_size" : 920, # 23 rows per file
-                                       "files_per_dir" : 3 })
+                                       "files_per_dir" : 3,
                                       "initial_nrows" : 2**40 })
        # Fill data
        self.ok("create /newton/prep float32_8")
@@ -888,7 +891,8 @@ class TestCmdline(object):
        server_stop()
        server_start(max_removals = 4321,
                     bulkdata_args = { "file_size" : 920, # 23 rows per file
-                                       "files_per_dir" : 3 })
+                                       "files_per_dir" : 3,
                                       "initial_nrows" : 2**40 })
        self.do_remove_files()
        self.ok("destroy -R /newton/prep") # destroy again
@@ -897,7 +901,8 @@ class TestCmdline(object):
        server_stop()
        server_start(max_int_removals = 1,
                     bulkdata_args = { "file_size" : 920, # 23 rows per file
-                                       "files_per_dir" : 3 })
+                                       "files_per_dir" : 3,
                                       "initial_nrows" : 2**40 })
        self.do_remove_files()
    def do_remove_files(self):
--- a/tests/test_timestamper.py
+++ b/tests/test_timestamper.py
@@ -1,5 +1,6 @@
 import nilmdb
 from nilmdb.utils.printf import *
 from nilmdb.utils import datetime_tz
 from nose.tools import *
 from nose.tools import assert_raises
@@ -19,6 +20,8 @@ class TestTimestamper(object):
        def join(list):
            return "\n".join(list) + "\n"
        datetime_tz.localtz_set("America/New_York")
        start = nilmdb.utils.time.parse_time("03/24/2012")
        lines_in  = [ "hello", "world", "hello world", "# commented out" ]
        lines_out = [ "1332561600000000 hello",
Author	SHA1	Message	Date
Jim Paris	8125d9c840	Fix nilmdb-fsck issue caused by `022b50950f`	2015-06-24 22:14:27 -04:00
Jim Paris	ba55ad82f0	Use a pure-python version of bisect_left, to fix 32-bit issues The default bisect module includes a fast C implementation, which requires that array indices fit within the system "long" type. For 32-bit systems, that's not acceptable, as the table indices for raw data can exceed 2^32 very quickly. A pure python version works fine.	2015-01-20 18:31:58 -05:00
Jim Paris	45c81d2019	Fix test that would fail if reordered, or in a different timezone	2015-01-18 17:50:54 -05:00
Jim Paris	78cfda32e3	Handle another exception from some versions of dateutil.parser	2015-01-18 17:50:54 -05:00
Jim Paris	3658d3876b	Rename deprecated config option The new version works in Cherrypy 3.2	2015-01-18 17:50:54 -05:00
Jim Paris	022b50950f	Support using a higher initial nrows in bulkdata, for tests This gives an easy way to get a large values in the database start_pos and end_pos fields, which is necessary for testing failure modes when those get too large (e.g. on 32-bit systems). Adjust tests to make use of this knob.	2015-01-18 17:49:52 -05:00
Jim Paris	e5efbadc8e	fsck: row indices are too big for slice.indices, so calculate manually Normally, indexes for an array are expected to fit in a platform's native long (32 or 64-bit). In nilmdb, tables aren't real arrays and we need to handle unbounded indices.	2015-01-18 16:36:56 -05:00
Jim Paris	74f633c9da	Distribute was merged back into setuptools, so use setuptools	2015-01-18 16:33:58 -05:00