Use a pure-python version of bisect_left, to fix 32-bit issues

The default bisect module includes a fast C implementation, which requires that array indices fit within the system "long" type. For 32-bit systems, that's not acceptable, as the table indices for raw data can exceed 2^32 very quickly. A pure python version works fine.
Fix test that would fail if reordered, or in a different timezone
2015-01-20 18:31:58 -05:00 · 2015-01-18 17:50:54 -05:00 · 2015-01-18 17:50:54 -05:00 · 2015-01-18 17:50:54 -05:00 · 2015-01-18 17:49:52 -05:00 · 2015-01-18 16:36:56 -05:00
8 changed files with 57 additions and 31 deletions
--- a/nilmdb/fsck/fsck.py
+++ b/nilmdb/fsck/fsck.py
@@ -425,11 +425,15 @@ class Fsck(object):
        for intv in ints:
            last_ts = None
            (stime, etime, spos, epos) = intv
-            if spos == epos:
-                continue
-            for start in xrange(*slice(spos, epos, maxrows).indices(epos)):
+
+            # Break interval into maxrows-sized chunks
+            next_start = spos
+            while next_start < epos:
+                start = next_start
                stop = min(start + maxrows, epos)
                count = stop - start
+                next_start = stop
+
                # Get raw data, convert to NumPy arary
                try:
                    raw = tab.get_data(start, stop, binary = True)
--- a/nilmdb/server/bulkdata.py
+++ b/nilmdb/server/bulkdata.py
@@ -43,6 +43,12 @@ class BulkData(object):
            # 32768 files per dir should work even on FAT32
            self.files_per_dir = 32768

+        if "initial_nrows" in kwargs:
+            self.initial_nrows = kwargs["initial_nrows"]
+        else:
+            # First row is 0
+            self.initial_nrows = 0
+
        # Make root path
        if not os.path.isdir(self.root):
            os.mkdir(self.root)
@@ -254,7 +260,7 @@ class BulkData(object):
        path = self._encode_filename(unicodepath)
        elements = path.lstrip('/').split('/')
        ospath = os.path.join(self.root, *elements)
-        return Table(ospath)
+        return Table(ospath, self.initial_nrows)

@nilmdb.utils.must_close(wrap_verify = False)
 class Table(object):
@@ -291,9 +297,10 @@ class Table(object):
            pickle.dump(fmt, f, 2)

    # Normal methods
-    def __init__(self, root):
+    def __init__(self, root, initial_nrows):
        """'root' is the full OS path to the directory of this table"""
        self.root = root
+        self.initial_nrows = initial_nrows

        # Load the format
        with open(os.path.join(self.root, "_format"), "rb") as f:
@@ -353,8 +360,14 @@ class Table(object):
            # Convert to row number
            return self._row_from_offset(subdir, filename, offset)

-        # No files, so no data
-        return 0
+        # No files, so no data.  We typically start at row 0 in this
+        # case, although initial_nrows is specified during some tests
+        # to exercise other parts of the code better.  Since we have
+        # no files yet, round initial_nrows up so it points to a row
+        # that would begin a new file.
+        nrows = ((self.initial_nrows + (self.rows_per_file - 1)) //
+                 self.rows_per_file) * self.rows_per_file
+        return nrows

    def _offset_from_row(self, row):
        """Return a (subdir, filename, offset, count) tuple:
--- a/nilmdb/server/nilmdb.py
+++ b/nilmdb/server/nilmdb.py
@@ -23,7 +23,6 @@ from nilmdb.server.errors import NilmDBError, StreamError, OverlapError
 import sqlite3
 import os
 import errno
-import bisect

 # Note about performance and transactions:
 #
@@ -516,6 +515,17 @@ class NilmDB(object):
        # And that's all
        return

+    def _bisect_left(self, a, x, lo, hi):
+        # Like bisect.bisect_left, but doesn't choke on large indices on
+        # 32-bit systems, like bisect's fast C implementation does.
+        while lo < hi:
+            mid = (lo + hi) / 2
+            if a[mid] < x:
+                lo = mid + 1
+            else:
+                hi = mid
+        return lo
+
    def _find_start(self, table, dbinterval):
        """
        Given a DBInterval, find the row in the database that
@@ -526,10 +536,10 @@ class NilmDB(object):
        # Optimization for the common case where an interval wasn't truncated
        if dbinterval.start == dbinterval.db_start:
            return dbinterval.db_startpos
-        return bisect.bisect_left(table,
-                                  dbinterval.start,
-                                  dbinterval.db_startpos,
-                                  dbinterval.db_endpos)
+        return self._bisect_left(table,
+                                 dbinterval.start,
+                                 dbinterval.db_startpos,
+                                 dbinterval.db_endpos)

    def _find_end(self, table, dbinterval):
        """
@@ -545,10 +555,10 @@ class NilmDB(object):
        # want to include the given timestamp in the results.  This is
        # so a queries like 1:00 -> 2:00 and 2:00 -> 3:00 return
        # non-overlapping data.
-        return bisect.bisect_left(table,
-                                  dbinterval.end,
-                                  dbinterval.db_startpos,
-                                  dbinterval.db_endpos)
+        return self._bisect_left(table,
+                                 dbinterval.end,
+                                 dbinterval.db_startpos,
+                                 dbinterval.db_endpos)

    def stream_extract(self, path, start = None, end = None,
                       count = False, markup = False, binary = False):
--- a/nilmdb/server/server.py
+++ b/nilmdb/server/server.py
@@ -429,7 +429,7 @@ class Server(object):
        cherrypy.config.update({
            'server.socket_host': host,
            'server.socket_port': port,
-            'engine.autoreload_on': False,
+            'engine.autoreload.on': False,
            'server.max_request_body_size': 8*1024*1024,
            })
        if self.embedded:
--- a/nilmdb/utils/time.py
+++ b/nilmdb/utils/time.py
@@ -87,7 +87,7 @@ def parse_time(toparse):
    try:
        return unix_to_timestamp(datetime_tz.datetime_tz.
                                 smartparse(toparse).totimestamp())
-    except (ValueError, OverflowError):
+    except (ValueError, OverflowError, TypeError):
        pass

    # If it's parseable as a float, treat it as a Unix or NILM
--- a/setup.py
+++ b/setup.py
@@ -6,15 +6,6 @@
 # Then just package it up:
 #   python setup.py sdist

-# This is supposed to be using Distribute:
-#
-#   distutils provides a "setup" method.
-#   setuptools is a set of monkeypatches on top of that.
-#   distribute is a particular version/implementation of setuptools.
-#
-# So we don't really know if this is using the old setuptools or the
-# Distribute-provided version of setuptools.
-
 import traceback
 import sys
 import os
@@ -109,7 +100,7 @@ setup(name='nilmdb',
                        'coverage',
                        'numpy',
                        ],
-      setup_requires = [ 'distribute',
+      setup_requires = [ 'setuptools',
                         ],
      install_requires = [ 'decorator',
                           'cherrypy >= 3.2',
--- a/tests/test_cmdline.py
+++ b/tests/test_cmdline.py
@@ -834,9 +834,12 @@ class TestCmdline(object):
    def test_13_files(self):
        # Test BulkData's ability to split into multiple files,
        # by forcing the file size to be really small.
+        # Also increase the initial nrows, so that start/end positions
+        # in the database are very large (> 32 bit)
        server_stop()
        server_start(bulkdata_args = { "file_size" : 920, # 23 rows per file
-                                       "files_per_dir" : 3 })
+                                       "files_per_dir" : 3,
+                                       "initial_nrows" : 2**40 })

        # Fill data
        self.ok("create /newton/prep float32_8")
@@ -888,7 +891,8 @@ class TestCmdline(object):
        server_stop()
        server_start(max_removals = 4321,
                     bulkdata_args = { "file_size" : 920, # 23 rows per file
-                                       "files_per_dir" : 3 })
+                                       "files_per_dir" : 3,
+                                       "initial_nrows" : 2**40 })
        self.do_remove_files()
        self.ok("destroy -R /newton/prep") # destroy again

@@ -897,7 +901,8 @@ class TestCmdline(object):
        server_stop()
        server_start(max_int_removals = 1,
                     bulkdata_args = { "file_size" : 920, # 23 rows per file
-                                       "files_per_dir" : 3 })
+                                       "files_per_dir" : 3,
+                                       "initial_nrows" : 2**40 })
        self.do_remove_files()

    def do_remove_files(self):
--- a/tests/test_timestamper.py
+++ b/tests/test_timestamper.py
@@ -1,5 +1,6 @@
 import nilmdb
 from nilmdb.utils.printf import *
+from nilmdb.utils import datetime_tz

 from nose.tools import *
 from nose.tools import assert_raises
@@ -19,6 +20,8 @@ class TestTimestamper(object):
        def join(list):
            return "\n".join(list) + "\n"

+        datetime_tz.localtz_set("America/New_York")
+
        start = nilmdb.utils.time.parse_time("03/24/2012")
        lines_in  = [ "hello", "world", "hello world", "# commented out" ]
        lines_out = [ "1332561600000000 hello",
Author	SHA1	Message	Date
Jim Paris	ba55ad82f0	Use a pure-python version of bisect_left, to fix 32-bit issues The default bisect module includes a fast C implementation, which requires that array indices fit within the system "long" type. For 32-bit systems, that's not acceptable, as the table indices for raw data can exceed 2^32 very quickly. A pure python version works fine.	2015-01-20 18:31:58 -05:00
Jim Paris	45c81d2019	Fix test that would fail if reordered, or in a different timezone	2015-01-18 17:50:54 -05:00
Jim Paris	78cfda32e3	Handle another exception from some versions of dateutil.parser	2015-01-18 17:50:54 -05:00
Jim Paris	3658d3876b	Rename deprecated config option The new version works in Cherrypy 3.2	2015-01-18 17:50:54 -05:00
Jim Paris	022b50950f	Support using a higher initial nrows in bulkdata, for tests This gives an easy way to get a large values in the database start_pos and end_pos fields, which is necessary for testing failure modes when those get too large (e.g. on 32-bit systems). Adjust tests to make use of this knob.	2015-01-18 17:49:52 -05:00
Jim Paris	e5efbadc8e	fsck: row indices are too big for slice.indices, so calculate manually Normally, indexes for an array are expected to fit in a platform's native long (32 or 64-bit). In nilmdb, tables aren't real arrays and we need to handle unbounded indices.	2015-01-18 16:36:56 -05:00
Jim Paris	74f633c9da	Distribute was merged back into setuptools, so use setuptools	2015-01-18 16:33:58 -05:00