Optimization that uses slices on the table rather than checking each

row individually, when extracting data. Switch to using bisect module when doing the bisection, to lessen the chance of errors. Added syslog ability for timer module, for timing stuff deep inside the server. Make the chunked/non-chunked test just give a warning, rather than failing the tests, for debugging purposes. Alternate approach would be to disable "die on error" for the tests. git-svn-id: https://bucket.mit.edu/svn/nilm/nilmdb@10896 ddd99763-3ecb-0310-9145-efcb8ce7c51f
12 years ago · 97bec3b1ee
--- a/nilmdb/nilmdb.py
+++ b/nilmdb/nilmdb.py
@@ -19,6 +19,7 @@ import time
 import sys
 import os
 import errno
 import bisect

 import pyximport
 pyximport.install()
@@ -86,6 +87,13 @@ class StreamError(NilmDBError):
 class OverlapError(NilmDBError):
    pass

 # Helper that lets us pass a Pytables table into bisect
 class BisectableTable(object):
    def __init__(self, table):
        self.table = table
    def __getitem__(self, index):
        return self.table[index][0]

 class NilmDB(object):
    verbose = 0

@@ -379,24 +387,36 @@ class NilmDB(object):
    def _find_start(self, table, interval):
        """
        Given a DBInterval, find the row in the database that
        corresponds to the start time.  Here, we perform a binary
        search between 'db_startpos' and 'db_endpos' and return the
        first database position with a timestamp (first element)
        greater than or equal to 'start'.
        corresponds to the start time.  Return the first database
        position with a timestamp (first element) greater than or
        equal to 'start'.
        """
        # Optimization for the common case where an interval wasn't truncated
        if interval.start == interval.db_start:
            return interval.db_startpos
        lo = interval.db_startpos
        hi = interval.db_endpos - 1
        x = interval.start
        while lo < hi:
            mid = (lo + hi) // 2
            if table[mid][0] < x:
                lo = mid + 1
            else:
                hi = mid
        return lo
        return bisect.bisect_left(BisectableTable(table),
                                  interval.start,
                                  interval.db_startpos,
                                  interval.db_endpos)

    def _find_end(self, table, interval):
        """
        Given a DBInterval, find the row in the database that follows
        the end time.  Return the first database position after the
        row with timestamp (first element) greater than or equal
        to 'end'.
        """
        # Optimization for the common case where an interval wasn't truncated
        if interval.end == interval.db_end:
            return interval.db_endpos
        # Note that we still use bisect_left here, because we don't
        # want to include the given timestamp in the results.  This is
        # so a queries like 1:00 -> 2:00 and 2:00 -> 3:00 return
        # non-overlapping data.
        return bisect.bisect_left(BisectableTable(table),
                                  interval.end,
                                  interval.db_startpos,
                                  interval.db_endpos)

    def stream_extract(self, path, start = None, end = None):
        """
@@ -417,25 +437,28 @@ class NilmDB(object):
        intervals = self._get_intervals(stream_id)
        requested = Interval(start or 0, end or 1e12)
        result = []
        n = 0
        remaining = self.max_results
        restart = 0
        for interval in intervals.intersection(requested):
            # Find row corresponding to interval start
            row = self._find_start(table, interval)

            # Gather results until we hit the row limit or the
            # endpoint.
            while table[row][0] < end:
                result.append(table[row])
                row += 1
                if row >= interval.db_endpos:
                    break
                n += 1
                if n >= self.max_results:
                    restart = table[row][0]
                    break

            # If restart is set, stop now
            # Reading single rows from the table is too slow, so
            # we use two bisections to find both the starting and
            # ending row for this particular interval, then
            # read the entire range as one slice.
            row_start = self._find_start(table, interval)
            row_end = self._find_end(table, interval)

            # Shorten it if we'll hit the maximum number of results
            row_max = row_start + remaining
            if row_max < row_end:
                row_end = row_max
                restart = table[row_max][0]

            # Gather these results up
            result.extend(table[row_start:row_end])

            # Count them
            remaining -= row_end - row_start

            if restart:
                break

--- a/nilmdb/timer.py
+++ b/nilmdb/timer.py
@@ -9,8 +9,13 @@ import contextlib
 import time

@contextlib.contextmanager
 def Timer(name = None):
 def Timer(name = None, tosyslog = False):
    start = time.time()
    yield
    elapsed = int((time.time() - start) * 1000)
    print (name or 'elapsed') + ": " + str(elapsed) + " ms"
    msg = (name or 'elapsed') + ": " + str(elapsed) + " ms"
    if tosyslog: # pragma: no cover
        import syslog
        syslog.syslog(msg)
    else:
        print msg
--- a/tests/test_client.py
+++ b/tests/test_client.py
@@ -14,6 +14,7 @@ import threading
 import cStringIO
 import simplejson as json
 import unittest
 import warnings

 from test_helpers import *

@@ -170,7 +171,14 @@ class TestClient(object):
        in_("400 Bad Request", str(e.exception))
        in_("OverlapError", str(e.exception))

    def test_client_4_generators(self):
    def test_client_4_extract(self):
        # Misc tests for extract.  Most of them are in test_cmdline.
        client = nilmdb.Client(url = "http://localhost:12380/")

        for x in client.stream_extract("/newton/prep", 123, 123):
            raise Exception("shouldn't be any data for this request")

    def test_client_5_generators(self):
        # A lot of the client functionality is already tested by test_cmdline,
        # but this gets a bit more coverage that cmdline misses.
        client = nilmdb.Client(url = "http://localhost:12380/")
@@ -218,8 +226,7 @@ class TestClient(object):
            in_("404 Not Found", str(e.exception))
            in_("No such stream", str(e.exception))

    #@unittest.skip("while debugging")
    def test_client_5_chunked(self):
    def test_client_6_chunked(self):
        # Make sure that /stream/intervals and /stream/extract
        # properly return streaming, chunked response.  Pokes around
        # in client.http internals a bit to look at the response
@@ -227,13 +234,17 @@ class TestClient(object):

        client = nilmdb.Client(url = "http://localhost:12380/")

        # Use a warning rather than returning a test failure, so that we can
        # still disable chunked responses for debugging.
        x = client.http.get("stream/intervals", { "path": "/newton/prep" },
                            retjson=False)
        eq_(x.count('\n'), 2)
        in_("transfer-encoding: chunked", client.http._headers.lower())
        if "transfer-encoding: chunked" not in client.http._headers.lower():
            warnings.warn("Non-chunked HTTP response for /stream/intervals")

        x = client.http.get("stream/extract",
                            { "path": "/newton/prep",
                              "start": "123",
                              "end": "123" }, retjson=False)
        in_("transfer-encoding: chunked", client.http._headers.lower())
        if "transfer-encoding: chunked" not in client.http._headers.lower():
            warnings.warn("Non-chunked HTTP response for /stream/extract")
--- a/tests/test_cmdline.py
+++ b/tests/test_cmdline.py
@@ -378,11 +378,15 @@ class TestCmdline(object):
        self.fail("extract /no/such/foo --start 2000-01-01 --end 2020-01-01")
        self.contain("Error getting stream info")

        # empty range
        # empty ranges
        self.fail("extract -a /newton/prep " +
                  "--start '23 Mar 2012 10:00:30' " +
                  "--end '23 Mar 2012 10:00:30'", exitcode = 2)
        self.contain("no data")
        self.fail("extract -a /newton/prep " +
                  "--start '23 Mar 2012 10:00:30.000001' " +
                  "--end '23 Mar 2012 10:00:30.000001'", exitcode = 2)
        self.contain("no data")

        # Check various dumps against stored copies of how they should appear
        def test(file, start, end, extra=""):
@@ -406,8 +410,9 @@ class TestCmdline(object):
                  "--end '23 Mar 2112 10:00:30'", exitcode = 2)
        self.contain("no data")

        # all data put in by tests
        self.ok("extract -a /newton/prep --start 2000-01-01 --end 2020-01-01")
        print self.captured.count('\n')
        eq_(self.captured.count('\n'), 43204)

    def test_cmdline_9_truncated(self):
        # Test truncated responses by overriding the nilmdb max_results