Browse Source

Merge branch 'python-intervals'

tags/nilmdb-1.4.1
Jim Paris 9 years ago
parent
commit
4a50dd015e
6 changed files with 153 additions and 82 deletions
  1. +13
    -0
      docs/design.md
  2. +6
    -72
      nilmdb/server/interval.pyx
  3. +8
    -4
      nilmdb/server/nilmdb.py
  4. +1
    -0
      nilmdb/utils/__init__.py
  5. +106
    -0
      nilmdb/utils/interval.py
  6. +19
    -6
      tests/test_interval.py

+ 13
- 0
docs/design.md View File

@@ -186,6 +186,19 @@ IntervalSet speed
- rbtree and interval converted to cython:
8.4 μS, total 12 s, 134 MB RAM

- Would like to move Interval itself back to Python so other
non-cythonized code like client code can use it more easily.
Testing speed with just `test_interval` being tested, with
`range(5,22)`, using `/usr/bin/time -v python tests/runtests.py`,
times recorded for 2097152:
- 52ae397 (Interval in cython):
12.6133 μs each, ratio 0.866533, total 47 sec, 399 MB RAM
- 9759dcf (Interval in python):
21.2937 μs each, ratio 1.462870, total 83 sec, 1107 MB RAM
That's a huge difference! Instead, will keep Interval and DBInterval
cythonized inside nilmdb, and just have an additional copy in
nilmdb.utils for clients to use.

Layouts
-------
Current/old design has specific layouts: RawData, PrepData, RawNotchedData.


+ 6
- 72
nilmdb/server/interval.pyx View File

@@ -1,5 +1,9 @@
"""Interval, IntervalSet

The Interval implemented here is just like
nilmdb.utils.interval.Interval, except implemented in Cython for
speed.

Represents an interval of time, and a set of such intervals.

Intervals are half-open, ie. they include data points with timestamps
@@ -23,6 +27,7 @@ from ..utils.time import min_timestamp as nilmdb_min_timestamp
from ..utils.time import max_timestamp as nilmdb_max_timestamp
from ..utils.time import timestamp_to_string
from ..utils.iterator import imerge
from ..utils.interval import IntervalError
import itertools

cimport rbtree
@@ -30,10 +35,6 @@ from libc.stdint cimport uint64_t, int64_t

ctypedef int64_t timestamp_t

class IntervalError(Exception):
"""Error due to interval overlap, etc"""
pass

cdef class Interval:
"""Represents an interval of time."""

@@ -59,17 +60,7 @@ cdef class Interval:

def __cmp__(self, Interval other):
"""Compare two intervals. If non-equal, order by start then end"""
if not isinstance(other, Interval):
raise TypeError("bad type")
if self.start == other.start:
if self.end < other.end:
return -1
if self.end > other.end:
return 1
return 0
if self.start < other.start:
return -1
return 1
return cmp(self.start, other.start) or cmp(self.end, other.end)

cpdef intersects(self, Interval other):
"""Return True if two Interval objects intersect"""
@@ -313,63 +304,6 @@ cdef class IntervalSet:
else:
yield subset

def set_difference(self, IntervalSet other not None,
Interval bounds = None):
"""
Compute the difference (self \\ other) between this
IntervalSet and the given IntervalSet; i.e., the ranges
that are present in 'self' but not 'other'.

If 'bounds' is not None, results are limited to the range
specified by the interval 'bounds'.

Returns a generator that yields each interval in turn.
Output intervals are built as subsets of the intervals in the
first argument (self).
"""
# Iterate through all starts and ends in sorted order. Add a
# tag to the iterator so that we can figure out which one they
# were, after sorting.
def decorate(it, key_start, key_end):
for i in it:
yield i.start, key_start, i
yield i.end, key_end, i
if bounds is None:
bounds = Interval(nilmdb_min_timestamp,
nilmdb_max_timestamp)
self_iter = decorate(self.intersection(bounds), 0, 2)
other_iter = decorate(other.intersection(bounds), 1, 3)

# Now iterate over the timestamps of each start and end.
# At each point, evaluate which type of end it is, to determine
# how to build up the output intervals.
self_interval = None
other_interval = None
out_start = None
for (ts, k, i) in imerge(self_iter, other_iter):
if k == 0:
# start self interval
self_interval = i
if other_interval is None:
out_start = ts
elif k == 1:
# start other interval
other_interval = i
if out_start is not None and out_start != ts:
yield self_interval.subset(out_start, ts)
out_start = None
elif k == 2:
# end self interval
if out_start is not None and out_start != ts:
yield self_interval.subset(out_start, ts)
out_start = None
self_interval = None
elif k == 3:
# end other interval
other_interval = None
if self_interval:
out_start = ts

cpdef intersects(self, Interval other):
"""Return True if this IntervalSet intersects another interval"""
for n in self.tree.intersect(other.start, other.end):


+ 8
- 4
nilmdb/server/nilmdb.py View File

@@ -12,8 +12,10 @@ Manages both the SQL database and the table storage backend.
from __future__ import absolute_import
import nilmdb.utils
from nilmdb.utils.printf import *
from nilmdb.server.interval import (Interval, DBInterval,
IntervalSet, IntervalError)

from nilmdb.utils.interval import IntervalError
from nilmdb.server.interval import Interval, DBInterval, IntervalSet

from nilmdb.server import bulkdata
from nilmdb.server.errors import NilmDBError, StreamError, OverlapError

@@ -328,7 +330,7 @@ class NilmDB(object):
List all intervals in 'path' between 'start' and 'end'. If
'diffpath' is not none, list instead the set-difference
between the intervals in the two streams; i.e. all interval
ranges that are present in 'path' but not 'path2'.
ranges that are present in 'path' but not 'diffpath'.

Returns (intervals, restart) tuple.

@@ -350,7 +352,9 @@ class NilmDB(object):
requested = Interval(start, end)
result = []
if diffpath:
getter = intervals.set_difference(diffintervals, requested)
getter = nilmdb.utils.interval.set_difference(
intervals.intersection(requested),
diffintervals.intersection(requested))
else:
getter = intervals.intersection(requested)
for n, i in enumerate(getter):


+ 1
- 0
nilmdb/utils/__init__.py View File

@@ -11,3 +11,4 @@ import nilmdb.utils.threadsafety
import nilmdb.utils.fallocate
import nilmdb.utils.time
import nilmdb.utils.iterator
import nilmdb.utils.interval

+ 106
- 0
nilmdb/utils/interval.py View File

@@ -0,0 +1,106 @@
"""Interval. Like nilmdb.server.interval, but re-implemented here
in plain Python so clients have easier access to it.

Intervals are half-open, ie. they include data points with timestamps
[start, end)
"""

import nilmdb.utils.time
import nilmdb.utils.iterator

class IntervalError(Exception):
"""Error due to interval overlap, etc"""
pass

# Interval
class Interval:
"""Represents an interval of time."""

def __init__(self, start, end):
"""
'start' and 'end' are arbitrary numbers that represent time
"""
if start >= end:
# Explicitly disallow zero-width intervals (since they're half-open)
raise IntervalError("start %s must precede end %s" % (start, end))
self.start = start
self.end = end

def __repr__(self):
s = repr(self.start) + ", " + repr(self.end)
return self.__class__.__name__ + "(" + s + ")"

def __str__(self):
return ("[" + nilmdb.utils.time.timestamp_to_string(self.start) +
" -> " + nilmdb.utils.time.timestamp_to_string(self.end) + ")")

def __cmp__(self, other):
"""Compare two intervals. If non-equal, order by start then end"""
return cmp(self.start, other.start) or cmp(self.end, other.end)

def intersects(self, other):
"""Return True if two Interval objects intersect"""
if not isinstance(other, Interval):
raise TypeError("need an Interval")
if self.end <= other.start or self.start >= other.end:
return False
return True

def subset(self, start, end):
"""Return a new Interval that is a subset of this one"""
# A subclass that tracks additional data might override this.
if start < self.start or end > self.end:
raise IntervalError("not a subset")
return Interval(start, end)

def set_difference(a, b):
"""
Compute the difference (a \\ b) between the intervals in 'a' and
the intervals in 'b'; i.e., the ranges that are present in 'self'
but not 'other'.

'a' and 'b' must both be iterables.

Returns a generator that yields each interval in turn.
Output intervals are built as subsets of the intervals in the
first argument (a).
"""
# Iterate through all starts and ends in sorted order. Add a
# tag to the iterator so that we can figure out which one they
# were, after sorting.
def decorate(it, key_start, key_end):
for i in it:
yield i.start, key_start, i
yield i.end, key_end, i
a_iter = decorate(iter(a), 0, 2)
b_iter = decorate(iter(b), 1, 3)

# Now iterate over the timestamps of each start and end.
# At each point, evaluate which type of end it is, to determine
# how to build up the output intervals.
a_interval = None
b_interval = None
out_start = None
for (ts, k, i) in nilmdb.utils.iterator.imerge(a_iter, b_iter):
if k == 0:
# start a interval
a_interval = i
if b_interval is None:
out_start = ts
elif k == 1:
# start b interval
b_interval = i
if out_start is not None and out_start != ts:
yield a_interval.subset(out_start, ts)
out_start = None
elif k == 2:
# end a interval
if out_start is not None and out_start != ts:
yield a_interval.subset(out_start, ts)
out_start = None
a_interval = None
elif k == 3:
# end b interval
b_interval = None
if a_interval:
out_start = ts

+ 19
- 6
tests/test_interval.py View File

@@ -8,8 +8,11 @@ from nose.tools import *
from nose.tools import assert_raises
import itertools

from nilmdb.server.interval import (Interval, DBInterval,
IntervalSet, IntervalError)
from nilmdb.utils.interval import IntervalError
from nilmdb.server.interval import Interval, DBInterval, IntervalSet

# so we can test them separately
from nilmdb.utils.interval import Interval as UtilsInterval

from testutil.helpers import *
import unittest
@@ -47,6 +50,15 @@ def makeset(string):
return iset

class TestInterval:
def test_client_interval(self):
# Run interval tests against the Python version of Interval.
global Interval
NilmdbInterval = Interval
Interval = UtilsInterval
self.test_interval()
self.test_interval_intersect()
Interval = NilmdbInterval

def test_interval(self):
# Test Interval class
os.environ['TZ'] = "America/New_York"
@@ -222,7 +234,7 @@ class TestInterval:
eq_(ab,c)

# a \ b == d
eq_(IntervalSet(a.set_difference(b)), d)
eq_(IntervalSet(nilmdb.utils.interval.set_difference(a,b)), d)

# Intersection with intervals
do_test(makeset("[---|---)[)"),
@@ -287,10 +299,11 @@ class TestInterval:
b = makeset("[-) [--) [)")
c = makeset("[----) ")
d = makeset(" [-) ")
eq_(a.set_difference(b, list(c)[0]), d)
eq_(nilmdb.utils.interval.set_difference(
a.intersection(list(c)[0]), b.intersection(list(c)[0])), d)

# Empty second set
eq_(a.set_difference(IntervalSet()), a)
eq_(nilmdb.utils.interval.set_difference(a, IntervalSet()), a)

class TestIntervalDB:
def test_dbinterval(self):
@@ -379,7 +392,7 @@ class TestIntervalSpeed:
print
yappi.start()
speeds = {}
limit = 10 # was 20
limit = 22 # was 20
for j in [ 2**x for x in range(5,limit) ]:
start = time.time()
iset = IntervalSet()


Loading…
Cancel
Save