Compare commits

..

7 Commits

Author SHA1 Message Date
ba55ad82f0 Use a pure-python version of bisect_left, to fix 32-bit issues
The default bisect module includes a fast C implementation, which
requires that array indices fit within the system "long" type.  For
32-bit systems, that's not acceptable, as the table indices for raw
data can exceed 2^32 very quickly.  A pure python version works fine.
2015-01-20 18:31:58 -05:00
45c81d2019 Fix test that would fail if reordered, or in a different timezone 2015-01-18 17:50:54 -05:00
78cfda32e3 Handle another exception from some versions of dateutil.parser 2015-01-18 17:50:54 -05:00
3658d3876b Rename deprecated config option
The new version works in Cherrypy 3.2
2015-01-18 17:50:54 -05:00
022b50950f Support using a higher initial nrows in bulkdata, for tests
This gives an easy way to get a large values in the database start_pos
and end_pos fields, which is necessary for testing failure modes when
those get too large (e.g. on 32-bit systems).  Adjust tests to make
use of this knob.
2015-01-18 17:49:52 -05:00
e5efbadc8e fsck: row indices are too big for slice.indices, so calculate manually
Normally, indexes for an array are expected to fit in a platform's
native long (32 or 64-bit).  In nilmdb, tables aren't real arrays and
we need to handle unbounded indices.
2015-01-18 16:36:56 -05:00
74f633c9da Distribute was merged back into setuptools, so use setuptools 2015-01-18 16:33:58 -05:00
8 changed files with 57 additions and 31 deletions

View File

@@ -425,11 +425,15 @@ class Fsck(object):
for intv in ints:
last_ts = None
(stime, etime, spos, epos) = intv
if spos == epos:
continue
for start in xrange(*slice(spos, epos, maxrows).indices(epos)):
# Break interval into maxrows-sized chunks
next_start = spos
while next_start < epos:
start = next_start
stop = min(start + maxrows, epos)
count = stop - start
next_start = stop
# Get raw data, convert to NumPy arary
try:
raw = tab.get_data(start, stop, binary = True)

View File

@@ -43,6 +43,12 @@ class BulkData(object):
# 32768 files per dir should work even on FAT32
self.files_per_dir = 32768
if "initial_nrows" in kwargs:
self.initial_nrows = kwargs["initial_nrows"]
else:
# First row is 0
self.initial_nrows = 0
# Make root path
if not os.path.isdir(self.root):
os.mkdir(self.root)
@@ -254,7 +260,7 @@ class BulkData(object):
path = self._encode_filename(unicodepath)
elements = path.lstrip('/').split('/')
ospath = os.path.join(self.root, *elements)
return Table(ospath)
return Table(ospath, self.initial_nrows)
@nilmdb.utils.must_close(wrap_verify = False)
class Table(object):
@@ -291,9 +297,10 @@ class Table(object):
pickle.dump(fmt, f, 2)
# Normal methods
def __init__(self, root):
def __init__(self, root, initial_nrows):
"""'root' is the full OS path to the directory of this table"""
self.root = root
self.initial_nrows = initial_nrows
# Load the format
with open(os.path.join(self.root, "_format"), "rb") as f:
@@ -353,8 +360,14 @@ class Table(object):
# Convert to row number
return self._row_from_offset(subdir, filename, offset)
# No files, so no data
return 0
# No files, so no data. We typically start at row 0 in this
# case, although initial_nrows is specified during some tests
# to exercise other parts of the code better. Since we have
# no files yet, round initial_nrows up so it points to a row
# that would begin a new file.
nrows = ((self.initial_nrows + (self.rows_per_file - 1)) //
self.rows_per_file) * self.rows_per_file
return nrows
def _offset_from_row(self, row):
"""Return a (subdir, filename, offset, count) tuple:

View File

@@ -23,7 +23,6 @@ from nilmdb.server.errors import NilmDBError, StreamError, OverlapError
import sqlite3
import os
import errno
import bisect
# Note about performance and transactions:
#
@@ -516,6 +515,17 @@ class NilmDB(object):
# And that's all
return
def _bisect_left(self, a, x, lo, hi):
# Like bisect.bisect_left, but doesn't choke on large indices on
# 32-bit systems, like bisect's fast C implementation does.
while lo < hi:
mid = (lo + hi) / 2
if a[mid] < x:
lo = mid + 1
else:
hi = mid
return lo
def _find_start(self, table, dbinterval):
"""
Given a DBInterval, find the row in the database that
@@ -526,10 +536,10 @@ class NilmDB(object):
# Optimization for the common case where an interval wasn't truncated
if dbinterval.start == dbinterval.db_start:
return dbinterval.db_startpos
return bisect.bisect_left(table,
dbinterval.start,
dbinterval.db_startpos,
dbinterval.db_endpos)
return self._bisect_left(table,
dbinterval.start,
dbinterval.db_startpos,
dbinterval.db_endpos)
def _find_end(self, table, dbinterval):
"""
@@ -545,10 +555,10 @@ class NilmDB(object):
# want to include the given timestamp in the results. This is
# so a queries like 1:00 -> 2:00 and 2:00 -> 3:00 return
# non-overlapping data.
return bisect.bisect_left(table,
dbinterval.end,
dbinterval.db_startpos,
dbinterval.db_endpos)
return self._bisect_left(table,
dbinterval.end,
dbinterval.db_startpos,
dbinterval.db_endpos)
def stream_extract(self, path, start = None, end = None,
count = False, markup = False, binary = False):

View File

@@ -429,7 +429,7 @@ class Server(object):
cherrypy.config.update({
'server.socket_host': host,
'server.socket_port': port,
'engine.autoreload_on': False,
'engine.autoreload.on': False,
'server.max_request_body_size': 8*1024*1024,
})
if self.embedded:

View File

@@ -87,7 +87,7 @@ def parse_time(toparse):
try:
return unix_to_timestamp(datetime_tz.datetime_tz.
smartparse(toparse).totimestamp())
except (ValueError, OverflowError):
except (ValueError, OverflowError, TypeError):
pass
# If it's parseable as a float, treat it as a Unix or NILM

View File

@@ -6,15 +6,6 @@
# Then just package it up:
# python setup.py sdist
# This is supposed to be using Distribute:
#
# distutils provides a "setup" method.
# setuptools is a set of monkeypatches on top of that.
# distribute is a particular version/implementation of setuptools.
#
# So we don't really know if this is using the old setuptools or the
# Distribute-provided version of setuptools.
import traceback
import sys
import os
@@ -109,7 +100,7 @@ setup(name='nilmdb',
'coverage',
'numpy',
],
setup_requires = [ 'distribute',
setup_requires = [ 'setuptools',
],
install_requires = [ 'decorator',
'cherrypy >= 3.2',

View File

@@ -834,9 +834,12 @@ class TestCmdline(object):
def test_13_files(self):
# Test BulkData's ability to split into multiple files,
# by forcing the file size to be really small.
# Also increase the initial nrows, so that start/end positions
# in the database are very large (> 32 bit)
server_stop()
server_start(bulkdata_args = { "file_size" : 920, # 23 rows per file
"files_per_dir" : 3 })
"files_per_dir" : 3,
"initial_nrows" : 2**40 })
# Fill data
self.ok("create /newton/prep float32_8")
@@ -888,7 +891,8 @@ class TestCmdline(object):
server_stop()
server_start(max_removals = 4321,
bulkdata_args = { "file_size" : 920, # 23 rows per file
"files_per_dir" : 3 })
"files_per_dir" : 3,
"initial_nrows" : 2**40 })
self.do_remove_files()
self.ok("destroy -R /newton/prep") # destroy again
@@ -897,7 +901,8 @@ class TestCmdline(object):
server_stop()
server_start(max_int_removals = 1,
bulkdata_args = { "file_size" : 920, # 23 rows per file
"files_per_dir" : 3 })
"files_per_dir" : 3,
"initial_nrows" : 2**40 })
self.do_remove_files()
def do_remove_files(self):

View File

@@ -1,5 +1,6 @@
import nilmdb
from nilmdb.utils.printf import *
from nilmdb.utils import datetime_tz
from nose.tools import *
from nose.tools import assert_raises
@@ -19,6 +20,8 @@ class TestTimestamper(object):
def join(list):
return "\n".join(list) + "\n"
datetime_tz.localtz_set("America/New_York")
start = nilmdb.utils.time.parse_time("03/24/2012")
lines_in = [ "hello", "world", "hello world", "# commented out" ]
lines_out = [ "1332561600000000 hello",