|
- # -*- coding: utf-8 -*-
-
- """NilmDB
-
- Object that represents a NILM database file.
-
- Manages both the SQL database and the PyTables storage backend.
- """
-
- # Need absolute_import so that "import nilmdb" won't pull in nilmdb.py,
- # but will pull the nilmdb module instead.
- from __future__ import absolute_import
- import nilmdb
- from nilmdb.printf import *
-
- import sqlite3
- import tables
- import time
- import sys
- import os
- import errno
- import bisect
-
- import pyximport
- pyximport.install()
- from nilmdb.interval import Interval, DBInterval, IntervalSet, IntervalError
-
- # Note about performance and transactions:
- #
- # Committing a transaction in the default sync mode (PRAGMA synchronous=FULL)
- # takes about 125msec. sqlite3 will commit transactions at 3 times:
- # 1: explicit con.commit()
- # 2: between a series of DML commands and non-DML commands, e.g.
- # after a series of INSERT, SELECT, but before a CREATE TABLE or PRAGMA.
- # 3: at the end of an explicit transaction, e.g. "with self.con as con:"
- #
- # To speed up testing, or if this transaction speed becomes an issue,
- # the sync=False option to NilmDB.__init__ will set PRAGMA synchronous=OFF.
-
-
- # Don't touch old entries -- just add new ones.
- _sql_schema_updates = {
- 0: """
- -- All streams
- CREATE TABLE streams(
- id INTEGER PRIMARY KEY, -- stream ID
- path TEXT UNIQUE NOT NULL, -- path, e.g. '/newton/prep'
- layout TEXT NOT NULL -- layout name, e.g. float32_8
- );
-
- -- Individual timestamped ranges in those streams.
- -- For a given start_time and end_time, this tells us that the
- -- data is stored between start_pos and end_pos.
- -- Times are stored as μs since Unix epoch
- -- Positions are opaque: PyTables rows, file offsets, etc.
- --
- -- Note: end_pos points to the row _after_ end_time, so end_pos-1
- -- is the last valid row.
- CREATE TABLE ranges(
- stream_id INTEGER NOT NULL,
- start_time INTEGER NOT NULL,
- end_time INTEGER NOT NULL,
- start_pos INTEGER NOT NULL,
- end_pos INTEGER NOT NULL
- );
- CREATE INDEX _ranges_index ON ranges (stream_id, start_time, end_time);
- """,
-
- 1: """
- -- Generic dictionary-type metadata that can be associated with a stream
- CREATE TABLE metadata(
- stream_id INTEGER NOT NULL,
- key TEXT NOT NULL,
- value TEXT
- );
- """,
- }
-
- class NilmDBError(Exception):
- """Base exception for NilmDB errors"""
- def __init__(self, message = "Unspecified error"):
- Exception.__init__(self, self.__class__.__name__ + ": " + message)
-
- class StreamError(NilmDBError):
- pass
-
- class OverlapError(NilmDBError):
- pass
-
- # Helper that lets us pass a Pytables table into bisect
- class BisectableTable(object):
- def __init__(self, table):
- self.table = table
- def __getitem__(self, index):
- return self.table[index][0]
-
- class NilmDB(object):
- verbose = 0
-
- def __init__(self, basepath, sync=True, max_results=None):
- # set up path
- self.basepath = os.path.abspath(basepath.rstrip('/'))
-
- # Create the database path if it doesn't exist
- try:
- os.makedirs(self.basepath)
- except OSError as e:
- if e.errno != errno.EEXIST:
- raise IOError("can't create tree " + self.basepath)
-
- # Our HD5 file goes inside it
- h5filename = os.path.abspath(self.basepath + "/data.h5")
- self.h5file = tables.openFile(h5filename, "a", "NILM Database")
-
- # SQLite database too
- sqlfilename = os.path.abspath(self.basepath + "/data.sql")
- # We use check_same_thread = False, assuming that the rest
- # of the code (e.g. Server) will be smart and not access this
- # database from multiple threads simultaneously. That requirement
- # may be relaxed later.
- self.con = sqlite3.connect(sqlfilename, check_same_thread = False)
- self._sql_schema_update()
-
- # See big comment at top about the performance implications of this
- if sync:
- self.con.execute("PRAGMA synchronous=FULL")
- else:
- self.con.execute("PRAGMA synchronous=OFF")
-
- # Approximate largest number of elements that we want to send
- # in a single reply (for stream_intervals, stream_extract)
- if max_results:
- self.max_results = max_results
- else:
- self.max_results = 16384
-
- self.opened = True
-
- # Cached intervals
- self._cached_iset = {}
-
- def __del__(self):
- if "opened" in self.__dict__: # pragma: no cover
- fprintf(sys.stderr,
- "error: NilmDB.close() wasn't called, path %s",
- self.basepath)
-
- def get_basepath(self):
- return self.basepath
-
- def close(self):
- if self.con:
- self.con.commit()
- self.con.close()
- self.h5file.close()
- del self.opened
-
- def _sql_schema_update(self):
- cur = self.con.cursor()
- version = cur.execute("PRAGMA user_version").fetchone()[0]
- oldversion = version
-
- while version in _sql_schema_updates:
- cur.executescript(_sql_schema_updates[version])
- version = version + 1
- if self.verbose: # pragma: no cover
- printf("Schema updated to %d\n", version)
-
- if version != oldversion:
- with self.con:
- cur.execute("PRAGMA user_version = {v:d}".format(v=version))
-
- def _get_intervals(self, stream_id):
- """
- Return a mutable IntervalSet corresponding to the given stream ID.
- """
- # Load from database if not cached
- if stream_id not in self._cached_iset:
- iset = IntervalSet()
- result = self.con.execute("SELECT start_time, end_time, "
- "start_pos, end_pos "
- "FROM ranges "
- "WHERE stream_id=?", (stream_id,))
- try:
- for (start_time, end_time, start_pos, end_pos) in result:
- iset += DBInterval(start_time, end_time,
- start_time, end_time,
- start_pos, end_pos)
- except IntervalError as e: # pragma: no cover
- raise NilmDBError("unexpected overlap in ranges table!")
- self._cached_iset[stream_id] = iset
- # Return cached value
- return self._cached_iset[stream_id]
-
- # TODO: Split add_interval into two pieces, one to add
- # and one to flush to disk?
- # Need to think about this. Basic problem is that we can't
- # mess with intervals once they're in the IntervalSet,
- # without mucking with bxinterval internals.
-
- # Maybe add a separate optimization step?
- # Join intervals that have a fairly small gap between them
-
- def _add_interval(self, stream_id, interval, start_pos, end_pos):
- """
- Add interval to the internal interval cache, and to the database.
- Note: arguments must be ints (not numpy.int64, etc)
- """
- # Ensure this stream's intervals are cached, and add the new
- # interval to that cache.
- iset = self._get_intervals(stream_id)
- try:
- iset += DBInterval(interval.start, interval.end,
- interval.start, interval.end,
- start_pos, end_pos)
- except IntervalError as e: # pragma: no cover
- raise NilmDBError("new interval overlaps existing data")
-
- # Insert into the database
- self.con.execute("INSERT INTO ranges "
- "(stream_id,start_time,end_time,start_pos,end_pos) "
- "VALUES (?,?,?,?,?)",
- (stream_id, interval.start, interval.end,
- int(start_pos), int(end_pos)))
- self.con.commit()
-
- def stream_list(self, path = None, layout = None):
- """Return list of [path, layout] lists of all streams
- in the database.
-
- If path is specified, include only streams with a path that
- matches the given string.
-
- If layout is specified, include only streams with a layout
- that matches the given string.
- """
- where = "WHERE 1=1"
- params = ()
- if layout:
- where += " AND layout=?"
- params += (layout,)
- if path:
- where += " AND path=?"
- params += (path,)
- result = self.con.execute("SELECT path, layout "
- "FROM streams " + where, params).fetchall()
-
- return sorted(list(x) for x in result)
-
- def stream_intervals(self, path, start = None, end = None):
- """
- Returns (intervals, restart) tuple.
-
- intervals is a list of [start,end] timestamps of all intervals
- that exist for path, between start and end.
-
- restart, if nonzero, means that there were too many results to
- return in a single request. The data is complete from the
- starting timestamp to the point at which it was truncated,
- and a new request with a start time of 'restart' will fetch
- the next block of data.
- """
- stream_id = self._stream_id(path)
- intervals = self._get_intervals(stream_id)
- requested = Interval(start or 0, end or 1e12)
- result = []
- for n, i in enumerate(intervals.intersection(requested)):
- if n >= self.max_results:
- restart = i.start
- break
- result.append([i.start, i.end])
- else:
- restart = 0
- return (result, restart)
-
- def stream_create(self, path, layout_name):
- """Create a new table in the database.
-
- path: path to the data (e.g. '/newton/prep').
- Paths must contain at least two elements, e.g.:
- /newton/prep
- /newton/raw
- /newton/upstairs/prep
- /newton/upstairs/raw
-
- layout_name: string for nilmdb.layout.get_named(), e.g. 'float32_8'
- """
- if path[0] != '/':
- raise ValueError("paths must start with /")
- [ group, node ] = path.rsplit("/", 1)
- if group == '':
- raise ValueError("invalid path")
-
- # Get description
- try:
- desc = nilmdb.layout.get_named(layout_name).description()
- except KeyError:
- raise ValueError("no such layout")
-
- # Estimated table size (for PyTables optimization purposes): assume
- # 3 months worth of data at 8 KHz. It's OK if this is wrong.
- exp_rows = 8000 * 60*60*24*30*3
-
- # Create the table
- table = self.h5file.createTable(group, node,
- description = desc,
- expectedrows = exp_rows,
- createparents = True)
-
- # Insert into SQL database once the PyTables is happy
- with self.con as con:
- con.execute("INSERT INTO streams (path, layout) VALUES (?,?)",
- (path, layout_name))
-
- def _stream_id(self, path):
- """Return unique stream ID"""
- result = self.con.execute("SELECT id FROM streams WHERE path=?",
- (path,)).fetchone()
- if result is None:
- raise StreamError("No stream at path " + path)
- return result[0]
-
- def stream_set_metadata(self, path, data):
- """Set stream metadata from a dictionary, e.g.
- { description = 'Downstairs lighting',
- v_scaling = 123.45 }
- This replaces all existing metadata.
- """
- stream_id = self._stream_id(path)
- with self.con as con:
- con.execute("DELETE FROM metadata "
- "WHERE stream_id=?", (stream_id,))
- for key in data:
- if data[key] != '':
- con.execute("INSERT INTO metadata VALUES (?, ?, ?)",
- (stream_id, key, data[key]))
-
- def stream_get_metadata(self, path):
- """Return stream metadata as a dictionary."""
- stream_id = self._stream_id(path)
- result = self.con.execute("SELECT metadata.key, metadata.value "
- "FROM metadata "
- "WHERE metadata.stream_id=?", (stream_id,))
- data = {}
- for (key, value) in result:
- data[key] = value
- return data
-
- def stream_update_metadata(self, path, newdata):
- """Update stream metadata from a dictionary"""
- data = self.stream_get_metadata(path)
- data.update(newdata)
- self.stream_set_metadata(path, data)
-
- def stream_insert(self, path, parser, old_timestamp = None):
- """Insert new data into the database.
- path: Path at which to add the data
- parser: nilmdb.layout.Parser instance full of data to insert
- """
- if (not parser.min_timestamp or not parser.max_timestamp or
- not len(parser.data)):
- raise StreamError("no data provided")
-
- # If we were provided with an old timestamp, the expectation
- # is that the client has a contiguous block of time it is sending,
- # but it's doing it over multiple calls to stream_insert.
- # old_timestamp is the max_timestamp of the previous insert.
- # To make things continuous, use that as our starting timestamp
- # instead of what the parser found.
- if old_timestamp:
- min_timestamp = old_timestamp
- else:
- min_timestamp = parser.min_timestamp
-
- # First check for basic overlap using timestamp info given.
- stream_id = self._stream_id(path)
- iset = self._get_intervals(stream_id)
- interval = Interval(min_timestamp, parser.max_timestamp)
- if iset.intersects(interval):
- raise OverlapError("new data overlaps existing data: "
- + str(iset & interval))
-
- # Insert the data into pytables
- table = self.h5file.getNode(path)
- row_start = table.nrows
- table.append(parser.data)
- row_end = table.nrows
- table.flush()
-
- # Insert the record into the sql database.
- # Casts are to convert from numpy.int64.
- self._add_interval(stream_id, interval, int(row_start), int(row_end))
-
- # And that's all
- return "ok"
-
- def _find_start(self, table, interval):
- """
- Given a DBInterval, find the row in the database that
- corresponds to the start time. Return the first database
- position with a timestamp (first element) greater than or
- equal to 'start'.
- """
- # Optimization for the common case where an interval wasn't truncated
- if interval.start == interval.db_start:
- return interval.db_startpos
- return bisect.bisect_left(BisectableTable(table),
- interval.start,
- interval.db_startpos,
- interval.db_endpos)
-
- def _find_end(self, table, interval):
- """
- Given a DBInterval, find the row in the database that follows
- the end time. Return the first database position after the
- row with timestamp (first element) greater than or equal
- to 'end'.
- """
- # Optimization for the common case where an interval wasn't truncated
- if interval.end == interval.db_end:
- return interval.db_endpos
- # Note that we still use bisect_left here, because we don't
- # want to include the given timestamp in the results. This is
- # so a queries like 1:00 -> 2:00 and 2:00 -> 3:00 return
- # non-overlapping data.
- return bisect.bisect_left(BisectableTable(table),
- interval.end,
- interval.db_startpos,
- interval.db_endpos)
-
- def stream_extract(self, path, start = None, end = None, count = False):
- """
- Returns (data, restart) tuple.
-
- data is a list of raw data from the database, suitable for
- passing to e.g. nilmdb.layout.Formatter to translate into
- textual form.
-
- restart, if nonzero, means that there were too many results to
- return in a single request. The data is complete from the
- starting timestamp to the point at which it was truncated,
- and a new request with a start time of 'restart' will fetch
- the next block of data.
-
- count, if true, means to not return raw data, but just the count
- of rows that would have been returned. This is much faster
- than actually fetching the data. It is not limited by
- max_results.
- """
- table = self.h5file.getNode(path)
- stream_id = self._stream_id(path)
- intervals = self._get_intervals(stream_id)
- requested = Interval(start or 0, end or 1e12)
- result = []
- matched = 0
- remaining = self.max_results
- restart = 0
- for interval in intervals.intersection(requested):
- # Reading single rows from the table is too slow, so
- # we use two bisections to find both the starting and
- # ending row for this particular interval, then
- # read the entire range as one slice.
- row_start = self._find_start(table, interval)
- row_end = self._find_end(table, interval)
-
- if count:
- matched += row_end - row_start
- continue
-
- # Shorten it if we'll hit the maximum number of results
- row_max = row_start + remaining
- if row_max < row_end:
- row_end = row_max
- restart = table[row_max][0]
-
- # Gather these results up
- result.extend(table[row_start:row_end])
-
- # Count them
- remaining -= row_end - row_start
-
- if restart:
- break
-
- if count:
- return matched
- return (result, restart)
|