nilm
/
nilmdb


			
							# -*- coding: utf-8 -*-

"""NilmDB

Object that represents a NILM database file.

Manages both the SQL database and the PyTables storage backend.
"""

# Need absolute_import so that "import nilmdb" won't pull in nilmdb.py,
# but will pull the nilmdb module instead.
from __future__ import absolute_import
import nilmdb
from nilmdb.printf import *

import sqlite3
import tables
import time
import sys
import os
import errno
import bisect

import pyximport
pyximport.install()
from nilmdb.interval import Interval, DBInterval, IntervalSet, IntervalError

# Note about performance and transactions:
#
# Committing a transaction in the default sync mode (PRAGMA synchronous=FULL)
# takes about 125msec.  sqlite3 will commit transactions at 3 times:
# 1: explicit con.commit()
# 2: between a series of DML commands and non-DML commands, e.g.
#    after a series of INSERT, SELECT, but before a CREATE TABLE or PRAGMA.
# 3: at the end of an explicit transaction, e.g. "with self.con as con:"
#
# To speed up testing, or if this transaction speed becomes an issue,
# the sync=False option to NilmDB.__init__ will set PRAGMA synchronous=OFF.


# Don't touch old entries -- just add new ones.
_sql_schema_updates = {
    0: """
    -- All streams
    CREATE TABLE streams(
    	id INTEGER PRIMARY KEY,		-- stream ID
        path TEXT UNIQUE NOT NULL,	-- path, e.g. '/newton/prep'
        layout TEXT NOT NULL		-- layout name, e.g. float32_8
    );

    -- Individual timestamped ranges in those streams.
    -- For a given start_time and end_time, this tells us that the
    -- data is stored between start_pos and end_pos.
    -- Times are stored as μs since Unix epoch
    -- Positions are opaque: PyTables rows, file offsets, etc.
    --
    -- Note: end_pos points to the row _after_ end_time, so end_pos-1
    -- is the last valid row.
    CREATE TABLE ranges(
        stream_id INTEGER NOT NULL,
        start_time INTEGER NOT NULL,
        end_time INTEGER NOT NULL,
        start_pos INTEGER NOT NULL,
        end_pos INTEGER NOT NULL
    );
    CREATE INDEX _ranges_index ON ranges (stream_id, start_time, end_time);
    """,

    1: """
    -- Generic dictionary-type metadata that can be associated with a stream
    CREATE TABLE metadata(
    	stream_id INTEGER NOT NULL,
        key TEXT NOT NULL,
        value TEXT
    );
    """,
}

class NilmDBError(Exception):
    """Base exception for NilmDB errors"""
    def __init__(self, message = "Unspecified error"):
        Exception.__init__(self, self.__class__.__name__ + ": " + message)

class StreamError(NilmDBError):
    pass

class OverlapError(NilmDBError):
    pass

# Helper that lets us pass a Pytables table into bisect
class BisectableTable(object):
    def __init__(self, table):
        self.table = table
    def __getitem__(self, index):
        return self.table[index][0]

class NilmDB(object):
    verbose = 0

    def __init__(self, basepath, sync=True, max_results=None):
        # set up path
        self.basepath = os.path.abspath(basepath.rstrip('/'))

        # Create the database path if it doesn't exist
        try:
            os.makedirs(self.basepath)
        except OSError as e:
            if e.errno != errno.EEXIST:
                raise IOError("can't create tree " + self.basepath)

        # Our HD5 file goes inside it
        h5filename = os.path.abspath(self.basepath + "/data.h5")
        self.h5file = tables.openFile(h5filename, "a", "NILM Database")

        # SQLite database too
        sqlfilename = os.path.abspath(self.basepath + "/data.sql")
        # We use check_same_thread = False, assuming that the rest
        # of the code (e.g. Server) will be smart and not access this
        # database from multiple threads simultaneously.  That requirement
        # may be relaxed later.
        self.con = sqlite3.connect(sqlfilename, check_same_thread = False)
        self._sql_schema_update()

        # See big comment at top about the performance implications of this
        if sync:
            self.con.execute("PRAGMA synchronous=FULL")
        else:
            self.con.execute("PRAGMA synchronous=OFF")

        # Approximate largest number of elements that we want to send
        # in a single reply (for stream_intervals, stream_extract)
        if max_results:
            self.max_results = max_results
        else:
            self.max_results = 16384

        self.opened = True

        # Cached intervals
        self._cached_iset = {}

    def __del__(self):
        if "opened" in self.__dict__: # pragma: no cover
            fprintf(sys.stderr,
                    "error: NilmDB.close() wasn't called, path %s",
                    self.basepath)

    def get_basepath(self):
        return self.basepath

    def close(self):
        if self.con:
            self.con.commit()
            self.con.close()
        self.h5file.close()
        del self.opened

    def _sql_schema_update(self):
        cur = self.con.cursor()
        version = cur.execute("PRAGMA user_version").fetchone()[0]
        oldversion = version

        while version in _sql_schema_updates:
            cur.executescript(_sql_schema_updates[version])
            version = version + 1
            if self.verbose: # pragma: no cover
                printf("Schema updated to %d\n", version)

        if version != oldversion:
            with self.con:
                cur.execute("PRAGMA user_version = {v:d}".format(v=version))

    def _get_intervals(self, stream_id):
        """
        Return a mutable IntervalSet corresponding to the given stream ID.
        """
        # Load from database if not cached
        if stream_id not in self._cached_iset:
            iset = IntervalSet()
            result = self.con.execute("SELECT start_time, end_time, "
                                      "start_pos, end_pos "
                                      "FROM ranges "
                                      "WHERE stream_id=?", (stream_id,))
            try:
                for (start_time, end_time, start_pos, end_pos) in result:
                    iset += DBInterval(start_time, end_time,
                                       start_time, end_time,
                                       start_pos, end_pos)
            except IntervalError as e: # pragma: no cover
                raise NilmDBError("unexpected overlap in ranges table!")
            self._cached_iset[stream_id] = iset
        # Return cached value
        return self._cached_iset[stream_id]

    # TODO: Split add_interval into two pieces, one to add
    # and one to flush to disk?
    # Need to think about this.  Basic problem is that we can't
    # mess with intervals once they're in the IntervalSet,
    # without mucking with bxinterval internals.

    # Maybe add a separate optimization step?
    # Join intervals that have a fairly small gap between them

    def _add_interval(self, stream_id, interval, start_pos, end_pos):
        """
        Add interval to the internal interval cache, and to the database.
        Note: arguments must be ints (not numpy.int64, etc)
        """
        # Ensure this stream's intervals are cached, and add the new
        # interval to that cache.
        iset = self._get_intervals(stream_id)
        try:
            iset += DBInterval(interval.start, interval.end,
                               interval.start, interval.end,
                               start_pos, end_pos)
        except IntervalError as e: # pragma: no cover
            raise NilmDBError("new interval overlaps existing data")

        # Insert into the database
        self.con.execute("INSERT INTO ranges "
                         "(stream_id,start_time,end_time,start_pos,end_pos) "
                         "VALUES (?,?,?,?,?)",
                         (stream_id, interval.start, interval.end,
                          int(start_pos), int(end_pos)))
        self.con.commit()

    def stream_list(self, path = None, layout = None):
        """Return list of [path, layout] lists of all streams
        in the database.

        If path is specified, include only streams with a path that
        matches the given string.

        If layout is specified, include only streams with a layout
        that matches the given string.
        """
        where = "WHERE 1=1"
        params = ()
        if layout:
            where += " AND layout=?"
            params += (layout,)
        if path:
            where += " AND path=?"
            params += (path,)
        result = self.con.execute("SELECT path, layout "
                                  "FROM streams " + where, params).fetchall()

        return sorted(list(x) for x in result)

    def stream_intervals(self, path, start = None, end = None):
        """
        Returns (intervals, restart) tuple.

        intervals is a list of [start,end] timestamps of all intervals
        that exist for path, between start and end.

        restart, if nonzero, means that there were too many results to
        return in a single request.  The data is complete from the
        starting timestamp to the point at which it was truncated,
        and a new request with a start time of 'restart' will fetch
        the next block of data.
        """
        stream_id = self._stream_id(path)
        intervals = self._get_intervals(stream_id)
        requested = Interval(start or 0, end or 1e12)
        result = []
        for n, i in enumerate(intervals.intersection(requested)):
            if n >= self.max_results:
                restart = i.start
                break
            result.append([i.start, i.end])
        else:
            restart = 0
        return (result, restart)

    def stream_create(self, path, layout_name):
        """Create a new table in the database.

        path: path to the data (e.g. '/newton/prep').
        Paths must contain at least two elements, e.g.:
           /newton/prep
           /newton/raw
           /newton/upstairs/prep
           /newton/upstairs/raw

        layout_name: string for nilmdb.layout.get_named(), e.g. 'float32_8'
        """
        if path[0] != '/':
            raise ValueError("paths must start with /")
        [ group, node ] = path.rsplit("/", 1)
        if group == '':
            raise ValueError("invalid path")

        # Get description
        try:
            desc = nilmdb.layout.get_named(layout_name).description()
        except KeyError:
            raise ValueError("no such layout")

        # Estimated table size (for PyTables optimization purposes): assume
        # 3 months worth of data at 8 KHz.  It's OK if this is wrong.
        exp_rows = 8000 * 60*60*24*30*3

        # Create the table
        table = self.h5file.createTable(group, node,
                                        description = desc,
                                        expectedrows = exp_rows,
                                        createparents = True)

        # Insert into SQL database once the PyTables is happy
        with self.con as con:
            con.execute("INSERT INTO streams (path, layout) VALUES (?,?)",
                        (path, layout_name))

    def _stream_id(self, path):
        """Return unique stream ID"""
        result = self.con.execute("SELECT id FROM streams WHERE path=?",
                                  (path,)).fetchone()
        if result is None:
            raise StreamError("No stream at path " + path)
        return result[0]

    def stream_set_metadata(self, path, data):
        """Set stream metadata from a dictionary, e.g.
           { description = 'Downstairs lighting',
             v_scaling = 123.45 }
           This replaces all existing metadata.
           """
        stream_id = self._stream_id(path)
        with self.con as con:
            con.execute("DELETE FROM metadata "
                        "WHERE stream_id=?", (stream_id,))
            for key in data:
                if data[key] != '':
                    con.execute("INSERT INTO metadata VALUES (?, ?, ?)",
                                (stream_id, key, data[key]))

    def stream_get_metadata(self, path):
        """Return stream metadata as a dictionary."""
        stream_id = self._stream_id(path)
        result = self.con.execute("SELECT metadata.key, metadata.value "
                                  "FROM metadata "
                                  "WHERE metadata.stream_id=?", (stream_id,))
        data = {}
        for (key, value) in result:
            data[key] = value
        return data

    def stream_update_metadata(self, path, newdata):
        """Update stream metadata from a dictionary"""
        data = self.stream_get_metadata(path)
        data.update(newdata)
        self.stream_set_metadata(path, data)

    def stream_insert(self, path, parser, old_timestamp = None):
        """Insert new data into the database.
           path: Path at which to add the data
           parser: nilmdb.layout.Parser instance full of data to insert
           """
        if (not parser.min_timestamp or not parser.max_timestamp or
            not len(parser.data)):
            raise StreamError("no data provided")

        # If we were provided with an old timestamp, the expectation
        # is that the client has a contiguous block of time it is sending,
        # but it's doing it over multiple calls to stream_insert.
        # old_timestamp is the max_timestamp of the previous insert.
        # To make things continuous, use that as our starting timestamp
        # instead of what the parser found.
        if old_timestamp:
            min_timestamp = old_timestamp
        else:
            min_timestamp = parser.min_timestamp

        # First check for basic overlap using timestamp info given.
        stream_id = self._stream_id(path)
        iset = self._get_intervals(stream_id)
        interval = Interval(min_timestamp, parser.max_timestamp)
        if iset.intersects(interval):
            raise OverlapError("new data overlaps existing data: "
                               + str(iset & interval))

        # Insert the data into pytables
        table = self.h5file.getNode(path)
        row_start = table.nrows
        table.append(parser.data)
        row_end = table.nrows
        table.flush()

        # Insert the record into the sql database.
        # Casts are to convert from numpy.int64.
        self._add_interval(stream_id, interval, int(row_start), int(row_end))

        # And that's all
        return "ok"

    def _find_start(self, table, interval):
        """
        Given a DBInterval, find the row in the database that
        corresponds to the start time.  Return the first database
        position with a timestamp (first element) greater than or
        equal to 'start'.
        """
        # Optimization for the common case where an interval wasn't truncated
        if interval.start == interval.db_start:
            return interval.db_startpos
        return bisect.bisect_left(BisectableTable(table),
                                  interval.start,
                                  interval.db_startpos,
                                  interval.db_endpos)

    def _find_end(self, table, interval):
        """
        Given a DBInterval, find the row in the database that follows
        the end time.  Return the first database position after the
        row with timestamp (first element) greater than or equal
        to 'end'.
        """
        # Optimization for the common case where an interval wasn't truncated
        if interval.end == interval.db_end:
            return interval.db_endpos
        # Note that we still use bisect_left here, because we don't
        # want to include the given timestamp in the results.  This is
        # so a queries like 1:00 -> 2:00 and 2:00 -> 3:00 return
        # non-overlapping data.
        return bisect.bisect_left(BisectableTable(table),
                                  interval.end,
                                  interval.db_startpos,
                                  interval.db_endpos)

    def stream_extract(self, path, start = None, end = None, count = False):
        """
        Returns (data, restart) tuple.

        data is a list of raw data from the database, suitable for
        passing to e.g. nilmdb.layout.Formatter to translate into
        textual form.

        restart, if nonzero, means that there were too many results to
        return in a single request.  The data is complete from the
        starting timestamp to the point at which it was truncated,
        and a new request with a start time of 'restart' will fetch
        the next block of data.

        count, if true, means to not return raw data, but just the count
        of rows that would have been returned.  This is much faster
        than actually fetching the data.  It is not limited by
        max_results.
        """
        table = self.h5file.getNode(path)
        stream_id = self._stream_id(path)
        intervals = self._get_intervals(stream_id)
        requested = Interval(start or 0, end or 1e12)
        result = []
        matched = 0
        remaining = self.max_results
        restart = 0
        for interval in intervals.intersection(requested):
            # Reading single rows from the table is too slow, so
            # we use two bisections to find both the starting and
            # ending row for this particular interval, then
            # read the entire range as one slice.
            row_start = self._find_start(table, interval)
            row_end = self._find_end(table, interval)

            if count:
                matched += row_end - row_start
                continue

            # Shorten it if we'll hit the maximum number of results
            row_max = row_start + remaining
            if row_max < row_end:
                row_end = row_max
                restart = table[row_max][0]

            # Gather these results up
            result.extend(table[row_start:row_end])

            # Count them
            remaining -= row_end - row_start

            if restart:
                break

        if count:
            return matched
        return (result, restart)