Don't need python-pip

Fix issue with column ordering in the exemplars
If the max scale in the exemplar was a column we weren't using, it would bail out when looking for that correlation later. Change things around so exemplars in RAM only keep around the columns we care about.
2013-07-20 16:15:29 -04:00 · 2013-07-18 22:51:27 -04:00 · 2013-07-15 15:19:52 -04:00 · 2013-07-11 18:56:53 -04:00
4 changed files with 70 additions and 12 deletions
--- a/4
+++ b/4
@@ -11,10 +11,14 @@ endif
 test: test_trainola
 test_trainola:
 	-nilmtool -u http://bucket/nilmdb remove -s min -e max \
 		/sharon/prep-a-matches
 	nilmtools/trainola.py "$$(cat extras/trainola-test-param-2.js)"
 	-nilmtool -u http://bucket/nilmdb remove -s min -e max \
 		/sharon/prep-a-matches
 	nilmtools/trainola.py "$$(cat extras/trainola-test-param.js)"
 test_cleanup:
 	nilmtools/cleanup.py -e extras/cleanup.cfg
 	nilmtools/cleanup.py extras/cleanup.cfg
--- a/README.txt
+++ b/README.txt
@@ -5,7 +5,7 @@ by Jim Paris <jim@jtan.com>
 Prerequisites:
  # Runtime and build environments
-  sudo apt-get install python2.7 python2.7-dev python-setuptools python-pip
+  sudo apt-get install python2.7 python2.7-dev python-setuptools
  sudo apt-get install python-numpy python-scipy
  nilmdb (1.8.1+)
--- a/extras/trainola-test-param-2.js
+++ b/extras/trainola-test-param-2.js
@@ -0,0 +1,29 @@
 { "columns" : [ { "index" : 0, "name" : "P1" },
                { "index" : 1, "name" : "Q1" },
                { "index" : 2, "name" : "P3" } ],
  "stream" : "/sharon/prep-a",
  "url" : "http://bucket.mit.edu/nilmdb",
  "dest_stream" : "/sharon/prep-a-matches",
  "start" : 1365153062643133.5,
  "end"   : 1365168814443575.5,
  "exemplars" : [ { "columns" : [ { "index" : 0,
                                    "name" : "P1"
                                  } ],
                    "dest_column" : 0,
                    "end" : 1365073657682000,
                    "name" : "Turn ON",
                    "start" : 1365073654321000,
                    "stream" : "/sharon/prep-a",
                    "url" : "http://bucket.mit.edu/nilmdb"
                  },
                  { "columns" : [ { "index" : 2, "name" : "P3" },
                                  { "index" : 0, "name" : "P1" } ],
                    "dest_column" : 1,
                    "end" : 1365176528818000,
                    "name" : "Type 2 turn ON",
                    "start" : 1365176520030000,
                    "stream" : "/sharon/prep-a",
                    "url" : "http://bucket.mit.edu/nilmdb"
                  }
                ]
 }
--- a/nilmtools/trainola.py
+++ b/nilmtools/trainola.py
@@ -6,6 +6,7 @@ import nilmtools.filter
 from nilmdb.utils.time import (timestamp_to_human,
                               timestamp_to_seconds,
                               seconds_to_timestamp)
 from nilmdb.utils import datetime_tz
 from nilmdb.utils.interval import Interval
 import numpy as np
@@ -15,6 +16,7 @@ from numpy.core.umath_tests import inner1d
 import nilmrun
 from collections import OrderedDict
 import sys
 import time
 import functools
 import collections
@@ -26,12 +28,12 @@ def build_column_mapping(colinfo, streaminfo):
    pull out a dictionary mapping for the column names/numbers."""
    columns = OrderedDict()
    for c in colinfo:
-        if (c['name'] in columns.keys() or
+        col_num = c['index'] + 1  # skip timestamp
-            c['index'] in columns.values()):
+        if (c['name'] in columns.keys() or col_num in columns.values()):
            raise DataError("duplicated columns")
        if (c['index'] < 0 or c['index'] >= streaminfo.layout_count):
            raise DataError("bad column number")
-        columns[c['name']] = c['index']
+        columns[c['name']] = col_num
    if not len(columns):
        raise DataError("no columns")
    return columns
@@ -52,6 +54,9 @@ class Exemplar(object):
        # Get stream info
        self.client = nilmdb.client.numpyclient.NumpyClient(self.url)
        self.info = nilmtools.filter.get_stream_info(self.client, self.stream)
        if not self.info:
            raise DataError(sprintf("exemplar stream '%s' does not exist " +
                                    "on server '%s'", self.stream, self.url))
        # Build up name => index mapping for the columns
        self.columns = build_column_mapping(exinfo['columns'], self.info)
@@ -74,10 +79,17 @@ class Exemplar(object):
                                                   maxrows = self.count)
        self.data = list(datagen)[0]
-        # Discard timestamp
+        # Extract just the columns that were specified in self.columns,
-        self.data = self.data[:,1:]
+        # skipping the timestamp.
        extract_columns = [ value for (key, value) in self.columns.items() ]
        self.data = self.data[:,extract_columns]
-        # Subtract the mean from each column
+        # Fix the column indices in e.columns, since we removed/reordered
        # columns in self.data
        for n, k in enumerate(self.columns):
            self.columns[k] = n
        # Subtract the means from each column
        self.data = self.data - self.data.mean(axis=0)
        # Get scale factors for each column by computing dot product
@@ -117,6 +129,10 @@ def peak_detect(data, delta):
                lookformax = True
    return (mins, maxs)
 def timestamp_to_short_human(timestamp):
    dt = datetime_tz.datetime_tz.fromtimestamp(timestamp_to_seconds(timestamp))
    return dt.strftime("%H:%M:%S")
 def trainola_matcher(data, interval, args, insert_func, final_chunk):
    """Perform cross-correlation match"""
    ( src_columns, dest_count, exemplars ) = args
@@ -138,7 +154,7 @@ def trainola_matcher(data, interval, args, insert_func, final_chunk):
        # Compute cross-correlation for each column
        for col_name in e.columns:
-            a = data[:, src_columns[col_name] + 1]
+            a = data[:, src_columns[col_name]]
            b = e.data[:, e.columns[col_name]]
            corr = scipy.signal.fftconvolve(a, np.flipud(b), 'valid')[0:valid]
@@ -183,7 +199,10 @@ def trainola_matcher(data, interval, args, insert_func, final_chunk):
    insert_func(out)
    # Return how many rows we processed
-    return max(valid, 0)
+    valid = max(valid, 0)
    printf("  [%s] matched %d exemplars in %d rows\n",
           timestamp_to_short_human(data[0][0]), np.sum(out[:,1:]), valid)
    return valid
 def trainola(conf):
    print "Trainola", nilmtools.__version__
@@ -247,14 +266,20 @@ def trainola(conf):
                                  src.path, layout = src.layout, maxrows = rows)
    inserter = functools.partial(dest_client.stream_insert_numpy_context,
                                 dest.path)
    start = time.time()
    processed_time = 0
    printf("Processing intervals:\n")
    for interval in intervals:
        printf("Processing interval:\n")
        printf("%s\n", interval.human_string())
        nilmtools.filter.process_numpy_interval(
            interval, extractor, inserter, rows * 3,
            trainola_matcher, (src_columns, dest.layout_count, exemplars))
        processed_time += (timestamp_to_seconds(interval.end) -
                           timestamp_to_seconds(interval.start))
    elapsed = max(time.time() - start, 1e-3)
-    return "done"
+    printf("Done. Processed %.2f seconds per second.\n",
           processed_time / elapsed)
 def main(argv = None):
    import simplejson as json
Author	SHA1	Message	Date
Jim Paris	9c5f07106d	Don't need python-pip	2013-07-20 16:15:29 -04:00
Jim Paris	62e11a11c0	Fix issue with column ordering in the exemplars If the max scale in the exemplar was a column we weren't using, it would bail out when looking for that correlation later. Change things around so exemplars in RAM only keep around the columns we care about.	2013-07-18 22:51:27 -04:00
Jim Paris	2bdcee2c36	More helpful error if exemplar stream doesn't exist	2013-07-15 15:19:52 -04:00
Jim Paris	6dce8c5296	More output	2013-07-11 18:56:53 -04:00