Trainola inserts into the destination stream now

More trainola work
More filter cleanup
2013-07-10 12:59:39 -04:00 · 2013-07-10 11:38:32 -04:00 · 2013-07-09 19:27:20 -04:00 · 2013-07-09 18:40:49 -04:00 · 2013-07-09 18:09:05 -04:00 · 2013-07-09 17:56:26 -04:00
18 changed files with 832 additions and 318 deletions
--- a/36
+++ b/36
@@ -8,22 +8,30 @@ else
 	@echo "Try 'make install'"
 endif

-test: test_cleanup
+test: test_trainola
+
+test_trainola:
+	-nilmtool -u http://bucket/nilmdb remove -s min -e max \
+		/sharon/prep-a-matches
+	nilmtools/trainola.py "$$(cat extras/trainola-test-param.js)"

 test_cleanup:
-	src/cleanup.py -e extras/cleanup.cfg
-	src/cleanup.py extras/cleanup.cfg
+	nilmtools/cleanup.py -e extras/cleanup.cfg
+	nilmtools/cleanup.py extras/cleanup.cfg

 test_insert:
-	@make install >/dev/null
-	src/insert.py --file --dry-run  /test/foo </dev/null
+	nilmtools/insert.py --file --dry-run  /test/foo </dev/null

 test_copy:
-	@make install >/dev/null
-	src/copy_wildcard.py -U "http://nilmdb.com/bucket/" -D /lees*
+	nilmtools/copy_wildcard.py -U "http://nilmdb.com/bucket/" -D /lees*

-test_prep:
-	@make install >/dev/null
+/tmp/raw.dat:
+	octave --eval 'fs = 8000;' \
+	--eval 't = (0:fs*10)*2*pi*60/fs;' \
+	--eval 'raw = transpose([sin(t); 0.3*sin(3*t)+sin(t)]);' \
+	--eval 'save("-ascii","/tmp/raw.dat","raw");'
+
+test_prep: /tmp/raw.dat
 	-nilmtool destroy -R /test/raw
 	-nilmtool destroy -R /test/sinefit
 	-nilmtool destroy -R /test/prep
@@ -31,8 +39,8 @@ test_prep:
 	nilmtool create /test/sinefit float32_3
 	nilmtool create /test/prep float32_8
 	nilmtool insert -s '@0' -t -r 8000 /test/raw /tmp/raw.dat
-	src/sinefit.py -c 1 /test/raw /test/sinefit
-	src/prep.py -c 2 /test/raw /test/sinefit /test/prep
+	nilmtools/sinefit.py -a 0.5 -c 1 /test/raw /test/sinefit
+	nilmtools/prep.py -c 2 /test/raw /test/sinefit /test/prep
 	nilmtool extract -s min -e max /test/prep | head -20

 test_decimate:
@@ -40,8 +48,8 @@ test_decimate:
 	-@nilmtool destroy /lees-compressor/no-leak/raw/16 || true
 	-@nilmtool create /lees-compressor/no-leak/raw/4 float32_18 || true
 	-@nilmtool create /lees-compressor/no-leak/raw/16 float32_18 || true
-	time python src/decimate.py -s '2013-02-04 18:10:00' -e '2013-02-04 18:11:00' /lees-compressor/no-leak/raw/1 /lees-compressor/no-leak/raw/4
-	python src/decimate.py -s '2013-02-04 18:10:00' -e '2013-02-04 18:11:00' /lees-compressor/no-leak/raw/4 /lees-compressor/no-leak/raw/16
+	time python nilmtools/decimate.py -s '2013-02-04 18:10:00' -e '2013-02-04 18:11:00' /lees-compressor/no-leak/raw/1 /lees-compressor/no-leak/raw/4
+	python nilmtools/decimate.py -s '2013-02-04 18:10:00' -e '2013-02-04 18:11:00' /lees-compressor/no-leak/raw/4 /lees-compressor/no-leak/raw/16

 version:
 	python setup.py version
@@ -63,4 +71,4 @@ clean::
 gitclean::
 	git clean -dXf

-.PHONY: all version dist sdist install clean gitclean
+.PHONY: all version dist sdist install clean gitclean test
--- a/README.txt
+++ b/README.txt
@@ -5,10 +5,10 @@ by Jim Paris <jim@jtan.com>
 Prerequisites:

  # Runtime and build environments
-  sudo apt-get install python2.7 python2.7-dev python-setuptools
-  sudo apt-get install python-numpy python-scipy python-matplotlib
+  sudo apt-get install python2.7 python2.7-dev python-setuptools python-pip
+  sudo apt-get install python-numpy python-scipy

-  nilmdb (1.5.0+)
+  nilmdb (1.8.1+)

 Install:

--- a/extras/trainola-test-param.js
+++ b/extras/trainola-test-param.js
@@ -0,0 +1,31 @@
+{ "url": "http://bucket.mit.edu/nilmdb",
+  "dest_stream": "/sharon/prep-a-matches",
+  "stream": "/sharon/prep-a",
+  "start": 1366111383280463,
+  "end": 1366126163457797,
+  "columns": [ { "name": "P1", "index": 0 },
+               { "name": "Q1", "index": 1 },
+               { "name": "P3", "index": 2 } ],
+  "exemplars": [
+      { "name": "Boiler Pump ON",
+        "url": "http://bucket.mit.edu/nilmdb",
+        "stream": "/sharon/prep-a",
+        "start": 1366260494269078,
+        "end": 1366260608185031,
+        "dest_column": 0,
+        "columns": [ { "name": "P1", "index": 0 },
+                     { "name": "Q1", "index": 1 }
+                   ]
+      },
+      { "name": "Boiler Pump OFF",
+        "url": "http://bucket.mit.edu/nilmdb",
+        "stream": "/sharon/prep-a",
+        "start": 1366260864215764,
+        "end": 1366260870882998,
+        "dest_column": 1,
+        "columns": [ { "name": "P1", "index": 0 },
+                     { "name": "Q1", "index": 1 }
+                   ]
+      }
+  ]
+}
--- a/nilmtools/init.py
+++ b/nilmtools/init.py
--- a/nilmtools/_version.py
+++ b/nilmtools/_version.py
@@ -181,7 +181,7 @@ def versions_from_parentdir(parentdir_prefix, versionfile_source, verbose=False)

 tag_prefix = "nilmtools-"
 parentdir_prefix = "nilmtools-"
-versionfile_source = "src/_version.py"
+versionfile_source = "nilmtools/_version.py"

 def get_versions(default={"version": "unknown", "full": ""}, verbose=False):
    variables = { "refnames": git_refnames, "full": git_full }
--- a/nilmtools/cleanup.py
+++ b/nilmtools/cleanup.py
@@ -238,12 +238,15 @@ def main(argv = None):
                       timestamp_to_seconds(total)))
            continue
        printf("  removing data before %s\n", timestamp_to_human(remove_before))
-        if args.yes:
-            client.stream_remove(path, None, remove_before)
-        for ap in streams[path].also_clean_paths:
-            printf("  also removing from %s\n", ap)
+        # Clean in reverse order.  Since we only use the primary stream and not
+        # the decimated streams to figure out which data to remove, removing
+        # the primary stream last means that we might recover more nicely if
+        # we are interrupted and restarted.
+        clean_paths = list(reversed(streams[path].also_clean_paths)) + [ path ]
+        for p in clean_paths:
+            printf("  removing from %s\n", p)
            if args.yes:
-                client.stream_remove(ap, None, remove_before)
+                client.stream_remove(p, None, remove_before)

    # All done
    if not args.yes:
--- a/nilmtools/copy_one.py
+++ b/nilmtools/copy_one.py
--- a/nilmtools/copy_wildcard.py
+++ b/nilmtools/copy_wildcard.py
--- a/nilmtools/decimate.py
+++ b/nilmtools/decimate.py
--- a/nilmtools/decimate_auto.py
+++ b/nilmtools/decimate_auto.py
@@ -4,15 +4,19 @@ import nilmtools.filter
 import nilmtools.decimate
 import nilmdb.client
 import argparse
+import fnmatch

 def main(argv = None):
    parser = argparse.ArgumentParser(
        formatter_class = argparse.RawDescriptionHelpFormatter,
-        version = "1.0",
+        version = nilmtools.__version__,
        description = """\
    Automatically create multiple decimations from a single source
    stream, continuing until the last decimated level contains fewer
    than 500 points total.
+
+    Wildcards and multiple paths are accepted.  Decimated paths are
+    ignored when matching wildcards.
    """)
    parser.add_argument("-u", "--url", action="store",
                        default="http://localhost/nilmdb/",
@@ -23,20 +27,36 @@ def main(argv = None):
                        default = False,
                        help="Force metadata changes if the dest "
                        "doesn't match")
-    parser.add_argument("path", action="store",
+    parser.add_argument("path", action="store", nargs='+',
                        help='Path of base stream')
    args = parser.parse_args(argv)

    # Pull out info about the base stream
    client = nilmdb.client.Client(args.url)

-    info = nilmtools.filter.get_stream_info(client, args.path)
-    if not info:
-        raise Exception("path " + args.path + " not found")
+    # Find list of paths to process
+    streams = [ unicode(s[0]) for s in client.stream_list() ]
+    streams = [ s for s in streams if "~decim-" not in s ]
+    paths = []
+    for path in args.path:
+        new = fnmatch.filter(streams, unicode(path))
+        if not new:
+            print "error: no stream matched path:", path
+            raise SystemExit(1)
+        paths.extend(new)

-    meta = client.stream_get_metadata(args.path)
+    for path in paths:
+        do_decimation(client, args, path)
+
+def do_decimation(client, args, path):
+    print "Decimating", path
+    info = nilmtools.filter.get_stream_info(client, path)
+    if not info:
+        raise Exception("path " + path + " not found")
+
+    meta = client.stream_get_metadata(path)
    if "decimate_source" in meta:
-        print "Stream", args.path, "was decimated from", meta["decimate_source"]
+        print "Stream", path, "was decimated from", meta["decimate_source"]
        print "You need to pass the base stream instead"
        raise SystemExit(1)

@@ -53,7 +73,7 @@ def main(argv = None):
        if info.rows <= 500:
            break
        factor *= args.factor
-        new_path = "%s~decim-%d" % (args.path, factor)
+        new_path = "%s~decim-%d" % (path, factor)

        # Create the stream if needed
        new_info = nilmtools.filter.get_stream_info(client, new_path)
@@ -72,5 +92,7 @@ def main(argv = None):
        # Update info using the newly decimated stream
        info = nilmtools.filter.get_stream_info(client, new_path)

+    return
+
 if __name__ == "__main__":
    main()
--- a/nilmtools/filter.py
+++ b/nilmtools/filter.py
@@ -19,6 +19,10 @@ import re
 import argparse
 import numpy as np
 import cStringIO
+import functools
+
+class ArgumentError(Exception):
+    pass

 class MissingDestination(Exception):
    def __init__(self, args, src, dest):
@@ -65,9 +69,73 @@ def get_stream_info(client, path):
        return None
    return StreamInfo(client.geturl(), streams[0])

+# Filter processing for a single interval of data.
+def process_numpy_interval(interval, extractor, inserter, warn_rows,
+                           function, args = None):
+    """For the given 'interval' of data, extract data, process it
+    through 'function', and insert the result.
+
+    'extractor' should be a function like NumpyClient.stream_extract_numpy
+    but with the the interval 'start' and 'end' as the only parameters,
+    e.g.:
+       extractor = functools.partial(NumpyClient.stream_extract_numpy,
+                                     src_path, layout = l, maxrows = m)
+
+    'inserter' should be a function like NumpyClient.stream_insert_context
+    but with the interval 'start' and 'end' as the only parameters, e.g.:
+       inserter = functools.partial(NumpyClient.stream_insert_context,
+                                    dest_path)
+
+    If 'warn_rows' is not None, print a warning to stdout when the
+    number of unprocessed rows exceeds this amount.
+
+    See process_numpy for details on 'function' and 'args'.
+    """
+    if args is None:
+        args = []
+
+    with inserter(interval.start, interval.end) as insert_ctx:
+        insert_func = insert_ctx.insert
+        old_array = np.array([])
+        for new_array in extractor(interval.start, interval.end):
+            # If we still had old data left, combine it
+            if old_array.shape[0] != 0:
+                array = np.vstack((old_array, new_array))
+            else:
+                array = new_array
+
+            # Pass the data to the user provided function
+            processed = function(array, interval, args, insert_func, False)
+
+            # Send any pending data that the user function inserted
+            insert_ctx.send()
+
+            # Save the unprocessed parts
+            if processed >= 0:
+                old_array = array[processed:]
+            else:
+                raise Exception(
+                    sprintf("%s return value %s must be >= 0",
+                            str(function), str(processed)))
+
+            # Warn if there's too much data remaining
+            if warn_rows is not None and old_array.shape[0] > warn_rows:
+                printf("warning: %d unprocessed rows in buffer\n",
+                       old_array.shape[0])
+
+        # Last call for this contiguous interval
+        if old_array.shape[0] != 0:
+            processed = function(old_array, interval, args,
+                                 insert_func, True)
+            if processed != old_array.shape[0]:
+                # Truncate the interval we're inserting at the first
+                # unprocessed data point.  This ensures that
+                # we'll not miss any data when we run again later.
+                insert_ctx.update_end(old_array[processed][0])
+
 class Filter(object):

-    def __init__(self):
+    def __init__(self, parser_description = None):
        self._parser = None
        self._client_src = None
        self._client_dest = None
@@ -78,6 +146,9 @@ class Filter(object):
        self.end = None
        self.interhost = False
        self.force_metadata = False
+        if parser_description is not None:
+            self.setup_parser(parser_description)
+            self.parse_args()

    @property
    def client_src(self):
@@ -131,63 +202,52 @@ class Filter(object):
        self._parser = parser
        return parser

-    def interval_string(self, interval):
-        return sprintf("[ %s -> %s ]",
-                       timestamp_to_human(interval.start),
-                       timestamp_to_human(interval.end))
-
-    def parse_args(self, argv = None):
-        args = self._parser.parse_args(argv)
-
-        if args.dest_url is None:
-            args.dest_url = args.url
-        if args.url != args.dest_url:
+    def set_args(self, url, dest_url, srcpath, destpath, start, end,
+                 parsed_args = None, quiet = True):
+        """Set arguments directly from parameters"""
+        if dest_url is None:
+            dest_url = url
+        if url != dest_url:
            self.interhost = True

-        self._client_src = Client(args.url)
-        self._client_dest = Client(args.dest_url)
+        self._client_src = Client(url)
+        self._client_dest = Client(dest_url)

-        if (not self.interhost) and (args.srcpath == args.destpath):
-            self._parser.error("source and destination path must be different")
+        if (not self.interhost) and (srcpath == destpath):
+            raise ArgumentError("source and destination path must be different")

-        # Open and print info about the streams
-        self.src = get_stream_info(self._client_src, args.srcpath)
+        # Open the streams
+        self.src = get_stream_info(self._client_src, srcpath)
        if not self.src:
-            self._parser.error("source path " + args.srcpath + " not found")
+            raise ArgumentError("source path " + srcpath + " not found")

-        self.dest = get_stream_info(self._client_dest, args.destpath)
+        self.dest = get_stream_info(self._client_dest, destpath)
        if not self.dest:
-            raise MissingDestination(args, self.src,
-                                     StreamInfo(args.dest_url, [args.destpath]))
+            raise MissingDestination(parsed_args, self.src,
+                                     StreamInfo(dest_url, [destpath]))

-        print "Source:", self.src.string(self.interhost)
-        print "  Dest:", self.dest.string(self.interhost)
+        self.start = start
+        self.end = end

-        if args.dry_run:
-            for interval in self.intervals():
-                print self.interval_string(interval)
-            raise SystemExit(0)
+        # Print info
+        if not quiet:
+            print "Source:", self.src.string(self.interhost)
+            print "  Dest:", self.dest.string(self.interhost)
+
+    def parse_args(self, argv = None):
+        """Parse arguments from a command line"""
+        args = self._parser.parse_args(argv)
+
+        self.set_args(args.url, args.dest_url, args.srcpath, args.destpath,
+                      args.start, args.end, quiet = False, parsed_args = args)

        self.force_metadata = args.force_metadata
-
-        self.start = args.start
-        self.end = args.end
-
+        if args.dry_run:
+            for interval in self.intervals():
+                print interval.human_string()
+            raise SystemExit(0)
        return args

-    def _optimize_int(self, it):
-        """Join and yield adjacent intervals from the iterator 'it'"""
-        saved_int = None
-        for interval in it:
-            if saved_int is not None:
-                if saved_int.end == interval.start:
-                    interval.start = saved_int.start
-                else:
-                    yield saved_int
-            saved_int = interval
-        if saved_int is not None:
-            yield saved_int
-
    def intervals(self):
        """Generate all the intervals that this filter should process"""
        self._using_client = True
@@ -214,12 +274,13 @@ class Filter(object):
                              self.src.path, diffpath = self.dest.path,
                              start = self.start, end = self.end) )
        # Optimize intervals: join intervals that are adjacent
-        for interval in self._optimize_int(intervals):
+        for interval in nilmdb.utils.interval.optimize(intervals):
            yield interval
        self._using_client = False

    # Misc helpers
-    def arg_time(self, toparse):
+    @staticmethod
+    def arg_time(toparse):
        """Parse a time string argument"""
        try:
            return nilmdb.utils.time.parse_time(toparse)
@@ -233,8 +294,14 @@ class Filter(object):
        metadata = self._client_dest.stream_get_metadata(self.dest.path)
        if not self.force_metadata:
            for key in data:
-                wanted = str(data[key])
+                wanted = data[key]
+                if not isinstance(wanted, basestring):
+                    wanted = str(wanted)
                val = metadata.get(key, wanted)
+                # Force UTF-8 encoding for comparison and display
+                wanted = wanted.encode('utf-8')
+                val = val.encode('utf-8')
+                key = key.encode('utf-8')
                if val != wanted and self.dest.rows > 0:
                    m =  "Metadata in destination stream:\n"
                    m += "  %s = %s\n" % (key, val)
@@ -250,13 +317,16 @@ class Filter(object):

    # The main filter processing method.
    def process_numpy(self, function, args = None, rows = 100000):
-        """For all intervals that exist in self.src but don't exist in
-        self.dest, call 'function' with a Numpy array corresponding to
-        the data.  The data is converted to a Numpy array in chunks of
-        'rows' rows at a time.
+        """Calls process_numpy_interval for each interval that currently
+        exists in self.src, but doesn't exist in self.dest.  It will
+        process the data in chunks as follows:
+
+        For each chunk of data, call 'function' with a Numpy array
+        corresponding to the data.  The data is converted to a Numpy
+        array in chunks of 'rows' rows at a time.

        'function' should be defined as:
-           def function(data, interval, args, insert_func, final)
+        # def function(data, interval, args, insert_func, final)

        'data': array of data to process -- may be empty

@@ -275,51 +345,23 @@ class Filter(object):
        Return value of 'function' is the number of data rows processed.
        Unprocessed data will be provided again in a subsequent call
        (unless 'final' is True).
+
+        If unprocessed data remains after 'final' is True, the interval
+        being inserted will be ended at the timestamp of the first
+        unprocessed data point.
        """
-        if args is None:
-            args = []
        extractor = NumpyClient(self.src.url).stream_extract_numpy
        inserter = NumpyClient(self.dest.url).stream_insert_numpy_context

-        for interval in self.intervals():
-            print "Processing", self.interval_string(interval)
-            with inserter(self.dest.path,
-                          interval.start, interval.end) as insert_ctx:
-                insert_function = insert_ctx.insert
-                old_array = np.array([])
-                for new_array in extractor(self.src.path,
-                                           interval.start, interval.end,
+        extractor_func = functools.partial(extractor, self.src.path,
                                           layout = self.src.layout,
-                                           maxrows = rows):
-                    # If we still had old data left, combine it
-                    if old_array.shape[0] != 0:
-                        array = np.vstack((old_array, new_array))
-                    else:
-                        array = new_array
+                                           maxrows = rows)
+        inserter_func = functools.partial(inserter, self.dest.path)

-                    # Pass it to the process function
-                    processed = function(array, interval, args,
-                                         insert_function, False)
-
-                    # Send any pending data
-                    insert_ctx.send()
-
-                    # Save the unprocessed parts
-                    if processed >= 0:
-                        old_array = array[processed:]
-                    else:
-                        raise Exception(
-                            sprintf("%s return value %s must be >= 0",
-                                    str(function), str(processed)))
-
-                    # Warn if there's too much data remaining
-                    if old_array.shape[0] > 3 * rows:
-                        printf("warning: %d unprocessed rows in buffer\n",
-                               old_array.shape[0])
-
-                # Last call for this contiguous interval
-                if old_array.shape[0] != 0:
-                    function(old_array, interval, args, insert_function, True)
+        for interval in self.intervals():
+            print "Processing", interval.human_string()
+            process_numpy_interval(interval, extractor_func, inserter_func,
+                                   rows * 3, function, args)

 def main(argv = None):
    # This is just a dummy function; actual filters can use the other
@@ -328,7 +370,7 @@ def main(argv = None):
    parser = f.setup_parser()
    args = f.parse_args(argv)
    for i in f.intervals():
-        print "Generic filter: need to handle", f.interval_string(i)
+        print "Generic filter: need to handle", i.human_string()

 if __name__ == "__main__":
    main()
--- a/nilmtools/insert.py
+++ b/nilmtools/insert.py
--- a/nilmtools/median.py
+++ b/nilmtools/median.py
@@ -0,0 +1,43 @@
+#!/usr/bin/python
+import nilmtools.filter, scipy.signal
+
+def main(argv = None):
+    f = nilmtools.filter.Filter()
+    parser = f.setup_parser("Median Filter")
+    group = parser.add_argument_group("Median filter options")
+    group.add_argument("-z", "--size", action="store", type=int, default=25,
+                       help = "median filter size (default %(default)s)")
+    group.add_argument("-d", "--difference", action="store_true",
+                       help = "store difference rather than filtered values")
+
+    try:
+        args = f.parse_args(argv)
+    except nilmtools.filter.MissingDestination as e:
+        print "Source is %s (%s)" % (e.src.path, e.src.layout)
+        print "Destination %s doesn't exist" % (e.dest.path)
+        print "You could make it with a command like:"
+        print "  nilmtool -u %s create %s %s" % (e.dest.url,
+                                                 e.dest.path, e.src.layout)
+        raise SystemExit(1)
+
+    meta = f.client_src.stream_get_metadata(f.src.path)
+    f.check_dest_metadata({ "median_filter_source": f.src.path,
+                            "median_filter_size": args.size,
+                            "median_filter_difference": repr(args.difference) })
+
+    f.process_numpy(median_filter, args = (args.size, args.difference))
+
+def median_filter(data, interval, args, insert, final):
+    (size, diff) = args
+    (rows, cols) = data.shape
+    for i in range(cols - 1):
+        filtered = scipy.signal.medfilt(data[:, i+1], size)
+        if diff:
+            data[:, i+1] -= filtered
+        else:
+            data[:, i+1] = filtered
+    insert(data)
+    return rows
+
+if __name__ == "__main__":
+    main()
--- a/nilmtools/prep.py
+++ b/nilmtools/prep.py
@@ -3,6 +3,8 @@
 # Spectral envelope preprocessor.
 # Requires two streams as input: the original raw data, and sinefit data.

+from nilmdb.utils.printf import *
+from nilmdb.utils.time import timestamp_to_human
 import nilmtools.filter
 import nilmdb.client
 from numpy import *
@@ -77,7 +79,8 @@ def main(argv = None):
    # Check and set metadata in prep stream
    f.check_dest_metadata({ "prep_raw_source": f.src.path,
                            "prep_sinefit_source": sinefit.path,
-                            "prep_column": args.column })
+                            "prep_column": args.column,
+                            "prep_rotation": repr(rotation) })

    # Run the processing function on all data
    f.process_numpy(process, args = (client_sinefit, sinefit.path, args.column,
@@ -105,7 +108,6 @@ def process(data, interval, args, insert_function, final):
    # Pull out sinefit data for the entire time range of this block
    for sinefit_line in client.stream_extract(sinefit_path,
                                              data[0, 0], data[rows-1, 0]):
-
        def prep_period(t_min, t_max, rot):
            """
            Compute prep coefficients from time t_min to t_max, which
@@ -162,7 +164,15 @@ def process(data, interval, args, insert_function, final):
                break
            processed = idx_max

-    print "Processed", processed, "of", rows, "rows"
+    # If we processed no data but there's lots in here, pretend we
+    # processed half of it.
+    if processed == 0 and rows > 10000:
+        processed = rows / 2
+        printf("%s: warning: no periods found; skipping %d rows\n",
+               timestamp_to_human(data[0][0]), processed)
+    else:
+        printf("%s: processed %d of %d rows\n",
+               timestamp_to_human(data[0][0]), processed, rows)
    return processed

 if __name__ == "__main__":
--- a/nilmtools/sinefit.py
+++ b/nilmtools/sinefit.py
@@ -0,0 +1,262 @@
+#!/usr/bin/python
+
+# Sine wave fitting.
+from nilmdb.utils.printf import *
+import nilmtools.filter
+import nilmdb.client
+from nilmdb.utils.time import (timestamp_to_human,
+                               timestamp_to_seconds,
+                               seconds_to_timestamp)
+
+from numpy import *
+from scipy import *
+#import pylab as p
+import operator
+import sys
+
+def main(argv = None):
+    f = nilmtools.filter.Filter()
+    parser = f.setup_parser("Sine wave fitting")
+    group = parser.add_argument_group("Sine fit options")
+    group.add_argument('-c', '--column', action='store', type=int,
+                       help='Column number (first data column is 1)')
+    group.add_argument('-f', '--frequency', action='store', type=float,
+                       default=60.0,
+                       help='Approximate frequency (default: %(default)s)')
+    group.add_argument('-m', '--min-freq', action='store', type=float,
+                       help='Minimum valid frequency '
+                       '(default: approximate frequency / 2))')
+    group.add_argument('-M', '--max-freq', action='store', type=float,
+                       help='Maximum valid frequency '
+                       '(default: approximate frequency * 2))')
+    group.add_argument('-a', '--min-amp', action='store', type=float,
+                       default=20.0,
+                       help='Minimum signal amplitude (default: %(default)s)')
+
+    # Parse arguments
+    try:
+        args = f.parse_args(argv)
+    except nilmtools.filter.MissingDestination as e:
+        rec = "float32_3"
+        print "Source is %s (%s)" % (e.src.path, e.src.layout)
+        print "Destination %s doesn't exist" % (e.dest.path)
+        print "You could make it with a command like:"
+        print "  nilmtool -u %s create %s %s" % (e.dest.url, e.dest.path, rec)
+        raise SystemExit(1)
+
+    if args.column is None or args.column < 1:
+        parser.error("need a column number >= 1")
+    if args.frequency < 0.1:
+        parser.error("frequency must be >= 0.1")
+    if args.min_freq is None:
+        args.min_freq = args.frequency / 2
+    if args.max_freq is None:
+        args.max_freq = args.frequency * 2
+    if (args.min_freq > args.max_freq or
+        args.min_freq > args.frequency or
+        args.max_freq < args.frequency):
+        parser.error("invalid min or max frequency")
+    if args.min_amp < 0:
+        parser.error("min amplitude must be >= 0")
+
+    f.check_dest_metadata({ "sinefit_source": f.src.path,
+                            "sinefit_column": args.column })
+    f.process_numpy(process, args = (args.column, args.frequency, args.min_amp,
+                                     args.min_freq, args.max_freq))
+
+class SuppressibleWarning(object):
+    def __init__(self, maxcount = 10, maxsuppress = 100):
+        self.maxcount = maxcount
+        self.maxsuppress = maxsuppress
+        self.count = 0
+        self.last_msg = ""
+
+    def _write(self, sec, msg):
+        if sec:
+            now = timestamp_to_human(seconds_to_timestamp(sec)) + ": "
+        else:
+            now = ""
+        sys.stderr.write(now + msg)
+
+    def warn(self, msg, seconds = None):
+        self.count += 1
+        if self.count <= self.maxcount:
+            self._write(seconds, msg)
+        if (self.count - self.maxcount) >= self.maxsuppress:
+            self.reset(seconds)
+
+    def reset(self, seconds = None):
+        if self.count > self.maxcount:
+            self._write(seconds, sprintf("(%d warnings suppressed)\n",
+                                         self.count - self.maxcount))
+        self.count = 0
+
+def process(data, interval, args, insert_function, final):
+    (column, f_expected, a_min, f_min, f_max) = args
+    rows = data.shape[0]
+
+    # Estimate sampling frequency from timestamps
+    fs = (rows-1) / (timestamp_to_seconds(data[-1][0]) -
+                     timestamp_to_seconds(data[0][0]))
+
+    # Pull out about 3.5 periods of data at once;
+    # we'll expect to match 3 zero crossings in each window
+    N = max(int(3.5 * fs / f_expected), 10)
+
+    # If we don't have enough data, don't bother processing it
+    if rows < N:
+        return 0
+
+    warn = SuppressibleWarning(3, 1000)
+
+    # Process overlapping windows
+    start = 0
+    num_zc = 0
+    last_inserted_timestamp = None
+    while start < (rows - N):
+        this = data[start:start+N, column]
+        t_min = timestamp_to_seconds(data[start, 0])
+        t_max = timestamp_to_seconds(data[start+N-1, 0])
+
+        # Do 4-parameter sine wave fit
+        (A, f0, phi, C) = sfit4(this, fs)
+
+        # Check bounds.  If frequency is too crazy, ignore this window
+        if f0 < f_min or f0 > f_max:
+            warn.warn(sprintf("frequency %s outside valid range %s - %s\n",
+                              str(f0), str(f_min), str(f_max)), t_min)
+            start += N
+            continue
+
+        # If amplitude is too low, results are probably just noise
+        if A < a_min:
+            warn.warn(sprintf("amplitude %s below minimum threshold %s\n",
+                              str(A), str(a_min)), t_min)
+            start += N
+            continue
+
+        #p.plot(arange(N), this)
+        #p.plot(arange(N), A * sin(f0/fs * 2 * pi * arange(N) + phi) + C, 'g')
+
+        # Period starts when the argument of sine is 0 degrees,
+        # so we're looking for sample number:
+        #     n = (0 - phi) / (f0/fs * 2 * pi)
+        zc_n = (0 - phi) / (f0 / fs * 2 * pi)
+        period_n = fs/f0
+
+        # Add periods to make N positive
+        while zc_n < 0:
+            zc_n += period_n
+
+        last_zc = None
+        # Mark the zero crossings until we're a half period away
+        # from the end of the window
+        while zc_n < (N - period_n/2):
+            #p.plot(zc_n, C, 'ro')
+            t = t_min + zc_n / fs
+            if (last_inserted_timestamp is None or
+                t > last_inserted_timestamp):
+                insert_function([[seconds_to_timestamp(t), f0, A, C]])
+                last_inserted_timestamp = t
+                warn.reset(t)
+            else:
+                warn.warn("timestamp overlap\n", t)
+            num_zc += 1
+            last_zc = zc_n
+            zc_n += period_n
+
+        # Advance the window one quarter period past the last marked
+        # zero crossing, or advance the window by half its size if we
+        # didn't mark any.
+        if last_zc is not None:
+            advance = min(last_zc + period_n/4, N)
+        else:
+            advance = N/2
+        #p.plot(advance, C, 'go')
+        #p.show()
+
+        start = int(round(start + advance))
+
+    # Return the number of rows we've processed
+    warn.reset(last_inserted_timestamp)
+    if last_inserted_timestamp:
+        now = timestamp_to_human(seconds_to_timestamp(
+            last_inserted_timestamp)) + ": "
+    else:
+        now = ""
+    printf("%sMarked %d zero-crossings in %d rows\n", now, num_zc, start)
+    return start
+
+def sfit4(data, fs):
+    """(A, f0, phi, C) = sfit4(data, fs)
+
+    Compute 4-parameter (unknown-frequency) least-squares fit to
+    sine-wave data, according to IEEE Std 1241-2010 Annex B
+
+    Input:
+      data  vector of input samples
+      fs    sampling rate (Hz)
+
+    Output:
+      Parameters [A, f0,  phi, C] to fit the equation
+        x[n] = A * sin(f0/fs * 2 * pi * n + phi) + C
+      where n is sample number.  Or, as a function of time:
+        x(t) = A * sin(f0 * 2 * pi * t + phi) + C
+
+    by Jim Paris
+    (Verified to match sfit4.m)
+    """
+    N = len(data)
+    t = linspace(0, (N-1) / float(fs), N)
+
+    ## Estimate frequency using FFT (step b)
+    Fc = fft(data)
+    F = abs(Fc)
+    F[0] = 0   # eliminate DC
+
+    # Find pair of spectral lines with largest amplitude:
+    # resulting values are in F(i) and F(i+1)
+    i = argmax(F[0:int(N/2)] + F[1:int(N/2+1)])
+
+    # Interpolate FFT to get a better result (from Markus [B37])
+    U1 = real(Fc[i])
+    U2 = real(Fc[i+1])
+    V1 = imag(Fc[i])
+    V2 = imag(Fc[i+1])
+    n = 2 * pi / N
+    ni1 = n * i
+    ni2 = n * (i+1)
+    K = ((V2-V1)*sin(ni1) + (U2-U1)*cos(ni1)) / (U2-U1)
+    Z1 = V1 * (K - cos(ni1)) / sin(ni1) + U1
+    Z2 = V2 * (K - cos(ni2)) / sin(ni2) + U2
+    i = arccos((Z2*cos(ni2) - Z1*cos(ni1)) / (Z2-Z1)) / n
+
+    # Convert to Hz
+    f0 = i * float(fs) / N
+
+    # Fit it.  We'll catch exceptions here and just returns zeros
+    # if something fails with the least squares fit, etc.
+    try:
+        # first guess for A0, B0 using 3-parameter fit (step c)
+        s = zeros(3)
+        w = 2*pi*f0
+
+        # Now iterate 7 times (step b, plus 6 iterations of step i)
+        for idx in range(7):
+            D = c_[cos(w*t), sin(w*t), ones(N),
+                  -s[0] * t * sin(w*t) + s[1] * t * cos(w*t) ] # eqn B.16
+            s = linalg.lstsq(D, data)[0] # eqn B.18
+            w = w + s[3]	# update frequency estimate
+
+        ## Extract results
+        A = sqrt(s[0]*s[0] + s[1]*s[1]) # eqn B.21
+        f0 = w / (2*pi)
+        phi = arctan2(s[0], s[1]) # eqn B.22 (flipped for sin instead of cos)
+        C = s[2]
+        return (A, f0, phi, C)
+    except Exception as e:
+        # something broke down, just return zeros
+        return (0, 0, 0, 0)
+
+if __name__ == "__main__":
+    main()
--- a/nilmtools/trainola.py
+++ b/nilmtools/trainola.py
@@ -0,0 +1,279 @@
+#!/usr/bin/python
+
+from nilmdb.utils.printf import *
+import nilmdb.client
+import nilmtools.filter
+from nilmdb.utils.time import (timestamp_to_human,
+                               timestamp_to_seconds,
+                               seconds_to_timestamp)
+from nilmdb.utils.interval import Interval
+
+import numpy as np
+import scipy
+import scipy.signal
+from numpy.core.umath_tests import inner1d
+import nilmrun
+from collections import OrderedDict
+import sys
+import functools
+import collections
+
+class DataError(ValueError):
+    pass
+
+def build_column_mapping(colinfo, streaminfo):
+    """Given the 'columns' list from the JSON data, verify and
+    pull out a dictionary mapping for the column names/numbers."""
+    columns = OrderedDict()
+    for c in colinfo:
+        if (c['name'] in columns.keys() or
+            c['index'] in columns.values()):
+            raise DataError("duplicated columns")
+        if (c['index'] < 0 or c['index'] >= streaminfo.layout_count):
+            raise DataError("bad column number")
+        columns[c['name']] = c['index']
+    if not len(columns):
+        raise DataError("no columns")
+    return columns
+
+class Exemplar(object):
+    def __init__(self, exinfo, min_rows = 10, max_rows = 100000):
+        """Given a dictionary entry from the 'exemplars' input JSON,
+        verify the stream, columns, etc.  Then, fetch all the data
+        into self.data."""
+
+        self.name = exinfo['name']
+        self.url = exinfo['url']
+        self.stream = exinfo['stream']
+        self.start = exinfo['start']
+        self.end = exinfo['end']
+        self.dest_column = exinfo['dest_column']
+
+        # Get stream info
+        self.client = nilmdb.client.numpyclient.NumpyClient(self.url)
+        self.info = nilmtools.filter.get_stream_info(self.client, self.stream)
+
+        # Build up name => index mapping for the columns
+        self.columns = build_column_mapping(exinfo['columns'], self.info)
+
+        # Count points
+        self.count = self.client.stream_count(self.stream, self.start, self.end)
+
+        # Verify count
+        if self.count == 0:
+            raise DataError("No data in this exemplar!")
+        if self.count < min_rows:
+            raise DataError("Too few data points: " + str(self.count))
+        if self.count > max_rows:
+            raise DataError("Too many data points: " + str(self.count))
+
+        # Extract the data
+        datagen = self.client.stream_extract_numpy(self.stream,
+                                                   self.start, self.end,
+                                                   self.info.layout,
+                                                   maxrows = self.count)
+        self.data = list(datagen)[0]
+
+        # Discard timestamp
+        self.data = self.data[:,1:]
+
+        # Subtract the mean from each column
+        self.data = self.data - self.data.mean(axis=0)
+
+        # Get scale factors for each column by computing dot product
+        # of each column with itself.
+        self.scale = inner1d(self.data.T, self.data.T)
+
+        # Ensure a minimum (nonzero) scale and convert to list
+        self.scale = np.maximum(self.scale, [1e-9]).tolist()
+
+    def __str__(self):
+        return sprintf("\"%s\" %s [%s] %s rows",
+                       self.name, self.stream, ",".join(self.columns.keys()),
+                       self.count)
+
+def peak_detect(data, delta):
+    """Simple min/max peak detection algorithm, taken from my code
+    in the disagg.m from the 10-8-5 paper"""
+    mins = [];
+    maxs = [];
+    cur_min = (None, np.inf)
+    cur_max = (None, -np.inf)
+    lookformax = False
+    for (n, p) in enumerate(data):
+        if p > cur_max[1]:
+            cur_max = (n, p)
+        if p < cur_min[1]:
+            cur_min = (n, p)
+        if lookformax:
+            if p < (cur_max[1] - delta):
+                maxs.append(cur_max)
+                cur_min = (n, p)
+                lookformax = False
+        else:
+            if p > (cur_min[1] + delta):
+                mins.append(cur_min)
+                cur_max = (n, p)
+                lookformax = True
+    return (mins, maxs)
+
+def trainola_matcher(data, interval, args, insert_func, final_chunk):
+    """Perform cross-correlation match"""
+    ( src_columns, dest_count, exemplars ) = args
+    nrows = data.shape[0]
+
+    # We want at least 10% more points than the widest exemplar.
+    widest = max([ x.count for x in exemplars ])
+    if (widest * 1.1) > nrows:
+        return 0
+
+    # This is how many points we'll consider valid in the
+    # cross-correlation.
+    valid = nrows + 1 - widest
+    matches = collections.defaultdict(list)
+
+    # Try matching against each of the exemplars
+    for e in exemplars:
+        corrs = []
+
+        # Compute cross-correlation for each column
+        for col_name in e.columns:
+            a = data[:, src_columns[col_name] + 1]
+            b = e.data[:, e.columns[col_name]]
+            corr = scipy.signal.fftconvolve(a, np.flipud(b), 'valid')[0:valid]
+
+            # Scale by the norm of the exemplar
+            corr = corr / e.scale[e.columns[col_name]]
+            corrs.append(corr)
+
+        # Find the peaks using the column with the largest amplitude
+        biggest = e.scale.index(max(e.scale))
+        peaks_minmax = peak_detect(corrs[biggest], 0.1)
+        peaks = [ p[0] for p in peaks_minmax[1] ]
+
+        # Now look at every peak
+        for row in peaks:
+            # Correlation for each column must be close enough to 1.
+            for (corr, scale) in zip(corrs, e.scale):
+                # The accepted distance from 1 is based on the relative
+                # amplitude of the column.  Use a linear mapping:
+                # scale 1.0 -> distance 0.1
+                # scale 0.0 -> distance 1.0
+                distance = 1 - 0.9 * (scale / e.scale[biggest])
+                if abs(corr[row] - 1) > distance:
+                    # No match
+                    break
+            else:
+                # Successful match
+                matches[row].append(e)
+
+    # Insert matches into destination stream.
+    matched_rows = sorted(matches.keys())
+    out = np.zeros((len(matched_rows), dest_count + 1))
+
+    for n, row in enumerate(matched_rows):
+        # Fill timestamp
+        out[n][0] = data[row, 0]
+
+        # Mark matched exemplars
+        for exemplar in matches[row]:
+            out[n, exemplar.dest_column + 1] = 1.0
+
+    # Insert it
+    insert_func(out)
+
+    # Return how many rows we processed
+    return max(valid, 0)
+
+def trainola(conf):
+    print "Trainola", nilmtools.__version__
+
+    # Load main stream data
+    url = conf['url']
+    src_path = conf['stream']
+    dest_path = conf['dest_stream']
+    start = conf['start']
+    end = conf['end']
+
+    # Get info for the src and dest streams
+    src_client = nilmdb.client.numpyclient.NumpyClient(url)
+    src = nilmtools.filter.get_stream_info(src_client, src_path)
+    if not src:
+        raise DataError("source path '" + src_path + "' does not exist")
+    src_columns = build_column_mapping(conf['columns'], src)
+
+    dest_client = nilmdb.client.numpyclient.NumpyClient(url)
+    dest = nilmtools.filter.get_stream_info(dest_client, dest_path)
+    if not dest:
+        raise DataError("destination path '" + dest_path + "' does not exist")
+
+    printf("Source:\n")
+    printf("  %s [%s]\n", src.path, ",".join(src_columns.keys()))
+    printf("Destination:\n")
+    printf("  %s (%s columns)\n", dest.path, dest.layout_count)
+
+    # Pull in the exemplar data
+    exemplars = []
+    for n, exinfo in enumerate(conf['exemplars']):
+        printf("Loading exemplar %d:\n", n)
+        e = Exemplar(exinfo)
+        col = e.dest_column
+        if col < 0 or col >= dest.layout_count:
+            raise DataError(sprintf("bad destination column number %d\n" +
+                                    "dest stream only has 0 through %d",
+                                    col, dest.layout_count - 1))
+        printf("  %s, output column %d\n", str(e), col)
+        exemplars.append(e)
+    if len(exemplars) == 0:
+        raise DataError("missing exemplars")
+
+    # Verify that the exemplar columns are all represented in the main data
+    for n, ex in enumerate(exemplars):
+        for col in ex.columns:
+            if col not in src_columns:
+                raise DataError(sprintf("Exemplar %d column %s is not "
+                                        "available in source data", n, col))
+
+    # Figure out which intervals we should process
+    intervals = ( Interval(s, e) for (s, e) in
+                  src_client.stream_intervals(src_path,
+                                              diffpath = dest_path,
+                                              start = start, end = end) )
+    intervals = nilmdb.utils.interval.optimize(intervals)
+
+    # Do the processing
+    rows = 100000
+    extractor = functools.partial(src_client.stream_extract_numpy,
+                                  src.path, layout = src.layout, maxrows = rows)
+    inserter = functools.partial(dest_client.stream_insert_numpy_context,
+                                 dest.path)
+    for interval in intervals:
+        printf("Processing interval:\n")
+        printf("  %s\n", interval.human_string())
+        nilmtools.filter.process_numpy_interval(
+            interval, extractor, inserter, rows * 3,
+            trainola_matcher, (src_columns, dest.layout_count, exemplars))
+
+    return "done"
+
+def main(argv = None):
+    import simplejson as json
+    import sys
+
+    if argv is None:
+        argv = sys.argv[1:]
+    if len(argv) != 1:
+        raise DataError("need one argument, either a dictionary or JSON string")
+
+    try:
+        # Passed in a JSON string (e.g. on the command line)
+        conf = json.loads(argv[0])
+    except TypeError as e:
+        # Passed in the config dictionary (e.g. from NilmRun)
+        conf = argv[0]
+
+    return trainola(conf)
+
+if __name__ == "__main__":
+    main()
+
--- a/setup.py
+++ b/setup.py
@@ -30,7 +30,7 @@ except ImportError:
 # Versioneer manages version numbers from git tags.
 # https://github.com/warner/python-versioneer
 import versioneer
-versioneer.versionfile_source = 'src/_version.py'
+versioneer.versionfile_source = 'nilmtools/_version.py'
 versioneer.versionfile_build = 'nilmtools/_version.py'
 versioneer.tag_prefix = 'nilmtools-'
 versioneer.parentdir_prefix = 'nilmtools-'
@@ -61,14 +61,13 @@ setup(name='nilmtools',
      long_description = "NILM Database Tools",
      license = "Proprietary",
      author_email = 'jim@jtan.com',
-      install_requires = [ 'nilmdb >= 1.5.0',
+      install_requires = [ 'nilmdb >= 1.8.1',
                           'numpy',
                           'scipy',
-                           'matplotlib',
+                           #'matplotlib',
                           ],
      packages = [ 'nilmtools',
                   ],
-      package_dir = { 'nilmtools': 'src' },
      entry_points = {
          'console_scripts': [
              'nilm-decimate = nilmtools.decimate:main',
@@ -79,6 +78,8 @@ setup(name='nilmtools',
              'nilm-copy-wildcard = nilmtools.copy_wildcard:main',
              'nilm-sinefit = nilmtools.sinefit:main',
              'nilm-cleanup = nilmtools.cleanup:main',
+              'nilm-median = nilmtools.median:main',
+              'nilm-trainola = nilmtools.trainola:main',
              ],
          },
      zip_safe = False,
--- a/src/sinefit.py
+++ b/src/sinefit.py
@@ -1,187 +0,0 @@
-#!/usr/bin/python
-
-# Sine wave fitting.  This runs about 5x faster than realtime on raw data.
-
-import nilmtools.filter
-import nilmdb.client
-from numpy import *
-from scipy import *
-#import pylab as p
-import operator
-
-def main(argv = None):
-    f = nilmtools.filter.Filter()
-    parser = f.setup_parser("Sine wave fitting")
-    group = parser.add_argument_group("Sine fit options")
-    group.add_argument('-c', '--column', action='store', type=int,
-                       help='Column number (first data column is 1)')
-    group.add_argument('-f', '--frequency', action='store', type=float,
-                       default=60.0,
-                       help='Approximate frequency (default: %(default)s)')
-
-    # Parse arguments
-    try:
-        args = f.parse_args(argv)
-    except nilmtools.filter.MissingDestination as e:
-        rec = "float32_3"
-        print "Source is %s (%s)" % (e.src.path, e.src.layout)
-        print "Destination %s doesn't exist" % (e.dest.path)
-        print "You could make it with a command like:"
-        print "  nilmtool -u %s create %s %s" % (e.dest.url, e.dest.path, rec)
-        raise SystemExit(1)
-
-    if args.column is None or args.column < 1:
-        parser.error("need a column number >= 1")
-    if args.frequency < 0.1:
-        parser.error("frequency must be >= 0.1")
-
-    f.check_dest_metadata({ "sinefit_source": f.src.path,
-                            "sinefit_column": args.column })
-    f.process_numpy(process, args = (args.column, args.frequency))
-
-def process(data, interval, args, insert_function, final):
-    (column, f_expected) = args
-    rows = data.shape[0]
-
-    # Estimate sampling frequency from timestamps
-    fs = 1e6 * (rows-1) / (data[-1][0] - data[0][0])
-
-    # Pull out about 3.5 periods of data at once;
-    # we'll expect to match 3 zero crossings in each window
-    N = max(int(3.5 * fs / f_expected), 10)
-
-    # If we don't have enough data, don't bother processing it
-    if rows < N:
-        return 0
-
-    # Process overlapping windows
-    start = 0
-    num_zc = 0
-    while start < (rows - N):
-        this = data[start:start+N, column]
-        t_min = data[start, 0]/1e6
-        t_max = data[start+N-1, 0]/1e6
-
-        # Do 4-parameter sine wave fit
-        (A, f0, phi, C) = sfit4(this, fs)
-
-        # Check bounds.  If frequency is too crazy, ignore this window
-        if f0 < (f_expected/2) or f0 > (f_expected*2):
-            print "frequency", f0, "too far from expected value", f_expected
-            start += N
-            continue
-
-        #p.plot(arange(N), this)
-        #p.plot(arange(N), A * cos(f0/fs * 2 * pi * arange(N) + phi) + C, 'g')
-
-        # Period starts when the argument of cosine is 3*pi/2 degrees,
-        # so we're looking for sample number:
-        #     n = (3 * pi / 2 - phi) / (f0/fs * 2 * pi)
-        zc_n = (3 * pi / 2 - phi) / (f0 / fs * 2 * pi)
-        period_n = fs/f0
-
-        # Add periods to make N positive
-        while zc_n < 0:
-            zc_n += period_n
-
-        last_zc = None
-        # Mark the zero crossings until we're a half period away
-        # from the end of the window
-        while zc_n < (N - period_n/2):
-            #p.plot(zc_n, C, 'ro')
-            t = t_min + zc_n / fs
-            insert_function([[t * 1e6, f0, A, C]])
-            num_zc += 1
-            last_zc = zc_n
-            zc_n += period_n
-
-        # Advance the window one quarter period past the last marked
-        # zero crossing, or advance the window by half its size if we
-        # didn't mark any.
-        if last_zc is not None:
-            advance = min(last_zc + period_n/4, N)
-        else:
-            advance = N/2
-        #p.plot(advance, C, 'go')
-        #p.show()
-
-        start = int(round(start + advance))
-
-    # Return the number of rows we've processed
-    print "Marked", num_zc, "zero-crossings in", start, "rows"
-    return start
-
-def sfit4(data, fs):
-    """(A, f0, phi, C) = sfit4(data, fs)
-
-    Compute 4-parameter (unknown-frequency) least-squares fit to
-    sine-wave data, according to IEEE Std 1241-2010 Annex B
-
-    Input:
-      data  vector of input samples
-      fs    sampling rate (Hz)
-
-    Output:
-      Parameters [A, f0,  phi, C] to fit the equation
-        x[n] = A * cos(f0/fs * 2 * pi * n + phi) + C
-      where n is sample number.  Or, as a function of time:
-        x(t) = A * cos(f0 * 2 * pi * t + phi) + C
-
-    by Jim Paris
-    (Verified to match sfit4.m)
-    """
-    N = len(data)
-    t = linspace(0, (N-1) / fs, N)
-
-    ## Estimate frequency using FFT (step b)
-    Fc = fft(data)
-    F = abs(Fc)
-    F[0] = 0   # eliminate DC
-
-    # Find pair of spectral lines with largest amplitude:
-    # resulting values are in F(i) and F(i+1)
-    i = argmax(F[0:int(N/2)] + F[1:int(N/2+1)])
-
-    # Interpolate FFT to get a better result (from Markus [B37])
-    U1 = real(Fc[i])
-    U2 = real(Fc[i+1])
-    V1 = imag(Fc[i])
-    V2 = imag(Fc[i+1])
-    n = 2 * pi / N
-    ni1 = n * i
-    ni2 = n * (i+1)
-    K = ((V2-V1)*sin(ni1) + (U2-U1)*cos(ni1)) / (U2-U1)
-    Z1 = V1 * (K - cos(ni1)) / sin(ni1) + U1
-    Z2 = V2 * (K - cos(ni2)) / sin(ni2) + U2
-    i = arccos((Z2*cos(ni2) - Z1*cos(ni1)) / (Z2-Z1)) / n
-
-    # Convert to Hz
-    f0 = i * fs / N
-
-    ## Fit it
-    # first guess for A0, B0 using 3-parameter fit (step c)
-    w = 2*pi*f0
-    D = c_[cos(w*t), sin(w*t), ones(N)]
-    s = linalg.lstsq(D, data)[0]
-
-    # Now iterate 6 times (step i)
-    for idx in range(6):
-        D = c_[cos(w*t), sin(w*t), ones(N),
-              -s[0] * t * sin(w*t) + s[1] * t * cos(w*t) ] # eqn B.16
-        s = linalg.lstsq(D, data)[0] # eqn B.18
-        w = w + s[3]	# update frequency estimate
-
-    ## Extract results
-    A = sqrt(s[0]*s[0] + s[1]*s[1]) # eqn B.21
-    f0 = w / (2*pi)
-    try:
-        phi = -arctan2(s[1], s[0]) # eqn B.22
-    except TypeError:
-        # something broke down, just return zeros
-        return (0, 0, 0, 0)
-    C = s[2]
-
-    return (A, f0, phi, C)
-
-if __name__ == "__main__":
-    main()
Author	SHA1	Message	Date
Jim Paris	25c35a56f6	Trainola inserts into the destination stream now	2013-07-10 12:59:39 -04:00
Jim Paris	d610deaef0	More trainola work	2013-07-10 11:38:32 -04:00
Jim Paris	d7d5ccc9a7	More filter cleanup	2013-07-09 19:27:20 -04:00
Jim Paris	f28753ff5c	Move process_numpy_interval outside the class	2013-07-09 18:40:49 -04:00
Jim Paris	c9c2e0d5a8	Improve split between process_numpy and process_numpy_interval	2013-07-09 18:09:05 -04:00
Jim Paris	5a2a32bec5	WIP on trainola improvements	2013-07-09 17:56:26 -04:00
Jim Paris	706c3933f9	Add trainola from nilmrun	2013-07-09 17:55:57 -04:00
Jim Paris	cfd1719152	Use nilmdb.utils.interval.optimize; bump nilmdb min version	2013-07-09 17:53:04 -04:00
Jim Paris	c62fb45980	Makefile cleanup; add nilm-trainola binary	2013-07-09 16:53:47 -04:00
Jim Paris	57d856f2fa	Split filter.py internals up a little more This makes it easier to use the filter stuff from other code, but it's also turning it into more of a spaghetti nightmare. Might not be worth continuing down this path.	2013-07-09 16:52:00 -04:00
Jim Paris	5d83d93019	Rename src/ directory to nilmtools/	2013-07-08 11:54:13 -04:00
Jim Paris	5f847a0513	Split process_numpy innards process_numpy_interval	2013-07-03 12:07:22 -04:00
Jim Paris	29cd7eb6c7	Improve test_prep target in Makefile	2013-07-03 12:06:50 -04:00
Jim Paris	62c8af41ea	Cleanup comments	2013-06-06 15:34:23 -04:00
Jim Paris	4f6bc48619	sinefit: include timestamps on marking output too	2013-05-11 11:00:31 -04:00
Jim Paris	cf9eb0ed48	Improve sinefit resiliancy	2013-05-10 14:19:55 -04:00
Jim Paris	32066fc260	Remove hard matplotlib dependency	2013-05-09 13:17:36 -04:00
Jim Paris	739da3f973	Add median filter	2013-05-08 23:36:50 -04:00
Jim Paris	83ad18ebf6	Fix non-string arguments to metadata_check	2013-05-08 12:49:38 -04:00
Jim Paris	c76d527f95	Fix unicode handling in filter metadata match	2013-05-07 12:40:53 -04:00
Jim Paris	b8a73278e7	Always store metadata rotation as a string	2013-04-29 14:25:11 -04:00
Jim Paris	ce0691d6c4	sineefit: Change sfit4 to fit to \sin instead of \cos And adjust the period locator accordingly. Fitting \sin is the same mathematically, it's just conceptually more straightforward since we're locating zero crossings anyway.	2013-04-27 18:12:20 -04:00
Jim Paris	4da658e960	sinefit: move initial estimate into the main iteration loop Just a little less code. Same results.	2013-04-27 17:50:23 -04:00
Jim Paris	8ab31eafc2	Allow shorthand method for creating an option-less parser. This is mostly just intended to make a simple filter example shorter.	2013-04-21 16:53:28 -04:00
Jim Paris	979ab13bff	Force fs to be a float in sfit4	2013-04-17 17:58:15 -04:00
Jim Paris	f4fda837ae	Bump required nilmdb version to 1.6.0	2013-04-11 11:55:11 -04:00
Jim Paris	5547d266d0	filter: Don't include trailing unprocessed data in the inserted intervals	2013-04-11 11:53:17 -04:00
Jim Paris	372e977e4a	Reverse cleanup order to handle interruptions better	2013-04-10 18:38:41 -04:00
Jim Paris	640a680704	Increase default min amplitude in sinefit	2013-04-10 17:09:52 -04:00
Jim Paris	2e74e6cd63	Skip over data if we aren't able to process any. Change output format	2013-04-10 17:01:07 -04:00
Jim Paris	de2a794e00	Support wildcards in nilm-decimate-auto	2013-04-10 16:05:16 -04:00
Jim Paris	065a40f265	sinefit: add minimum amplitude check	2013-04-10 15:33:51 -04:00
Jim Paris	65fa43aff1	sinefit: catch all errors in sfit4	2013-04-10 14:36:50 -04:00
Jim Paris	57c23c3792	sinefit: allow user to override min/max frequency detection	2013-04-10 14:36:40 -04:00
Jim Paris	d4c8e4acb4	Include rotation in metadata	2013-04-10 14:36:05 -04:00