Decimate seems to work pretty well right now

11 years ago · 54f8c34f8e
--- a/+ 4
+++ b/+ 4
@@ -1,5 +1,8 @@
 test:
 	python nilmtools/decimate.py /lees-compressor/noleak/raw /lees-compressor/noleak/raw~4
 	nilmtool remove /lees-compressor/noleak/raw~4 -s 2000 -e 2020
 	nilmtool remove /lees-compressor/noleak/raw~16	 -s 2000 -e 2020
 	python nilmtools/decimate.py -s '2013-02-04 18:10:00' -e '2013-02-04 18:11:00' /lees-compressor/noleak/raw /lees-compressor/noleak/raw~4
 	python nilmtools/decimate.py -s '2013-02-04 18:10:00' -e '2013-02-04 18:11:00' /lees-compressor/noleak/raw~4 /lees-compressor/noleak/raw~16

 all:
 	@echo "Try 'make install'"
--- a/nilmtools/decimate.py
+++ b/nilmtools/decimate.py
@@ -2,9 +2,7 @@

 import nilmtools.filter
 import nilmdb.client

 def DecimateException(Exception):
    pass
 import numpy as np

 def main():
    f = nilmtools.filter.Filter()
@@ -16,10 +14,10 @@ def main():
    try:
        args = f.parse_args()
    except nilmtools.filter.MissingDestination as e:
        # If no destination, suggest how to create it.
        # If no destination, suggest how to create it by figuring out
        # a recommended layout.
        print "Source is %s (%s)" % (e.src, e.layout)
        print "Destination %s doesn't exist" % (e.dest)
        # Figure out a recommended layout
        if "decimate_source" in f.client.stream_get_metadata(e.src):
            rec = e.layout
        elif 'int32' in e.layout_type or 'float64' in e.layout_type:
@@ -30,33 +28,43 @@ def main():
        print "  nilmtool create", e.dest, rec
        raise SystemExit(1)

    # See if the metadata jives, and complain if it doesn't
    dest_metadata = f.client.stream_get_metadata(args.destpath)
    try:
        rows = f.destinfo[4] # don't complain unless there's data
        tmp = dest_metadata.get("decimate_source", args.srcpath)
        if tmp != args.srcpath and rows > 0:
            raise DecimateException("storing decimated data from %s" % tmp)
        tmp = int(dest_metadata.get("decimate_factor", args.factor))
        if tmp != args.factor and rows > 0:
            raise DecimateException("storing data decimated at "
                                    "a different factor (%d)", tmp)
    except DecimateException as e:
        print "The destination seems to already be " + str(e)
        print "Refusing to change it.  You can change the stream's"
        print "decimate_* metadata, or remove all data in the stream,"
        print "to prevent this error."

    # Fill in the metadata in case it's missing
    f.client.stream_update_metadata(args.destpath,
                                    { "decimate_source": args.srcpath,
                                      "decimate_factor": args.factor })

    # Process it
    f.process(maxlen = 600, function = decimate, args = (factor,))

 def decimate(data, start, end, factor):
    pass
    f.check_dest_metadata({ "decimate_source": args.srcpath,
                            "decimate_factor": args.factor })

    # If source is decimated, we have to decimate a bit differently
    if "decimate_source" in f.client.stream_get_metadata(args.srcpath):
        f.process(function = decimate_again, rows = args.factor)
    else:
        f.process(function = decimate_first, rows = args.factor)

 def decimate_first(data):
    """Decimate original data -- result has 3 times as many columns"""
    data = np.array(data)
    rows, cols = data.shape
    n = cols - 1
    out = np.zeros(1 + 3 * n)

    out[0] = np.mean(data[:, 0], 0)
    out[    1 : n+1  ] = np.mean(data[:, 1 : n+1], 0)
    out[  n+1 : 2*n+1] = np.min( data[:, 1 : n+1], 0)
    out[2*n+1 : 3*n+1] = np.max( data[:, 1 : n+1], 0)

    return [out]

 def decimate_again(data):
    """Decimate already-decimated data -- result has the same number
    of columns"""
    data = np.array(data)
    rows, cols = data.shape
    n = (cols - 1) // 3
    out = np.zeros(1 + 3 * n)

    out[0] = np.mean(data[:, 0], 0)
    out[    1 : n+1  ] = np.mean(data[:,     1 :   n+1], 0)
    out[  n+1 : 2*n+1] = np.min( data[:,   n+1 : 2*n+1], 0)
    out[2*n+1 : 3*n+1] = np.max( data[:, 2*n+1 : 3*n+1], 0)

    return [out]

 if __name__ == "__main__":
    main()
--- a/nilmtools/filter.py
+++ b/nilmtools/filter.py
@@ -5,6 +5,8 @@ from nilmdb.utils.printf import *
 from nilmdb.utils.time import parse_time, format_time

 import nilmtools

 import itertools
 import time
 import sys
 import re
@@ -48,6 +50,14 @@ class Filter(object):
                           default = False,
                           help="Just print intervals that would be "
                           "processed")
        group.add_argument("-s", "--start",
                           metavar="TIME", type=self.arg_time,
                           help="Starting timestamp for intervals "
                           "(free-form, inclusive)")
        group.add_argument("-e", "--end",
                           metavar="TIME", type=self.arg_time,
                           help="Ending timestamp for intervals "
                           "(free-form, noninclusive)")
        group.add_argument("srcpath", action="store",
                           help="Path of source stream, e.g. /foo/bar")
        group.add_argument("destpath", action="store",
@@ -89,11 +99,20 @@ class Filter(object):
        """Generate all the intervals that this filter should process"""
        self._using_client = True
        for i in self._client.stream_intervals(
            self._args.srcpath, diffpath = self._args.destpath):
            self._args.srcpath, diffpath = self._args.destpath,
            start = self._args.start, end = self._args.end):
            yield i
        self._using_client = False

    # Misc helpers
    def arg_time(self, toparse):
        """Parse a time string argument"""
        try:
            return nilmdb.utils.time.parse_time(toparse).totimestamp()
        except ValueError as e:
            raise argparse.ArgumentTypeError(sprintf("%s \"%s\"",
                                                     str(e), toparse))

    def stream_info_string(self, info):
        """Print stream info as a string"""
        return sprintf("%s (%s), %.2fM rows, %.2f hours",
@@ -104,27 +123,89 @@ class Filter(object):
        return sprintf("[ %s -> %s ]", format_time(interval[0]),
                       format_time(interval[1]))

    def check_dest_metadata(self, data):
        """See if the metadata jives, and complain if it doesn't.  If
        there's no conflict, update the metadata to match 'data'."""
        metadata = self._client.stream_get_metadata(self._args.destpath)
        rows = self.destinfo[4]
        for key in data:
            wanted = str(data[key])
            val = metadata.get(key, wanted)
            if val != wanted and rows > 0:
                m =  "Metadata in destination stream:\n"
                m += "  %s = %s\n" % (key, val)
                m += "doesn't match desired data:\n"
                m += "  %s = %s\n" % (key, wanted)
                m += "Refusing to change it.  You can change the stream's "
                m += "metadata manually, or\n"
                m += "remove existing data from the stream, to prevent "
                m += "this error.\n"
                raise Exception(m)
        # All good -- write the metadata in case it's not already there
        self._client.stream_update_metadata(self._args.destpath, data)

    # Main processing helper
    def process(self, function, maxlen, args):
        """Process data in chunks.
    def process(self, function, rows, partial = True, args = None):
        """Process data in chunks of 'rows' data at a time.

        function: function to process the data
        maxlen: maximum length of data to pass to function, in seconds
        args: tuple containing extra arguments to pass to function
        rows: maximum number of rows to pass to 'function' at once
        args: tuple containing extra arguments to pass to 'function'
        partial: if true, less than 'rows' may be passed to 'function'.
                 if false, partial data at the end of an interval will
                 be dropped.

        'function' should be defined like:
            function(data, start, end, *args)
        It will be passed a block of data from the source stream,
        the start and end times of that block, and any arguments
        that were passed to process in 'args'.  The total
        length of the interval will be at most 'maxlen' seconds.

        'function' should transform the data as desired, and return
        a new list of data, which will be inserted into the
        destination stream."""
            function(data, *args)
        It will be passed an array containing up to 'rows' rows of
        data from the source stream, and any arguments passed in
        'args'.  It should transform the data as desired, and return a
        new array of data, which will be inserted into the destination
        stream.
        """
        if args is None:
            args = []
        extractor = nilmdb.client.Client(self._args.url).stream_extract
        inserter = nilmdb.client.Client(self._args.url).stream_insert_context
        src = self._args.srcpath
        dest = self._args.destpath
        islice = itertools.islice

        # Figure out how to format output data
        dest_layout = self.destinfo[1].split('_')[1]
        def int_formatter(row):
            return ("%.6f " % row[0]) + " ".join(str(int(x)) for x in row[1:])
        def float_formatter(row):
            return ("%.6f " % row[0]) + " ".join(repr(x) for x in row[1:])
        if "int" in dest_layout:
            formatter = int_formatter
        else:
            formatter = float_formatter

        for (start, end) in self.intervals():
            if (end - start) 
            return
            print "Processing", self.interval_string((start, end))
            with inserter(dest, start, end) as insert_ctx:
                src_array = []
                for line in extractor(src, start, end):
                    # Read in data
                    src_array.append([ float(x) for x in line.split() ])

                    if len(src_array) == rows:
                        # Pass through filter function
                        dest_array = function(src_array, *args)

                        # Write result to destination
                        out = [ formatter(row) for row in dest_array ]
                        insert_ctx.insert("\n".join(out) + "\n")

                        # Clear source array
                        src_array = []

                # Take care of partial chunk
                if len(src_array) and partial:
                    dest_array = function(src_array, *args)
                    out = [ formatter(row) for row in dest_array ]
                    insert_ctx.insert("\n".join(out) + "\n")

 def main():
    # This is just a dummy function; actual filters can use the other