|
- #!/usr/bin/python
-
- from __future__ import absolute_import
-
- import nilmdb.client
- from nilmdb.client import Client
- from nilmdb.utils.printf import *
- from nilmdb.utils.time import (parse_time, timestamp_to_human,
- timestamp_to_seconds)
- from nilmdb.utils.interval import Interval
-
- import nilmtools
-
- import itertools
- import time
- import sys
- import re
- import argparse
- import numpy as np
- import cStringIO
-
- class MissingDestination(Exception):
- def __init__(self, src, dest):
- self.src = src
- self.dest = dest
- Exception.__init__(self, "destination path " + dest.path + " not found")
-
- class StreamInfo(object):
- def __init__(self, url, info, interhost):
- self.url = url
- self.info = info
- self.interhost = interhost
- try:
- self.path = info[0]
- self.layout = info[1]
- self.layout_type = self.layout.split('_')[0]
- self.layout_count = int(self.layout.split('_')[1])
- self.total_count = self.layout_count + 1
- self.timestamp_min = info[2]
- self.timestamp_max = info[3]
- self.rows = info[4]
- self.seconds = nilmdb.utils.time.timestamp_to_seconds(info[5])
- except IndexError, TypeError:
- pass
-
- def __str__(self):
- """Print stream info as a string"""
- res = ""
- if self.interhost:
- res = sprintf("[%s] ", self.url)
- res += sprintf("%s (%s), %.2fM rows, %.2f hours",
- self.path, self.layout, self.rows / 1e6,
- self.seconds / 3600.0)
- return res
-
- class Filter(object):
-
- def __init__(self):
- self._parser = None
- self._client_src = None
- self._client_dest = None
- self._using_client = False
- self.src = None
- self.dest = None
- self.start = None
- self.end = None
- self.interhost = False
-
- @property
- def client_src(self):
- if self._using_client:
- raise Exception("Filter client is in use; make another")
- return self._client_src
-
- @property
- def client_dest(self):
- if self._using_client:
- raise Exception("Filter client is in use; make another")
- return self._client_dest
-
- def setup_parser(self, description = "Filter data"):
- parser = argparse.ArgumentParser(
- formatter_class = argparse.RawDescriptionHelpFormatter,
- version = nilmtools.__version__,
- description = description)
- group = parser.add_argument_group("General filter arguments")
- group.add_argument("-u", "--url", action="store",
- default="http://localhost:12380/",
- help="Server URL (default: %(default)s)")
- group.add_argument("-U", "--dest-url", action="store",
- help="Destination server URL "
- "(default: same as source)")
- group.add_argument("-D", "--dry-run", action="store_true",
- default = False,
- help="Just print intervals that would be "
- "processed")
- group.add_argument("-s", "--start",
- metavar="TIME", type=self.arg_time,
- help="Starting timestamp for intervals "
- "(free-form, inclusive)")
- group.add_argument("-e", "--end",
- metavar="TIME", type=self.arg_time,
- help="Ending timestamp for intervals "
- "(free-form, noninclusive)")
- group.add_argument("srcpath", action="store",
- help="Path of source stream, e.g. /foo/bar")
- group.add_argument("destpath", action="store",
- help="Path of destination stream, e.g. /foo/bar")
- self._parser = parser
- return parser
-
- def interval_string(self, interval):
- return sprintf("[ %s -> %s ]",
- timestamp_to_human(interval.start),
- timestamp_to_human(interval.end))
-
- def parse_args(self):
- args = self._parser.parse_args()
-
- if args.dest_url is None:
- args.dest_url = args.url
- if args.url != args.dest_url:
- self.interhost = True
-
- self._client_src = Client(args.url)
- self._client_dest = Client(args.dest_url)
-
- if (not self.interhost) and (args.srcpath == args.destpath):
- raise Exception("source and destination path must be different")
-
- # Open and print info about the streams
- src = self._client_src.stream_list(args.srcpath, extended = True)
- if len(src) != 1:
- raise Exception("source path " + args.srcpath + " not found")
- self.src = StreamInfo(args.url, src[0], self.interhost)
-
- dest = self._client_dest.stream_list(args.destpath, extended = True)
- if len(dest) != 1:
- raise MissingDestination(self.src,
- StreamInfo(args.dest_url, [args.destpath],
- self.interhost))
- self.dest = StreamInfo(args.dest_url, dest[0], self.interhost)
-
- print "Source:", self.src
- print " Dest:", self.dest
-
- if args.dry_run:
- for interval in self.intervals():
- print self.interval_string(interval)
- raise SystemExit(0)
-
- self.start = args.start
- self.end = args.end
-
- return args
-
- def _optimize_int(self, it):
- """Join and yield adjacent intervals from the iterator 'it'"""
- saved_int = None
- for interval in it:
- if saved_int is not None:
- if saved_int.end == interval.start:
- interval.start = saved_int.start
- else:
- yield saved_int
- saved_int = interval
- if saved_int is not None:
- yield saved_int
-
- def intervals(self):
- """Generate all the intervals that this filter should process"""
- self._using_client = True
-
- if self.interhost:
- # Do the difference ourselves
- s_intervals = ( Interval(start, end)
- for (start, end) in
- self._client_src.stream_intervals(
- self.src.path,
- start = self.start, end = self.end) )
- d_intervals = ( Interval(start, end)
- for (start, end) in
- self._client_dest.stream_intervals(
- self.dest.path,
- start = self.start, end = self.end) )
- intervals = nilmdb.utils.interval.set_difference(s_intervals,
- d_intervals)
- else:
- # Let the server do the difference for us
- intervals = ( Interval(start, end)
- for (start, end) in
- self._client_src.stream_intervals(
- self.src.path, diffpath = self.dest.path,
- start = self.start, end = self.end) )
- # Optimize intervals: join intervals that are adjacent
- for interval in self._optimize_int(intervals):
- yield interval
- self._using_client = False
-
- # Misc helpers
- def arg_time(self, toparse):
- """Parse a time string argument"""
- try:
- return nilmdb.utils.time.parse_time(toparse)
- except ValueError as e:
- raise argparse.ArgumentTypeError(sprintf("%s \"%s\"",
- str(e), toparse))
-
- def check_dest_metadata(self, data):
- """See if the metadata jives, and complain if it doesn't. If
- there's no conflict, update the metadata to match 'data'."""
- metadata = self._client_dest.stream_get_metadata(self.dest.path)
- for key in data:
- wanted = str(data[key])
- val = metadata.get(key, wanted)
- if val != wanted and self.dest.rows > 0:
- m = "Metadata in destination stream:\n"
- m += " %s = %s\n" % (key, val)
- m += "doesn't match desired data:\n"
- m += " %s = %s\n" % (key, wanted)
- m += "Refusing to change it. You can change the stream's "
- m += "metadata manually, or\n"
- m += "remove existing data from the stream, to prevent "
- m += "this error.\n"
- raise Exception(m)
- # All good -- write the metadata in case it's not already there
- self._client_dest.stream_update_metadata(self.dest.path, data)
-
- # Main processing helper
- def process_python(self, function, rows, args = None, partial = False):
- """Process data in chunks of 'rows' data at a time.
-
- This provides data as nested Python lists and expects the same
- back.
-
- function: function to process the data
- rows: maximum number of rows to pass to 'function' at once
- args: tuple containing extra arguments to pass to 'function'
- partial: if true, less than 'rows' may be passed to 'function'.
- if false, partial data at the end of an interval will
- be dropped.
-
- 'function' should be defined like:
- function(data, *args)
- It will be passed a list containing up to 'rows' rows of
- data from the source stream, and any arguments passed in
- 'args'. It should transform the data as desired, and return a
- new list of rdata, which will be inserted into the destination
- stream.
- """
- if args is None:
- args = []
- extractor = Client(self.src.url).stream_extract
- inserter = Client(self.dest.url).stream_insert_context
-
- # Parse input data. We use homogenous types for now, which
- # means the timestamp type will be either float or int.
- if "int" in self.src.layout_type:
- parser = lambda line: [ int(x) for x in line.split() ]
- else:
- parser = lambda line: [ float(x) for x in line.split() ]
-
- # Format output data.
- formatter = lambda row: " ".join([repr(x) for x in row]) + "\n"
-
- for interval in self.intervals():
- print "Processing", self.interval_string(interval)
- with inserter(self.dest.path,
- interval.start, interval.end) as insert_ctx:
- src_array = []
- for line in extractor(self.src.path,
- interval.start, interval.end):
- # Read in data
- src_array.append([ float(x) for x in line.split() ])
-
- if len(src_array) == rows:
- # Pass through filter function
- dest_array = function(src_array, *args)
-
- # Write result to destination
- out = [ formatter(row) for row in dest_array ]
- insert_ctx.insert("".join(out))
-
- # Clear source array
- src_array = []
-
- # Take care of partial chunk
- if len(src_array) and partial:
- dest_array = function(src_array, *args)
- out = [ formatter(row) for row in dest_array ]
- insert_ctx.insert("".join(out))
-
- # Like process_python, but provides Numpy arrays and allows for
- # partial processing.
- def process_numpy(self, function, args = None, rows = 100000):
- """For all intervals that exist in self.src but don't exist in
- self.dest, call 'function' with a Numpy array corresponding to
- the data. The data is converted to a Numpy array in chunks of
- 'rows' rows at a time.
-
- 'function' should be defined as:
- def function(data, interval, args, insert_func, final)
-
- 'data': array of data to process -- may be empty
-
- 'interval': overall interval we're processing (but not necessarily
- the interval of this particular chunk of data)
-
- 'args': opaque arguments passed to process_numpy
-
- 'insert_func': function to call in order to insert array of data.
- Should be passed a 2-dimensional array of data to insert.
- Data timestamps must be within the provided interval.
-
- 'final': True if this is the last bit of data for this
- contiguous interval, False otherwise.
-
- Return value of 'function' is the number of data rows processed.
- Unprocessed data will be provided again in a subsequent call
- (unless 'final' is True).
- """
- if args is None:
- args = []
- extractor = Client(self.src.url).stream_extract
- inserter = Client(self.dest.url).stream_insert_context
-
- # Format output data.
- formatter = lambda row: " ".join([repr(x) for x in row]) + "\n"
-
- def batch(iterable, size):
- c = itertools.count()
- for k, g in itertools.groupby(iterable, lambda x: c.next() // size):
- yield g
-
- for interval in self.intervals():
- print "Processing", self.interval_string(interval)
- with inserter(self.dest.path,
- interval.start, interval.end) as insert_ctx:
- def insert_function(array):
- s = cStringIO.StringIO()
- if len(np.shape(array)) != 2:
- raise Exception("array must be 2-dimensional")
- np.savetxt(s, array)
- insert_ctx.insert(s.getvalue())
-
- extract = extractor(self.src.path, interval.start, interval.end)
- old_array = np.array([])
- for batched in batch(extract, rows):
- # Read in this batch of data
- new_array = np.loadtxt(batched)
-
- # If we still had old data left, combine it
- if old_array.shape[0] != 0:
- array = np.vstack((old_array, new_array))
- else:
- array = new_array
-
- # Pass it to the process function
- processed = function(array, interval, args,
- insert_function, False)
-
- # Save the unprocessed parts
- if processed > 0:
- old_array = array[processed:]
- else:
- old_array = array
-
- # Last call for this contiguous interval
- if old_array.shape[0] != 0:
- function(old_array, interval, args, insert_function, True)
-
- def main():
- # This is just a dummy function; actual filters can use the other
- # functions to prepare stuff, and then do something with the data.
- f = Filter()
- parser = f.setup_parser()
- args = f.parse_args()
- for i in f.intervals():
- print "Generic filter: need to handle", f.interval_string(i)
-
- if __name__ == "__main__":
- main()
|