|
- #!/usr/bin/python
-
- from __future__ import absolute_import
-
- import nilmdb.client
- from nilmdb.client import Client
- from nilmdb.client.numpyclient import NumpyClient
- from nilmdb.utils.printf import *
- from nilmdb.utils.time import (parse_time, timestamp_to_human,
- timestamp_to_seconds)
- from nilmdb.utils.interval import Interval
-
- import nilmtools
-
- import itertools
- import time
- import sys
- import re
- import argparse
- import numpy as np
- import cStringIO
-
- class MissingDestination(Exception):
- def __init__(self, args, src, dest):
- self.parsed_args = args
- self.src = src
- self.dest = dest
- Exception.__init__(self, "destination path " + dest.path + " not found")
-
- class StreamInfo(object):
- def __init__(self, url, info):
- self.url = url
- self.info = info
- try:
- self.path = info[0]
- self.layout = info[1]
- self.layout_type = self.layout.split('_')[0]
- self.layout_count = int(self.layout.split('_')[1])
- self.total_count = self.layout_count + 1
- self.timestamp_min = info[2]
- self.timestamp_max = info[3]
- self.rows = info[4]
- self.seconds = nilmdb.utils.time.timestamp_to_seconds(info[5])
- except IndexError, TypeError:
- pass
-
- def string(self, interhost):
- """Return stream info as a string. If interhost is true,
- include the host URL."""
- if interhost:
- return sprintf("[%s] ", self.url) + str(self)
- return str(self)
-
- def __str__(self):
- """Return stream info as a string."""
- return sprintf("%s (%s), %.2fM rows, %.2f hours",
- self.path, self.layout, self.rows / 1e6,
- self.seconds / 3600.0)
-
- def get_stream_info(client, path):
- """Return a StreamInfo object about the given path, or None if it
- doesn't exist"""
- streams = client.stream_list(path, extended = True)
- if len(streams) != 1:
- return None
- return StreamInfo(client.geturl(), streams[0])
-
- class Filter(object):
-
- def __init__(self, parser_description = None):
- self._parser = None
- self._client_src = None
- self._client_dest = None
- self._using_client = False
- self.src = None
- self.dest = None
- self.start = None
- self.end = None
- self.interhost = False
- self.force_metadata = False
- if parser_description is not None:
- self.setup_parser(parser_description)
- self.parse_args()
-
- @property
- def client_src(self):
- if self._using_client:
- raise Exception("Filter client is in use; make another")
- return self._client_src
-
- @property
- def client_dest(self):
- if self._using_client:
- raise Exception("Filter client is in use; make another")
- return self._client_dest
-
- def setup_parser(self, description = "Filter data", skip_paths = False):
- parser = argparse.ArgumentParser(
- formatter_class = argparse.RawDescriptionHelpFormatter,
- version = nilmtools.__version__,
- description = description)
- group = parser.add_argument_group("General filter arguments")
- group.add_argument("-u", "--url", action="store",
- default="http://localhost/nilmdb/",
- help="Server URL (default: %(default)s)")
- group.add_argument("-U", "--dest-url", action="store",
- help="Destination server URL "
- "(default: same as source)")
- group.add_argument("-D", "--dry-run", action="store_true",
- default = False,
- help="Just print intervals that would be "
- "processed")
- group.add_argument("--force-metadata", action="store_true",
- default = False,
- help="Force metadata changes if the dest "
- "doesn't match")
- group.add_argument("-s", "--start",
- metavar="TIME", type=self.arg_time,
- help="Starting timestamp for intervals "
- "(free-form, inclusive)")
- group.add_argument("-e", "--end",
- metavar="TIME", type=self.arg_time,
- help="Ending timestamp for intervals "
- "(free-form, noninclusive)")
- if not skip_paths:
- # Individual filter scripts might want to add these arguments
- # themselves, to include multiple sources in a different order
- # (for example). "srcpath" and "destpath" arguments must exist,
- # though.
- group.add_argument("srcpath", action="store",
- help="Path of source stream, e.g. /foo/bar")
- group.add_argument("destpath", action="store",
- help="Path of destination stream, e.g. /foo/bar")
- self._parser = parser
- return parser
-
- def interval_string(self, interval):
- return sprintf("[ %s -> %s ]",
- timestamp_to_human(interval.start),
- timestamp_to_human(interval.end))
-
- def parse_args(self, argv = None):
- args = self._parser.parse_args(argv)
-
- if args.dest_url is None:
- args.dest_url = args.url
- if args.url != args.dest_url:
- self.interhost = True
-
- self._client_src = Client(args.url)
- self._client_dest = Client(args.dest_url)
-
- if (not self.interhost) and (args.srcpath == args.destpath):
- self._parser.error("source and destination path must be different")
-
- # Open and print info about the streams
- self.src = get_stream_info(self._client_src, args.srcpath)
- if not self.src:
- self._parser.error("source path " + args.srcpath + " not found")
-
- self.dest = get_stream_info(self._client_dest, args.destpath)
- if not self.dest:
- raise MissingDestination(args, self.src,
- StreamInfo(args.dest_url, [args.destpath]))
-
- print "Source:", self.src.string(self.interhost)
- print " Dest:", self.dest.string(self.interhost)
-
- if args.dry_run:
- for interval in self.intervals():
- print self.interval_string(interval)
- raise SystemExit(0)
-
- self.force_metadata = args.force_metadata
-
- self.start = args.start
- self.end = args.end
-
- return args
-
- def _optimize_int(self, it):
- """Join and yield adjacent intervals from the iterator 'it'"""
- saved_int = None
- for interval in it:
- if saved_int is not None:
- if saved_int.end == interval.start:
- interval.start = saved_int.start
- else:
- yield saved_int
- saved_int = interval
- if saved_int is not None:
- yield saved_int
-
- def intervals(self):
- """Generate all the intervals that this filter should process"""
- self._using_client = True
-
- if self.interhost:
- # Do the difference ourselves
- s_intervals = ( Interval(start, end)
- for (start, end) in
- self._client_src.stream_intervals(
- self.src.path,
- start = self.start, end = self.end) )
- d_intervals = ( Interval(start, end)
- for (start, end) in
- self._client_dest.stream_intervals(
- self.dest.path,
- start = self.start, end = self.end) )
- intervals = nilmdb.utils.interval.set_difference(s_intervals,
- d_intervals)
- else:
- # Let the server do the difference for us
- intervals = ( Interval(start, end)
- for (start, end) in
- self._client_src.stream_intervals(
- self.src.path, diffpath = self.dest.path,
- start = self.start, end = self.end) )
- # Optimize intervals: join intervals that are adjacent
- for interval in self._optimize_int(intervals):
- yield interval
- self._using_client = False
-
- # Misc helpers
- def arg_time(self, toparse):
- """Parse a time string argument"""
- try:
- return nilmdb.utils.time.parse_time(toparse)
- except ValueError as e:
- raise argparse.ArgumentTypeError(sprintf("%s \"%s\"",
- str(e), toparse))
-
- def check_dest_metadata(self, data):
- """See if the metadata jives, and complain if it doesn't. If
- there's no conflict, update the metadata to match 'data'."""
- metadata = self._client_dest.stream_get_metadata(self.dest.path)
- if not self.force_metadata:
- for key in data:
- wanted = str(data[key])
- val = metadata.get(key, wanted)
- if val != wanted and self.dest.rows > 0:
- m = "Metadata in destination stream:\n"
- m += " %s = %s\n" % (key, val)
- m += "doesn't match desired data:\n"
- m += " %s = %s\n" % (key, wanted)
- m += "Refusing to change it. To prevent this error, "
- m += "change or delete the metadata with nilmtool,\n"
- m += "remove existing data from the stream, or "
- m += "retry with --force-metadata."
- raise Exception(m)
- # All good -- write the metadata in case it's not already there
- self._client_dest.stream_update_metadata(self.dest.path, data)
-
- # The main filter processing method.
- def process_numpy(self, function, args = None, rows = 100000):
- """For all intervals that exist in self.src but don't exist in
- self.dest, call 'function' with a Numpy array corresponding to
- the data. The data is converted to a Numpy array in chunks of
- 'rows' rows at a time.
-
- 'function' should be defined as:
- def function(data, interval, args, insert_func, final)
-
- 'data': array of data to process -- may be empty
-
- 'interval': overall interval we're processing (but not necessarily
- the interval of this particular chunk of data)
-
- 'args': opaque arguments passed to process_numpy
-
- 'insert_func': function to call in order to insert array of data.
- Should be passed a 2-dimensional array of data to insert.
- Data timestamps must be within the provided interval.
-
- 'final': True if this is the last bit of data for this
- contiguous interval, False otherwise.
-
- Return value of 'function' is the number of data rows processed.
- Unprocessed data will be provided again in a subsequent call
- (unless 'final' is True).
-
- If unprocessed data remains after 'final' is True, the interval
- being inserted will be ended at the timestamp of the first
- unprocessed data point.
- """
- if args is None:
- args = []
- extractor = NumpyClient(self.src.url).stream_extract_numpy
- inserter = NumpyClient(self.dest.url).stream_insert_numpy_context
-
- for interval in self.intervals():
- print "Processing", self.interval_string(interval)
- with inserter(self.dest.path,
- interval.start, interval.end) as insert_ctx:
- insert_function = insert_ctx.insert
- old_array = np.array([])
- for new_array in extractor(self.src.path,
- interval.start, interval.end,
- layout = self.src.layout,
- maxrows = rows):
- # If we still had old data left, combine it
- if old_array.shape[0] != 0:
- array = np.vstack((old_array, new_array))
- else:
- array = new_array
-
- # Pass it to the process function
- processed = function(array, interval, args,
- insert_function, False)
-
- # Send any pending data
- insert_ctx.send()
-
- # Save the unprocessed parts
- if processed >= 0:
- old_array = array[processed:]
- else:
- raise Exception(
- sprintf("%s return value %s must be >= 0",
- str(function), str(processed)))
-
- # Warn if there's too much data remaining
- if old_array.shape[0] > 3 * rows:
- printf("warning: %d unprocessed rows in buffer\n",
- old_array.shape[0])
-
- # Last call for this contiguous interval
- if old_array.shape[0] != 0:
- processed = function(old_array, interval, args,
- insert_function, True)
- if processed != old_array.shape[0]:
- # Truncate the interval we're inserting at the first
- # unprocessed data point. This ensures that
- # we'll not miss any data when we run again later.
- insert_ctx.update_end(old_array[processed][0])
-
- def main(argv = None):
- # This is just a dummy function; actual filters can use the other
- # functions to prepare stuff, and then do something with the data.
- f = Filter()
- parser = f.setup_parser()
- args = f.parse_args(argv)
- for i in f.intervals():
- print "Generic filter: need to handle", f.interval_string(i)
-
- if __name__ == "__main__":
- main()
|