|
- # -*- coding: utf-8 -*-
-
- """Provide a NumpyClient class that is based on normal Client, but has
- additional methods for extracting and inserting data via Numpy arrays."""
-
- import nilmdb.utils
- import nilmdb.client.client
- import nilmdb.client.httpclient
- from nilmdb.client.errors import ClientError
-
- import contextlib
- from nilmdb.utils.time import timestamp_to_string, string_to_timestamp
-
- import numpy
- import io
-
- def layout_to_dtype(layout):
- ltype = layout.split('_')[0]
- lcount = int(layout.split('_')[1])
- if ltype.startswith('int'):
- atype = '<i' + str(int(ltype[3:]) // 8)
- elif ltype.startswith('uint'):
- atype = '<u' + str(int(ltype[4:]) // 8)
- elif ltype.startswith('float'):
- atype = '<f' + str(int(ltype[5:]) // 8)
- else:
- raise ValueError("bad layout")
- if lcount == 1:
- dtype = [('timestamp', '<i8'), ('data', atype)]
- else:
- dtype = [('timestamp', '<i8'), ('data', atype, lcount)]
- return numpy.dtype(dtype)
-
- class NumpyClient(nilmdb.client.client.Client):
- """Subclass of nilmdb.client.Client that adds additional methods for
- extracting and inserting data via Numpy arrays."""
-
- def _get_dtype(self, path, layout):
- if layout is None:
- streams = self.stream_list(path)
- if len(streams) != 1:
- raise ClientError("can't get layout for path: " + path)
- layout = streams[0][1]
- return layout_to_dtype(layout)
-
- def stream_extract_numpy(self, path, start = None, end = None,
- layout = None, maxrows = 100000,
- structured = False):
- """
- Extract data from a stream. Returns a generator that yields
- Numpy arrays of up to 'maxrows' of data each.
-
- If 'layout' is None, it is read using stream_info.
-
- If 'structured' is False, all data is converted to float64
- and returned in a flat 2D array. Otherwise, data is returned
- as a structured dtype in a 1D array.
- """
- dtype = self._get_dtype(path, layout)
-
- def to_numpy(data):
- a = numpy.fromstring(data, dtype)
- if structured:
- return a
- return numpy.c_[a['timestamp'], a['data']]
-
- chunks = []
- total_len = 0
- maxsize = dtype.itemsize * maxrows
- for data in self.stream_extract(path, start, end, binary = True):
- # Add this block of binary data
- chunks.append(data)
- total_len += len(data)
-
- # See if we have enough to make the requested Numpy array
- while total_len >= maxsize:
- assembled = b"".join(chunks)
- total_len -= maxsize
- chunks = [ assembled[maxsize:] ]
- block = assembled[:maxsize]
- yield to_numpy(block)
-
- if total_len:
- yield to_numpy(b"".join(chunks))
-
- @contextlib.contextmanager
- def stream_insert_numpy_context(self, path, start = None, end = None,
- layout = None):
- """Return a context manager that allows data to be efficiently
- inserted into a stream in a piecewise manner. Data is
- provided as Numpy arrays, and is aggregated and sent to the
- server in larger or smaller chunks as necessary. Data format
- must match the database layout for the given path.
-
- For more details, see help for
- nilmdb.client.numpyclient.StreamInserterNumpy
-
- If 'layout' is not None, use it as the layout rather than
- querying the database.
- """
- dtype = self._get_dtype(path, layout)
- ctx = StreamInserterNumpy(self, path, start, end, dtype)
- yield ctx
- ctx.finalize()
- ctx.destroy()
-
- def stream_insert_numpy(self, path, data, start = None, end = None,
- layout = None):
- """Insert data into a stream. data should be a Numpy array
- which will be passed through stream_insert_numpy_context to
- break it into chunks etc. See the help for that function
- for details."""
- with self.stream_insert_numpy_context(path, start, end, layout) as ctx:
- if isinstance(data, numpy.ndarray):
- ctx.insert(data)
- else:
- for chunk in data:
- ctx.insert(chunk)
- return ctx.last_response
-
- class StreamInserterNumpy(nilmdb.client.client.StreamInserter):
- """Object returned by stream_insert_numpy_context() that manages
- the insertion of rows of data into a particular path.
-
- See help for nilmdb.client.client.StreamInserter for details.
- The only difference is that, instead of ASCII formatted data,
- this context manager can take Numpy arrays, which are either
- structured (1D with complex dtype) or flat (2D with simple dtype).
- """
-
- # Soft limit of how many bytes to send per HTTP request.
- _max_data = 2 * 1024 * 1024
-
- def __init__(self, client, path, start, end, dtype):
- """
- 'client' is the client object. 'path' is the database path
- to insert to. 'start' and 'end' are used for the first
- contiguous interval and may be None. 'dtype' is the Numpy
- dtype for this stream.
- """
- super(StreamInserterNumpy, self).__init__(client, path, start, end)
- self._dtype = dtype
-
- # Max rows to send at once
- self._max_rows = self._max_data // self._dtype.itemsize
-
- # List of the current arrays we're building up to send
- self._block_arrays = []
- self._block_rows = 0
-
- def insert(self, array):
- """Insert Numpy data, which must match the layout type."""
- if type(array) != numpy.ndarray:
- array = numpy.array(array)
- if array.ndim == 1:
- # Already a structured array; just verify the type
- if array.dtype != self._dtype:
- raise ValueError("wrong dtype for 1D (structured) array")
- elif array.ndim == 2:
- # Convert to structured array
- sarray = numpy.zeros(array.shape[0], dtype=self._dtype)
- try:
- sarray['timestamp'] = array[:,0]
- # Need the squeeze in case sarray['data'] is 1 dimensional
- sarray['data'] = numpy.squeeze(array[:,1:])
- except (IndexError, ValueError):
- raise ValueError("wrong number of fields for this data type")
- array = sarray
- else:
- raise ValueError("wrong number of dimensions in array")
-
- length = len(array)
- maxrows = self._max_rows
-
- if length == 0:
- return
- if length > maxrows:
- # This is more than twice what we wanted to send, so split
- # it up. This is a bit inefficient, but the user really
- # shouldn't be providing this much data at once.
- for cut in range(0, length, maxrows):
- self.insert(array[cut:(cut + maxrows)])
- return
-
- # Add this array to our list
- self._block_arrays.append(array)
- self._block_rows += length
-
- # Send if it's too long
- if self._block_rows >= maxrows:
- self._send_block(final = False)
-
- def _send_block(self, final = False):
- """Send the data current stored up. One row might be left
- over if we need its timestamp saved."""
-
- # Build the full array to send
- if self._block_rows == 0:
- array = numpy.zeros(0, dtype = self._dtype)
- else:
- array = numpy.hstack(self._block_arrays)
-
- # Get starting timestamp
- start_ts = self._interval_start
- if start_ts is None:
- # Pull start from the first row
- try:
- start_ts = array['timestamp'][0]
- except IndexError:
- pass # no timestamp is OK, if we have no data
-
- # Get ending timestamp
- if final:
- # For a final block, the timestamp is either the
- # user-provided end, or the timestamp of the last line
- # plus epsilon.
- end_ts = self._interval_end
- if end_ts is None:
- try:
- end_ts = array['timestamp'][-1]
- end_ts += nilmdb.utils.time.epsilon
- except IndexError:
- pass # no timestamp is OK, if we have no data
- self._block_arrays = []
- self._block_rows = 0
-
- # Next block is completely fresh
- self._interval_start = None
- self._interval_end = None
- else:
- # An intermediate block. We need to save the last row
- # for the next block, and use its timestamp as the ending
- # timestamp for this one.
- if len(array) < 2:
- # Not enough data to send an intermediate block
- return
- end_ts = array['timestamp'][-1]
- if self._interval_end is not None and end_ts > self._interval_end:
- # User gave us bad endpoints; send it anyway, and let
- # the server complain so that the error is the same
- # as if we hadn't done this chunking.
- end_ts = self._interval_end
- self._block_arrays = [ array[-1:] ]
- self._block_rows = 1
- array = array[:-1]
-
- # Next block continues where this one ended
- self._interval_start = end_ts
-
- # If we have no endpoints, or equal endpoints, it's OK as long
- # as there's no data to send
- if (start_ts is None or end_ts is None) or (start_ts == end_ts):
- if len(array) == 0:
- return
- raise ClientError("have data to send, but invalid start/end times")
-
- # Send it
- data = array.tostring()
- self.last_response = self._client.stream_insert_block(
- self._path, data, start_ts, end_ts, binary = True)
-
- return
|