Merge branch 'binary' into HEAD

Numpy is required for tests now, due to nilmdb.client.numpyclient
Still allow installation without it, though.
2013-04-07 18:08:10 -04:00 · 2013-04-07 18:05:43 -04:00 · 2013-04-07 17:43:52 -04:00 · 2013-04-07 16:06:52 -04:00 · 2013-04-06 19:12:17 -04:00
13 changed files with 301 additions and 15 deletions
--- a/nilmdb/client/client.py
+++ b/nilmdb/client/client.py
@@ -178,7 +178,7 @@ class Client(object):
        return self.http.get_gen("stream/intervals", params)

    def stream_extract(self, path, start = None, end = None,
-                       count = False, markup = False):
+                       count = False, markup = False, binary = False):
        """
        Extract data from a stream.  Returns a generator that yields
        lines of ASCII-formatted data that matches the database
@@ -189,6 +189,11 @@ class Client(object):

        Specify markup = True to include comments in the returned data
        that indicate interval starts and ends.
+
+        Specify binary = True to return chunks of raw binary data,
+        rather than lines of ASCII-formatted data.  Raw binary data
+        is always little-endian and matches the database types
+        (including a uint64 timestamp).
        """
        params = {
            "path": path,
@@ -201,7 +206,9 @@ class Client(object):
            params["count"] = 1
        if markup:
            params["markup"] = 1
-        return self.http.get_gen("stream/extract", params)
+        if binary:
+            params["binary"] = 1
+        return self.http.get_gen("stream/extract", params, binary = binary)

    def stream_count(self, path, start = None, end = None):
        """
--- a/nilmdb/client/httpclient.py
+++ b/nilmdb/client/httpclient.py
@@ -110,7 +110,8 @@ class HTTPClient(object):
        return self._req("PUT", url, params, data)

    # Generator versions that return data one line at a time.
-    def _req_gen(self, method, url, query = None, body = None, headers = None):
+    def _req_gen(self, method, url, query = None, body = None,
+                 headers = None, binary = False):
        """
        Make a request and return a generator that gives back strings
        or JSON decoded lines of the body data, or raise an error if
@@ -118,16 +119,19 @@ class HTTPClient(object):
        """
        (response, isjson) = self._do_req(method, url, query, body,
                                          stream = True, headers = headers)
-        if isjson:
+        if binary:
+            for chunk in response.iter_content(chunk_size = 65536):
+                yield chunk
+        elif isjson:
            for line in response.iter_lines():
                yield json.loads(line)
        else:
            for line in response.iter_lines():
                yield line

-    def get_gen(self, url, params = None):
+    def get_gen(self, url, params = None, binary = False):
        """Simple GET (parameters in URL) returning a generator"""
-        return self._req_gen("GET", url, params)
+        return self._req_gen("GET", url, params, binary = binary)

    # Not much use for a POST or PUT generator, since they don't
    # return much data.
--- a/nilmdb/client/numpyclient.py
+++ b/nilmdb/client/numpyclient.py
@@ -0,0 +1,77 @@
+# -*- coding: utf-8 -*-
+
+"""Provide a NumpyClient class that is based on normal Client, but has
+additional methods for extracting and inserting data via Numpy arrays."""
+
+import nilmdb.utils
+import nilmdb.client.client
+import nilmdb.client.httpclient
+from nilmdb.client.errors import ClientError
+
+import contextlib
+from nilmdb.utils.time import timestamp_to_string, string_to_timestamp
+
+import numpy
+import cStringIO
+
+def layout_to_dtype(layout):
+    ltype = layout.split('_')[0]
+    lcount = int(layout.split('_')[1])
+    if ltype.startswith('int'):
+        atype = '<i' + str(int(ltype[3:]) / 8)
+    elif ltype.startswith('uint'):
+        atype = '<u' + str(int(ltype[4:]) / 8)
+    elif ltype.startswith('float'):
+        atype = '<f' + str(int(ltype[5:]) / 8)
+    else:
+        raise ValueError("bad layout")
+    return numpy.dtype([('timestamp', '<i8'), ('data', atype, lcount)])
+
+class NumpyClient(nilmdb.client.client.Client):
+    """Subclass of nilmdb.client.Client that adds additional methods for
+    extracting and inserting data via Numpy arrays."""
+
+    def stream_extract_numpy(self, path, start = None, end = None,
+                             layout = None, maxrows = 100000,
+                             structured = False):
+        """
+        Extract data from a stream.  Returns a generator that yields
+        Numpy arrays of up to 'maxrows' of data each.
+
+        If 'layout' is None, it is read using stream_info.
+
+        If 'structured' is False, all data is converted to float64
+        and returned in a flat 2D array.  Otherwise, data is returned
+        as a structured dtype in a 1D array.
+        """
+        if layout is None:
+            streams = self.stream_list(path)
+            if len(streams) != 1:
+                raise ClientError("can't get layout for path: " + path)
+            layout = streams[0][1]
+        dtype = layout_to_dtype(layout)
+
+        def to_numpy(data):
+            a = numpy.fromstring(data, dtype)
+            if structured:
+                return a
+            return numpy.c_[a['timestamp'], a['data']]
+
+        chunks = []
+        total_len = 0
+        maxsize = dtype.itemsize * maxrows
+        for data in self.stream_extract(path, start, end, binary = True):
+            # Add this block of binary data
+            chunks.append(data)
+            total_len += len(data)
+
+            # See if we have enough to make the requested Numpy array
+            while total_len >= maxsize:
+                assembled = "".join(chunks)
+                total_len -= maxsize
+                chunks = [ assembled[maxsize:] ]
+                block = assembled[:maxsize]
+                yield to_numpy(block)
+
+        if total_len:
+            yield to_numpy("".join(chunks))
--- a/nilmdb/server/bulkdata.py
+++ b/nilmdb/server/bulkdata.py
@@ -479,7 +479,7 @@ class Table(object):
            # Success, so update self.nrows accordingly
            self.nrows = tot_rows

-    def get_data(self, start, stop):
+    def get_data(self, start, stop, binary = False):
        """Extract data corresponding to Python range [n:m],
        and returns a formatted string"""
        if (start is None or
@@ -497,10 +497,13 @@ class Table(object):
            if count > remaining:
                count = remaining
            f = self.file_open(subdir, filename)
-            ret.append(f.extract_string(offset, count))
+            if binary:
+                ret.append(f.extract_binary(offset, count))
+            else:
+                ret.append(f.extract_string(offset, count))
            remaining -= count
            row += count
-        return "".join(ret)
+        return b"".join(ret)

    def __getitem__(self, row):
        """Extract timestamps from a row, with table[n] notation."""
--- a/nilmdb/server/nilmdb.py
+++ b/nilmdb/server/nilmdb.py
@@ -538,7 +538,7 @@ class NilmDB(object):
                                  dbinterval.db_endpos)

    def stream_extract(self, path, start = None, end = None,
-                       count = False, markup = False):
+                       count = False, markup = False, binary = False):
        """
        Returns (data, restart) tuple.

@@ -559,6 +559,9 @@ class NilmDB(object):
        'markup', if true, indicates that returned data should be
        marked with a comment denoting when a particular interval
        starts, and another comment when an interval ends.
+
+        'binary', if true, means to return raw binary rather than
+        ASCII-formatted data.
        """
        stream_id = self._stream_id(path)
        table = self.data.getnode(path)
@@ -569,6 +572,8 @@ class NilmDB(object):
        matched = 0
        remaining = self.max_results
        restart = None
+        if binary and (markup or count):
+            raise NilmDBError("binary mode can't be used with markup or count")
        for interval in intervals.intersection(requested):
            # Reading single rows from the table is too slow, so
            # we use two bisections to find both the starting and
@@ -593,7 +598,7 @@ class NilmDB(object):
                              timestamp_to_string(interval.start) + "\n")

            # Gather these results up
-            result.append(table.get_data(row_start, row_end))
+            result.append(table.get_data(row_start, row_end, binary))

            # Count them
            remaining -= row_end - row_start
--- a/nilmdb/server/rocket.c
+++ b/nilmdb/server/rocket.c
@@ -527,6 +527,46 @@ err:
 	return NULL;
 }

+/****
+ * Extract to binary string containing raw little-endian binary data
+ */
+static PyObject *Rocket_extract_binary(Rocket *self, PyObject *args)
+{
+	long count;
+	long offset;
+
+	if (!PyArg_ParseTuple(args, "ll", &offset, &count))
+		return NULL;
+	if (!self->file) {
+		PyErr_SetString(PyExc_Exception, "no file");
+		return NULL;
+	}
+	/* Seek to target location */
+	if (fseek(self->file, offset, SEEK_SET) < 0) {
+		PyErr_SetFromErrno(PyExc_OSError);
+		return NULL;
+	}
+
+        uint8_t *str;
+        int len = count * self->binary_size;
+        str = malloc(len);
+        if (str == NULL) {
+                PyErr_SetFromErrno(PyExc_OSError);
+                return NULL;
+        }
+
+        /* Data in the file is already in the desired little-endian
+           binary format, so just read it directly. */
+        if (fread(str, self->binary_size, count, self->file) != count) {
+                free(str);
+                PyErr_SetFromErrno(PyExc_OSError);
+                return NULL;
+        }
+
+	PyObject *pystr = PyBytes_FromStringAndSize((char *)str, len);
+	free(str);
+	return pystr;
+}

 /****
 * Extract timestamp
@@ -600,6 +640,12 @@ static PyMethodDef Rocket_methods[] = {
 	  "Extract count rows of data from the file at offset offset.\n"
 	  "Return an ascii formatted string according to the layout" },

+	{ "extract_binary",
+	  (PyCFunction)Rocket_extract_binary, METH_VARARGS,
+	  "extract_binary(self, offset, count)\n\n"
+	  "Extract count rows of data from the file at offset offset.\n"
+	  "Return a raw binary string of data matching the data layout." },
+
 	{ "extract_timestamp",
 	  (PyCFunction)Rocket_extract_timestamp, METH_VARARGS,
 	  "extract_timestamp(self, offset)\n\n"
--- a/nilmdb/server/server.py
+++ b/nilmdb/server/server.py
@@ -400,7 +400,7 @@ class Stream(NilmApp):
    @chunked_response
    @response_type("text/plain")
    def extract(self, path, start = None, end = None,
-                count = False, markup = False):
+                count = False, markup = False, binary = False):
        """
        Extract data from backend database.  Streams the resulting
        entries as ASCII text lines separated by newlines.  This may
@@ -411,6 +411,11 @@ class Stream(NilmApp):

        If 'markup' is True, adds comments to the stream denoting each
        interval's start and end timestamp.
+
+        If 'binary' is True, return raw binary data, rather than lines
+        of ASCII-formatted data.  Raw binary data is always
+        little-endian and matches the database types (including a
+        uint64 timestamp).
        """
        (start, end) = self._get_times(start, end)

@@ -418,6 +423,13 @@ class Stream(NilmApp):
        if len(self.db.stream_list(path = path)) != 1:
            raise cherrypy.HTTPError("404", "No such stream: " + path)

+        if binary:
+            cherrypy.response.headers['Content-Type'] = (
+                "application/octet-stream")
+            if markup or count:
+                raise cherrypy.HTTPError("400", "can't mix binary and "
+                                         "markup or count modes")
+
        @workaround_cp_bug_1200
        def content(start, end):
            # Note: disable chunked responses to see tracebacks from here.
@@ -429,7 +441,8 @@ class Stream(NilmApp):

            while True:
                (data, restart) = self.db.stream_extract(
-                    path, start, end, count = False, markup = markup)
+                    path, start, end, count = False,
+                    markup = markup, binary = binary)
                yield data

                if restart is None:
--- a/nilmdb/utils/time.py
+++ b/nilmdb/utils/time.py
@@ -70,7 +70,7 @@ def parse_time(toparse):
    try:
        if toparse[0] == '@':
            return int(toparse[1:])
-    except (ValueError, KeyError):
+    except (ValueError, KeyError, IndexError):
        pass

    # If string isn't "now" and doesn't contain at least 4 digits,
--- a/setup.py
+++ b/setup.py
@@ -107,6 +107,7 @@ setup(name='nilmdb',
      author_email = 'jim@jtan.com',
      tests_require = [ 'nose',
                        'coverage',
+                        'numpy',
                        ],
      setup_requires = [ 'distribute',
                         ],
--- a/tests/test.order
+++ b/tests/test.order
@@ -12,6 +12,7 @@ test_interval.py
 test_bulkdata.py
 test_nilmdb.py
 test_client.py
+test_numpyclient.py
 test_cmdline.py

 test_*.py
--- a/tests/test_client.py
+++ b/tests/test_client.py
@@ -23,6 +23,7 @@ import warnings
 import resource
 import time
 import re
+import struct

 from testutil.helpers import *

@@ -293,6 +294,23 @@ class TestClient(object):
        # Test count
        eq_(client.stream_count("/newton/prep"), 14400)

+        # Test binary output
+        with assert_raises(ClientError) as e:
+            list(client.stream_extract("/newton/prep",
+                                       markup = True, binary = True))
+        with assert_raises(ClientError) as e:
+            list(client.stream_extract("/newton/prep",
+                                       count = True, binary = True))
+        data = "".join(client.stream_extract("/newton/prep", binary = True))
+        # Quick check using struct
+        unpacker = struct.Struct("<qffffffff")
+        out = []
+        for i in range(14400):
+            out.append(unpacker.unpack_from(data, i * unpacker.size))
+        eq_(out[0], (1332511200000000, 266568.0, 224029.0, 5161.39990234375,
+                     2525.169921875, 8350.83984375, 3724.699951171875,
+                     1355.3399658203125, 2039.0))
+
        client.close()

    def test_client_06_generators(self):
--- a/tests/test_nilmdb.py
+++ b/tests/test_nilmdb.py
@@ -90,13 +90,16 @@ class Test00Nilmdb(object):  # named 00 so it runs first
        eq_(db.stream_get_metadata("/newton/prep"), meta1)
        eq_(db.stream_get_metadata("/newton/raw"), meta1)

-        # fill in some test coverage for start >= end
+        # fill in some misc. test coverage
        with assert_raises(nilmdb.server.NilmDBError):
            db.stream_remove("/newton/prep", 0, 0)
        with assert_raises(nilmdb.server.NilmDBError):
            db.stream_remove("/newton/prep", 1, 0)
        db.stream_remove("/newton/prep", 0, 1)

+        with assert_raises(nilmdb.server.NilmDBError):
+            db.stream_extract("/newton/prep", count = True, binary = True)
+
        db.close()

 class TestBlockingServer(object):
--- a/tests/test_numpyclient.py
+++ b/tests/test_numpyclient.py
@@ -0,0 +1,108 @@
+# -*- coding: utf-8 -*-
+
+import nilmdb.server
+import nilmdb.client
+import nilmdb.client.numpyclient
+
+from nilmdb.utils.printf import *
+from nilmdb.utils import timestamper
+from nilmdb.client import ClientError, ServerError
+from nilmdb.utils import datetime_tz
+
+from nose.plugins.skip import SkipTest
+from nose.tools import *
+from nose.tools import assert_raises
+import itertools
+import distutils.version
+
+from testutil.helpers import *
+
+import numpy as np
+
+testdb = "tests/numpyclient-testdb"
+testurl = "http://localhost:32180/"
+
+def setup_module():
+    global test_server, test_db
+    # Clear out DB
+    recursive_unlink(testdb)
+
+    # Start web app on a custom port
+    test_db = nilmdb.utils.serializer_proxy(nilmdb.server.NilmDB)(testdb)
+    test_server = nilmdb.server.Server(test_db, host = "127.0.0.1",
+                                       port = 32180, stoppable = False,
+                                       fast_shutdown = True,
+                                       force_traceback = True)
+    test_server.start(blocking = False)
+
+def teardown_module():
+    global test_server, test_db
+    # Close web app
+    test_server.stop()
+    test_db.close()
+
+class TestNumpyClient(object):
+
+    def test_numpyclient_01_basic(self):
+        # Test basic connection
+        client = nilmdb.client.numpyclient.NumpyClient(url = testurl)
+        version = client.version()
+        eq_(distutils.version.LooseVersion(version),
+            distutils.version.LooseVersion(test_server.version))
+
+        # Verify subclassing
+        assert(isinstance(client, nilmdb.client.Client))
+
+        # Layouts
+        for layout in "int8_t", "something_8", "integer_1":
+            with assert_raises(ValueError):
+                for x in client.stream_extract_numpy("/foo", layout=layout):
+                    pass
+        for layout in "int8_1", "uint8_30", "int16_20", "float64_100":
+            with assert_raises(ClientError) as e:
+                for x in client.stream_extract_numpy("/foo", layout=layout):
+                    pass
+            in_("No such stream", str(e.exception))
+
+        with assert_raises(ClientError) as e:
+            for x in client.stream_extract_numpy("/foo"):
+                pass
+        in_("can't get layout for path", str(e.exception))
+
+        client.close()
+
+    def test_numpyclient_02_extract(self):
+        client = nilmdb.client.numpyclient.NumpyClient(url = testurl)
+
+        # Insert some data as text
+        client.stream_create("/newton/prep", "float32_8")
+        testfile = "tests/data/prep-20120323T1000"
+        start = nilmdb.utils.time.parse_time("20120323T1000")
+        rate = 120
+        data = timestamper.TimestamperRate(testfile, start, rate)
+        result = client.stream_insert("/newton/prep", data,
+                                      start, start + 119999777)
+
+        # Extract Numpy arrays
+        array = None
+        pieces = 0
+        for chunk in client.stream_extract_numpy("/newton/prep", maxrows=1000):
+            pieces += 1
+            if array is not None:
+                array = np.vstack((array, chunk))
+            else:
+                array = chunk
+        eq_(array.shape, (14400, 9))
+        eq_(pieces, 15)
+
+        # Try structured
+        s = list(client.stream_extract_numpy("/newton/prep", structured = True))
+        assert(np.array_equal(np.c_[s[0]['timestamp'], s[0]['data']], array))
+
+        # Compare.  Will be close but not exact because the conversion
+        # to and from ASCII was lossy.
+        data = timestamper.TimestamperRate(testfile, start, rate)
+        actual = np.fromstring(" ".join(data), sep=' ').reshape(14400, 9)
+        assert(np.allclose(array, actual))
+
+        client.close()
Author	SHA1	Message	Date
Jim Paris	f0304b4c00	Merge branch 'binary' into HEAD	2013-04-07 18:08:10 -04:00
Jim Paris	60594ca58e	Numpy is required for tests now, due to nilmdb.client.numpyclient Still allow installation without it, though.	2013-04-07 18:05:43 -04:00
Jim Paris	c7f2df4abc	Add nilmdb.client.numpyclient.NumpyClient with stream_extract_numpy This is a subclass of nilmdb.client.client.Client that adds numpy specific routines, which should be a lot faster.	2013-04-07 17:43:52 -04:00
Jim Paris	5b7409f802	Add binary extract to client, server, nilmdb, bulkdata, and rocket.	2013-04-07 16:06:52 -04:00
Jim Paris	06038062a2	Fix error in time parsing	2013-04-06 19:12:17 -04:00