Make the whole nilmdb.layout parser Cython, and rewrite the parsing

bits so it's hopefully quite a bit faster now. git-svn-id: https://bucket.mit.edu/svn/nilm/nilmdb@10798 ddd99763-3ecb-0310-9145-efcb8ce7c51f
2012-04-24 21:00:26 +00:00 · 2012-04-24 21:00:26 +00:00 · c07670ac3e
commit c07670ac3e
parent 37b4376b4c
4 changed files with 172 additions and 170 deletions
--- a/nilmdb/init.py
+++ b/nilmdb/init.py
@ -5,7 +5,9 @@ from .server import Server
 from .client import Client
 from .timer import Timer

+import pyximport; pyximport.install()
 import layout
+
 import serializer
 import cmdline
 import timestamper
--- a/nilmdb/layout.py
+++ b/nilmdb/layout.py
@ -1,143 +0,0 @@
-from __future__ import absolute_import
-import nilmdb
-from nilmdb.printf import *
-
-import tables
-import time
-import sys
-import inspect
-import cStringIO
-
-class ParserError(Exception):
-    def __init__(self, line, message):
-        self.message = sprintf("line %d: %s", line, message)
-        Exception.__init__(self, self.message)
-
-class Layout(object):
-    """Represents a NILM database layout"""
-    def description(self):
-        """Return the PyTables description of this layout"""
-        desc = {}
-        for (n, (name, type)) in enumerate(self.fields):
-            desc[name] = tables.Col.from_type(type, pos=n+1)
-        return tables.Description(desc)
-
-    def parse(self, inp):
-        """Given inp as text, return a list of values
-        converted to the correct types"""
-        # Consider overriding this in subclasses for speed.
-        # In general it takes about 2/3 the time that way.
-        out = []
-        for (n, (name, type)) in enumerate(self.fields):
-            if name == 'timestamp':
-                out.append(int(float(inp[n]) * 1e6))
-            elif type == 'float32':
-                out.append(float(inp[n]))
-            elif type == 'uint16':
-                x = int(inp[n], 10)
-                if x < 0 or x > 65535:
-                    raise ValueError("data out of range")
-                out.append(x)
-            else:
-                raise TypeError("can't parse type " + repr(type))
-        return out
-
-def parse_timestamp(string):
-    return int(float(string) * 1e6)
-
-class PrepData(Layout):
-    rate_hz = 120
-    fields = [ ( 'timestamp', 'int64' ),
-               ( 'p1', 'float32' ),
-               ( 'q1', 'float32' ),
-               ( 'p3', 'float32' ),
-               ( 'q3', 'float32' ),
-               ( 'p5', 'float32' ),
-               ( 'q5', 'float32' ),
-               ( 'p7', 'float32' ),
-               ( 'q7', 'float32' ) ]
-    def parse(self, inp):
-        return [ parse_timestamp(inp[0]),
-                 float(inp[1]), float(inp[2]),
-                 float(inp[3]), float(inp[4]),
-                 float(inp[5]), float(inp[6]),
-                 float(inp[7]), float(inp[8]) ]
-
-class RawData(Layout):
-    rate_hz = 8000
-    fields = [ ( 'timestamp', 'int64' ),
-               ( 'va', 'uint16' ),
-               ( 'vb', 'uint16' ),
-               ( 'vc', 'uint16' ),
-               ( 'ia', 'uint16' ),
-               ( 'ib', 'uint16' ),
-               ( 'ic', 'uint16' ) ]
-
-class RawNotchedData(RawData):
-    rate_hz = 8000
-    fields = RawData.fields + [
-        ( 'notch_ia', 'uint16' ),
-        ( 'notch_ib', 'uint16' ),
-        ( 'notch_ic', 'uint16' ) ]
-
-# Instantiate all layouts, indexed by their name
-named = {}
-for name, obj in inspect.getmembers(sys.modules[__name__]):
-    if inspect.isclass(obj) and issubclass(obj, Layout):
-        named[name] = obj()
-
-class Parser(object):
-    """Object that parses and stores ASCII data for inclusion into the database"""
-    def __init__(self, layout):
-        if issubclass(layout.__class__, Layout):
-            self.layout = layout
-        else:
-            try:
-                self.layout = named[layout]
-            except KeyError:
-                raise TypeError("unknown layout")
-
-        self.data = []
-        self.min_timestamp = None
-        self.max_timestamp = None
-        # Assume timestamp is always the first field, for now
-        self.ts_field = 0
-
-    def parse(self, textdata):
-        """Parse the data, provided as lines of text, using the current
-        layout, into an internal data structure."""
-
-        indata = cStringIO.StringIO(textdata)
-        n = 0
-        # Assume any parsing error is a real error.
-        # In the future we might want to skip completely empty lines,
-        # or partial lines right before EOF?
-        try:
-            last_ts = None
-            for line in indata:
-                n += 1
-
-                # Parse and append
-                fields = line.partition('#')[0].split()
-                if len(fields) != len(self.layout.fields):
-                    raise IndexError(sprintf("wanted %d fields, got %d",
-                                             len(self.layout.fields), len(fields)))
-                out = self.layout.parse(fields)
-                self.data.append(out)
-
-                # Verify timestamp
-                if self.ts_field is not None:
-                    if last_ts is not None and out[self.ts_field] < last_ts:
-                        raise ValueError("timestamp is not monotonically increasing")
-                    last_ts = out[self.ts_field]
-        except ValueError as e:
-            raise ParserError(n, "value error: " + e.message)
-        except IndexError as e:
-            raise ParserError(n, "index error: " + e.message)
-        except TypeError as e:
-            raise ParserError(n, "type error: " + e.message)
-
-        # Mark timestamp ranges
-        if len(self.data) and self.ts_field is not None:
-            self.min_timestamp = self.data[0][self.ts_field]
-            self.max_timestamp = self.data[-1][self.ts_field]
--- a/nilmdb/layout.pyx
+++ b/nilmdb/layout.pyx
@ -0,0 +1,159 @@
+import tables
+import time
+import sys
+import inspect
+import cStringIO
+import numpy as np
+
+cimport cython
+cimport libc.stdlib
+cimport libc.stdio
+cimport libc.string
+
+class ParserError(Exception):
+    def __init__(self, line, message):
+        self.message = "line " + str(line) + ": " + message
+        Exception.__init__(self, self.message)
+
+class Layout:
+    """Represents a NILM database layout"""
+    def description(self):
+        """Return the PyTables description of this layout"""
+        desc = {}
+        for (n, (name, type)) in enumerate(self.fields):
+            desc[name] = tables.Col.from_type(type, pos=n+1)
+        return tables.Description(desc)
+
+    def parse(self, char *text):
+        raise ParserError("no parser for this layout")
+
+class PrepData(Layout):
+    rate_hz = 120
+    fields = [ ( 'timestamp', 'float64' ),
+               ( 'p1', 'float32' ),
+               ( 'q1', 'float32' ),
+               ( 'p3', 'float32' ),
+               ( 'q3', 'float32' ),
+               ( 'p5', 'float32' ),
+               ( 'q5', 'float32' ),
+               ( 'p7', 'float32' ),
+               ( 'q7', 'float32' ) ]
+
+    def parse(self, char *text):
+        cdef int n
+        cdef double ts
+        # return doubles instead of float32, since they're going into
+        # a Python array which would upconvert to double anyway.
+        cdef double v[8]
+        cdef char dummy
+        n = libc.stdio.sscanf(text, " %lf %lf %lf %lf %lf %lf %lf %lf %lf %c",
+                              &ts, &v[0], &v[1], &v[2], &v[3], &v[4],
+                              &v[5], &v[6], &v[7], &dummy)
+        if (n < 9) or (n > 9 and (dummy != '#' and dummy != '\n')):
+            raise ValueError("wrong number of values: wanted 9, got " + str(n))
+        return (ts, [ts, v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]])
+
+class RawData(Layout):
+    rate_hz = 8000
+    fields = [ ( 'timestamp', 'float64' ),
+               ( 'va', 'uint16' ),
+               ( 'vb', 'uint16' ),
+               ( 'vc', 'uint16' ),
+               ( 'ia', 'uint16' ),
+               ( 'ib', 'uint16' ),
+               ( 'ic', 'uint16' ) ]
+
+    def parse(self, char *text):
+        cdef int n
+        cdef double ts
+        cdef int v[6]
+        cdef char dummy
+        n = libc.stdio.sscanf(text, " %lf %u %u %u %u %u %u %c",
+                              &ts, &v[0], &v[1], &v[2],
+                              &v[3], &v[4], &v[5], &dummy)
+        if (n < 7) or (n > 7 and (dummy != '#' and dummy != '\n')):
+            raise ValueError("wrong number of values: wanted 7, got " + str(n))
+        for i in range(6):
+            if v[i] < 0 or v[i] > 65535:
+                raise ValueError("value out of range: " + str(v[i]))
+        return (ts, [ts, v[0], v[1], v[2], v[3], v[4], v[5]])
+
+class RawNotchedData(RawData):
+    rate_hz = 8000
+    fields = RawData.fields + [
+        ( 'notch_ia', 'uint16' ),
+        ( 'notch_ib', 'uint16' ),
+        ( 'notch_ic', 'uint16' ) ]
+
+    def parse(self, char *text):
+        cdef int n
+        cdef double ts
+        cdef int v[9]
+        cdef char dummy
+        n = libc.stdio.sscanf(text, " %lf %u %u %u %u %u %u %u %u %u %c",
+                              &ts, &v[0], &v[1], &v[2], &v[3], &v[4],
+                              &v[5], &v[6], &v[7], &v[8], &dummy)
+        if (n < 10) or (n > 10 and (dummy != '#' and dummy != '\n')):
+            raise ValueError("wrong number of values: wanted 10, got " + str(n))
+        for i in range(9):
+            if v[i] < 0 or v[i] > 65535:
+                raise ValueError("value out of range: " + str(v[i]))
+        return (ts, [ts, v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7], v[8]])
+
+# Instantiate all layouts, indexed by their name
+named = {}
+for name, obj in inspect.getmembers(sys.modules[__name__]):
+    if inspect.isclass(obj) and issubclass(obj, Layout):
+        named[name] = obj()
+
+class Parser(object):
+    """Object that parses and stores ASCII data for inclusion into the
+    database"""
+
+    def __init__(self, layout):
+        if issubclass(layout.__class__, Layout):
+            self.layout = layout
+        else:
+            try:
+                self.layout = named[layout]
+            except KeyError:
+                raise TypeError("unknown layout")
+
+        self.data = []
+        self.min_timestamp = None
+        self.max_timestamp = None
+
+    def parse(self, textdata):
+        """
+        Parse the data, provided as lines of text, using the current
+        layout, into an internal data structure suitable for a
+        pytables 'table.append(parser.data)'.
+        """
+        cdef double last_ts = 0, ts
+        cdef int n = 0, i
+        cdef char *line
+
+        indata = cStringIO.StringIO(textdata)
+        # Assume any parsing error is a real error.
+        # In the future we might want to skip completely empty lines,
+        # or partial lines right before EOF?
+        try:
+            self.data = []
+            for pyline in indata:
+                line = pyline
+                n += 1
+                if line[0] == '\#':
+                    continue
+                (ts, row) = self.layout.parse(line)
+                if ts < last_ts:
+                    raise ValueError("timestamp is not "
+                                     "monotonically increasing")
+                last_ts = ts
+                self.data.append(row)
+        except (ValueError, IndexError, TypeError) as e:
+            raise ParserError(n, "error: " + e.message)
+
+        # Mark timestamp ranges
+        if len(self.data):
+            self.min_timestamp = self.data[0][0]
+            self.max_timestamp = self.data[-1][0]
--- a/tests/test_layout.py
+++ b/tests/test_layout.py
@ -4,7 +4,7 @@ from nose.tools import *
 from nose.tools import assert_raises
 import distutils.version
 import json
-import itertools 
+import itertools
 import os
 import shutil
 import sys
@ -36,7 +36,7 @@ class TestLayouts(object):
                 "1234567890.100000 1.1 2.2 3.3 4.4 5.5\n")
        with assert_raises(ParserError) as e:
            parser.parse(data)
-        in_("index error", str(e.exception))
+        in_("error", str(e.exception))

        # too much data
        parser = Parser("PrepData")
@ -44,21 +44,25 @@ class TestLayouts(object):
                 "1234567890.100000 1.1 2.2 3.3 4.4 5.5 6.6 7.7 8.8 9.9\n")
        with assert_raises(ParserError) as e:
            parser.parse(data)
-        in_("index error", str(e.exception))
+        in_("error", str(e.exception))

        # just right
        parser = Parser("PrepData")
        data = ( "1234567890.000000 1.1 2.2 3.3 4.4 5.5 6.6 7.7 8.8\n" +
                 "1234567890.100000 1.1 2.2 3.3 4.4 5.5 6.6 7.7 8.8\n")
        parser.parse(data)
-        eq_(parser.min_timestamp, 1234567890000000)
-        eq_(parser.max_timestamp, 1234567890100000)
-        
+        eq_(parser.min_timestamp, 1234567890.0)
+        eq_(parser.max_timestamp, 1234567890.1)
+        eq_(parser.data, [[1234567890.0,1.1,2.2,3.3,4.4,5.5,6.6,7.7,8.8],
+                          [1234567890.1,1.1,2.2,3.3,4.4,5.5,6.6,7.7,8.8]])
+
        # try RawData too, with clamping
        parser = Parser("RawData")
        data = ( "1234567890.000000 1 2 3 4 5 6\n" +
                 "1234567890.100000 1 2 3 4 5 6\n" )
        parser.parse(data)
+        eq_(parser.data, [[1234567890.0,1,2,3,4,5,6],
+                          [1234567890.1,1,2,3,4,5,6]])

        # pass an instantiated class
        parser = Parser(RawNotchedData())
@ -66,26 +70,6 @@ class TestLayouts(object):
                 "1234567890.100000 1 2 3 4 5 6 7 8 9\n" )
        parser.parse(data)

-        # using generic parse function, rawdata
-        class CrappyLayout(RawData):
-            pass
-        x = CrappyLayout()
-        x.fields = x.fields + [("foo", "float32")]
-        x.parse = super(RawData, x).parse
-        parser = Parser(x)
-        data = ( "1234567890.000000 1 2 3 4 5 6 1.1\n" +
-                 "1234567890.100000 1 2 3 4 5 6 2.2\n" )
-        parser.parse(data)
-
-        # some invalid type
-        x.fields = x.fields + [("faketype", "faketype")]
-        parser = Parser(x)
-        data = ( "1234567890.000000 1 2 3 4 5 6 1.1 fake\n" +
-                 "1234567890.100000 1 2 3 4 5 6 2.2 fake\n" )
-        with assert_raises(ParserError) as e:
-            parser.parse(data)
-        in_("can't parse type", str(e.exception))
-
        # non-monotonic
        parser = Parser("RawData")
        data = ( "1234567890.100000 1 2 3 4 5 6\n" +
@ -100,7 +84,7 @@ class TestLayouts(object):
                 "1234567890.100000 1 2 3 4 5 6\n" )
        with assert_raises(ParserError) as e:
            parser.parse(data)
-        in_("data out of range", str(e.exception))
+        in_("value out of range", str(e.exception))

        # Empty data should work but is useless
        parser = Parser("RawData")