Make the whole nilmdb.layout parser Cython, and rewrite the parsing

bits so it's hopefully quite a bit faster now.


git-svn-id: https://bucket.mit.edu/svn/nilm/nilmdb@10798 ddd99763-3ecb-0310-9145-efcb8ce7c51f
This commit is contained in:
Jim Paris 2012-04-24 21:00:26 +00:00
parent 37b4376b4c
commit c07670ac3e
4 changed files with 172 additions and 170 deletions

View File

@ -5,7 +5,9 @@ from .server import Server
from .client import Client
from .timer import Timer
import pyximport; pyximport.install()
import layout
import serializer
import cmdline
import timestamper

View File

@ -1,143 +0,0 @@
from __future__ import absolute_import
import nilmdb
from nilmdb.printf import *
import tables
import time
import sys
import inspect
import cStringIO
class ParserError(Exception):
def __init__(self, line, message):
self.message = sprintf("line %d: %s", line, message)
Exception.__init__(self, self.message)
class Layout(object):
"""Represents a NILM database layout"""
def description(self):
"""Return the PyTables description of this layout"""
desc = {}
for (n, (name, type)) in enumerate(self.fields):
desc[name] = tables.Col.from_type(type, pos=n+1)
return tables.Description(desc)
def parse(self, inp):
"""Given inp as text, return a list of values
converted to the correct types"""
# Consider overriding this in subclasses for speed.
# In general it takes about 2/3 the time that way.
out = []
for (n, (name, type)) in enumerate(self.fields):
if name == 'timestamp':
out.append(int(float(inp[n]) * 1e6))
elif type == 'float32':
out.append(float(inp[n]))
elif type == 'uint16':
x = int(inp[n], 10)
if x < 0 or x > 65535:
raise ValueError("data out of range")
out.append(x)
else:
raise TypeError("can't parse type " + repr(type))
return out
def parse_timestamp(string):
return int(float(string) * 1e6)
class PrepData(Layout):
rate_hz = 120
fields = [ ( 'timestamp', 'int64' ),
( 'p1', 'float32' ),
( 'q1', 'float32' ),
( 'p3', 'float32' ),
( 'q3', 'float32' ),
( 'p5', 'float32' ),
( 'q5', 'float32' ),
( 'p7', 'float32' ),
( 'q7', 'float32' ) ]
def parse(self, inp):
return [ parse_timestamp(inp[0]),
float(inp[1]), float(inp[2]),
float(inp[3]), float(inp[4]),
float(inp[5]), float(inp[6]),
float(inp[7]), float(inp[8]) ]
class RawData(Layout):
rate_hz = 8000
fields = [ ( 'timestamp', 'int64' ),
( 'va', 'uint16' ),
( 'vb', 'uint16' ),
( 'vc', 'uint16' ),
( 'ia', 'uint16' ),
( 'ib', 'uint16' ),
( 'ic', 'uint16' ) ]
class RawNotchedData(RawData):
rate_hz = 8000
fields = RawData.fields + [
( 'notch_ia', 'uint16' ),
( 'notch_ib', 'uint16' ),
( 'notch_ic', 'uint16' ) ]
# Instantiate all layouts, indexed by their name
named = {}
for name, obj in inspect.getmembers(sys.modules[__name__]):
if inspect.isclass(obj) and issubclass(obj, Layout):
named[name] = obj()
class Parser(object):
"""Object that parses and stores ASCII data for inclusion into the database"""
def __init__(self, layout):
if issubclass(layout.__class__, Layout):
self.layout = layout
else:
try:
self.layout = named[layout]
except KeyError:
raise TypeError("unknown layout")
self.data = []
self.min_timestamp = None
self.max_timestamp = None
# Assume timestamp is always the first field, for now
self.ts_field = 0
def parse(self, textdata):
"""Parse the data, provided as lines of text, using the current
layout, into an internal data structure."""
indata = cStringIO.StringIO(textdata)
n = 0
# Assume any parsing error is a real error.
# In the future we might want to skip completely empty lines,
# or partial lines right before EOF?
try:
last_ts = None
for line in indata:
n += 1
# Parse and append
fields = line.partition('#')[0].split()
if len(fields) != len(self.layout.fields):
raise IndexError(sprintf("wanted %d fields, got %d",
len(self.layout.fields), len(fields)))
out = self.layout.parse(fields)
self.data.append(out)
# Verify timestamp
if self.ts_field is not None:
if last_ts is not None and out[self.ts_field] < last_ts:
raise ValueError("timestamp is not monotonically increasing")
last_ts = out[self.ts_field]
except ValueError as e:
raise ParserError(n, "value error: " + e.message)
except IndexError as e:
raise ParserError(n, "index error: " + e.message)
except TypeError as e:
raise ParserError(n, "type error: " + e.message)
# Mark timestamp ranges
if len(self.data) and self.ts_field is not None:
self.min_timestamp = self.data[0][self.ts_field]
self.max_timestamp = self.data[-1][self.ts_field]

159
nilmdb/layout.pyx Normal file
View File

@ -0,0 +1,159 @@
import tables
import time
import sys
import inspect
import cStringIO
import numpy as np
cimport cython
cimport libc.stdlib
cimport libc.stdio
cimport libc.string
class ParserError(Exception):
def __init__(self, line, message):
self.message = "line " + str(line) + ": " + message
Exception.__init__(self, self.message)
class Layout:
"""Represents a NILM database layout"""
def description(self):
"""Return the PyTables description of this layout"""
desc = {}
for (n, (name, type)) in enumerate(self.fields):
desc[name] = tables.Col.from_type(type, pos=n+1)
return tables.Description(desc)
def parse(self, char *text):
raise ParserError("no parser for this layout")
class PrepData(Layout):
rate_hz = 120
fields = [ ( 'timestamp', 'float64' ),
( 'p1', 'float32' ),
( 'q1', 'float32' ),
( 'p3', 'float32' ),
( 'q3', 'float32' ),
( 'p5', 'float32' ),
( 'q5', 'float32' ),
( 'p7', 'float32' ),
( 'q7', 'float32' ) ]
def parse(self, char *text):
cdef int n
cdef double ts
# return doubles instead of float32, since they're going into
# a Python array which would upconvert to double anyway.
cdef double v[8]
cdef char dummy
n = libc.stdio.sscanf(text, " %lf %lf %lf %lf %lf %lf %lf %lf %lf %c",
&ts, &v[0], &v[1], &v[2], &v[3], &v[4],
&v[5], &v[6], &v[7], &dummy)
if (n < 9) or (n > 9 and (dummy != '#' and dummy != '\n')):
raise ValueError("wrong number of values: wanted 9, got " + str(n))
return (ts, [ts, v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]])
class RawData(Layout):
rate_hz = 8000
fields = [ ( 'timestamp', 'float64' ),
( 'va', 'uint16' ),
( 'vb', 'uint16' ),
( 'vc', 'uint16' ),
( 'ia', 'uint16' ),
( 'ib', 'uint16' ),
( 'ic', 'uint16' ) ]
def parse(self, char *text):
cdef int n
cdef double ts
cdef int v[6]
cdef char dummy
n = libc.stdio.sscanf(text, " %lf %u %u %u %u %u %u %c",
&ts, &v[0], &v[1], &v[2],
&v[3], &v[4], &v[5], &dummy)
if (n < 7) or (n > 7 and (dummy != '#' and dummy != '\n')):
raise ValueError("wrong number of values: wanted 7, got " + str(n))
for i in range(6):
if v[i] < 0 or v[i] > 65535:
raise ValueError("value out of range: " + str(v[i]))
return (ts, [ts, v[0], v[1], v[2], v[3], v[4], v[5]])
class RawNotchedData(RawData):
rate_hz = 8000
fields = RawData.fields + [
( 'notch_ia', 'uint16' ),
( 'notch_ib', 'uint16' ),
( 'notch_ic', 'uint16' ) ]
def parse(self, char *text):
cdef int n
cdef double ts
cdef int v[9]
cdef char dummy
n = libc.stdio.sscanf(text, " %lf %u %u %u %u %u %u %u %u %u %c",
&ts, &v[0], &v[1], &v[2], &v[3], &v[4],
&v[5], &v[6], &v[7], &v[8], &dummy)
if (n < 10) or (n > 10 and (dummy != '#' and dummy != '\n')):
raise ValueError("wrong number of values: wanted 10, got " + str(n))
for i in range(9):
if v[i] < 0 or v[i] > 65535:
raise ValueError("value out of range: " + str(v[i]))
return (ts, [ts, v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7], v[8]])
# Instantiate all layouts, indexed by their name
named = {}
for name, obj in inspect.getmembers(sys.modules[__name__]):
if inspect.isclass(obj) and issubclass(obj, Layout):
named[name] = obj()
class Parser(object):
"""Object that parses and stores ASCII data for inclusion into the
database"""
def __init__(self, layout):
if issubclass(layout.__class__, Layout):
self.layout = layout
else:
try:
self.layout = named[layout]
except KeyError:
raise TypeError("unknown layout")
self.data = []
self.min_timestamp = None
self.max_timestamp = None
def parse(self, textdata):
"""
Parse the data, provided as lines of text, using the current
layout, into an internal data structure suitable for a
pytables 'table.append(parser.data)'.
"""
cdef double last_ts = 0, ts
cdef int n = 0, i
cdef char *line
indata = cStringIO.StringIO(textdata)
# Assume any parsing error is a real error.
# In the future we might want to skip completely empty lines,
# or partial lines right before EOF?
try:
self.data = []
for pyline in indata:
line = pyline
n += 1
if line[0] == '\#':
continue
(ts, row) = self.layout.parse(line)
if ts < last_ts:
raise ValueError("timestamp is not "
"monotonically increasing")
last_ts = ts
self.data.append(row)
except (ValueError, IndexError, TypeError) as e:
raise ParserError(n, "error: " + e.message)
# Mark timestamp ranges
if len(self.data):
self.min_timestamp = self.data[0][0]
self.max_timestamp = self.data[-1][0]

View File

@ -4,7 +4,7 @@ from nose.tools import *
from nose.tools import assert_raises
import distutils.version
import json
import itertools
import itertools
import os
import shutil
import sys
@ -36,7 +36,7 @@ class TestLayouts(object):
"1234567890.100000 1.1 2.2 3.3 4.4 5.5\n")
with assert_raises(ParserError) as e:
parser.parse(data)
in_("index error", str(e.exception))
in_("error", str(e.exception))
# too much data
parser = Parser("PrepData")
@ -44,21 +44,25 @@ class TestLayouts(object):
"1234567890.100000 1.1 2.2 3.3 4.4 5.5 6.6 7.7 8.8 9.9\n")
with assert_raises(ParserError) as e:
parser.parse(data)
in_("index error", str(e.exception))
in_("error", str(e.exception))
# just right
parser = Parser("PrepData")
data = ( "1234567890.000000 1.1 2.2 3.3 4.4 5.5 6.6 7.7 8.8\n" +
"1234567890.100000 1.1 2.2 3.3 4.4 5.5 6.6 7.7 8.8\n")
parser.parse(data)
eq_(parser.min_timestamp, 1234567890000000)
eq_(parser.max_timestamp, 1234567890100000)
eq_(parser.min_timestamp, 1234567890.0)
eq_(parser.max_timestamp, 1234567890.1)
eq_(parser.data, [[1234567890.0,1.1,2.2,3.3,4.4,5.5,6.6,7.7,8.8],
[1234567890.1,1.1,2.2,3.3,4.4,5.5,6.6,7.7,8.8]])
# try RawData too, with clamping
parser = Parser("RawData")
data = ( "1234567890.000000 1 2 3 4 5 6\n" +
"1234567890.100000 1 2 3 4 5 6\n" )
parser.parse(data)
eq_(parser.data, [[1234567890.0,1,2,3,4,5,6],
[1234567890.1,1,2,3,4,5,6]])
# pass an instantiated class
parser = Parser(RawNotchedData())
@ -66,26 +70,6 @@ class TestLayouts(object):
"1234567890.100000 1 2 3 4 5 6 7 8 9\n" )
parser.parse(data)
# using generic parse function, rawdata
class CrappyLayout(RawData):
pass
x = CrappyLayout()
x.fields = x.fields + [("foo", "float32")]
x.parse = super(RawData, x).parse
parser = Parser(x)
data = ( "1234567890.000000 1 2 3 4 5 6 1.1\n" +
"1234567890.100000 1 2 3 4 5 6 2.2\n" )
parser.parse(data)
# some invalid type
x.fields = x.fields + [("faketype", "faketype")]
parser = Parser(x)
data = ( "1234567890.000000 1 2 3 4 5 6 1.1 fake\n" +
"1234567890.100000 1 2 3 4 5 6 2.2 fake\n" )
with assert_raises(ParserError) as e:
parser.parse(data)
in_("can't parse type", str(e.exception))
# non-monotonic
parser = Parser("RawData")
data = ( "1234567890.100000 1 2 3 4 5 6\n" +
@ -100,7 +84,7 @@ class TestLayouts(object):
"1234567890.100000 1 2 3 4 5 6\n" )
with assert_raises(ParserError) as e:
parser.parse(data)
in_("data out of range", str(e.exception))
in_("value out of range", str(e.exception))
# Empty data should work but is useless
parser = Parser("RawData")