Make the whole nilmdb.layout parser Cython, and rewrite the parsing
bits so it's hopefully quite a bit faster now. git-svn-id: https://bucket.mit.edu/svn/nilm/nilmdb@10798 ddd99763-3ecb-0310-9145-efcb8ce7c51f
This commit is contained in:
parent
37b4376b4c
commit
c07670ac3e
|
@ -5,7 +5,9 @@ from .server import Server
|
|||
from .client import Client
|
||||
from .timer import Timer
|
||||
|
||||
import pyximport; pyximport.install()
|
||||
import layout
|
||||
|
||||
import serializer
|
||||
import cmdline
|
||||
import timestamper
|
||||
|
|
143
nilmdb/layout.py
143
nilmdb/layout.py
|
@ -1,143 +0,0 @@
|
|||
from __future__ import absolute_import
|
||||
import nilmdb
|
||||
from nilmdb.printf import *
|
||||
|
||||
import tables
|
||||
import time
|
||||
import sys
|
||||
import inspect
|
||||
import cStringIO
|
||||
|
||||
class ParserError(Exception):
|
||||
def __init__(self, line, message):
|
||||
self.message = sprintf("line %d: %s", line, message)
|
||||
Exception.__init__(self, self.message)
|
||||
|
||||
class Layout(object):
|
||||
"""Represents a NILM database layout"""
|
||||
def description(self):
|
||||
"""Return the PyTables description of this layout"""
|
||||
desc = {}
|
||||
for (n, (name, type)) in enumerate(self.fields):
|
||||
desc[name] = tables.Col.from_type(type, pos=n+1)
|
||||
return tables.Description(desc)
|
||||
|
||||
def parse(self, inp):
|
||||
"""Given inp as text, return a list of values
|
||||
converted to the correct types"""
|
||||
# Consider overriding this in subclasses for speed.
|
||||
# In general it takes about 2/3 the time that way.
|
||||
out = []
|
||||
for (n, (name, type)) in enumerate(self.fields):
|
||||
if name == 'timestamp':
|
||||
out.append(int(float(inp[n]) * 1e6))
|
||||
elif type == 'float32':
|
||||
out.append(float(inp[n]))
|
||||
elif type == 'uint16':
|
||||
x = int(inp[n], 10)
|
||||
if x < 0 or x > 65535:
|
||||
raise ValueError("data out of range")
|
||||
out.append(x)
|
||||
else:
|
||||
raise TypeError("can't parse type " + repr(type))
|
||||
return out
|
||||
|
||||
def parse_timestamp(string):
|
||||
return int(float(string) * 1e6)
|
||||
|
||||
class PrepData(Layout):
|
||||
rate_hz = 120
|
||||
fields = [ ( 'timestamp', 'int64' ),
|
||||
( 'p1', 'float32' ),
|
||||
( 'q1', 'float32' ),
|
||||
( 'p3', 'float32' ),
|
||||
( 'q3', 'float32' ),
|
||||
( 'p5', 'float32' ),
|
||||
( 'q5', 'float32' ),
|
||||
( 'p7', 'float32' ),
|
||||
( 'q7', 'float32' ) ]
|
||||
def parse(self, inp):
|
||||
return [ parse_timestamp(inp[0]),
|
||||
float(inp[1]), float(inp[2]),
|
||||
float(inp[3]), float(inp[4]),
|
||||
float(inp[5]), float(inp[6]),
|
||||
float(inp[7]), float(inp[8]) ]
|
||||
|
||||
class RawData(Layout):
|
||||
rate_hz = 8000
|
||||
fields = [ ( 'timestamp', 'int64' ),
|
||||
( 'va', 'uint16' ),
|
||||
( 'vb', 'uint16' ),
|
||||
( 'vc', 'uint16' ),
|
||||
( 'ia', 'uint16' ),
|
||||
( 'ib', 'uint16' ),
|
||||
( 'ic', 'uint16' ) ]
|
||||
|
||||
class RawNotchedData(RawData):
|
||||
rate_hz = 8000
|
||||
fields = RawData.fields + [
|
||||
( 'notch_ia', 'uint16' ),
|
||||
( 'notch_ib', 'uint16' ),
|
||||
( 'notch_ic', 'uint16' ) ]
|
||||
|
||||
# Instantiate all layouts, indexed by their name
|
||||
named = {}
|
||||
for name, obj in inspect.getmembers(sys.modules[__name__]):
|
||||
if inspect.isclass(obj) and issubclass(obj, Layout):
|
||||
named[name] = obj()
|
||||
|
||||
class Parser(object):
|
||||
"""Object that parses and stores ASCII data for inclusion into the database"""
|
||||
def __init__(self, layout):
|
||||
if issubclass(layout.__class__, Layout):
|
||||
self.layout = layout
|
||||
else:
|
||||
try:
|
||||
self.layout = named[layout]
|
||||
except KeyError:
|
||||
raise TypeError("unknown layout")
|
||||
|
||||
self.data = []
|
||||
self.min_timestamp = None
|
||||
self.max_timestamp = None
|
||||
# Assume timestamp is always the first field, for now
|
||||
self.ts_field = 0
|
||||
|
||||
def parse(self, textdata):
|
||||
"""Parse the data, provided as lines of text, using the current
|
||||
layout, into an internal data structure."""
|
||||
|
||||
indata = cStringIO.StringIO(textdata)
|
||||
n = 0
|
||||
# Assume any parsing error is a real error.
|
||||
# In the future we might want to skip completely empty lines,
|
||||
# or partial lines right before EOF?
|
||||
try:
|
||||
last_ts = None
|
||||
for line in indata:
|
||||
n += 1
|
||||
|
||||
# Parse and append
|
||||
fields = line.partition('#')[0].split()
|
||||
if len(fields) != len(self.layout.fields):
|
||||
raise IndexError(sprintf("wanted %d fields, got %d",
|
||||
len(self.layout.fields), len(fields)))
|
||||
out = self.layout.parse(fields)
|
||||
self.data.append(out)
|
||||
|
||||
# Verify timestamp
|
||||
if self.ts_field is not None:
|
||||
if last_ts is not None and out[self.ts_field] < last_ts:
|
||||
raise ValueError("timestamp is not monotonically increasing")
|
||||
last_ts = out[self.ts_field]
|
||||
except ValueError as e:
|
||||
raise ParserError(n, "value error: " + e.message)
|
||||
except IndexError as e:
|
||||
raise ParserError(n, "index error: " + e.message)
|
||||
except TypeError as e:
|
||||
raise ParserError(n, "type error: " + e.message)
|
||||
|
||||
# Mark timestamp ranges
|
||||
if len(self.data) and self.ts_field is not None:
|
||||
self.min_timestamp = self.data[0][self.ts_field]
|
||||
self.max_timestamp = self.data[-1][self.ts_field]
|
159
nilmdb/layout.pyx
Normal file
159
nilmdb/layout.pyx
Normal file
|
@ -0,0 +1,159 @@
|
|||
import tables
|
||||
import time
|
||||
import sys
|
||||
import inspect
|
||||
import cStringIO
|
||||
import numpy as np
|
||||
|
||||
cimport cython
|
||||
cimport libc.stdlib
|
||||
cimport libc.stdio
|
||||
cimport libc.string
|
||||
|
||||
class ParserError(Exception):
|
||||
def __init__(self, line, message):
|
||||
self.message = "line " + str(line) + ": " + message
|
||||
Exception.__init__(self, self.message)
|
||||
|
||||
class Layout:
|
||||
"""Represents a NILM database layout"""
|
||||
def description(self):
|
||||
"""Return the PyTables description of this layout"""
|
||||
desc = {}
|
||||
for (n, (name, type)) in enumerate(self.fields):
|
||||
desc[name] = tables.Col.from_type(type, pos=n+1)
|
||||
return tables.Description(desc)
|
||||
|
||||
def parse(self, char *text):
|
||||
raise ParserError("no parser for this layout")
|
||||
|
||||
class PrepData(Layout):
|
||||
rate_hz = 120
|
||||
fields = [ ( 'timestamp', 'float64' ),
|
||||
( 'p1', 'float32' ),
|
||||
( 'q1', 'float32' ),
|
||||
( 'p3', 'float32' ),
|
||||
( 'q3', 'float32' ),
|
||||
( 'p5', 'float32' ),
|
||||
( 'q5', 'float32' ),
|
||||
( 'p7', 'float32' ),
|
||||
( 'q7', 'float32' ) ]
|
||||
|
||||
def parse(self, char *text):
|
||||
cdef int n
|
||||
cdef double ts
|
||||
# return doubles instead of float32, since they're going into
|
||||
# a Python array which would upconvert to double anyway.
|
||||
cdef double v[8]
|
||||
cdef char dummy
|
||||
n = libc.stdio.sscanf(text, " %lf %lf %lf %lf %lf %lf %lf %lf %lf %c",
|
||||
&ts, &v[0], &v[1], &v[2], &v[3], &v[4],
|
||||
&v[5], &v[6], &v[7], &dummy)
|
||||
if (n < 9) or (n > 9 and (dummy != '#' and dummy != '\n')):
|
||||
raise ValueError("wrong number of values: wanted 9, got " + str(n))
|
||||
return (ts, [ts, v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]])
|
||||
|
||||
class RawData(Layout):
|
||||
rate_hz = 8000
|
||||
fields = [ ( 'timestamp', 'float64' ),
|
||||
( 'va', 'uint16' ),
|
||||
( 'vb', 'uint16' ),
|
||||
( 'vc', 'uint16' ),
|
||||
( 'ia', 'uint16' ),
|
||||
( 'ib', 'uint16' ),
|
||||
( 'ic', 'uint16' ) ]
|
||||
|
||||
def parse(self, char *text):
|
||||
cdef int n
|
||||
cdef double ts
|
||||
cdef int v[6]
|
||||
cdef char dummy
|
||||
n = libc.stdio.sscanf(text, " %lf %u %u %u %u %u %u %c",
|
||||
&ts, &v[0], &v[1], &v[2],
|
||||
&v[3], &v[4], &v[5], &dummy)
|
||||
if (n < 7) or (n > 7 and (dummy != '#' and dummy != '\n')):
|
||||
raise ValueError("wrong number of values: wanted 7, got " + str(n))
|
||||
for i in range(6):
|
||||
if v[i] < 0 or v[i] > 65535:
|
||||
raise ValueError("value out of range: " + str(v[i]))
|
||||
return (ts, [ts, v[0], v[1], v[2], v[3], v[4], v[5]])
|
||||
|
||||
class RawNotchedData(RawData):
|
||||
rate_hz = 8000
|
||||
fields = RawData.fields + [
|
||||
( 'notch_ia', 'uint16' ),
|
||||
( 'notch_ib', 'uint16' ),
|
||||
( 'notch_ic', 'uint16' ) ]
|
||||
|
||||
def parse(self, char *text):
|
||||
cdef int n
|
||||
cdef double ts
|
||||
cdef int v[9]
|
||||
cdef char dummy
|
||||
n = libc.stdio.sscanf(text, " %lf %u %u %u %u %u %u %u %u %u %c",
|
||||
&ts, &v[0], &v[1], &v[2], &v[3], &v[4],
|
||||
&v[5], &v[6], &v[7], &v[8], &dummy)
|
||||
if (n < 10) or (n > 10 and (dummy != '#' and dummy != '\n')):
|
||||
raise ValueError("wrong number of values: wanted 10, got " + str(n))
|
||||
for i in range(9):
|
||||
if v[i] < 0 or v[i] > 65535:
|
||||
raise ValueError("value out of range: " + str(v[i]))
|
||||
return (ts, [ts, v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7], v[8]])
|
||||
|
||||
# Instantiate all layouts, indexed by their name
|
||||
named = {}
|
||||
for name, obj in inspect.getmembers(sys.modules[__name__]):
|
||||
if inspect.isclass(obj) and issubclass(obj, Layout):
|
||||
named[name] = obj()
|
||||
|
||||
class Parser(object):
|
||||
"""Object that parses and stores ASCII data for inclusion into the
|
||||
database"""
|
||||
|
||||
def __init__(self, layout):
|
||||
if issubclass(layout.__class__, Layout):
|
||||
self.layout = layout
|
||||
else:
|
||||
try:
|
||||
self.layout = named[layout]
|
||||
except KeyError:
|
||||
raise TypeError("unknown layout")
|
||||
|
||||
self.data = []
|
||||
self.min_timestamp = None
|
||||
self.max_timestamp = None
|
||||
|
||||
def parse(self, textdata):
|
||||
"""
|
||||
Parse the data, provided as lines of text, using the current
|
||||
layout, into an internal data structure suitable for a
|
||||
pytables 'table.append(parser.data)'.
|
||||
"""
|
||||
cdef double last_ts = 0, ts
|
||||
cdef int n = 0, i
|
||||
cdef char *line
|
||||
|
||||
indata = cStringIO.StringIO(textdata)
|
||||
# Assume any parsing error is a real error.
|
||||
# In the future we might want to skip completely empty lines,
|
||||
# or partial lines right before EOF?
|
||||
try:
|
||||
self.data = []
|
||||
for pyline in indata:
|
||||
line = pyline
|
||||
n += 1
|
||||
if line[0] == '\#':
|
||||
continue
|
||||
(ts, row) = self.layout.parse(line)
|
||||
if ts < last_ts:
|
||||
raise ValueError("timestamp is not "
|
||||
"monotonically increasing")
|
||||
last_ts = ts
|
||||
self.data.append(row)
|
||||
except (ValueError, IndexError, TypeError) as e:
|
||||
raise ParserError(n, "error: " + e.message)
|
||||
|
||||
# Mark timestamp ranges
|
||||
if len(self.data):
|
||||
self.min_timestamp = self.data[0][0]
|
||||
self.max_timestamp = self.data[-1][0]
|
|
@ -4,7 +4,7 @@ from nose.tools import *
|
|||
from nose.tools import assert_raises
|
||||
import distutils.version
|
||||
import json
|
||||
import itertools
|
||||
import itertools
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
|
@ -36,7 +36,7 @@ class TestLayouts(object):
|
|||
"1234567890.100000 1.1 2.2 3.3 4.4 5.5\n")
|
||||
with assert_raises(ParserError) as e:
|
||||
parser.parse(data)
|
||||
in_("index error", str(e.exception))
|
||||
in_("error", str(e.exception))
|
||||
|
||||
# too much data
|
||||
parser = Parser("PrepData")
|
||||
|
@ -44,21 +44,25 @@ class TestLayouts(object):
|
|||
"1234567890.100000 1.1 2.2 3.3 4.4 5.5 6.6 7.7 8.8 9.9\n")
|
||||
with assert_raises(ParserError) as e:
|
||||
parser.parse(data)
|
||||
in_("index error", str(e.exception))
|
||||
in_("error", str(e.exception))
|
||||
|
||||
# just right
|
||||
parser = Parser("PrepData")
|
||||
data = ( "1234567890.000000 1.1 2.2 3.3 4.4 5.5 6.6 7.7 8.8\n" +
|
||||
"1234567890.100000 1.1 2.2 3.3 4.4 5.5 6.6 7.7 8.8\n")
|
||||
parser.parse(data)
|
||||
eq_(parser.min_timestamp, 1234567890000000)
|
||||
eq_(parser.max_timestamp, 1234567890100000)
|
||||
|
||||
eq_(parser.min_timestamp, 1234567890.0)
|
||||
eq_(parser.max_timestamp, 1234567890.1)
|
||||
eq_(parser.data, [[1234567890.0,1.1,2.2,3.3,4.4,5.5,6.6,7.7,8.8],
|
||||
[1234567890.1,1.1,2.2,3.3,4.4,5.5,6.6,7.7,8.8]])
|
||||
|
||||
# try RawData too, with clamping
|
||||
parser = Parser("RawData")
|
||||
data = ( "1234567890.000000 1 2 3 4 5 6\n" +
|
||||
"1234567890.100000 1 2 3 4 5 6\n" )
|
||||
parser.parse(data)
|
||||
eq_(parser.data, [[1234567890.0,1,2,3,4,5,6],
|
||||
[1234567890.1,1,2,3,4,5,6]])
|
||||
|
||||
# pass an instantiated class
|
||||
parser = Parser(RawNotchedData())
|
||||
|
@ -66,26 +70,6 @@ class TestLayouts(object):
|
|||
"1234567890.100000 1 2 3 4 5 6 7 8 9\n" )
|
||||
parser.parse(data)
|
||||
|
||||
# using generic parse function, rawdata
|
||||
class CrappyLayout(RawData):
|
||||
pass
|
||||
x = CrappyLayout()
|
||||
x.fields = x.fields + [("foo", "float32")]
|
||||
x.parse = super(RawData, x).parse
|
||||
parser = Parser(x)
|
||||
data = ( "1234567890.000000 1 2 3 4 5 6 1.1\n" +
|
||||
"1234567890.100000 1 2 3 4 5 6 2.2\n" )
|
||||
parser.parse(data)
|
||||
|
||||
# some invalid type
|
||||
x.fields = x.fields + [("faketype", "faketype")]
|
||||
parser = Parser(x)
|
||||
data = ( "1234567890.000000 1 2 3 4 5 6 1.1 fake\n" +
|
||||
"1234567890.100000 1 2 3 4 5 6 2.2 fake\n" )
|
||||
with assert_raises(ParserError) as e:
|
||||
parser.parse(data)
|
||||
in_("can't parse type", str(e.exception))
|
||||
|
||||
# non-monotonic
|
||||
parser = Parser("RawData")
|
||||
data = ( "1234567890.100000 1 2 3 4 5 6\n" +
|
||||
|
@ -100,7 +84,7 @@ class TestLayouts(object):
|
|||
"1234567890.100000 1 2 3 4 5 6\n" )
|
||||
with assert_raises(ParserError) as e:
|
||||
parser.parse(data)
|
||||
in_("data out of range", str(e.exception))
|
||||
in_("value out of range", str(e.exception))
|
||||
|
||||
# Empty data should work but is useless
|
||||
parser = Parser("RawData")
|
||||
|
|
Loading…
Reference in New Issue
Block a user