Browse Source

First pass at Python implementation of rocket

tags/nilmdb-1.3.0
Jim Paris 9 years ago
parent
commit
4406d51a98
4 changed files with 121 additions and 95 deletions
  1. +1
    -0
      Makefile
  2. +66
    -93
      nilmdb/server/bulkdata.py
  3. +54
    -0
      nilmdb/server/pyrocket.py
  4. +0
    -2
      tests/test_bulkdata.py

+ 1
- 0
Makefile View File

@@ -23,6 +23,7 @@ lint:
test:
ifeq ($(INSIDE_EMACS), t)
# Use the slightly more flexible script
python setup.py build_ext --inplace
python tests/runtests.py
else
# Let setup.py check dependencies, build stuff, and run the test


+ 66
- 93
nilmdb/server/bulkdata.py View File

@@ -9,19 +9,9 @@ from nilmdb.utils.printf import *

import os
import cPickle as pickle
import struct
import mmap
import re

# If we have the faulthandler module, use it. All of the mmap stuff
# might trigger a SIGSEGV or SIGBUS if we're not careful, and
# faulthandler will give a traceback in that case. (the Python
# interpreter will still die either way).
try: # pragma: no cover
import faulthandler
faulthandler.enable()
except: # pragma: no cover
pass
from . import pyrocket as rocket

# Up to 256 open file descriptors at any given time.
# These variables are global so they can be used in the decorator arguments.
@@ -83,26 +73,6 @@ class BulkData(object):
raise ValueError("invalid path; path must contain at least one "
"folder")

# Get layout, and build format string for struct module
try:
layout = nilmdb.server.layout.get_named(layout_name)
struct_fmt = '<d' # Little endian, double timestamp
struct_mapping = {
"int8": 'b',
"uint8": 'B',
"int16": 'h',
"uint16": 'H',
"int32": 'i',
"uint32": 'I',
"int64": 'q',
"uint64": 'Q',
"float32": 'f',
"float64": 'd',
}
struct_fmt += struct_mapping[layout.datatype] * layout.count
except KeyError:
raise ValueError("no such layout, or bad data types")

# Create the table. Note that we make a distinction here
# between NilmDB paths (always Unix style, split apart
# manually) and OS paths (built up with os.path.join)
@@ -122,11 +92,16 @@ class BulkData(object):
raise ValueError("subdirs of this path already exist")
os.mkdir(ospath)

# Write format string to file
Table.create(ospath, struct_fmt, self.file_size, self.files_per_dir)
try:
# Write format string to file
Table.create(ospath, layout_name, self.file_size,
self.files_per_dir)

# Open and cache it
self.getnode(unicodepath)
# Open and cache it
self.getnode(unicodepath)
except:
os.rmdir(ospath)
raise

# Success
return
@@ -173,8 +148,8 @@ class BulkData(object):

@nilmdb.utils.must_close(wrap_verify = False)
class File(object):
"""Object representing a single file on disk. Data can be appended,
or the self.mmap handle can be used for random reads."""
"""Object representing a single file on disk. Data can be appended
or extracted using the rocket functions."""

def __init__(self, root, subdir, filename):
# Create path if it doesn't exist
@@ -190,46 +165,22 @@ class File(object):
self._f.seek(0, 2)
self.size = self._f.tell()

# Open mmap object
self.mmap = None
self._mmap_reopen()

def _mmap_reopen(self):
if self.size == 0:
# Don't mmap if the file is empty; it would fail
pass
elif self.mmap is None:
# Not opened yet, so open it
self.mmap = mmap.mmap(self._f.fileno(), 0)
else:
# Already opened, so just resize it
self.mmap.resize(self.size)

def close(self):
if self.mmap is not None:
self.mmap.close()
self._f.close()

def append(self, data): # pragma: no cover (below version used instead)
# Write data, flush it, and resize our mmap accordingly
self._f.write(data)
self._f.flush()
self.size += len(data)
self._mmap_reopen()

def append_pack_iter(self, count, packer, dataiter):
# An optimized verison of append, to avoid flushing the file
# and resizing the mmap after each data point.
def append_rocket_iter(self, rocket, rows, dataiter):
try:
rows = []
for i in xrange(count):
row = dataiter.next()
rows.append(packer(*row))
self._f.write("".join(rows))
for i in xrange(rows):
rocket.append_list(self._f, [dataiter.next()])
finally:
self._f.flush()
self.size = self._f.tell()
self._mmap_reopen()

def extract_rocket_list(self, rocket, offset, rows):
return rocket.extract_list(self._f, offset, rows)

def extract_rocket_string(self, rocket, offset, rows):
return rocket.extract_string(self._f, offset, rows)

@nilmdb.utils.must_close(wrap_verify = False)
class Table(object):
@@ -243,19 +194,19 @@ class Table(object):
return os.path.isfile(os.path.join(root, "_format"))

@classmethod
def create(cls, root, struct_fmt, file_size, files_per_dir):
"""Initialize a table at the given OS path.
'struct_fmt' is a Struct module format description"""
def create(cls, root, layout, file_size, files_per_dir):
"""Initialize a table at the given OS path with the
given layout string"""

# Calculate rows per file so that each file is approximately
# file_size bytes.
packer = struct.Struct(struct_fmt)
rows_per_file = max(file_size // packer.size, 1)
rkt = rocket.Rocket(layout)
rows_per_file = max(file_size // rkt.binary_size, 1)

fmt = { "rows_per_file": rows_per_file,
"files_per_dir": files_per_dir,
"struct_fmt": struct_fmt,
"version": 1 }
"layout": layout,
"version": 2 }
with open(os.path.join(root, "_format"), "wb") as f:
pickle.dump(fmt, f, 2)

@@ -264,18 +215,18 @@ class Table(object):
"""'root' is the full OS path to the directory of this table"""
self.root = root

# Load the format and build packer
# Load the format and build rocket
with open(os.path.join(self.root, "_format"), "rb") as f:
fmt = pickle.load(f)

if fmt["version"] != 1: # pragma: no cover (just future proofing)
if fmt["version"] != 2: # pragma: no cover (just future proofing)
raise NotImplementedError("version " + fmt["version"] +
" bulk data store not supported")

self.rows_per_file = fmt["rows_per_file"]
self.files_per_dir = fmt["files_per_dir"]
self.packer = struct.Struct(fmt["struct_fmt"])
self.file_size = self.packer.size * self.rows_per_file
self.rocket = rocket.Rocket(fmt["layout"])
self.file_size = self.rocket.binary_size * self.rows_per_file

# Find nrows
self.nrows = self._get_nrows()
@@ -330,17 +281,19 @@ class Table(object):
# will just get longer but will still sort correctly.
dirname = sprintf("%04x", filenum // self.files_per_dir)
filename = sprintf("%04x", filenum % self.files_per_dir)
offset = (row % self.rows_per_file) * self.packer.size
offset = (row % self.rows_per_file) * self.rocket.binary_size
count = self.rows_per_file - (row % self.rows_per_file)
return (dirname, filename, offset, count)

def _row_from_offset(self, subdir, filename, offset):
"""Return the row number that corresponds to the given
'subdir/filename' and byte-offset within that file."""
if (offset % self.packer.size) != 0: # pragma: no cover; shouldn't occur
if (offset % self.rocket.binary_size) != 0: # pragma: no cover
# shouldn't occur, unless there is some corruption somewhere
raise ValueError("file offset is not a multiple of data size")
filenum = int(subdir, 16) * self.files_per_dir + int(filename, 16)
row = (filenum * self.rows_per_file) + (offset // self.packer.size)
row = ((filenum * self.rows_per_file) +
(offset // self.rocket.binary_size))
return row

# Cache open files
@@ -365,10 +318,33 @@ class Table(object):
f = self.file_open(subdir, fname)

# Write the data
f.append_pack_iter(count, self.packer.pack, dataiter)
f.append_rocket_iter(self.rocket, count, dataiter)
remaining -= count
self.nrows += count

def get_string(self, start, stop):
"""Extract data corresponding to Python range [n:m],
and returns a formatted string"""
if (start is None or
stop is None or
start >= stop or
start < 0 or
stop > self.nrows):
raise IndexError("Index out of range")

ret_chunks = []
row = start
remaining = stop - start
while remaining:
(subdir, filename, offset, count) = self._offset_from_row(row)
if count > remaining:
count = remaining
f = self.file_open(subdir, filename)
ret.append(f.extract_rocket_string(self.rocket, offset, count))
remaining -= count
row += count
return "".join(ret)

def __getitem__(self, key):
"""Extract data and return it. Supports simple indexing
(table[n]) and range slices (table[n:m]). Returns a nested
@@ -392,10 +368,8 @@ class Table(object):
(subdir, filename, offset, count) = self._offset_from_row(row)
if count > remaining:
count = remaining
mm = self.file_open(subdir, filename).mmap
for i in xrange(count):
ret.append(list(self.packer.unpack_from(mm, offset)))
offset += self.packer.size
f = self.file_open(subdir, filename)
ret.extend(f.extract_rocket_list(self.rocket, offset, count))
remaining -= count
row += count
return ret
@@ -404,9 +378,8 @@ class Table(object):
if key < 0 or key >= self.nrows:
raise IndexError("Index out of range")
(subdir, filename, offset, count) = self._offset_from_row(key)
mm = self.file_open(subdir, filename).mmap
# unpack_from ignores the mmap object's current seek position
return list(self.packer.unpack_from(mm, offset))
f = self.file_open(subdir, filename)
return f.extract_rocket_list(self.rocket, offset, 1)[0]

def _remove_rows(self, subdir, filename, start, stop):
"""Helper to mark specific rows as being removed from a
@@ -485,7 +458,7 @@ class Table(object):
(subdir, filename, offset, count) = self._offset_from_row(row)
if count > remaining:
count = remaining
row_offset = offset // self.packer.size
row_offset = offset // self.rocket.binary_size
# Mark the rows as being removed
self._remove_rows(subdir, filename, row_offset, row_offset + count)
remaining -= count


+ 54
- 0
nilmdb/server/pyrocket.py View File

@@ -0,0 +1,54 @@
# Python implementation of the "rocket" data parsing interface

import struct

class Rocket(object):
def __init__(self, layout):
self.layout = layout

# For packing/unpacking into a binary file.
# This will change in the C version
(self.ltype, lcount) = layout.split('_', 2)
self.lcount = int(lcount)
try:
struct_fmt = '<d' # Little endian, double timestamp
struct_mapping = {
"int8": 'b',
"uint8": 'B',
"int16": 'h',
"uint16": 'H',
"int32": 'i',
"uint32": 'I',
"int64": 'q',
"uint64": 'Q',
"float32": 'f',
"float64": 'd',
}
struct_fmt += struct_mapping[self.ltype] * self.lcount
except KeyError:
raise ValueError("no such layout, or bad data types")
self.packer = struct.Struct(struct_fmt)

@property
def binary_size(self):
"""Return size of one row of data in the binary file, in bytes"""
return self.packer.size

def append_list(self, file, data):
"""Append the list data to the file"""
for row in data:
file.write(self.packer.pack(*row))
file.flush()

def extract_list(self, file, offset, count):
"""Extract count rows of data from the file at offset offset.
Return a list of lists [[row],[row],...]"""
ret = []
file.seek(offset)
for i in xrange(count):
data = file.read(self.binary_size)
ret.append(list(self.packer.unpack(data)))
return ret

def extract_string(self, rocket, offset, rows):
pass

+ 0
- 2
tests/test_bulkdata.py View File

@@ -35,8 +35,6 @@ class TestBulkData(object):
data.create("/foo", "uint16_8")
with assert_raises(ValueError):
data.create("foo/bar", "uint16_8")
with assert_raises(ValueError):
data.create("/foo/bar", "uint8_8")
data.create("/foo/bar", "uint16_8")
data.create(u"/foo/baz/quux", "float64_16")
with assert_raises(ValueError):


Loading…
Cancel
Save