First pass at Python implementation of rocket

2013-03-03 13:27:47 -05:00 · 2013-03-03 13:27:47 -05:00 · 4406d51a98
commit 4406d51a98
parent 9b6de6ecb7
4 changed files with 121 additions and 95 deletions
--- a/1
+++ b/1
@ -23,6 +23,7 @@ lint:
 test:
 ifeq ($(INSIDE_EMACS), t)
 # Use the slightly more flexible script
 	python setup.py build_ext --inplace
 	python tests/runtests.py
 else
 # Let setup.py check dependencies, build stuff, and run the test
--- a/nilmdb/server/bulkdata.py
+++ b/nilmdb/server/bulkdata.py
@ -9,19 +9,9 @@ from nilmdb.utils.printf import *
 import os
 import cPickle as pickle
 import struct
 import mmap
 import re
-# If we have the faulthandler module, use it.  All of the mmap stuff
+from . import pyrocket as rocket
 # might trigger a SIGSEGV or SIGBUS if we're not careful, and
 # faulthandler will give a traceback in that case.  (the Python
 # interpreter will still die either way).
 try: # pragma: no cover
    import faulthandler
    faulthandler.enable()
 except: # pragma: no cover
    pass
 # Up to 256 open file descriptors at any given time.
 # These variables are global so they can be used in the decorator arguments.
@ -83,26 +73,6 @@ class BulkData(object):
            raise ValueError("invalid path; path must contain at least one "
                             "folder")
        # Get layout, and build format string for struct module
        try:
            layout = nilmdb.server.layout.get_named(layout_name)
            struct_fmt = '<d'  # Little endian, double timestamp
            struct_mapping = {
                "int8": 'b',
                "uint8": 'B',
                "int16": 'h',
                "uint16": 'H',
                "int32": 'i',
                "uint32": 'I',
                "int64": 'q',
                "uint64": 'Q',
                "float32": 'f',
                "float64": 'd',
                }
            struct_fmt += struct_mapping[layout.datatype] * layout.count
        except KeyError:
            raise ValueError("no such layout, or bad data types")
        # Create the table.  Note that we make a distinction here
        # between NilmDB paths (always Unix style, split apart
        # manually) and OS paths (built up with os.path.join)
@ -122,11 +92,16 @@ class BulkData(object):
            raise ValueError("subdirs of this path already exist")
        os.mkdir(ospath)
-        # Write format string to file
+        try:
-        Table.create(ospath, struct_fmt, self.file_size, self.files_per_dir)
+            # Write format string to file
            Table.create(ospath, layout_name, self.file_size,
                         self.files_per_dir)
-        # Open and cache it
+            # Open and cache it
-        self.getnode(unicodepath)
+            self.getnode(unicodepath)
        except:
            os.rmdir(ospath)
            raise
        # Success
        return
@ -173,8 +148,8 @@ class BulkData(object):
@nilmdb.utils.must_close(wrap_verify = False)
 class File(object):
-    """Object representing a single file on disk.  Data can be appended,
+    """Object representing a single file on disk.  Data can be appended
-    or the self.mmap handle can be used for random reads."""
+    or extracted using the rocket functions."""
    def __init__(self, root, subdir, filename):
        # Create path if it doesn't exist
@ -190,46 +165,22 @@ class File(object):
        self._f.seek(0, 2)
        self.size = self._f.tell()
        # Open mmap object
        self.mmap = None
        self._mmap_reopen()
    def _mmap_reopen(self):
        if self.size == 0:
            # Don't mmap if the file is empty; it would fail
            pass
        elif self.mmap is None:
            # Not opened yet, so open it
            self.mmap = mmap.mmap(self._f.fileno(), 0)
        else:
            # Already opened, so just resize it
            self.mmap.resize(self.size)
    def close(self):
        if self.mmap is not None:
            self.mmap.close()
        self._f.close()
-    def append(self, data): # pragma: no cover (below version used instead)
+    def append_rocket_iter(self, rocket, rows, dataiter):
        # Write data, flush it, and resize our mmap accordingly
        self._f.write(data)
        self._f.flush()
        self.size += len(data)
        self._mmap_reopen()
    def append_pack_iter(self, count, packer, dataiter):
        # An optimized verison of append, to avoid flushing the file
        # and resizing the mmap after each data point.
        try:
-            rows = []
+            for i in xrange(rows):
-            for i in xrange(count):
+                rocket.append_list(self._f, [dataiter.next()])
                row = dataiter.next()
                rows.append(packer(*row))
            self._f.write("".join(rows))
        finally:
            self._f.flush()
            self.size = self._f.tell()
-            self._mmap_reopen()
+
    def extract_rocket_list(self, rocket, offset, rows):
        return rocket.extract_list(self._f, offset, rows)
    def extract_rocket_string(self, rocket, offset, rows):
        return rocket.extract_string(self._f, offset, rows)
@nilmdb.utils.must_close(wrap_verify = False)
 class Table(object):
@ -243,19 +194,19 @@ class Table(object):
        return os.path.isfile(os.path.join(root, "_format"))
    @classmethod
-    def create(cls, root, struct_fmt, file_size, files_per_dir):
+    def create(cls, root, layout, file_size, files_per_dir):
-        """Initialize a table at the given OS path.
+        """Initialize a table at the given OS path with the
-        'struct_fmt' is a Struct module format description"""
+        given layout string"""
        # Calculate rows per file so that each file is approximately
        # file_size bytes.
-        packer = struct.Struct(struct_fmt)
+        rkt = rocket.Rocket(layout)
-        rows_per_file = max(file_size // packer.size, 1)
+        rows_per_file = max(file_size // rkt.binary_size, 1)
        fmt = { "rows_per_file": rows_per_file,
                "files_per_dir": files_per_dir,
-                "struct_fmt": struct_fmt,
+                "layout": layout,
-                "version": 1 }
+                "version": 2 }
        with open(os.path.join(root, "_format"), "wb") as f:
            pickle.dump(fmt, f, 2)
@ -264,18 +215,18 @@ class Table(object):
        """'root' is the full OS path to the directory of this table"""
        self.root = root
-        # Load the format and build packer
+        # Load the format and build rocket
        with open(os.path.join(self.root, "_format"), "rb") as f:
            fmt = pickle.load(f)
-        if fmt["version"] != 1: # pragma: no cover (just future proofing)
+        if fmt["version"] != 2: # pragma: no cover (just future proofing)
            raise NotImplementedError("version " + fmt["version"] +
                                      " bulk data store not supported")
        self.rows_per_file = fmt["rows_per_file"]
        self.files_per_dir = fmt["files_per_dir"]
-        self.packer = struct.Struct(fmt["struct_fmt"])
+        self.rocket = rocket.Rocket(fmt["layout"])
-        self.file_size = self.packer.size * self.rows_per_file
+        self.file_size = self.rocket.binary_size * self.rows_per_file
        # Find nrows
        self.nrows = self._get_nrows()
@ -330,17 +281,19 @@ class Table(object):
        # will just get longer but will still sort correctly.
        dirname = sprintf("%04x", filenum // self.files_per_dir)
        filename = sprintf("%04x", filenum % self.files_per_dir)
-        offset = (row % self.rows_per_file) * self.packer.size
+        offset = (row % self.rows_per_file) * self.rocket.binary_size
        count = self.rows_per_file - (row % self.rows_per_file)
        return (dirname, filename, offset, count)
    def _row_from_offset(self, subdir, filename, offset):
        """Return the row number that corresponds to the given
        'subdir/filename' and byte-offset within that file."""
-        if (offset % self.packer.size) != 0: # pragma: no cover; shouldn't occur
+        if (offset % self.rocket.binary_size) != 0: # pragma: no cover
            # shouldn't occur, unless there is some corruption somewhere
            raise ValueError("file offset is not a multiple of data size")
        filenum = int(subdir, 16) * self.files_per_dir + int(filename, 16)
-        row = (filenum * self.rows_per_file) + (offset // self.packer.size)
+        row = ((filenum * self.rows_per_file) +
               (offset // self.rocket.binary_size))
        return row
    # Cache open files
@ -365,10 +318,33 @@ class Table(object):
            f = self.file_open(subdir, fname)
            # Write the data
-            f.append_pack_iter(count, self.packer.pack, dataiter)
+            f.append_rocket_iter(self.rocket, count, dataiter)
            remaining -= count
            self.nrows += count
    def get_string(self, start, stop):
        """Extract data corresponding to Python range [n:m],
        and returns a formatted string"""
        if (start is None or
            stop is None or
            start >= stop or
            start < 0 or
            stop > self.nrows):
            raise IndexError("Index out of range")
        ret_chunks = []
        row = start
        remaining = stop - start
        while remaining:
            (subdir, filename, offset, count) = self._offset_from_row(row)
            if count > remaining:
                count = remaining
            f = self.file_open(subdir, filename)
            ret.append(f.extract_rocket_string(self.rocket, offset, count))
            remaining -= count
            row += count
        return "".join(ret)
    def __getitem__(self, key):
        """Extract data and return it.  Supports simple indexing
        (table[n]) and range slices (table[n:m]).  Returns a nested
@ -392,10 +368,8 @@ class Table(object):
                (subdir, filename, offset, count) = self._offset_from_row(row)
                if count > remaining:
                    count = remaining
-                mm = self.file_open(subdir, filename).mmap
+                f = self.file_open(subdir, filename)
-                for i in xrange(count):
+                ret.extend(f.extract_rocket_list(self.rocket, offset, count))
                    ret.append(list(self.packer.unpack_from(mm, offset)))
                    offset += self.packer.size
                remaining -= count
                row += count
            return ret
@ -404,9 +378,8 @@ class Table(object):
        if key < 0 or key >= self.nrows:
            raise IndexError("Index out of range")
        (subdir, filename, offset, count) = self._offset_from_row(key)
-        mm = self.file_open(subdir, filename).mmap
+        f = self.file_open(subdir, filename)
-        # unpack_from ignores the mmap object's current seek position
+        return f.extract_rocket_list(self.rocket, offset, 1)[0]
        return list(self.packer.unpack_from(mm, offset))
    def _remove_rows(self, subdir, filename, start, stop):
        """Helper to mark specific rows as being removed from a
@ -485,7 +458,7 @@ class Table(object):
            (subdir, filename, offset, count) = self._offset_from_row(row)
            if count > remaining:
                count = remaining
-            row_offset = offset // self.packer.size
+            row_offset = offset // self.rocket.binary_size
            # Mark the rows as being removed
            self._remove_rows(subdir, filename, row_offset, row_offset + count)
            remaining -= count
--- a/nilmdb/server/pyrocket.py
+++ b/nilmdb/server/pyrocket.py
@ -0,0 +1,54 @@
 # Python implementation of the "rocket" data parsing interface
 import struct
 class Rocket(object):
    def __init__(self, layout):
        self.layout = layout
        # For packing/unpacking into a binary file.
        # This will change in the C version
        (self.ltype, lcount) = layout.split('_', 2)
        self.lcount = int(lcount)
        try:
            struct_fmt = '<d'  # Little endian, double timestamp
            struct_mapping = {
                "int8": 'b',
                "uint8": 'B',
                "int16": 'h',
                "uint16": 'H',
                "int32": 'i',
                "uint32": 'I',
                "int64": 'q',
                "uint64": 'Q',
                "float32": 'f',
                "float64": 'd',
                }
            struct_fmt += struct_mapping[self.ltype] * self.lcount
        except KeyError:
            raise ValueError("no such layout, or bad data types")
        self.packer = struct.Struct(struct_fmt)
    @property
    def binary_size(self):
        """Return size of one row of data in the binary file, in bytes"""
        return self.packer.size
    def append_list(self, file, data):
        """Append the list data to the file"""
        for row in data:
            file.write(self.packer.pack(*row))
        file.flush()
    def extract_list(self, file, offset, count):
        """Extract count rows of data from the file at offset offset.
        Return a list of lists [[row],[row],...]"""
        ret = []
        file.seek(offset)
        for i in xrange(count):
            data = file.read(self.binary_size)
            ret.append(list(self.packer.unpack(data)))
        return ret
    def extract_string(self, rocket, offset, rows):
        pass
--- a/tests/test_bulkdata.py
+++ b/tests/test_bulkdata.py
@ -35,8 +35,6 @@ class TestBulkData(object):
            data.create("/foo", "uint16_8")
        with assert_raises(ValueError):
            data.create("foo/bar", "uint16_8")
        with assert_raises(ValueError):
            data.create("/foo/bar", "uint8_8")
        data.create("/foo/bar", "uint16_8")
        data.create(u"/foo/baz/quux", "float64_16")
        with assert_raises(ValueError):