|
|
@@ -9,19 +9,9 @@ from nilmdb.utils.printf import * |
|
|
|
|
|
|
|
import os |
|
|
|
import cPickle as pickle |
|
|
|
import struct |
|
|
|
import mmap |
|
|
|
import re |
|
|
|
|
|
|
|
# If we have the faulthandler module, use it. All of the mmap stuff |
|
|
|
# might trigger a SIGSEGV or SIGBUS if we're not careful, and |
|
|
|
# faulthandler will give a traceback in that case. (the Python |
|
|
|
# interpreter will still die either way). |
|
|
|
try: # pragma: no cover |
|
|
|
import faulthandler |
|
|
|
faulthandler.enable() |
|
|
|
except: # pragma: no cover |
|
|
|
pass |
|
|
|
from . import pyrocket as rocket |
|
|
|
|
|
|
|
# Up to 256 open file descriptors at any given time. |
|
|
|
# These variables are global so they can be used in the decorator arguments. |
|
|
@@ -83,26 +73,6 @@ class BulkData(object): |
|
|
|
raise ValueError("invalid path; path must contain at least one " |
|
|
|
"folder") |
|
|
|
|
|
|
|
# Get layout, and build format string for struct module |
|
|
|
try: |
|
|
|
layout = nilmdb.server.layout.get_named(layout_name) |
|
|
|
struct_fmt = '<d' # Little endian, double timestamp |
|
|
|
struct_mapping = { |
|
|
|
"int8": 'b', |
|
|
|
"uint8": 'B', |
|
|
|
"int16": 'h', |
|
|
|
"uint16": 'H', |
|
|
|
"int32": 'i', |
|
|
|
"uint32": 'I', |
|
|
|
"int64": 'q', |
|
|
|
"uint64": 'Q', |
|
|
|
"float32": 'f', |
|
|
|
"float64": 'd', |
|
|
|
} |
|
|
|
struct_fmt += struct_mapping[layout.datatype] * layout.count |
|
|
|
except KeyError: |
|
|
|
raise ValueError("no such layout, or bad data types") |
|
|
|
|
|
|
|
# Create the table. Note that we make a distinction here |
|
|
|
# between NilmDB paths (always Unix style, split apart |
|
|
|
# manually) and OS paths (built up with os.path.join) |
|
|
@@ -122,11 +92,16 @@ class BulkData(object): |
|
|
|
raise ValueError("subdirs of this path already exist") |
|
|
|
os.mkdir(ospath) |
|
|
|
|
|
|
|
# Write format string to file |
|
|
|
Table.create(ospath, struct_fmt, self.file_size, self.files_per_dir) |
|
|
|
try: |
|
|
|
# Write format string to file |
|
|
|
Table.create(ospath, layout_name, self.file_size, |
|
|
|
self.files_per_dir) |
|
|
|
|
|
|
|
# Open and cache it |
|
|
|
self.getnode(unicodepath) |
|
|
|
# Open and cache it |
|
|
|
self.getnode(unicodepath) |
|
|
|
except: |
|
|
|
os.rmdir(ospath) |
|
|
|
raise |
|
|
|
|
|
|
|
# Success |
|
|
|
return |
|
|
@@ -173,8 +148,8 @@ class BulkData(object): |
|
|
|
|
|
|
|
@nilmdb.utils.must_close(wrap_verify = False) |
|
|
|
class File(object): |
|
|
|
"""Object representing a single file on disk. Data can be appended, |
|
|
|
or the self.mmap handle can be used for random reads.""" |
|
|
|
"""Object representing a single file on disk. Data can be appended |
|
|
|
or extracted using the rocket functions.""" |
|
|
|
|
|
|
|
def __init__(self, root, subdir, filename): |
|
|
|
# Create path if it doesn't exist |
|
|
@@ -190,46 +165,22 @@ class File(object): |
|
|
|
self._f.seek(0, 2) |
|
|
|
self.size = self._f.tell() |
|
|
|
|
|
|
|
# Open mmap object |
|
|
|
self.mmap = None |
|
|
|
self._mmap_reopen() |
|
|
|
|
|
|
|
def _mmap_reopen(self): |
|
|
|
if self.size == 0: |
|
|
|
# Don't mmap if the file is empty; it would fail |
|
|
|
pass |
|
|
|
elif self.mmap is None: |
|
|
|
# Not opened yet, so open it |
|
|
|
self.mmap = mmap.mmap(self._f.fileno(), 0) |
|
|
|
else: |
|
|
|
# Already opened, so just resize it |
|
|
|
self.mmap.resize(self.size) |
|
|
|
|
|
|
|
def close(self): |
|
|
|
if self.mmap is not None: |
|
|
|
self.mmap.close() |
|
|
|
self._f.close() |
|
|
|
|
|
|
|
def append(self, data): # pragma: no cover (below version used instead) |
|
|
|
# Write data, flush it, and resize our mmap accordingly |
|
|
|
self._f.write(data) |
|
|
|
self._f.flush() |
|
|
|
self.size += len(data) |
|
|
|
self._mmap_reopen() |
|
|
|
|
|
|
|
def append_pack_iter(self, count, packer, dataiter): |
|
|
|
# An optimized verison of append, to avoid flushing the file |
|
|
|
# and resizing the mmap after each data point. |
|
|
|
def append_rocket_iter(self, rocket, rows, dataiter): |
|
|
|
try: |
|
|
|
rows = [] |
|
|
|
for i in xrange(count): |
|
|
|
row = dataiter.next() |
|
|
|
rows.append(packer(*row)) |
|
|
|
self._f.write("".join(rows)) |
|
|
|
for i in xrange(rows): |
|
|
|
rocket.append_list(self._f, [dataiter.next()]) |
|
|
|
finally: |
|
|
|
self._f.flush() |
|
|
|
self.size = self._f.tell() |
|
|
|
self._mmap_reopen() |
|
|
|
|
|
|
|
def extract_rocket_list(self, rocket, offset, rows): |
|
|
|
return rocket.extract_list(self._f, offset, rows) |
|
|
|
|
|
|
|
def extract_rocket_string(self, rocket, offset, rows): |
|
|
|
return rocket.extract_string(self._f, offset, rows) |
|
|
|
|
|
|
|
@nilmdb.utils.must_close(wrap_verify = False) |
|
|
|
class Table(object): |
|
|
@@ -243,19 +194,19 @@ class Table(object): |
|
|
|
return os.path.isfile(os.path.join(root, "_format")) |
|
|
|
|
|
|
|
@classmethod |
|
|
|
def create(cls, root, struct_fmt, file_size, files_per_dir): |
|
|
|
"""Initialize a table at the given OS path. |
|
|
|
'struct_fmt' is a Struct module format description""" |
|
|
|
def create(cls, root, layout, file_size, files_per_dir): |
|
|
|
"""Initialize a table at the given OS path with the |
|
|
|
given layout string""" |
|
|
|
|
|
|
|
# Calculate rows per file so that each file is approximately |
|
|
|
# file_size bytes. |
|
|
|
packer = struct.Struct(struct_fmt) |
|
|
|
rows_per_file = max(file_size // packer.size, 1) |
|
|
|
rkt = rocket.Rocket(layout) |
|
|
|
rows_per_file = max(file_size // rkt.binary_size, 1) |
|
|
|
|
|
|
|
fmt = { "rows_per_file": rows_per_file, |
|
|
|
"files_per_dir": files_per_dir, |
|
|
|
"struct_fmt": struct_fmt, |
|
|
|
"version": 1 } |
|
|
|
"layout": layout, |
|
|
|
"version": 2 } |
|
|
|
with open(os.path.join(root, "_format"), "wb") as f: |
|
|
|
pickle.dump(fmt, f, 2) |
|
|
|
|
|
|
@@ -264,18 +215,18 @@ class Table(object): |
|
|
|
"""'root' is the full OS path to the directory of this table""" |
|
|
|
self.root = root |
|
|
|
|
|
|
|
# Load the format and build packer |
|
|
|
# Load the format and build rocket |
|
|
|
with open(os.path.join(self.root, "_format"), "rb") as f: |
|
|
|
fmt = pickle.load(f) |
|
|
|
|
|
|
|
if fmt["version"] != 1: # pragma: no cover (just future proofing) |
|
|
|
if fmt["version"] != 2: # pragma: no cover (just future proofing) |
|
|
|
raise NotImplementedError("version " + fmt["version"] + |
|
|
|
" bulk data store not supported") |
|
|
|
|
|
|
|
self.rows_per_file = fmt["rows_per_file"] |
|
|
|
self.files_per_dir = fmt["files_per_dir"] |
|
|
|
self.packer = struct.Struct(fmt["struct_fmt"]) |
|
|
|
self.file_size = self.packer.size * self.rows_per_file |
|
|
|
self.rocket = rocket.Rocket(fmt["layout"]) |
|
|
|
self.file_size = self.rocket.binary_size * self.rows_per_file |
|
|
|
|
|
|
|
# Find nrows |
|
|
|
self.nrows = self._get_nrows() |
|
|
@@ -330,17 +281,19 @@ class Table(object): |
|
|
|
# will just get longer but will still sort correctly. |
|
|
|
dirname = sprintf("%04x", filenum // self.files_per_dir) |
|
|
|
filename = sprintf("%04x", filenum % self.files_per_dir) |
|
|
|
offset = (row % self.rows_per_file) * self.packer.size |
|
|
|
offset = (row % self.rows_per_file) * self.rocket.binary_size |
|
|
|
count = self.rows_per_file - (row % self.rows_per_file) |
|
|
|
return (dirname, filename, offset, count) |
|
|
|
|
|
|
|
def _row_from_offset(self, subdir, filename, offset): |
|
|
|
"""Return the row number that corresponds to the given |
|
|
|
'subdir/filename' and byte-offset within that file.""" |
|
|
|
if (offset % self.packer.size) != 0: # pragma: no cover; shouldn't occur |
|
|
|
if (offset % self.rocket.binary_size) != 0: # pragma: no cover |
|
|
|
# shouldn't occur, unless there is some corruption somewhere |
|
|
|
raise ValueError("file offset is not a multiple of data size") |
|
|
|
filenum = int(subdir, 16) * self.files_per_dir + int(filename, 16) |
|
|
|
row = (filenum * self.rows_per_file) + (offset // self.packer.size) |
|
|
|
row = ((filenum * self.rows_per_file) + |
|
|
|
(offset // self.rocket.binary_size)) |
|
|
|
return row |
|
|
|
|
|
|
|
# Cache open files |
|
|
@@ -365,10 +318,33 @@ class Table(object): |
|
|
|
f = self.file_open(subdir, fname) |
|
|
|
|
|
|
|
# Write the data |
|
|
|
f.append_pack_iter(count, self.packer.pack, dataiter) |
|
|
|
f.append_rocket_iter(self.rocket, count, dataiter) |
|
|
|
remaining -= count |
|
|
|
self.nrows += count |
|
|
|
|
|
|
|
def get_string(self, start, stop): |
|
|
|
"""Extract data corresponding to Python range [n:m], |
|
|
|
and returns a formatted string""" |
|
|
|
if (start is None or |
|
|
|
stop is None or |
|
|
|
start >= stop or |
|
|
|
start < 0 or |
|
|
|
stop > self.nrows): |
|
|
|
raise IndexError("Index out of range") |
|
|
|
|
|
|
|
ret_chunks = [] |
|
|
|
row = start |
|
|
|
remaining = stop - start |
|
|
|
while remaining: |
|
|
|
(subdir, filename, offset, count) = self._offset_from_row(row) |
|
|
|
if count > remaining: |
|
|
|
count = remaining |
|
|
|
f = self.file_open(subdir, filename) |
|
|
|
ret.append(f.extract_rocket_string(self.rocket, offset, count)) |
|
|
|
remaining -= count |
|
|
|
row += count |
|
|
|
return "".join(ret) |
|
|
|
|
|
|
|
def __getitem__(self, key): |
|
|
|
"""Extract data and return it. Supports simple indexing |
|
|
|
(table[n]) and range slices (table[n:m]). Returns a nested |
|
|
@@ -392,10 +368,8 @@ class Table(object): |
|
|
|
(subdir, filename, offset, count) = self._offset_from_row(row) |
|
|
|
if count > remaining: |
|
|
|
count = remaining |
|
|
|
mm = self.file_open(subdir, filename).mmap |
|
|
|
for i in xrange(count): |
|
|
|
ret.append(list(self.packer.unpack_from(mm, offset))) |
|
|
|
offset += self.packer.size |
|
|
|
f = self.file_open(subdir, filename) |
|
|
|
ret.extend(f.extract_rocket_list(self.rocket, offset, count)) |
|
|
|
remaining -= count |
|
|
|
row += count |
|
|
|
return ret |
|
|
@@ -404,9 +378,8 @@ class Table(object): |
|
|
|
if key < 0 or key >= self.nrows: |
|
|
|
raise IndexError("Index out of range") |
|
|
|
(subdir, filename, offset, count) = self._offset_from_row(key) |
|
|
|
mm = self.file_open(subdir, filename).mmap |
|
|
|
# unpack_from ignores the mmap object's current seek position |
|
|
|
return list(self.packer.unpack_from(mm, offset)) |
|
|
|
f = self.file_open(subdir, filename) |
|
|
|
return f.extract_rocket_list(self.rocket, offset, 1)[0] |
|
|
|
|
|
|
|
def _remove_rows(self, subdir, filename, start, stop): |
|
|
|
"""Helper to mark specific rows as being removed from a |
|
|
@@ -485,7 +458,7 @@ class Table(object): |
|
|
|
(subdir, filename, offset, count) = self._offset_from_row(row) |
|
|
|
if count > remaining: |
|
|
|
count = remaining |
|
|
|
row_offset = offset // self.packer.size |
|
|
|
row_offset = offset // self.rocket.binary_size |
|
|
|
# Mark the rows as being removed |
|
|
|
self._remove_rows(subdir, filename, row_offset, row_offset + count) |
|
|
|
remaining -= count |
|
|
|