You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

489 lines
18 KiB

  1. # Fixed record size bulk data storage
  2. # Need absolute_import so that "import nilmdb" won't pull in
  3. # nilmdb.py, but will pull the parent nilmdb module instead.
  4. from __future__ import absolute_import
  5. from __future__ import division
  6. import nilmdb
  7. from nilmdb.utils.printf import *
  8. import os
  9. import cPickle as pickle
  10. import struct
  11. import mmap
  12. import re
  13. # If we have the faulthandler module, use it. All of the mmap stuff
  14. # might trigger a SIGSEGV or SIGBUS if we're not careful, and
  15. # faulthandler will give a traceback in that case. (the Python
  16. # interpreter will still die either way).
  17. try: # pragma: no cover
  18. import faulthandler
  19. faulthandler.enable()
  20. except: # pragma: no cover
  21. pass
  22. # Up to 256 open file descriptors at any given time.
  23. # These variables are global so they can be used in the decorator arguments.
  24. table_cache_size = 16
  25. fd_cache_size = 16
  26. @nilmdb.utils.must_close(wrap_verify = True)
  27. class BulkData(object):
  28. def __init__(self, basepath, **kwargs):
  29. self.basepath = basepath
  30. self.root = os.path.join(self.basepath, "data")
  31. # Tuneables
  32. if "file_size" in kwargs:
  33. self.file_size = kwargs["file_size"]
  34. else:
  35. # Default to approximately 128 MiB per file
  36. self.file_size = 128 * 1024 * 1024
  37. if "files_per_dir" in kwargs:
  38. self.files_per_dir = kwargs["files_per_dir"]
  39. else:
  40. # 32768 files per dir should work even on FAT32
  41. self.files_per_dir = 32768
  42. # Make root path
  43. if not os.path.isdir(self.root):
  44. os.mkdir(self.root)
  45. def close(self):
  46. self.getnode.cache_remove_all()
  47. def _encode_filename(self, path):
  48. # Encode all paths to UTF-8, regardless of sys.getfilesystemencoding(),
  49. # because we want to be able to represent all code points and the user
  50. # will never be directly exposed to filenames. We can then do path
  51. # manipulations on the UTF-8 directly.
  52. if isinstance(path, unicode):
  53. return path.encode('utf-8')
  54. return path
  55. def create(self, unicodepath, layout_name):
  56. """
  57. unicodepath: path to the data (e.g. u'/newton/prep').
  58. Paths must contain at least two elements, e.g.:
  59. /newton/prep
  60. /newton/raw
  61. /newton/upstairs/prep
  62. /newton/upstairs/raw
  63. layout_name: string for nilmdb.layout.get_named(), e.g. 'float32_8'
  64. """
  65. path = self._encode_filename(unicodepath)
  66. if path[0] != '/':
  67. raise ValueError("paths must start with /")
  68. [ group, node ] = path.rsplit("/", 1)
  69. if group == '':
  70. raise ValueError("invalid path; path must contain at least one "
  71. "folder")
  72. # Get layout, and build format string for struct module
  73. try:
  74. layout = nilmdb.server.layout.get_named(layout_name)
  75. struct_fmt = '<d' # Little endian, double timestamp
  76. struct_mapping = {
  77. "int8": 'b',
  78. "uint8": 'B',
  79. "int16": 'h',
  80. "uint16": 'H',
  81. "int32": 'i',
  82. "uint32": 'I',
  83. "int64": 'q',
  84. "uint64": 'Q',
  85. "float32": 'f',
  86. "float64": 'd',
  87. }
  88. struct_fmt += struct_mapping[layout.datatype] * layout.count
  89. except KeyError:
  90. raise ValueError("no such layout, or bad data types")
  91. # Create the table. Note that we make a distinction here
  92. # between NilmDB paths (always Unix style, split apart
  93. # manually) and OS paths (built up with os.path.join)
  94. # Make directories leading up to this one
  95. elements = path.lstrip('/').split('/')
  96. for i in range(len(elements)):
  97. ospath = os.path.join(self.root, *elements[0:i])
  98. if Table.exists(ospath):
  99. raise ValueError("path is subdir of existing node")
  100. if not os.path.isdir(ospath):
  101. os.mkdir(ospath)
  102. # Make the final dir
  103. ospath = os.path.join(self.root, *elements)
  104. if os.path.isdir(ospath):
  105. raise ValueError("subdirs of this path already exist")
  106. os.mkdir(ospath)
  107. # Write format string to file
  108. Table.create(ospath, struct_fmt, self.file_size, self.files_per_dir)
  109. # Open and cache it
  110. self.getnode(unicodepath)
  111. # Success
  112. return
  113. def destroy(self, unicodepath):
  114. """Fully remove all data at a particular path. No way to undo
  115. it! The group/path structure is removed, too."""
  116. path = self._encode_filename(unicodepath)
  117. # Get OS path
  118. elements = path.lstrip('/').split('/')
  119. ospath = os.path.join(self.root, *elements)
  120. # Remove Table object from cache
  121. self.getnode.cache_remove(self, unicodepath)
  122. # Remove the contents of the target directory
  123. if not Table.exists(ospath):
  124. raise ValueError("nothing at that path")
  125. for (root, dirs, files) in os.walk(ospath, topdown = False):
  126. for name in files:
  127. os.remove(os.path.join(root, name))
  128. for name in dirs:
  129. os.rmdir(os.path.join(root, name))
  130. # Remove empty parent directories
  131. for i in reversed(range(len(elements))):
  132. ospath = os.path.join(self.root, *elements[0:i+1])
  133. try:
  134. os.rmdir(ospath)
  135. except OSError:
  136. break
  137. # Cache open tables
  138. @nilmdb.utils.lru_cache(size = table_cache_size,
  139. onremove = lambda x: x.close())
  140. def getnode(self, unicodepath):
  141. """Return a Table object corresponding to the given database
  142. path, which must exist."""
  143. path = self._encode_filename(unicodepath)
  144. elements = path.lstrip('/').split('/')
  145. ospath = os.path.join(self.root, *elements)
  146. return Table(ospath)
  147. @nilmdb.utils.must_close(wrap_verify = True)
  148. class File(object):
  149. """Object representing a single file on disk. Data can be appended,
  150. or the self.mmap handle can be used for random reads."""
  151. def __init__(self, root, subdir, filename):
  152. # Create path if it doesn't exist
  153. try:
  154. os.mkdir(os.path.join(root, subdir))
  155. except OSError:
  156. pass
  157. # Open/create file
  158. self._f = open(os.path.join(root, subdir, filename), "a+b", 0)
  159. # Seek to end, and get size
  160. self._f.seek(0, 2)
  161. self.size = self._f.tell()
  162. # Open mmap object
  163. self.mmap = None
  164. self._mmap_reopen()
  165. def _mmap_reopen(self):
  166. if self.size == 0:
  167. # Don't mmap if the file is empty; it would fail
  168. pass
  169. elif self.mmap is None:
  170. # Not opened yet, so open it
  171. self.mmap = mmap.mmap(self._f.fileno(), 0)
  172. else:
  173. # Already opened, so just resize it
  174. self.mmap.resize(self.size)
  175. def close(self):
  176. if self.mmap is not None:
  177. self.mmap.close()
  178. self._f.close()
  179. def append(self, data):
  180. # Write data, flush it, and resize our mmap accordingly
  181. self._f.write(data)
  182. self._f.flush()
  183. self.size += len(data)
  184. self._mmap_reopen()
  185. @nilmdb.utils.must_close(wrap_verify = True)
  186. class Table(object):
  187. """Tools to help access a single table (data at a specific OS path)."""
  188. # See design.md for design details
  189. # Class methods, to help keep format details in this class.
  190. @classmethod
  191. def exists(cls, root):
  192. """Return True if a table appears to exist at this OS path"""
  193. return os.path.isfile(os.path.join(root, "_format"))
  194. @classmethod
  195. def create(cls, root, struct_fmt, file_size, files_per_dir):
  196. """Initialize a table at the given OS path.
  197. 'struct_fmt' is a Struct module format description"""
  198. # Calculate rows per file so that each file is approximately
  199. # file_size bytes.
  200. packer = struct.Struct(struct_fmt)
  201. rows_per_file = max(file_size // packer.size, 1)
  202. fmt = { "rows_per_file": rows_per_file,
  203. "files_per_dir": files_per_dir,
  204. "struct_fmt": struct_fmt,
  205. "version": 1 }
  206. with open(os.path.join(root, "_format"), "wb") as f:
  207. pickle.dump(fmt, f, 2)
  208. # Normal methods
  209. def __init__(self, root):
  210. """'root' is the full OS path to the directory of this table"""
  211. self.root = root
  212. # Load the format and build packer
  213. with open(os.path.join(self.root, "_format"), "rb") as f:
  214. fmt = pickle.load(f)
  215. if fmt["version"] != 1: # pragma: no cover (just future proofing)
  216. raise NotImplementedError("version " + fmt["version"] +
  217. " bulk data store not supported")
  218. self.rows_per_file = fmt["rows_per_file"]
  219. self.files_per_dir = fmt["files_per_dir"]
  220. self.packer = struct.Struct(fmt["struct_fmt"])
  221. self.file_size = self.packer.size * self.rows_per_file
  222. # Find nrows
  223. self.nrows = self._get_nrows()
  224. def close(self):
  225. self.file_open.cache_remove_all()
  226. # Internal helpers
  227. def _get_nrows(self):
  228. """Find nrows by locating the lexicographically last filename
  229. and using its size"""
  230. # Note that this just finds a 'nrows' that is guaranteed to be
  231. # greater than the row number of any piece of data that
  232. # currently exists, not necessarily all data that _ever_
  233. # existed.
  234. regex = re.compile("^[0-9a-f]{4,}$")
  235. # Find the last directory. We sort and loop through all of them,
  236. # starting with the numerically greatest, because the dirs could be
  237. # empty if something was deleted.
  238. subdirs = sorted(filter(regex.search, os.listdir(self.root)),
  239. key = lambda x: int(x, 16), reverse = True)
  240. for subdir in subdirs:
  241. # Now find the last file in that dir
  242. path = os.path.join(self.root, subdir)
  243. files = filter(regex.search, os.listdir(path))
  244. if not files: # pragma: no cover (shouldn't occur)
  245. # Empty dir: try the next one
  246. continue
  247. # Find the numerical max
  248. filename = max(files, key = lambda x: int(x, 16))
  249. offset = os.path.getsize(os.path.join(self.root, subdir, filename))
  250. # Convert to row number
  251. return self._row_from_offset(subdir, filename, offset)
  252. # No files, so no data
  253. return 0
  254. def _offset_from_row(self, row):
  255. """Return a (subdir, filename, offset, count) tuple:
  256. subdir: subdirectory for the file
  257. filename: the filename that contains the specified row
  258. offset: byte offset of the specified row within the file
  259. count: number of rows (starting at offset) that fit in the file
  260. """
  261. filenum = row // self.rows_per_file
  262. # It's OK if these format specifiers are too short; the filenames
  263. # will just get longer but will still sort correctly.
  264. dirname = sprintf("%04x", filenum // self.files_per_dir)
  265. filename = sprintf("%04x", filenum % self.files_per_dir)
  266. offset = (row % self.rows_per_file) * self.packer.size
  267. count = self.rows_per_file - (row % self.rows_per_file)
  268. return (dirname, filename, offset, count)
  269. def _row_from_offset(self, subdir, filename, offset):
  270. """Return the row number that corresponds to the given
  271. 'subdir/filename' and byte-offset within that file."""
  272. if (offset % self.packer.size) != 0: # pragma: no cover; shouldn't occur
  273. raise ValueError("file offset is not a multiple of data size")
  274. filenum = int(subdir, 16) * self.files_per_dir + int(filename, 16)
  275. row = (filenum * self.rows_per_file) + (offset // self.packer.size)
  276. return row
  277. # Cache open files
  278. @nilmdb.utils.lru_cache(size = fd_cache_size,
  279. onremove = lambda f: f.close())
  280. def file_open(self, subdir, filename):
  281. """Open and map a given 'subdir/filename' (relative to self.root).
  282. Will be automatically closed when evicted from the cache."""
  283. return File(self.root, subdir, filename)
  284. def append(self, data):
  285. """Append the data and flush it to disk.
  286. data is a nested Python list [[row],[row],[...]]"""
  287. remaining = len(data)
  288. dataiter = iter(data)
  289. while remaining:
  290. # See how many rows we can fit into the current file, and open it
  291. (subdir, fname, offset, count) = self._offset_from_row(self.nrows)
  292. if count > remaining:
  293. count = remaining
  294. f = self.file_open(subdir, fname)
  295. # Write the data
  296. for i in xrange(count):
  297. row = dataiter.next()
  298. f.append(self.packer.pack(*row))
  299. remaining -= count
  300. self.nrows += count
  301. def __getitem__(self, key):
  302. """Extract data and return it. Supports simple indexing
  303. (table[n]) and range slices (table[n:m]). Returns a nested
  304. Python list [[row],[row],[...]]"""
  305. # Handle simple slices
  306. if isinstance(key, slice):
  307. # Fall back to brute force if the slice isn't simple
  308. if ((key.step is not None and key.step != 1) or
  309. key.start is None or
  310. key.stop is None or
  311. key.start >= key.stop or
  312. key.start < 0 or
  313. key.stop > self.nrows):
  314. return [ self[x] for x in xrange(*key.indices(self.nrows)) ]
  315. ret = []
  316. row = key.start
  317. remaining = key.stop - key.start
  318. while remaining:
  319. (subdir, filename, offset, count) = self._offset_from_row(row)
  320. if count > remaining:
  321. count = remaining
  322. mm = self.file_open(subdir, filename).mmap
  323. for i in xrange(count):
  324. ret.append(list(self.packer.unpack_from(mm, offset)))
  325. offset += self.packer.size
  326. remaining -= count
  327. row += count
  328. return ret
  329. # Handle single points
  330. if key < 0 or key >= self.nrows:
  331. raise IndexError("Index out of range")
  332. (subdir, filename, offset, count) = self._offset_from_row(key)
  333. mm = self.file_open(subdir, filename).mmap
  334. # unpack_from ignores the mmap object's current seek position
  335. return list(self.packer.unpack_from(mm, offset))
  336. def _remove_rows(self, subdir, filename, start, stop):
  337. """Helper to mark specific rows as being removed from a
  338. file, and potentially removing or truncating the file itself."""
  339. # Import an existing list of deleted rows for this file
  340. datafile = os.path.join(self.root, subdir, filename)
  341. cachefile = datafile + ".removed"
  342. try:
  343. with open(cachefile, "rb") as f:
  344. ranges = pickle.load(f)
  345. cachefile_present = True
  346. except:
  347. ranges = []
  348. cachefile_present = False
  349. # Append our new range and sort
  350. ranges.append((start, stop))
  351. ranges.sort()
  352. # Merge adjacent ranges into "out"
  353. merged = []
  354. prev = None
  355. for new in ranges:
  356. if prev is None:
  357. # No previous range, so remember this one
  358. prev = new
  359. elif prev[1] == new[0]:
  360. # Previous range connected to this new one; extend prev
  361. prev = (prev[0], new[1])
  362. else:
  363. # Not connected; append previous and start again
  364. merged.append(prev)
  365. prev = new
  366. if prev is not None:
  367. merged.append(prev)
  368. # If the range covered the whole file, we can delete it now.
  369. # Note that the last file in a table may be only partially
  370. # full (smaller than self.rows_per_file). We purposely leave
  371. # those files around rather than deleting them, because the
  372. # remainder will be filled on a subsequent append(), and things
  373. # are generally easier if we don't have to special-case that.
  374. if (len(merged) == 1 and
  375. merged[0][0] == 0 and merged[0][1] == self.rows_per_file):
  376. # Close potentially open file in file_open LRU cache
  377. self.file_open.cache_remove(self, subdir, filename)
  378. # Delete files
  379. os.remove(datafile)
  380. if cachefile_present:
  381. os.remove(cachefile)
  382. # Try deleting subdir, too
  383. try:
  384. os.rmdir(os.path.join(self.root, subdir))
  385. except:
  386. pass
  387. else:
  388. # Update cache. Try to do it atomically.
  389. nilmdb.utils.atomic.replace_file(cachefile,
  390. pickle.dumps(merged, 2))
  391. def remove(self, start, stop):
  392. """Remove specified rows [start, stop) from this table.
  393. If a file is left empty, it is fully removed. Otherwise, a
  394. parallel data file is used to remember which rows have been
  395. removed, and the file is otherwise untouched."""
  396. if start < 0 or start > stop or stop > self.nrows:
  397. raise IndexError("Index out of range")
  398. row = start
  399. remaining = stop - start
  400. while remaining:
  401. # Loop through each file that we need to touch
  402. (subdir, filename, offset, count) = self._offset_from_row(row)
  403. if count > remaining:
  404. count = remaining
  405. row_offset = offset // self.packer.size
  406. # Mark the rows as being removed
  407. self._remove_rows(subdir, filename, row_offset, row_offset + count)
  408. remaining -= count
  409. row += count
  410. class TimestampOnlyTable(object):
  411. """Helper that lets us pass a Tables object into bisect, by
  412. returning only the timestamp when a particular row is requested."""
  413. def __init__(self, table):
  414. self.table = table
  415. def __getitem__(self, index):
  416. return self.table[index][0]