From e5d3deb6fe1b4ca5b61b1b018d0b786bd5cfb28b Mon Sep 17 00:00:00 2001 From: Jim Paris Date: Wed, 9 Jan 2013 23:26:59 -0500 Subject: [PATCH] Removal support is complete. `nrows` may change if you restart the server; documented why this is the case in the design.md file. It's not a problem. --- .gitignore | 1 + Makefile | 3 + design.md | 103 +++++++++++++-------- nilmdb/bulkdata.py | 13 +-- tests/data/prep-20120323T1002-first19lines | 19 ++++ tests/test.order | 5 - tests/test_cmdline.py | 17 ++-- 7 files changed, 105 insertions(+), 56 deletions(-) create mode 100644 tests/data/prep-20120323T1002-first19lines diff --git a/.gitignore b/.gitignore index e4b9a4a..62445e2 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ db/ tests/*testdb/ .coverage *.pyc +design.html diff --git a/Makefile b/Makefile index a8862df..933fe1a 100644 --- a/Makefile +++ b/Makefile @@ -8,6 +8,9 @@ tool: lint: pylint -f parseable nilmdb +%.html: %.md + pandoc -s $< > $@ + test: python runtests.py diff --git a/design.md b/design.md index 0f0cb88..e461250 100644 --- a/design.md +++ b/design.md @@ -104,21 +104,21 @@ Speed - First approach was quadratic. Adding four hours of data: - $ time zcat /home/jim/bpnilm-data/snapshot-1-20110513-110002.raw.gz | ./nilmtool.py insert -s 20110513-110000 /bpnilm/1/raw - real 24m31.093s - $ time zcat /home/jim/bpnilm-data/snapshot-1-20110513-110002.raw.gz | ./nilmtool.py insert -s 20110513-120001 /bpnilm/1/raw - real 43m44.528s - $ time zcat /home/jim/bpnilm-data/snapshot-1-20110513-110002.raw.gz | ./nilmtool.py insert -s 20110513-130002 /bpnilm/1/raw - real 93m29.713s - $ time zcat /home/jim/bpnilm-data/snapshot-1-20110513-110002.raw.gz | ./nilmtool.py insert -s 20110513-140003 /bpnilm/1/raw - real 166m53.007s + $ time zcat /home/jim/bpnilm-data/snapshot-1-20110513-110002.raw.gz | ./nilmtool.py insert -s 20110513-110000 /bpnilm/1/raw + real 24m31.093s + $ time zcat /home/jim/bpnilm-data/snapshot-1-20110513-110002.raw.gz | ./nilmtool.py insert -s 20110513-120001 /bpnilm/1/raw + real 43m44.528s + $ time zcat /home/jim/bpnilm-data/snapshot-1-20110513-110002.raw.gz | ./nilmtool.py insert -s 20110513-130002 /bpnilm/1/raw + real 93m29.713s + $ time zcat /home/jim/bpnilm-data/snapshot-1-20110513-110002.raw.gz | ./nilmtool.py insert -s 20110513-140003 /bpnilm/1/raw + real 166m53.007s - Disabling pytables indexing didn't help: - real 31m21.492s - real 52m51.963s - real 102m8.151s - real 176m12.469s + real 31m21.492s + real 52m51.963s + real 102m8.151s + real 176m12.469s - Server RAM usage is constant. @@ -139,10 +139,12 @@ Speed - Next slowdown target is nilmdb.layout.Parser.parse(). - Rewrote parsers using cython and sscanf - Stats (rev 10831), with _add_interval disabled - layout.pyx.Parser.parse:128 6303 sec, 262k calls - layout.pyx.parse:63 13913 sec, 5.1g calls - numpy:records.py.fromrecords:569 7410 sec, 262k calls - - Probably OK for now. + + layout.pyx.Parser.parse:128 6303 sec, 262k calls + layout.pyx.parse:63 13913 sec, 5.1g calls + numpy:records.py.fromrecords:569 7410 sec, 262k calls + +- Probably OK for now. - After all updates, now takes about 8.5 minutes to insert an hour of data, constant after adding 171 hours (4.9 billion data points) @@ -157,12 +159,12 @@ IntervalSet speed sorted list - Replaced with bxInterval; now takes about log n time for an insertion - - TestIntervalSpeed with range(17,18) and profiling - - 85 μs each - - 131072 calls to `__iadd__` - - 131072 to bx.insert_interval - - 131072 to bx.insert:395 - - 2355835 to bx.insert:106 (18x as many?) + - TestIntervalSpeed with range(17,18) and profiling + - 85 μs each + - 131072 calls to `__iadd__` + - 131072 to bx.insert_interval + - 131072 to bx.insert:395 + - 2355835 to bx.insert:106 (18x as many?) - Tried blist too, worse than bxinterval. @@ -173,14 +175,14 @@ IntervalSet speed insert for 2**17 insertions, followed by total wall time and RAM usage for running "make test" with `test_rbtree` and `test_interval` with range(5,20): - - old values with bxinterval: - 20.2 μS, total 20 s, 177 MB RAM - - rbtree, plain python: - 97 μS, total 105 s, 846 MB RAM - - rbtree converted to cython: - 26 μS, total 29 s, 320 MB RAM - - rbtree and interval converted to cython: - 8.4 μS, total 12 s, 134 MB RAM + - old values with bxinterval: + 20.2 μS, total 20 s, 177 MB RAM + - rbtree, plain python: + 97 μS, total 105 s, 846 MB RAM + - rbtree converted to cython: + 26 μS, total 29 s, 320 MB RAM + - rbtree and interval converted to cython: + 8.4 μS, total 12 s, 134 MB RAM Layouts ------- @@ -220,20 +222,45 @@ Each table contains: parameters of how the data is broken up, like files per directory, rows per file, and the binary data format -- A changing `_nrows` file (Python pickle format) that contains the - number of the next row that will be inserted into the database. - This number only increases, even if rows are deleted, and is - overwritten atomically. (Note that it may not really be atomic on - all OSes, and it may not be fully durable on power loss or other - failures.) - - Hex named subdirectories `("%04x", although more than 65536 can exist)` - Hex named files within those subdirectories, like: /nilmdb/data/newton/raw/000b/010a + The data format of these files is raw binary, interpreted by the + Python `struct` module according to the format string in the + `_format` file. + - Same as above, with `.removed` suffix, is an optional file (Python pickle format) containing a list of row numbers that have been logically removed from the file. If this range covers the entire - file, the entire file can be removed. + file, the entire file will be removed. + +- Note that the `bulkdata.nrows` variable is calculated once in + `BulkData.__init__()`, and only ever incremented during use. Thus, + even if all data is removed, `nrows` can remain high. However, if + the server is restarted, the newly calculated `nrows` may be lower + than in a previous run due to deleted data. To be specific, this + sequence of events: + + - insert data + - remove all data + - insert data + + will result in having different row numbers in the database, and + differently numbered files on the filesystem, than the sequence: + + - insert data + - remove all data + - restart server + - insert data + + This is okay! Everything should remain consistent both in the + `BulkData` and `NilmDB`. Not attempting to readjust `nrows` during + deletion makes the code quite a bit simpler. + +- Similarly, data files are never truncated shorter. Removing data + from the end of the file will not shorten it; it will only be + deleted when it has been fully filled and all of the data has been + subsequently removed. diff --git a/nilmdb/bulkdata.py b/nilmdb/bulkdata.py index 3faae59..9123c85 100644 --- a/nilmdb/bulkdata.py +++ b/nilmdb/bulkdata.py @@ -217,8 +217,10 @@ class Table(object): def _get_nrows(self): """Find nrows by locating the lexicographically last filename and using its size""" - # Find nrows by locating the lexicographically last filename - # and using its size. + # Note that this just finds a 'nrows' that is guaranteed to be + # greater than the row number of any piece of data that + # currently exists, not necessarily all data that _ever_ + # existed. regex = re.compile("^[0-9a-f]{4,}$") # Find the last directory. We sort and loop through all of them, @@ -226,14 +228,12 @@ class Table(object): # empty if something was deleted. subdirs = sorted(filter(regex.search, os.listdir(self.root)), key = lambda x: int(x, 16), reverse = True) - if not subdirs: - return 0 for subdir in subdirs: # Now find the last file in that dir path = os.path.join(self.root, subdir) files = filter(regex.search, os.listdir(path)) - if not files: + if not files: # pragma: no cover (shouldn't occur) # Empty dir: try the next one continue @@ -243,7 +243,8 @@ class Table(object): # Convert to row number return self._row_from_offset(subdir, filename, offset) - # No files in any of the subdirs, so no data + + # No files, so no data return 0 def _offset_from_row(self, row): diff --git a/tests/data/prep-20120323T1002-first19lines b/tests/data/prep-20120323T1002-first19lines new file mode 100644 index 0000000..f61c08f --- /dev/null +++ b/tests/data/prep-20120323T1002-first19lines @@ -0,0 +1,19 @@ +2.56437e+05 2.24430e+05 4.01161e+03 3.47534e+03 7.49589e+03 3.38894e+03 2.61397e+02 3.73126e+03 +2.53963e+05 2.24167e+05 5.62107e+03 1.54801e+03 9.16517e+03 3.52293e+03 1.05893e+03 2.99696e+03 +2.58508e+05 2.24930e+05 6.01140e+03 8.18866e+02 9.03995e+03 4.48244e+03 2.49039e+03 2.67934e+03 +2.59627e+05 2.26022e+05 4.47450e+03 2.42302e+03 7.41419e+03 5.07197e+03 2.43938e+03 2.96296e+03 +2.55187e+05 2.24632e+05 4.73857e+03 3.39804e+03 7.39512e+03 4.72645e+03 1.83903e+03 3.39353e+03 +2.57102e+05 2.21623e+05 6.14413e+03 1.44109e+03 8.75648e+03 3.49532e+03 1.86994e+03 3.75253e+03 +2.63653e+05 2.21770e+05 6.22177e+03 7.38962e+02 9.54760e+03 2.66682e+03 1.46266e+03 3.33257e+03 +2.63613e+05 2.25256e+05 4.47712e+03 2.43745e+03 8.51021e+03 3.85563e+03 9.59442e+02 2.38718e+03 +2.55350e+05 2.26264e+05 4.28372e+03 3.92394e+03 7.91247e+03 5.46652e+03 1.28499e+03 2.09372e+03 +2.52727e+05 2.24609e+05 5.85193e+03 2.49198e+03 8.54063e+03 5.62305e+03 2.33978e+03 3.00714e+03 +2.58475e+05 2.23578e+05 5.92487e+03 1.39448e+03 8.77962e+03 4.54418e+03 2.13203e+03 3.84976e+03 +2.61563e+05 2.24609e+05 4.33614e+03 2.45575e+03 8.05538e+03 3.46911e+03 6.27873e+02 3.66420e+03 +2.56401e+05 2.24441e+05 4.18715e+03 3.45717e+03 7.90669e+03 3.53355e+03 -5.84482e+00 2.96687e+03 +2.54745e+05 2.22644e+05 6.02005e+03 1.94721e+03 9.28939e+03 3.80020e+03 1.34820e+03 2.37785e+03 +2.60723e+05 2.22660e+05 6.69719e+03 1.03048e+03 9.26124e+03 4.34917e+03 2.84530e+03 2.73619e+03 +2.63089e+05 2.25711e+05 4.77887e+03 2.60417e+03 7.39660e+03 4.59811e+03 2.17472e+03 3.40729e+03 +2.55843e+05 2.27128e+05 4.02413e+03 4.39323e+03 6.79336e+03 4.62535e+03 7.52009e+02 3.44647e+03 +2.51904e+05 2.24868e+05 5.82289e+03 3.02127e+03 8.46160e+03 3.80298e+03 8.07212e+02 3.53468e+03 +2.57670e+05 2.22974e+05 6.73436e+03 1.60956e+03 9.92960e+03 2.98028e+03 1.44168e+03 3.05351e+03 diff --git a/tests/test.order b/tests/test.order index d727b6e..7c646a4 100644 --- a/tests/test.order +++ b/tests/test.order @@ -1,8 +1,3 @@ -test_cmdline.py -########## - -test_client.py - test_printf.py test_lrucache.py test_mustclose.py diff --git a/tests/test_cmdline.py b/tests/test_cmdline.py index c35abb6..a9ef494 100644 --- a/tests/test_cmdline.py +++ b/tests/test_cmdline.py @@ -786,13 +786,18 @@ class TestCmdline(object): self.ok("remove /newton/prep " + "--start '23 Mar 2012 10:02:00' --end '2020-01-01'") - # Shut down and restart server, to force nrows to get refreshed -# global test_server, test_db -# raise Exception() -# print test_db.data.getnode("/newton/prep") + # Re-insert 19 lines from that file, then remove them again. + # With the specific file_size above, this will cause the last + # file in the bulk data storage to be exactly file_size large, + # so removing the data should also remove that last file. + self.ok("insert --rate 120 /newton/prep " + + "tests/data/prep-20120323T1002-first19lines") + self.ok("remove /newton/prep " + + "--start '23 Mar 2012 10:02:00' --end '2020-01-01'") + + # Shut down and restart server, to force nrows to get refreshed. server_stop() server_start() -# print test_db.data.getnode("/newton/prep") # Re-add the full 10:02 data file. This tests adding new data once # we removed data near the end. @@ -801,5 +806,3 @@ class TestCmdline(object): # See if we can extract it all self.ok("extract /newton/prep --start 2000-01-01 --end 2020-01-01") lines_(self.captured, 15600) - -# raise Exception()