From e5d3deb6fe1b4ca5b61b1b018d0b786bd5cfb28b Mon Sep 17 00:00:00 2001
From: Jim Paris <jim@jtan.com>
Date: Wed, 9 Jan 2013 23:26:59 -0500
Subject: [PATCH] Removal support is complete.

`nrows` may change if you restart the server; documented why this is
the case in the design.md file.  It's not a problem.
---
 .gitignore                                 |   1 +
 Makefile                                   |   3 +
 design.md                                  | 103 +++++++++++++--------
 nilmdb/bulkdata.py                         |  13 +--
 tests/data/prep-20120323T1002-first19lines |  19 ++++
 tests/test.order                           |   5 -
 tests/test_cmdline.py                      |  17 ++--
 7 files changed, 105 insertions(+), 56 deletions(-)
 create mode 100644 tests/data/prep-20120323T1002-first19lines

diff --git a/.gitignore b/.gitignore
index e4b9a4a..62445e2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,4 @@ db/
 tests/*testdb/
 .coverage
 *.pyc
+design.html
diff --git a/Makefile b/Makefile
index a8862df..933fe1a 100644
--- a/Makefile
+++ b/Makefile
@@ -8,6 +8,9 @@ tool:
 lint:
 	pylint -f parseable nilmdb
 
+%.html: %.md
+	pandoc -s $< > $@
+
 test:
 	python runtests.py
 
diff --git a/design.md b/design.md
index 0f0cb88..e461250 100644
--- a/design.md
+++ b/design.md
@@ -104,21 +104,21 @@ Speed
 
 - First approach was quadratic.  Adding four hours of data:
 
-    $ time zcat /home/jim/bpnilm-data/snapshot-1-20110513-110002.raw.gz | ./nilmtool.py insert -s 20110513-110000 /bpnilm/1/raw
-	real    24m31.093s
-	$ time zcat /home/jim/bpnilm-data/snapshot-1-20110513-110002.raw.gz | ./nilmtool.py insert -s 20110513-120001 /bpnilm/1/raw
-	real    43m44.528s
-	$ time zcat /home/jim/bpnilm-data/snapshot-1-20110513-110002.raw.gz | ./nilmtool.py insert -s 20110513-130002 /bpnilm/1/raw
-	real    93m29.713s
-	$ time zcat /home/jim/bpnilm-data/snapshot-1-20110513-110002.raw.gz | ./nilmtool.py insert -s 20110513-140003 /bpnilm/1/raw
-	real    166m53.007s
+        $ time zcat /home/jim/bpnilm-data/snapshot-1-20110513-110002.raw.gz | ./nilmtool.py insert -s 20110513-110000 /bpnilm/1/raw
+    	real    24m31.093s
+    	$ time zcat /home/jim/bpnilm-data/snapshot-1-20110513-110002.raw.gz | ./nilmtool.py insert -s 20110513-120001 /bpnilm/1/raw
+    	real    43m44.528s
+    	$ time zcat /home/jim/bpnilm-data/snapshot-1-20110513-110002.raw.gz | ./nilmtool.py insert -s 20110513-130002 /bpnilm/1/raw
+    	real    93m29.713s
+    	$ time zcat /home/jim/bpnilm-data/snapshot-1-20110513-110002.raw.gz | ./nilmtool.py insert -s 20110513-140003 /bpnilm/1/raw
+    	real    166m53.007s
 
 - Disabling pytables indexing didn't help:
 
-    real    31m21.492s
-	real    52m51.963s
-	real    102m8.151s
-	real    176m12.469s
+        real    31m21.492s
+    	real    52m51.963s
+    	real    102m8.151s
+    	real    176m12.469s
 
 - Server RAM usage is constant.
 
@@ -139,10 +139,12 @@ Speed
 - Next slowdown target is nilmdb.layout.Parser.parse().
   - Rewrote parsers using cython and sscanf
   - Stats (rev 10831), with _add_interval disabled
-     layout.pyx.Parser.parse:128        6303 sec, 262k calls
-	 layout.pyx.parse:63               13913 sec, 5.1g calls
-	 numpy:records.py.fromrecords:569   7410 sec, 262k calls
-  - Probably OK for now.
+
+        layout.pyx.Parser.parse:128        6303 sec, 262k calls
+ 	    layout.pyx.parse:63               13913 sec, 5.1g calls
+ 	    numpy:records.py.fromrecords:569   7410 sec, 262k calls
+
+- Probably OK for now.
 
 - After all updates, now takes about 8.5 minutes to insert an hour of
   data, constant after adding 171 hours (4.9 billion data points)
@@ -157,12 +159,12 @@ IntervalSet speed
   sorted list
 
 - Replaced with bxInterval; now takes about log n time for an insertion
-  - TestIntervalSpeed with range(17,18) and profiling
-    - 85 μs each
-    - 131072 calls to `__iadd__`
-    - 131072 to bx.insert_interval
-    - 131072 to bx.insert:395
-    - 2355835 to bx.insert:106  (18x as many?)
+    - TestIntervalSpeed with range(17,18) and profiling
+        - 85 μs each
+        - 131072 calls to `__iadd__`
+        - 131072 to bx.insert_interval
+        - 131072 to bx.insert:395
+        - 2355835 to bx.insert:106  (18x as many?)
 
 - Tried blist too, worse than bxinterval.
 
@@ -173,14 +175,14 @@ IntervalSet speed
   insert for 2**17 insertions, followed by total wall time and RAM
   usage for running "make test" with `test_rbtree` and `test_interval`
   with range(5,20):
-  - old values with bxinterval:
-    20.2 μS, total 20 s, 177 MB RAM
-  - rbtree, plain python:
-    97 μS, total 105 s, 846 MB RAM
-  - rbtree converted to cython:
-    26 μS, total 29 s, 320 MB RAM
-  - rbtree and interval converted to cython:
-    8.4 μS, total 12 s, 134 MB RAM
+    - old values with bxinterval:
+      20.2 μS, total 20 s, 177 MB RAM
+    - rbtree, plain python:
+      97 μS, total 105 s, 846 MB RAM
+    - rbtree converted to cython:
+      26 μS, total 29 s, 320 MB RAM
+    - rbtree and interval converted to cython:
+      8.4 μS, total 12 s, 134 MB RAM
 
 Layouts
 -------
@@ -220,20 +222,45 @@ Each table contains:
   parameters of how the data is broken up, like files per directory,
   rows per file, and the binary data format
 
-- A changing `_nrows` file (Python pickle format) that contains the
-  number of the next row that will be inserted into the database.
-  This number only increases, even if rows are deleted, and is
-  overwritten atomically.  (Note that it may not really be atomic on
-  all OSes, and it may not be fully durable on power loss or other
-  failures.)
-
 - Hex named subdirectories `("%04x", although more than 65536 can exist)`
 
 - Hex named files within those subdirectories, like:
 
         /nilmdb/data/newton/raw/000b/010a
 
+    The data format of these files is raw binary, interpreted by the
+    Python `struct` module according to the format string in the
+    `_format` file.
+
 - Same as above, with `.removed` suffix, is an optional file (Python
   pickle format) containing a list of row numbers that have been
   logically removed from the file.  If this range covers the entire
-  file, the entire file can be removed.
+  file, the entire file will be removed.
+
+- Note that the `bulkdata.nrows` variable is calculated once in
+  `BulkData.__init__()`, and only ever incremented during use.  Thus,
+  even if all data is removed, `nrows` can remain high.  However, if
+  the server is restarted, the newly calculated `nrows` may be lower
+  than in a previous run due to deleted data.  To be specific, this
+  sequence of events:
+
+    - insert data
+    - remove all data
+    - insert data
+
+    will result in having different row numbers in the database, and
+    differently numbered files on the filesystem, than the sequence:
+
+    - insert data
+    - remove all data
+    - restart server
+    - insert data
+
+    This is okay!  Everything should remain consistent both in the
+    `BulkData` and `NilmDB`.  Not attempting to readjust `nrows` during
+    deletion makes the code quite a bit simpler.
+
+- Similarly, data files are never truncated shorter.  Removing data
+  from the end of the file will not shorten it; it will only be
+  deleted when it has been fully filled and all of the data has been
+  subsequently removed.
diff --git a/nilmdb/bulkdata.py b/nilmdb/bulkdata.py
index 3faae59..9123c85 100644
--- a/nilmdb/bulkdata.py
+++ b/nilmdb/bulkdata.py
@@ -217,8 +217,10 @@ class Table(object):
     def _get_nrows(self):
         """Find nrows by locating the lexicographically last filename
         and using its size"""
-        # Find nrows by locating the lexicographically last filename
-        # and using its size.
+        # Note that this just finds a 'nrows' that is guaranteed to be
+        # greater than the row number of any piece of data that
+        # currently exists, not necessarily all data that _ever_
+        # existed.
         regex = re.compile("^[0-9a-f]{4,}$")
 
         # Find the last directory.  We sort and loop through all of them,
@@ -226,14 +228,12 @@ class Table(object):
         # empty if something was deleted.
         subdirs = sorted(filter(regex.search, os.listdir(self.root)),
                          key = lambda x: int(x, 16), reverse = True)
-        if not subdirs:
-            return 0
 
         for subdir in subdirs:
             # Now find the last file in that dir
             path = os.path.join(self.root, subdir)
             files = filter(regex.search, os.listdir(path))
-            if not files:
+            if not files: # pragma: no cover (shouldn't occur)
                 # Empty dir: try the next one
                 continue
 
@@ -243,7 +243,8 @@ class Table(object):
 
             # Convert to row number
             return self._row_from_offset(subdir, filename, offset)
-        # No files in any of the subdirs, so no data
+
+        # No files, so no data
         return 0
 
     def _offset_from_row(self, row):
diff --git a/tests/data/prep-20120323T1002-first19lines b/tests/data/prep-20120323T1002-first19lines
new file mode 100644
index 0000000..f61c08f
--- /dev/null
+++ b/tests/data/prep-20120323T1002-first19lines
@@ -0,0 +1,19 @@
+2.56437e+05  2.24430e+05  4.01161e+03  3.47534e+03  7.49589e+03  3.38894e+03  2.61397e+02  3.73126e+03  
+2.53963e+05  2.24167e+05  5.62107e+03  1.54801e+03  9.16517e+03  3.52293e+03  1.05893e+03  2.99696e+03  
+2.58508e+05  2.24930e+05  6.01140e+03  8.18866e+02  9.03995e+03  4.48244e+03  2.49039e+03  2.67934e+03  
+2.59627e+05  2.26022e+05  4.47450e+03  2.42302e+03  7.41419e+03  5.07197e+03  2.43938e+03  2.96296e+03  
+2.55187e+05  2.24632e+05  4.73857e+03  3.39804e+03  7.39512e+03  4.72645e+03  1.83903e+03  3.39353e+03  
+2.57102e+05  2.21623e+05  6.14413e+03  1.44109e+03  8.75648e+03  3.49532e+03  1.86994e+03  3.75253e+03  
+2.63653e+05  2.21770e+05  6.22177e+03  7.38962e+02  9.54760e+03  2.66682e+03  1.46266e+03  3.33257e+03  
+2.63613e+05  2.25256e+05  4.47712e+03  2.43745e+03  8.51021e+03  3.85563e+03  9.59442e+02  2.38718e+03  
+2.55350e+05  2.26264e+05  4.28372e+03  3.92394e+03  7.91247e+03  5.46652e+03  1.28499e+03  2.09372e+03  
+2.52727e+05  2.24609e+05  5.85193e+03  2.49198e+03  8.54063e+03  5.62305e+03  2.33978e+03  3.00714e+03  
+2.58475e+05  2.23578e+05  5.92487e+03  1.39448e+03  8.77962e+03  4.54418e+03  2.13203e+03  3.84976e+03  
+2.61563e+05  2.24609e+05  4.33614e+03  2.45575e+03  8.05538e+03  3.46911e+03  6.27873e+02  3.66420e+03  
+2.56401e+05  2.24441e+05  4.18715e+03  3.45717e+03  7.90669e+03  3.53355e+03  -5.84482e+00  2.96687e+03  
+2.54745e+05  2.22644e+05  6.02005e+03  1.94721e+03  9.28939e+03  3.80020e+03  1.34820e+03  2.37785e+03  
+2.60723e+05  2.22660e+05  6.69719e+03  1.03048e+03  9.26124e+03  4.34917e+03  2.84530e+03  2.73619e+03  
+2.63089e+05  2.25711e+05  4.77887e+03  2.60417e+03  7.39660e+03  4.59811e+03  2.17472e+03  3.40729e+03  
+2.55843e+05  2.27128e+05  4.02413e+03  4.39323e+03  6.79336e+03  4.62535e+03  7.52009e+02  3.44647e+03  
+2.51904e+05  2.24868e+05  5.82289e+03  3.02127e+03  8.46160e+03  3.80298e+03  8.07212e+02  3.53468e+03  
+2.57670e+05  2.22974e+05  6.73436e+03  1.60956e+03  9.92960e+03  2.98028e+03  1.44168e+03  3.05351e+03  
diff --git a/tests/test.order b/tests/test.order
index d727b6e..7c646a4 100644
--- a/tests/test.order
+++ b/tests/test.order
@@ -1,8 +1,3 @@
-test_cmdline.py
-##########
-
-test_client.py
-
 test_printf.py
 test_lrucache.py
 test_mustclose.py
diff --git a/tests/test_cmdline.py b/tests/test_cmdline.py
index c35abb6..a9ef494 100644
--- a/tests/test_cmdline.py
+++ b/tests/test_cmdline.py
@@ -786,13 +786,18 @@ class TestCmdline(object):
         self.ok("remove /newton/prep " +
                 "--start '23 Mar 2012 10:02:00' --end '2020-01-01'")
 
-        # Shut down and restart server, to force nrows to get refreshed
-#        global test_server, test_db
-#        raise Exception()
-#        print test_db.data.getnode("/newton/prep")
+        # Re-insert 19 lines from that file, then remove them again.
+        # With the specific file_size above, this will cause the last
+        # file in the bulk data storage to be exactly file_size large,
+        # so removing the data should also remove that last file.
+        self.ok("insert --rate 120 /newton/prep " +
+                "tests/data/prep-20120323T1002-first19lines")
+        self.ok("remove /newton/prep " +
+                "--start '23 Mar 2012 10:02:00' --end '2020-01-01'")
+
+        # Shut down and restart server, to force nrows to get refreshed.
         server_stop()
         server_start()
-#        print test_db.data.getnode("/newton/prep")
 
         # Re-add the full 10:02 data file.  This tests adding new data once
         # we removed data near the end.
@@ -801,5 +806,3 @@ class TestCmdline(object):
         # See if we can extract it all
         self.ok("extract /newton/prep --start 2000-01-01 --end 2020-01-01")
         lines_(self.captured, 15600)
-
-#        raise Exception()