You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

cleanup.py 9.7 KiB

8 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267
  1. #!/usr/bin/env python3
  2. from nilmdb.utils.printf import printf, fprintf, sprintf
  3. from nilmdb.utils.time import (timestamp_to_human,
  4. timestamp_to_seconds, seconds_to_timestamp)
  5. from nilmdb.utils.diskusage import human_size
  6. from nilmdb.utils.interval import Interval
  7. import nilmdb.client
  8. import nilmdb.client.numpyclient
  9. import nilmtools
  10. import argparse
  11. import configparser
  12. import sys
  13. import collections
  14. import fnmatch
  15. import re
  16. import os
  17. def warn(msg, *args):
  18. fprintf(sys.stderr, "warning: " + msg + "\n", *args)
  19. class TimePeriod(object):
  20. _units = {'h': ('hour', 60*60),
  21. 'd': ('day', 60*60*24),
  22. 'w': ('week', 60*60*24*7),
  23. 'm': ('month', 60*60*24*30),
  24. 'y': ('year', 60*60*24*365)}
  25. def __init__(self, val):
  26. for u in self._units:
  27. if val.endswith(u):
  28. self.unit = self._units[u][0]
  29. self.scale = self._units[u][1]
  30. self.count = float(val[:-len(u)])
  31. break
  32. else:
  33. raise ValueError("unknown units: " + val)
  34. def seconds(self):
  35. return self.count * self.scale
  36. def describe_seconds(self, seconds):
  37. count = seconds / self.scale
  38. units = self.unit if count == 1 else (self.unit + "s")
  39. if count == int(count):
  40. return sprintf("%d %s", count, units)
  41. else:
  42. return sprintf("%.2f %s", count, units)
  43. def __str__(self):
  44. return self.describe_seconds(self.seconds())
  45. class StreamCleanupConfig(object):
  46. def __init__(self, info):
  47. self.path = info[0]
  48. self.layout = info[1]
  49. if info[4] != 0 and info[5] != 0:
  50. self.rate = info[4] / timestamp_to_seconds(info[5])
  51. else:
  52. self.rate = None
  53. self.keep = None
  54. self.clean_decimated = True
  55. self.decimated_from = None
  56. self.also_clean_paths = []
  57. def main(argv=None):
  58. parser = argparse.ArgumentParser(
  59. formatter_class=argparse.RawDescriptionHelpFormatter,
  60. description="""\
  61. Clean up old data from streams using a configuration file to specify
  62. which data to remove.
  63. The format of the config file is as follows:
  64. [/stream/path]
  65. keep = 3w # keep up to 3 weeks of data
  66. rate = 8000 # optional, used for the --estimate option
  67. decimated = false # whether to delete decimated data too (default true)
  68. [*/prep]
  69. keep = 3.5m # or 2520h or 105d or 15w or 0.29y
  70. The suffix for 'keep' is 'h' for hours, 'd' for days, 'w' for weeks,
  71. 'm' for months, or 'y' for years.
  72. Streams paths may include wildcards. If a path is matched by more than
  73. one config section, data from the last config section counts.
  74. Decimated streams (paths containing '~decim-') are treated specially:
  75. - They don't match wildcards
  76. - When deleting data from a parent stream, data is also deleted
  77. from its decimated streams, unless decimated=false
  78. Rate is optional and is only used for the --estimate option.
  79. """)
  80. parser.add_argument("-v", "--version", action="version",
  81. version=nilmtools.__version__)
  82. def_url = os.environ.get("NILMDB_URL", "http://localhost/nilmdb/")
  83. parser.add_argument("-u", "--url", action="store", default=def_url,
  84. help="NilmDB server URL (default: %(default)s)")
  85. parser.add_argument("-y", "--yes", action="store_true",
  86. default=False,
  87. help="Actually remove the data (default: no)")
  88. parser.add_argument("-e", "--estimate", action="store_true",
  89. default=False,
  90. help="Estimate how much disk space will be used")
  91. parser.add_argument("configfile", type=argparse.FileType('r'),
  92. help="Configuration file")
  93. args = parser.parse_args(argv)
  94. # Parse config file
  95. config = configparser.RawConfigParser()
  96. config.readfp(args.configfile)
  97. # List all streams
  98. client = nilmdb.client.Client(args.url)
  99. streamlist = client.stream_list(extended=True)
  100. # Create config objects
  101. streams = collections.OrderedDict()
  102. for s in streamlist:
  103. streams[s[0]] = StreamCleanupConfig(s)
  104. m = re.search(r"^(.*)~decim-[0-9]+$", s[0])
  105. if m:
  106. streams[s[0]].decimated_from = m.group(1)
  107. # Build up configuration
  108. for section in config.sections():
  109. matched = False
  110. for path in streams.keys():
  111. # Decimated streams only allow exact matches
  112. if streams[path].decimated_from and path != section:
  113. continue
  114. if not fnmatch.fnmatch(path, section):
  115. continue
  116. matched = True
  117. options = config.options(section)
  118. # Keep period (days, weeks, months, years)
  119. if 'keep' in options:
  120. streams[path].keep = TimePeriod(config.get(section, 'keep'))
  121. options.remove('keep')
  122. # Rate
  123. if 'rate' in options:
  124. streams[path].rate = config.getfloat(section, 'rate')
  125. options.remove('rate')
  126. # Decimated
  127. if 'decimated' in options:
  128. val = config.getboolean(section, 'decimated')
  129. streams[path].clean_decimated = val
  130. options.remove('decimated')
  131. for leftover in options:
  132. warn("option '%s' for '%s' is unknown", leftover, section)
  133. if not matched:
  134. warn("config for '%s' did not match any existing streams", section)
  135. # List all decimated streams in the parent stream's info
  136. for path in list(streams.keys()):
  137. src = streams[path].decimated_from
  138. if src and src in streams:
  139. if streams[src].clean_decimated:
  140. streams[src].also_clean_paths.append(path)
  141. del streams[path]
  142. # Warn about streams that aren't getting cleaned up
  143. for path in list(streams.keys()):
  144. if streams[path].keep is None or streams[path].keep.seconds() < 0:
  145. warn("no config for existing stream '%s'", path)
  146. del streams[path]
  147. if args.estimate:
  148. # Estimate disk usage
  149. total = 0
  150. for path in list(streams.keys()):
  151. rate = streams[path].rate
  152. if not rate or rate < 0:
  153. warn("unable to estimate disk usage for stream '%s' because "
  154. "the data rate is unknown", path)
  155. continue
  156. printf("%s:\n", path)
  157. layout = streams[path].layout
  158. dtype = nilmdb.client.numpyclient.layout_to_dtype(layout)
  159. per_row = dtype.itemsize
  160. per_sec = per_row * rate
  161. printf("%17s: %s per row, %s rows per second\n",
  162. "base rate",
  163. human_size(per_row),
  164. round(rate, 1))
  165. printf("%17s: %s per hour, %s per day\n",
  166. "base size",
  167. human_size(per_sec * 3600),
  168. human_size(per_sec * 3600 * 24))
  169. # If we'll be cleaning up decimated data, add an
  170. # estimation for how much room decimated data takes up.
  171. if streams[path].clean_decimated:
  172. d_layout = "float32_" + str(3*(int(layout.split('_')[1])))
  173. d_dtype = nilmdb.client.numpyclient.layout_to_dtype(d_layout)
  174. # Assume the decimations will be a factor of 4
  175. # sum_{k=0..inf} (rate / (n^k)) * d_dtype.itemsize
  176. d_per_row = d_dtype.itemsize
  177. factor = 4.0
  178. d_per_sec = (d_per_row *
  179. (rate / factor) *
  180. (1 / (1 - (1/factor))))
  181. per_sec += d_per_sec
  182. printf("%17s: %s per hour, %s per day\n",
  183. "with decimation",
  184. human_size(per_sec * 3600),
  185. human_size(per_sec * 3600 * 24))
  186. keep = per_sec * streams[path].keep.seconds()
  187. printf("%17s: %s\n\n",
  188. "keep " + str(streams[path].keep), human_size(keep))
  189. total += keep
  190. printf("Total estimated disk usage for these streams:\n")
  191. printf(" %s\n", human_size(total))
  192. raise SystemExit(0)
  193. # Do the cleanup
  194. for path in streams:
  195. printf("%s: keep %s\n", path, streams[path].keep)
  196. # Figure out the earliest timestamp we should keep.
  197. intervals = [Interval(start, end) for (start, end) in
  198. reversed(list(client.stream_intervals(path)))]
  199. total = 0
  200. keep = seconds_to_timestamp(streams[path].keep.seconds())
  201. for i in intervals:
  202. total += i.end - i.start
  203. if total <= keep:
  204. continue
  205. remove_before = i.start + (total - keep)
  206. break
  207. else:
  208. printf(" nothing to do (only %s of data present)\n",
  209. streams[path].keep.describe_seconds(
  210. timestamp_to_seconds(total)))
  211. continue
  212. printf(" removing data before %s\n",
  213. timestamp_to_human(remove_before))
  214. # Clean in reverse order. Since we only use the primary stream and not
  215. # the decimated streams to figure out which data to remove, removing
  216. # the primary stream last means that we might recover more nicely if
  217. # we are interrupted and restarted.
  218. clean_paths = list(reversed(streams[path].also_clean_paths)) + [path]
  219. for p in clean_paths:
  220. printf(" removing from %s\n", p)
  221. if args.yes:
  222. client.stream_remove(p, None, remove_before)
  223. # All done
  224. if not args.yes:
  225. printf("Note: specify --yes to actually perform removals\n")
  226. return
  227. if __name__ == "__main__":
  228. main()