You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

384 lines
13 KiB

  1. """Interval, IntervalSet
  2. Represents an interval of time, and a set of such intervals.
  3. Intervals are half-open, ie. they include data points with timestamps
  4. [start, end)
  5. """
  6. # First implementation kept a sorted list of intervals and used
  7. # biesct() to optimize some operations, but this was too slow.
  8. # Second version was based on the quicksect implementation from
  9. # python-bx, modified slightly to handle floating point intervals.
  10. # This didn't support deletion.
  11. # Third version is more similar to the first version, using a rb-tree
  12. # instead of a simple sorted list to maintain O(log n) operations.
  13. # Fourth version is an optimized rb-tree that stores interval starts
  14. # and ends directly in the tree, like bxinterval did.
  15. from ..utils.time import float_time_to_string as ftts
  16. from ..utils.iterator import imerge
  17. import itertools
  18. cimport rbtree
  19. cdef extern from "stdint.h":
  20. ctypedef unsigned long long uint64_t
  21. class IntervalError(Exception):
  22. """Error due to interval overlap, etc"""
  23. pass
  24. cdef class Interval:
  25. """Represents an interval of time."""
  26. cdef public double start, end
  27. def __init__(self, double start, double end):
  28. """
  29. 'start' and 'end' are arbitrary floats that represent time
  30. """
  31. if start >= end:
  32. # Explicitly disallow zero-width intervals (since they're half-open)
  33. raise IntervalError("start %s must precede end %s" % (start, end))
  34. self.start = float(start)
  35. self.end = float(end)
  36. def __repr__(self):
  37. s = repr(self.start) + ", " + repr(self.end)
  38. return self.__class__.__name__ + "(" + s + ")"
  39. def __str__(self):
  40. return "[" + ftts(self.start) + " -> " + ftts(self.end) + ")"
  41. def __cmp__(self, Interval other):
  42. """Compare two intervals. If non-equal, order by start then end"""
  43. if not isinstance(other, Interval):
  44. raise TypeError("bad type")
  45. if self.start == other.start:
  46. if self.end < other.end:
  47. return -1
  48. if self.end > other.end:
  49. return 1
  50. return 0
  51. if self.start < other.start:
  52. return -1
  53. return 1
  54. cpdef intersects(self, Interval other):
  55. """Return True if two Interval objects intersect"""
  56. if (self.end <= other.start or self.start >= other.end):
  57. return False
  58. return True
  59. cpdef subset(self, double start, double end):
  60. """Return a new Interval that is a subset of this one"""
  61. # A subclass that tracks additional data might override this.
  62. if start < self.start or end > self.end:
  63. raise IntervalError("not a subset")
  64. return Interval(start, end)
  65. cdef class DBInterval(Interval):
  66. """
  67. Like Interval, but also tracks corresponding start/end times and
  68. positions within the database. These are not currently modified
  69. when subsets are taken, but can be used later to help zero in on
  70. database positions.
  71. The actual 'start' and 'end' will always fall within the database
  72. start and end, e.g.:
  73. db_start = 100, db_startpos = 10000
  74. start = 123
  75. end = 150
  76. db_end = 200, db_endpos = 20000
  77. """
  78. cpdef public double db_start, db_end
  79. cpdef public uint64_t db_startpos, db_endpos
  80. def __init__(self, start, end,
  81. db_start, db_end,
  82. db_startpos, db_endpos):
  83. """
  84. 'db_start' and 'db_end' are arbitrary floats that represent
  85. time. They must be a strict superset of the time interval
  86. covered by 'start' and 'end'. The 'db_startpos' and
  87. 'db_endpos' are arbitrary database position indicators that
  88. correspond to those points.
  89. """
  90. Interval.__init__(self, start, end)
  91. self.db_start = db_start
  92. self.db_end = db_end
  93. self.db_startpos = db_startpos
  94. self.db_endpos = db_endpos
  95. if db_start > start or db_end < end:
  96. raise IntervalError("database times must span the interval times")
  97. def __repr__(self):
  98. s = repr(self.start) + ", " + repr(self.end)
  99. s += ", " + repr(self.db_start) + ", " + repr(self.db_end)
  100. s += ", " + repr(self.db_startpos) + ", " + repr(self.db_endpos)
  101. return self.__class__.__name__ + "(" + s + ")"
  102. cpdef subset(self, double start, double end):
  103. """
  104. Return a new DBInterval that is a subset of this one
  105. """
  106. if start < self.start or end > self.end:
  107. raise IntervalError("not a subset")
  108. return DBInterval(start, end,
  109. self.db_start, self.db_end,
  110. self.db_startpos, self.db_endpos)
  111. cdef class IntervalSet:
  112. """
  113. A non-intersecting set of intervals.
  114. """
  115. cdef public rbtree.RBTree tree
  116. def __init__(self, source=None):
  117. """
  118. 'source' is an Interval or IntervalSet to add.
  119. """
  120. self.tree = rbtree.RBTree()
  121. if source is not None:
  122. self += source
  123. def __iter__(self):
  124. for node in self.tree:
  125. if node.obj:
  126. yield node.obj
  127. def __len__(self):
  128. return sum(1 for x in self)
  129. def __repr__(self):
  130. descs = [ repr(x) for x in self ]
  131. return self.__class__.__name__ + "([" + ", ".join(descs) + "])"
  132. def __str__(self):
  133. descs = [ str(x) for x in self ]
  134. return "[" + ", ".join(descs) + "]"
  135. def __match__(self, other):
  136. # This isn't particularly efficient, but it shouldn't get used in the
  137. # general case.
  138. """Test equality of two IntervalSets.
  139. Treats adjacent Intervals as equivalent to one long interval,
  140. so this function really tests whether the IntervalSets cover
  141. the same spans of time."""
  142. i = 0
  143. j = 0
  144. outside = True
  145. def is_adjacent(a, b):
  146. """Return True if two Intervals are adjacent (same end or start)"""
  147. if a.end == b.start or b.end == a.start:
  148. return True
  149. else:
  150. return False
  151. this = list(self)
  152. that = list(other)
  153. try:
  154. while True:
  155. if (outside):
  156. # To match, we need to be finished both sets
  157. if (i >= len(this) and j >= len(that)):
  158. return True
  159. # Or the starts need to match
  160. if (this[i].start != that[j].start):
  161. return False
  162. outside = False
  163. else:
  164. # We can move on if the two interval ends match
  165. if (this[i].end == that[j].end):
  166. i += 1
  167. j += 1
  168. outside = True
  169. else:
  170. # Whichever ends first needs to be adjacent to the next
  171. if (this[i].end < that[j].end):
  172. if (not is_adjacent(this[i],this[i+1])):
  173. return False
  174. i += 1
  175. else:
  176. if (not is_adjacent(that[j],that[j+1])):
  177. return False
  178. j += 1
  179. except IndexError:
  180. return False
  181. # Use __richcmp__ instead of __eq__, __ne__ for Cython.
  182. def __richcmp__(self, other, int op):
  183. if op == 2: # ==
  184. return self.__match__(other)
  185. elif op == 3: # !=
  186. return not self.__match__(other)
  187. return False
  188. #def __eq__(self, other):
  189. # return self.__match__(other)
  190. #
  191. #def __ne__(self, other):
  192. # return not self.__match__(other)
  193. def __iadd__(self, object other not None):
  194. """Inplace add -- modifies self
  195. This throws an exception if the regions being added intersect."""
  196. if isinstance(other, Interval):
  197. if self.intersects(other):
  198. raise IntervalError("Tried to add overlapping interval "
  199. "to this set")
  200. self.tree.insert(rbtree.RBNode(other.start, other.end, other))
  201. else:
  202. for x in other:
  203. self.__iadd__(x)
  204. return self
  205. def iadd_nocheck(self, Interval other not None):
  206. """Inplace add -- modifies self.
  207. 'Optimized' version that doesn't check for intersection and
  208. only inserts the new interval into the tree."""
  209. self.tree.insert(rbtree.RBNode(other.start, other.end, other))
  210. def __isub__(self, Interval other not None):
  211. """Inplace subtract -- modifies self
  212. Removes an interval from the set. Must exist exactly
  213. as provided -- cannot remove a subset of an existing interval."""
  214. i = self.tree.find(other.start, other.end)
  215. if i is None:
  216. raise IntervalError("interval " + str(other) + " not in tree")
  217. self.tree.delete(i)
  218. return self
  219. def __add__(self, other not None):
  220. """Add -- returns a new object"""
  221. new = IntervalSet(self)
  222. new += IntervalSet(other)
  223. return new
  224. def __and__(self, other not None):
  225. """
  226. Compute a new IntervalSet from the intersection of this
  227. IntervalSet with one other interval.
  228. Output intervals are built as subsets of the intervals in the
  229. first argument (self).
  230. """
  231. out = IntervalSet()
  232. for i in self.intersection(other):
  233. out.tree.insert(rbtree.RBNode(i.start, i.end, i))
  234. return out
  235. def intersection(self, Interval interval not None, orig = False):
  236. """
  237. Compute a sequence of intervals that correspond to the
  238. intersection between `self` and the provided interval.
  239. Returns a generator that yields each of these intervals
  240. in turn.
  241. Output intervals are built as subsets of the intervals in the
  242. first argument (self).
  243. If orig = True, also return the original interval that was
  244. (potentially) subsetted to make the one that is being
  245. returned.
  246. """
  247. if not isinstance(interval, Interval):
  248. raise TypeError("bad type")
  249. for n in self.tree.intersect(interval.start, interval.end):
  250. i = n.obj
  251. if i:
  252. if i.start >= interval.start and i.end <= interval.end:
  253. if orig:
  254. yield (i, i)
  255. else:
  256. yield i
  257. else:
  258. subset = i.subset(max(i.start, interval.start),
  259. min(i.end, interval.end))
  260. if orig:
  261. yield (subset, i)
  262. else:
  263. yield subset
  264. def set_difference(self, IntervalSet other not None,
  265. Interval bounds = None):
  266. """
  267. Compute the difference (self \\ other) between this
  268. IntervalSet and the given IntervalSet; i.e., the ranges
  269. that are present in 'self' but not 'other'.
  270. If 'bounds' is not None, results are limited to the range
  271. specified by the interval 'bounds'.
  272. Returns a generator that yields each interval in turn.
  273. Output intervals are built as subsets of the intervals in the
  274. first argument (self).
  275. """
  276. # Iterate through all starts and ends in sorted order. Add a
  277. # tag to the iterator so that we can figure out which one they
  278. # were, after sorting.
  279. def decorate(it, key_start, key_end):
  280. for i in it:
  281. yield i.start, key_start, i
  282. yield i.end, key_end, i
  283. if bounds is None:
  284. bounds = Interval(-1e12, 1e12)
  285. self_iter = decorate(self.intersection(bounds), 0, 2)
  286. other_iter = decorate(other.intersection(bounds), 1, 3)
  287. # Now iterate over the timestamps of each start and end.
  288. # At each point, evaluate which type of end it is, to determine
  289. # how to build up the output intervals.
  290. self_interval = None
  291. other_interval = None
  292. out_start = None
  293. for (ts, k, i) in imerge(self_iter, other_iter):
  294. if k == 0:
  295. # start self interval
  296. self_interval = i
  297. if other_interval is None:
  298. out_start = ts
  299. elif k == 1:
  300. # start other interval
  301. other_interval = i
  302. if out_start is not None and out_start != ts:
  303. yield self_interval.subset(out_start, ts)
  304. out_start = None
  305. elif k == 2:
  306. # end self interval
  307. if out_start is not None and out_start != ts:
  308. yield self_interval.subset(out_start, ts)
  309. out_start = None
  310. self_interval = None
  311. elif k == 3:
  312. # end other interval
  313. other_interval = None
  314. if self_interval:
  315. out_start = ts
  316. cpdef intersects(self, Interval other):
  317. """Return True if this IntervalSet intersects another interval"""
  318. for n in self.tree.intersect(other.start, other.end):
  319. if n.obj.intersects(other):
  320. return True
  321. return False
  322. def find_end(self, double t):
  323. """
  324. Return an Interval from this tree that ends at time t, or
  325. None if it doesn't exist.
  326. """
  327. n = self.tree.find_left_end(t)
  328. if n and n.obj.end == t:
  329. return n.obj
  330. return None