You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

330 lines
11 KiB

  1. """Interval, IntervalSet
  2. Represents an interval of time, and a set of such intervals.
  3. Intervals are half-open, ie. they include data points with timestamps
  4. [start, end)
  5. """
  6. # First implementation kept a sorted list of intervals and used
  7. # biesct() to optimize some operations, but this was too slow.
  8. # Second version was based on the quicksect implementation from
  9. # python-bx, modified slightly to handle floating point intervals.
  10. # This didn't support deletion.
  11. # Third version is more similar to the first version, using a rb-tree
  12. # instead of a simple sorted list to maintain O(log n) operations.
  13. # Fourth version is an optimized rb-tree that stores interval starts
  14. # and ends directly in the tree, like bxinterval did.
  15. cimport rbtree
  16. cdef extern from "stdint.h":
  17. ctypedef unsigned long long uint64_t
  18. class IntervalError(Exception):
  19. """Error due to interval overlap, etc"""
  20. pass
  21. cdef class Interval:
  22. """Represents an interval of time."""
  23. cdef public double start, end
  24. def __init__(self, double start, double end):
  25. """
  26. 'start' and 'end' are arbitrary floats that represent time
  27. """
  28. if start > end:
  29. # Explicitly disallow zero-width intervals (since they're half-open)
  30. raise IntervalError("start %s must precede end %s" % (start, end))
  31. self.start = float(start)
  32. self.end = float(end)
  33. def __repr__(self):
  34. s = repr(self.start) + ", " + repr(self.end)
  35. return self.__class__.__name__ + "(" + s + ")"
  36. def __str__(self):
  37. return "[" + repr(self.start) + " -> " + repr(self.end) + ")"
  38. def __cmp__(self, Interval other):
  39. """Compare two intervals. If non-equal, order by start then end"""
  40. if not isinstance(other, Interval):
  41. raise TypeError("bad type")
  42. if self.start == other.start:
  43. if self.end < other.end:
  44. return -1
  45. if self.end > other.end:
  46. return 1
  47. return 0
  48. if self.start < other.start:
  49. return -1
  50. return 1
  51. cpdef intersects(self, Interval other):
  52. """Return True if two Interval objects intersect"""
  53. if (self.end <= other.start or self.start >= other.end):
  54. return False
  55. return True
  56. cpdef subset(self, double start, double end):
  57. """Return a new Interval that is a subset of this one"""
  58. # A subclass that tracks additional data might override this.
  59. if start < self.start or end > self.end:
  60. raise IntervalError("not a subset")
  61. return Interval(start, end)
  62. cdef class DBInterval(Interval):
  63. """
  64. Like Interval, but also tracks corresponding start/end times and
  65. positions within the database. These are not currently modified
  66. when subsets are taken, but can be used later to help zero in on
  67. database positions.
  68. The actual 'start' and 'end' will always fall within the database
  69. start and end, e.g.:
  70. db_start = 100, db_startpos = 10000
  71. start = 123
  72. end = 150
  73. db_end = 200, db_endpos = 20000
  74. """
  75. cpdef public double db_start, db_end
  76. cpdef public uint64_t db_startpos, db_endpos
  77. def __init__(self, start, end,
  78. db_start, db_end,
  79. db_startpos, db_endpos):
  80. """
  81. 'db_start' and 'db_end' are arbitrary floats that represent
  82. time. They must be a strict superset of the time interval
  83. covered by 'start' and 'end'. The 'db_startpos' and
  84. 'db_endpos' are arbitrary database position indicators that
  85. correspond to those points.
  86. """
  87. Interval.__init__(self, start, end)
  88. self.db_start = db_start
  89. self.db_end = db_end
  90. self.db_startpos = db_startpos
  91. self.db_endpos = db_endpos
  92. if db_start > start or db_end < end:
  93. raise IntervalError("database times must span the interval times")
  94. def __repr__(self):
  95. s = repr(self.start) + ", " + repr(self.end)
  96. s += ", " + repr(self.db_start) + ", " + repr(self.db_end)
  97. s += ", " + repr(self.db_startpos) + ", " + repr(self.db_endpos)
  98. return self.__class__.__name__ + "(" + s + ")"
  99. cpdef subset(self, double start, double end):
  100. """
  101. Return a new DBInterval that is a subset of this one
  102. """
  103. if start < self.start or end > self.end:
  104. raise IntervalError("not a subset")
  105. return DBInterval(start, end,
  106. self.db_start, self.db_end,
  107. self.db_startpos, self.db_endpos)
  108. cdef class IntervalSet:
  109. """
  110. A non-intersecting set of intervals.
  111. """
  112. cdef public rbtree.RBTree tree
  113. def __init__(self, source=None):
  114. """
  115. 'source' is an Interval or IntervalSet to add.
  116. """
  117. self.tree = rbtree.RBTree()
  118. if source is not None:
  119. self += source
  120. def __iter__(self):
  121. for node in self.tree:
  122. if node.obj:
  123. yield node.obj
  124. def __len__(self):
  125. return sum(1 for x in self)
  126. def __repr__(self):
  127. descs = [ repr(x) for x in self ]
  128. return self.__class__.__name__ + "([" + ", ".join(descs) + "])"
  129. def __str__(self):
  130. descs = [ str(x) for x in self ]
  131. return "[" + ", ".join(descs) + "]"
  132. def __match__(self, other):
  133. # This isn't particularly efficient, but it shouldn't get used in the
  134. # general case.
  135. """Test equality of two IntervalSets.
  136. Treats adjacent Intervals as equivalent to one long interval,
  137. so this function really tests whether the IntervalSets cover
  138. the same spans of time."""
  139. i = 0
  140. j = 0
  141. outside = True
  142. def is_adjacent(a, b):
  143. """Return True if two Intervals are adjacent (same end or start)"""
  144. if a.end == b.start or b.end == a.start:
  145. return True
  146. else:
  147. return False
  148. this = list(self)
  149. that = list(other)
  150. try:
  151. while True:
  152. if (outside):
  153. # To match, we need to be finished both sets
  154. if (i >= len(this) and j >= len(that)):
  155. return True
  156. # Or the starts need to match
  157. if (this[i].start != that[j].start):
  158. return False
  159. outside = False
  160. else:
  161. # We can move on if the two interval ends match
  162. if (this[i].end == that[j].end):
  163. i += 1
  164. j += 1
  165. outside = True
  166. else:
  167. # Whichever ends first needs to be adjacent to the next
  168. if (this[i].end < that[j].end):
  169. if (not is_adjacent(this[i],this[i+1])):
  170. return False
  171. i += 1
  172. else:
  173. if (not is_adjacent(that[j],that[j+1])):
  174. return False
  175. j += 1
  176. except IndexError:
  177. return False
  178. # Use __richcmp__ instead of __eq__, __ne__ for Cython.
  179. def __richcmp__(self, other, int op):
  180. if op == 2: # ==
  181. return self.__match__(other)
  182. elif op == 3: # !=
  183. return not self.__match__(other)
  184. return False
  185. #def __eq__(self, other):
  186. # return self.__match__(other)
  187. #
  188. #def __ne__(self, other):
  189. # return not self.__match__(other)
  190. def __iadd__(self, object other not None):
  191. """Inplace add -- modifies self
  192. This throws an exception if the regions being added intersect."""
  193. if isinstance(other, Interval):
  194. if self.intersects(other):
  195. raise IntervalError("Tried to add overlapping interval "
  196. "to this set")
  197. self.tree.insert(rbtree.RBNode(other.start, other.end, other))
  198. else:
  199. for x in other:
  200. self.__iadd__(x)
  201. return self
  202. def iadd_nocheck(self, Interval other not None):
  203. """Inplace add -- modifies self.
  204. 'Optimized' version that doesn't check for intersection and
  205. only inserts the new interval into the tree."""
  206. self.tree.insert(rbtree.RBNode(other.start, other.end, other))
  207. def __isub__(self, Interval other not None):
  208. """Inplace subtract -- modifies self
  209. Removes an interval from the set. Must exist exactly
  210. as provided -- cannot remove a subset of an existing interval."""
  211. i = self.tree.find(other.start, other.end)
  212. if i is None:
  213. raise IntervalError("interval " + str(other) + " not in tree")
  214. self.tree.delete(i)
  215. return self
  216. def __add__(self, other not None):
  217. """Add -- returns a new object"""
  218. new = IntervalSet(self)
  219. new += IntervalSet(other)
  220. return new
  221. def __and__(self, other not None):
  222. """
  223. Compute a new IntervalSet from the intersection of two others
  224. Output intervals are built as subsets of the intervals in the
  225. first argument (self).
  226. """
  227. out = IntervalSet()
  228. if not isinstance(other, IntervalSet):
  229. for i in self.intersection(other):
  230. out.tree.insert(rbtree.RBNode(i.start, i.end, i))
  231. else:
  232. for x in other:
  233. for i in self.intersection(x):
  234. out.tree.insert(rbtree.RBNode(i.start, i.end, i))
  235. return out
  236. def intersection(self, Interval interval not None, orig = False):
  237. """
  238. Compute a sequence of intervals that correspond to the
  239. intersection between `self` and the provided interval.
  240. Returns a generator that yields each of these intervals
  241. in turn.
  242. Output intervals are built as subsets of the intervals in the
  243. first argument (self).
  244. If orig = True, also return the original interval that was
  245. (potentially) subsetted to make the one that is being
  246. returned.
  247. """
  248. if not isinstance(interval, Interval):
  249. raise TypeError("bad type")
  250. for n in self.tree.intersect(interval.start, interval.end):
  251. i = n.obj
  252. if i:
  253. if i.start >= interval.start and i.end <= interval.end:
  254. if orig:
  255. yield (i, i)
  256. else:
  257. yield i
  258. else:
  259. subset = i.subset(max(i.start, interval.start),
  260. min(i.end, interval.end))
  261. if orig:
  262. yield (subset, i)
  263. else:
  264. yield subset
  265. cpdef intersects(self, Interval other):
  266. """Return True if this IntervalSet intersects another interval"""
  267. for n in self.tree.intersect(other.start, other.end):
  268. if n.obj.intersects(other):
  269. return True
  270. return False
  271. def find_end(self, double t):
  272. """
  273. Return an Interval from this tree that ends at time t, or
  274. None if it doesn't exist.
  275. """
  276. n = self.tree.find_left_end(t)
  277. if n and n.obj.end == t:
  278. return n.obj
  279. return None