# Copyright (C) 2015, 2016 VIB/BEG/UGent - Tim Diels <timdiels.m@gmail.com>
#
# This file is part of Chicken Turtle Util.
#
# Chicken Turtle Util is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Chicken Turtle Util is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with Chicken Turtle Util. If not, see <http://www.gnu.org/licenses/>.
'''
Set utilities. Contains only `merge_by_overlap`, merges overlapping sets in place.
'''
def _locate_bin(bins, n):
"""
Find the bin where list n has ended up: Follow bin references until
we find a bin that has not moved.
"""
while bins[n] != n:
n = bins[n]
return n
# XXX test performance compared to original alg
# Original implementation by http://stackoverflow.com/users/699305/alexis
# at http://stackoverflow.com/a/9453249/1031434
# Modified slightly
[docs]def merge_by_overlap(sets):
'''
Of a list of sets, merge those that overlap, in place.
The result isn't necessarily a subsequence of the original `sets`.
Parameters
----------
sets : [{any}]
Sets of which to merge those that overlap. Empty sets are ignored.
Notes
-----
Implementation is based on `this StackOverflow answer`_. It outperforms all
other algorithms in the thread (visited at dec 2015) on python3.4 using a
wide range of inputs.
.. _this StackOverflow answer: http://stackoverflow.com/a/9453249/1031434
Examples
--------
>>> merge_by_overlap([{1,2}, set(), {2,3}, {4,5,6}, {6,7}])
[{1,2,3}, {4,5,6,7}]
'''
data = sets
bins = list(range(len(data))) # Initialize each bin[n] == n
nums = dict()
for r, row in enumerate(data):
if not row:
data[r] = None
else:
for num in row:
if num not in nums:
# New number: tag it with a pointer to this row's bin
nums[num] = r
continue
else:
dest = _locate_bin(bins, nums[num])
if dest == r:
continue # already in the same bin
if dest > r:
dest, r = r, dest # always merge into the smallest bin
data[dest].update(data[r])
data[r] = None
# Update our indices to reflect the move
bins[r] = dest
r = dest
# Remove empty bins
for i in reversed(range(len(data))):
if not data[i]:
del data[i]