Source code for set_operations

# labtools, Copyright (C) 2017 Jerry Fowler and Paul Scheet.
# This program comes with ABSOLUTELY NO WARRANTY. It is licensed under
# GNU GPL Version 3. License and warranty may be viewed in the manual.
'''
Some set operations that are particularly useful for visualizing
set differences
'''

import os
import sys
from collections import OrderedDict, defaultdict
import itertools as iter

from labtools import misc
from labtools import const
from labtools import genomics

[docs]def combinations(keys, issorted=True): ''' Return a list of comma-spliced combinations of the given keys ''' if issorted: keys = sorted(keys) return [const.COMMA.join(c) for r in range(len(keys)+1, 0, -1) for c in iter.combinations(keys, r) ]
def intersect_from_dict(keylist, d): isect = d[keylist[0]] for k in keylist[1:]: isect = isect & d[k] return isect
[docs]def set_combinations(setdict): ''' Build and return a dict of sets keyed by all combinations of the dictionary's keys ''' keys = list(setdict.keys()) intersections = OrderedDict() for r in range(1, len(keys)+1): for c in iter.combinations(keys, r): combokey = const.COMMA.join(c) intersections[combokey] = intersect_from_dict(c, setdict) return intersections
[docs]def obsolete_sort_setsize_and_keylength(x, y): ''' Specific to the output of set_combinations or a dict of corresponding set sizes. Sort pairs by the numeric (second element) followed by number of comma-separated elements in the key (first element) Return descending order of size and keylength, ascending alphetic key names ''' if isinstance(x[1], int): x1 = x[1] y1 = y[1] else: x1 = len(x[1]) y1 = len(y[1]) return (cmp(y1, x1) or cmp(len(y[0].split(const.COMMA)), len(x[0].split(const.COMMA))) or cmp(x[0], y[0]))
[docs]def sort_by_setsize_and_keylength(rlist): ''' Specific to the output of set_combinations or a dict of corresponding set sizes. Sort pairs by the numeric (second element) followed by number of 1 comma-separated elements in the key (first element) Return descending order of size and keylength, ascending alphetic key names ''' for func, dir in [(lambda x: x[0], False), (lambda x: len(x[0].split(const.COMMA)), True), (lambda x: x[1] if isinstance(x[1], int) else len(x[1]), True),]: rlist = sorted(rlist, key=func, reverse=dir) return rlist
[docs]def maximal_subsets(intersections): ''' takes output of set_combinations and eliminates keys whose elements are strict subsets of other keys. returns the set of maximal sets and their lengths. ''' res = list(zip(list(intersections.keys()), [len(intersections[k]) for k in list(intersections.keys())])) print(res) ressort = sort_by_setsize_and_keylength(res) print(ressort) maximal = [] oldcount = -1 # bugfix: a set could contain 0, so that was a bad idea for r in ressort: if oldcount != r[1]: oldcount = r[1] matches = [set(r[0].split(const.COMMA))] maximal.append(r) else: testset = set(r[0].split(const.COMMA)) if not max([testset < match for match in matches]): # this set is not a subset of any of the maximals, keep it. maximal.append(r) matches.append(testset) return maximal
[docs]def select_keyset_of_largest_intersection(setdict, intersected_combinations=None, howmany=3): ''' Expects a dict of sets, and computes the intersections of all combinations of those sets, and returns a reduced dict of the howmany keys in setdict whose sets have the largest intersection. intersected_combinations can be provided to facilitate re-use of a one-time calculation, but no effort is made to determine that it was calculated from setdict. howmany is the size of the subset of setdict.keys() that should be measured. Default of 3 facilitates 3-way Venn diagrams. ''' if intersected_combinations is None: intersected_combinations = set_combinations(setdict) pairs = sort_by_setsize_and_keylength([[combokey, intersected_combinations[combokey]] for combokey in list(intersected_combinations.keys()) if len(combokey.split(const.COMMA)) == howmany]) keys = pairs[0][0].split(const.COMMA) if len(pairs) else [] return OrderedDict([[key, setdict[key]] for key in keys])
MEET = 'meet' JOIN = 'join' MEET_ONLY = [MEET] FULL_GRAPH = [MEET, JOIN] def _meet_subst(key, meet, join): return meet if key == MEET else join if key == JOIN else key
[docs]def write_distance_matrix(sets, dist_matrix, matrix=sys.stdout, map=None, phylip=True, rooting=[]): ''' Given a dictionary of named sets, and a jaccard distance between them, write a distance matrix suitable for input to phylip or phylip-like tree-ing packages. matrix (the output) defaults to stdout. ''' keys = list(sets.keys()) if MEET in rooting: keys.append(MEET) if JOIN in rooting: keys.append(JOIN) printkeys = [map[key] if map and key in map else key for key in keys] if phylip: printkeys = [key[:10] for key in printkeys] format = '%-10s' else: format = '%%-%ds' % (max([len(k) for k in printkeys])) print('%5d' % (len(keys)), file=matrix) for idx in range(len(keys)): key = keys[idx] row = [format % (printkeys[idx])] row.extend(['%6.4f' % (tuple[1]) for tuple in dist_matrix if tuple[0][1] == key]) row.append( '%6.4f' % (0.0)) row.extend(['%6.4f' % (tuple[1]) for tuple in dist_matrix if tuple[0][0] == key]) print(const.SPACE.join(row), file=matrix)