Source code for scipy_set_operations

#!/usr/bin/env python
'''
Extend set_operations with some scipy-dependent stuff, to
confine the dependency here.
'''

from __future__ import print_function

import os
import sys

VERSION = 1.1

from collections import OrderedDict, defaultdict
import copy
import itertools as iter


try:
    import numpy
    from scipy.spatial import distance
except ImportError as ie:
    raise LabtoolsWarning('{} uses numpy/scipy. Please install or fix PYTHONPATH'.format(sys.argv[0]),
                          file=sys.stderr)


import misc
import const
import genomics
from labexceptions import LabtoolsWarning

MEET = 'meet'
JOIN = 'join'

all_are_true = min
one_is_true = max

[docs]def meet_join_sort(x, y): ''' Makes the assumption that the sorted pairs were built from an array of keys where MEET was the first and JOIN the last if they exist. ''' x0 = x[0] y0 = y[0] if not one_is_true([mj in x0 for mj in [MEET, JOIN]]): if not one_is_true([mj in y0 for mj in [MEET, JOIN]]): return cmp(x0, y0) # mj in y0 return -1 if JOIN in y0 else 1 # mj in x0, possibly in y if not one_is_true([mj in y0 for mj in [MEET, JOIN]]): return 1 if JOIN in x0 else -1 # mj in both # note, the element meet,join is unique, so we don't need to # worry about comparing it to itself. if all_are_true([MEET in a for a in [x0, y0]]): if one_is_true([JOIN in a for a in [x0, y0]]): return -1 if JOIN in y0 else 1 # MEET in both, no JOIN in either return cmp(x0, y0) # possible join and possible 1 meet if all_are_true([JOIN in a for a in [x0, y0]]): if one_is_true([MEET in a for a in [x0, y0]]): return 1 if MEET in y0 else -1 # JOIN in both, no MEET in either return cmp(x0, y0) # each has at most one or the other if one_is_true([MEET in a for a in [x0, y0]]): return -1 if MEET in x0 else 1 # gotta be a single join return 1 if JOIN in x0 else -1
def find_distance_function(df): import types try: dfx = eval('distance.%s' % (df)) except AttributeError: raise LabtoolsWarning('No function named scipy.distance.%s' % df) if not isinstance(dfx, types.FunctionType): raise LabtoolsWarning('No function named scipy.distance.%s' % df) return dfx def nan_distance(array1, array2, function=distance.jaccard, nan_value=0): j = function(array1, array2) return nan_value if numpy.isnan(j) else j
[docs]def compute_distance(setdict, distance_function=None, nan_value=0, rooting=[]): ''' Computes the distances of all pairs of sets in setdict according to specified distance_function, jaccard by default. returns an ordered map of set pairnames to their distances. rooting can be MEET, JOIN, both, or (default) neither. MEET implies add a lower bound for the set of distances (all zeroes) JOIN implies add an upper bound for the set of distances (all ones) nan_value is substituted for distance computations that produce numpy.NaN, default zero. ''' if distance_function is None: distance_function = distance.jaccard elif isinstance(distance_function, str): distance_function = find_distance_function(distance_function) if rooting: all_terms = [term for sublist in setdict.values() for term in sublist] setdict = copy.copy(setdict) # don't alter the original if one_is_true([r not in [MEET, JOIN] for r in rooting]): raise LabtoolsWarning('Bad rooting option %r. Good ones are % and %s only' % (rooting, MEET, JOIN)) if MEET in rooting: setmin = [] for j in range(len(setdict.values()[0])): setmin.append(min([v[j] for v in setdict.values()])) #setmin = [min(v[j]) for j in range(len(setdict.values()[0])) for v in setdict.values()] setdict[MEET] = setmin if JOIN in rooting: setmax = [] for j in range(len(setdict.values()[0])): setmax.append(max([v[j] for v in setdict.values()])) #setmax = [max(v[j]) for v in setdict.values() for j in range(len(setdict.values()[0]))] setdict[JOIN] = setmax computed_distance = [[pair, nan_distance(setdict[pair[0]], setdict[pair[1]], distance_function, nan_value)] for pair in iter.combinations(setdict.keys(), 2)] return computed_distance