Source code for scipy_set_operations
#!/usr/bin/env python
'''
Extend set_operations with some scipy-dependent stuff, to
confine the dependency here.
'''
from __future__ import print_function
import os
import sys
VERSION = 1.1
from collections import OrderedDict, defaultdict
import copy
import itertools as iter
try:
import numpy
from scipy.spatial import distance
except ImportError as ie:
raise LabtoolsWarning('{} uses numpy/scipy. Please install or fix PYTHONPATH'.format(sys.argv[0]),
file=sys.stderr)
import misc
import const
import genomics
from labexceptions import LabtoolsWarning
MEET = 'meet'
JOIN = 'join'
all_are_true = min
one_is_true = max
[docs]def meet_join_sort(x, y):
'''
Makes the assumption that the sorted pairs were built from an array
of keys where MEET was the first and JOIN the last if they exist.
'''
x0 = x[0]
y0 = y[0]
if not one_is_true([mj in x0 for mj in [MEET, JOIN]]):
if not one_is_true([mj in y0 for mj in [MEET, JOIN]]):
return cmp(x0, y0)
# mj in y0
return -1 if JOIN in y0 else 1
# mj in x0, possibly in y
if not one_is_true([mj in y0 for mj in [MEET, JOIN]]):
return 1 if JOIN in x0 else -1
# mj in both
# note, the element meet,join is unique, so we don't need to
# worry about comparing it to itself.
if all_are_true([MEET in a for a in [x0, y0]]):
if one_is_true([JOIN in a for a in [x0, y0]]):
return -1 if JOIN in y0 else 1
# MEET in both, no JOIN in either
return cmp(x0, y0)
# possible join and possible 1 meet
if all_are_true([JOIN in a for a in [x0, y0]]):
if one_is_true([MEET in a for a in [x0, y0]]):
return 1 if MEET in y0 else -1
# JOIN in both, no MEET in either
return cmp(x0, y0)
# each has at most one or the other
if one_is_true([MEET in a for a in [x0, y0]]):
return -1 if MEET in x0 else 1
# gotta be a single join
return 1 if JOIN in x0 else -1
def find_distance_function(df):
import types
try:
dfx = eval('distance.%s' % (df))
except AttributeError:
raise LabtoolsWarning('No function named scipy.distance.%s' % df)
if not isinstance(dfx, types.FunctionType):
raise LabtoolsWarning('No function named scipy.distance.%s' % df)
return dfx
def nan_distance(array1, array2, function=distance.jaccard, nan_value=0):
j = function(array1, array2)
return nan_value if numpy.isnan(j) else j
[docs]def compute_distance(setdict, distance_function=None, nan_value=0, rooting=[]):
'''
Computes the distances of all pairs of sets in setdict according to specified distance_function,
jaccard by default.
returns an ordered map of set pairnames to their distances.
rooting can be MEET, JOIN, both, or (default) neither.
MEET implies add a lower bound for the set of distances (all zeroes)
JOIN implies add an upper bound for the set of distances (all ones)
nan_value is substituted for distance computations that produce numpy.NaN, default zero.
'''
if distance_function is None:
distance_function = distance.jaccard
elif isinstance(distance_function, str):
distance_function = find_distance_function(distance_function)
if rooting:
all_terms = [term for sublist in setdict.values() for term in sublist]
setdict = copy.copy(setdict) # don't alter the original
if one_is_true([r not in [MEET, JOIN] for r in rooting]):
raise LabtoolsWarning('Bad rooting option %r. Good ones are % and %s only' %
(rooting, MEET, JOIN))
if MEET in rooting:
setmin = []
for j in range(len(setdict.values()[0])):
setmin.append(min([v[j] for v in setdict.values()]))
#setmin = [min(v[j]) for j in range(len(setdict.values()[0])) for v in setdict.values()]
setdict[MEET] = setmin
if JOIN in rooting:
setmax = []
for j in range(len(setdict.values()[0])):
setmax.append(max([v[j] for v in setdict.values()]))
#setmax = [max(v[j]) for v in setdict.values() for j in range(len(setdict.values()[0]))]
setdict[JOIN] = setmax
computed_distance = [[pair, nan_distance(setdict[pair[0]],
setdict[pair[1]],
distance_function,
nan_value)]
for pair in iter.combinations(setdict.keys(), 2)]
return computed_distance