#
source:
orange-bioinformatics/_bioinformatics/obiExperiments.py
@
1636:10d234fdadb9

Revision 1636:10d234fdadb9, 4.4 KB checked in by mitar, 2 years ago (diff) |
---|

Rev | Line | |
---|---|---|

[1557] | 1 | |

2 | from collections import defaultdict | |

3 | from operator import add | |

4 | import numpy | |

5 | import math | |

6 | ||

7 | def data_type(vals): | |

8 | try: | |

9 | _ = [ int(a) for a in vals ] | |

10 | return int | |

11 | except ValueError: | |

12 | try: | |

13 | _ = [ float(a) for a in vals ] | |

14 | return float | |

15 | except ValueError: | |

16 | return lambda x: x | |

17 | ||

18 | def separate_by(data, separate, ignore=[], consider=None, add_empty=True): | |

19 | """ | |

20 | data - the data - annotations are saved in the at.attributes | |

21 | annotatitions: keys of at.attributes by which to separate | |

22 | ignore: ignore values of these annotations | |

23 | consider: consider only these annotations | |

24 | """ | |

25 | ignore = set(ignore) | |

26 | ||

27 | annotations = [ at.attributes for at in data.domain.attributes ] | |

28 | ||

29 | all_values = defaultdict(set) | |

30 | for a in annotations: | |

31 | for k,v in a.iteritems(): | |

32 | all_values[k].add(v) | |

33 | ||

34 | types = {} | |

35 | for k,vals in all_values.iteritems(): | |

36 | types[k] = data_type(vals) | |

37 | ||

38 | groups = defaultdict(list) | |

39 | for i,a in enumerate(annotations): | |

40 | groups[tuple(a[k] for k in separate)].append(i) | |

41 | ||

42 | different_in_all = set(k \ | |

43 | for k,vals in all_values.iteritems() \ | |

44 | if len(vals) == len(annotations) or len(vals) == 1) | |

45 | ||

46 | other_relevant = set(all_values.keys()) - different_in_all - ignore - set(separate) | |

47 | if consider != None: | |

48 | other_relevant &= set(consider) | |

49 | other_relevant = sorted(other_relevant) #TODO how to order them? | |

50 | ||

51 | def relevant_vals(annotation): | |

52 | if isinstance(annotation, tuple): | |

53 | return annotation | |

54 | return tuple(types[v](annotation[v]) for v in other_relevant) | |

55 | ||

56 | other_relevant_d2 = defaultdict(int) #"multiset" - number | |

57 | #of maximum occurances of a relevant value in a group | |

58 | for _,g in groups.items(): | |

59 | d = defaultdict(int) | |

60 | for i in g: | |

61 | d[relevant_vals(annotations[i])] += 1 | |

62 | for rv,n in d.items(): | |

63 | if n > other_relevant_d2[rv]: | |

64 | other_relevant_d2[rv] = n | |

65 | ||

66 | if add_empty: #fill in with "empty" relevant vals | |

67 | ngroups = {} | |

68 | for g in groups: | |

69 | need_to_fill = other_relevant_d2.copy() | |

70 | for i in groups[g]: | |

71 | need_to_fill[relevant_vals(annotations[i])] -= 1 | |

72 | add = [] | |

73 | for rv,num in need_to_fill.items(): | |

74 | for a in range(num): | |

75 | add.append(rv) | |

76 | ngroups[g] = groups[g] + add | |

77 | groups = ngroups | |

78 | ||

79 | ngroups = {} | |

80 | uniquepos = {} #which positions are unique | |

81 | for g in groups: | |

82 | elements = list(groups[g]) | |

83 | ||

84 | rv2 = lambda x: relevant_vals(annotations[x] if isinstance(x,int) else x) | |

85 | ||

86 | ngroups[g] = map(lambda x: x if isinstance(x,int) else None, | |

87 | sorted(elements, key=rv2)) | |

88 | ||

89 | d = defaultdict(int) #get groups of different relevant values | |

90 | for i in elements: | |

91 | d[rv2(i)] += 1 | |

92 | ||

93 | uniquepos[g] = map(lambda x: not d[rv2(x)] > 1, | |

94 | sorted(elements, key=rv2)) | |

95 | ||

96 | return ngroups, uniquepos | |

97 | ||

98 | def float_or_none(value): | |

99 | return value.value if value.value != "?" else None | |

100 | ||

101 | def linearize(data, ids): | |

102 | """ Returns a list of floats in the data subspace (or None's | |

103 | if the values are unknown or not present. """ | |

104 | l = [ [ None ] * len(data) if id1 == None \ | |

105 | else [ float_or_none(ex[id1]) for ex in data ] for id1 in ids ] | |

106 | l = reduce(add, l) | |

107 | return l | |

108 | ||

109 | def pearson_lists(l1, l2): | |

110 | """ Returns pearson correlation between two lists. Ignores elements | |

111 | which are None.""" | |

112 | okvals = [ (a,b) for a,b in zip(l1,l2) if a != None and b != None ] | |

113 | return numpy.corrcoef([ [ v[0] for v in okvals], [ v[1] for v in okvals] ])[0,1] | |

114 | ||

115 | def euclidean_lists(l1, l2): | |

116 | """ Returns pearson correlation between two lists. Ignores elements | |

117 | which are None.""" | |

118 | okvals = [ (a,b) for a,b in zip(l1,l2) if a != None and b != None ] | |

119 | return math.sqrt( sum((a-b)*(a-b) for a,b in okvals )) | |

120 | ||

121 | def spearman_lists(l1, l2): | |

122 | """ Returns pearson correlation between two lists. Ignores elements | |

123 | which are None.""" | |

124 | import scipy.stats | |

125 | okvals = [ (a,b) for a,b in zip(l1,l2) if a != None and b != None ] | |

126 | #print okvals, len(okvals) | |

127 | return scipy.stats.spearmanr([ v[0] for v in okvals], [ v[1] for v in okvals] )[0] | |

128 | ||

129 | def dist_spearman(l1, l2): | |

130 | return (1.-spearman_lists(l1, l2))/2 | |

131 | ||

132 | def dist_pcorr(l1, l2): | |

133 | #normalized to 0..1 | |

134 | return (1.-pearson_lists(l1, l2))/2 | |

135 | ||

136 | def dist_eucl(l1, l2): | |

137 | return euclidean_lists(l1, l2) |

**Note:**See TracBrowser for help on using the repository browser.