# source:orange/orange/orngInteractions.py@9351:f6cef38717a7

Revision 9351:f6cef38717a7, 9.1 KB checked in by blaz <blaz.zupan@…>, 2 years ago (diff)

minor (removed the authorship)

Line
1"""
2Feature interaction analysis. Implements 3-way attribute interactions as proposed
3by Aleks Jakulin in his Ph. D. Thesis. Replaces his own orange module (orngInteract)
4for the reasons of speed and simpler interface. Introduces sampling-based p-value
5estimation.
6"""
7
8import operator
9import random
10from math import log
11
12import orange
13import orngTest
14import orngStat
15import orngCI
16import itertools
17import warnings
18import copy
19import bisect
20
21from collections import defaultdict
22
23#def entropy_frequency_matrix(m):
24#    n = sum(sum(v) for v in m)
25#    return -sum(sum(plogp(x/n) for x in v) for v in m)
26
27def flatten(matrix):
28   """return a list of matrix elements"""
29   return reduce(operator.add, [list(vector) for vector in matrix])
30
31def p2f(vector):
32    """convert list of frequencies to list of probabilities"""
33    s = float(sum(vector))
34    return [x/s for x in vector]
35
37    """print out orange's sym matrix"""
38    print
39    if headers: print "    " + " ".join(["%-7d" % e for e in range(len(sym[0]))])
40    for (i, line) in enumerate(sym):
41        if headers: print "%2d" % i,
42        print " ".join(["%7.4f" % e for e in line])
43
44def attribute_pairs(data, n=None):
45    """iterator through attribute pairs for given data set"""
46    return itertools.combinations(data.domain.attributes, 2)
47
48# information-based scoring
49
50def plogp(p):
51    return 0. if p==0. else p * log(p, 2.)
52
53def _entropy(ps):
54    """entropy computed from the vector of probabilities"""
55    return -sum(plogp(p) for p in ps)
56
57def entropy(x, data):
58    """entropy of an attribute x from dataset data"""
59    if type(x)==orange.EnumVariable:
60        return _entropy(p2f(orange.Distribution(x, data)))
61    if type(x)==list:
62        if len(x)==2: # joint entropy of a pair of attributes
63            c = orange.ContingencyAttrAttr(x, y, data)
64            return _entropy(p2f(flatten(c)))
65        else: # joint entropy of for a set of attributes
66            pass
67
68def joint_entropy(x, y, data):
69    """return H(x,y), a joint entropy of attributes x and y"""
70    c = orange.ContingencyAttrAttr(x, y, data)
71    return _entropy(p2f(flatten(c)))
72
73def conditional_entropy(x, y, data):
74    """return H(x|y), a conditional entropy of attributes x and y"""
75    return joint_entropy(x, y, data) - entropy(x, data) # H(x|y) = H(x,y) - H(x)
76
77class Mutual_information():
78    def __init__(self, data):
79        self.hs = dict([(x, entropy(x, data)) for x in data.domain])
80        self.data = data
81    def __call__(self, x, y):
82        hxy = joint_entropy(x, y, self.data)
83        return self.hs[x] + self.hs[y] - hxy
84
85def mutual_information_matrix(data):
86    """return a matrix with mutual information for attribute pairs"""
87    atts = data.domain.attributes
88    mis = orange.SymMatrix(len(atts))
89    for a in range(len(atts)-1):
90        for b in range(a+1, len(atts)):
91            mis[a,b] = mutual_information(atts[a], atts[b], data)
92    return mis
93
94def conditional_entropy(x, y, data):
95    """returns conditional entropy H(X|Y) given attributes X and Y
96    where H(X|Y) = H(X,Y) - H(Y)"""
97    hxy = joint_entropy(x, y, data)
98    hy = entropy(y, data)
99    return hxy - hy
100
101class Interaction:
102    """
103    Two-way attribute interactions (feature synergies).
104    """
105    def __init__(self, data, p_values=False, samples=10000, permutations=100, permutation="class"):
106        self.data = data
107        self.measure = orange.MeasureAttribute_info
108        self.gain = self.gains()
109        self.class_entropy =  entropy(data.domain.classVar, data)
110        self.samples = samples
111        self.permutations = permutations
112        self.p_values = p_values
113        if permutation == "class":
114            score_dist_fn = self.compute_score_dist
115        elif permutation == "aic":
116            score_dist_fn = self.compute_score_dist_aic
117        else:
118            wrongPermutationType()
119        if p_values:
120            self.score_dist = score_dist_fn()
121
122    def gains(self):
123        return dict([(a, self.measure(a, self.data)) for a in self.data.domain.attributes])
124
125    def compute_score_dist(self, rand=random):
126        """Distribution (list) of interaction scores obtained by permutation analysis"""
127
128        def permute_class():
129            rand.shuffle(classvalues)
130            for v, d in itertools.izip(classvalues, self.data):
131                d.setclass(v)
132
133        orig_classvalues = [d.getclass() for d in self.data]
134        classvalues = copy.copy(orig_classvalues)
135        attributes = self.data.domain.attributes
136        samples_in_permutations = self.samples / self.permutations
137        self.permuted_scores = []
138        orig_gain = self.gain
139
140        for _ in range(self.permutations):
141            permute_class()
142            self.gain = self.gains() #recompute univariate gains for permuted classes
143            scores = [self.get_score(*rand.sample(attributes, 2)) for _ in range(samples_in_permutations)]
144            self.permuted_scores.extend(scores)
145
146        self.permuted_scores_len = float(len(self.permuted_scores))
147        self.permuted_scores.sort()
148
149        # restore class values to original values
150        for v, d in itertools.izip(orig_classvalues, self.data):
151            d.setclass(v)
152        self.gain = orig_gain #restore original gains
153
154    def compute_score_dist_aic(self, rand=random):
155        """Distribution (list) of interaction scores obtained by permutation analysis"""
156
157        def shuffleAttribute(data, attribute, locations):
158            """
159            Destructive!
160            Locations: transposion vector. i-th value is transfered
161            to locations[i]
162            """
163            attribute = data.domain[attribute]
164            l = [None]*len(data)
165            for i in range(len(data)):
166                l[locations[i]] = data[i][attribute]
167            for i in range(len(data)):
168                data[i][attribute] = l[i]
169
170
171        def permute_attributes_in_class(data):
172            #shuffle inside a class
173            #get classes - you can get positions for class 1, then shuffle them
174            #inplace!
175
176            def groups_by_class(data):
177                #return groups by class value
178                dorig = defaultdict(list)
179                for i,c in enumerate([ex.getclass() for ex in data ]):
180                    dorig[c.value].append(i)
181                return dorig.values()
182
183            def permute_by_groups(groups, rand):
184                # Permute by groups and return a transposition vector. Each group is
185                #a list of indices belonging to the group.
186
187                perm = [ None ] * len(data)
188
189                for indices in groups:
190                    indices2 = copy.copy(indices)
191                    rand.shuffle(indices2)
192
193                    for old,new in zip(indices,indices2):
194                        perm[old] = new
195
196                return perm
197
198            gc = groups_by_class(data)
199
200            for at in data.domain.attributes:
201                transpositions = permute_by_groups(gc, rand)
202                shuffleAttribute(data, at, transpositions)
203
204        datacopy = orange.ExampleTable(self.data.domain, self.data)
205        orig_classvalues = [d.getclass() for d in self.data]
206        attributes = self.data.domain.attributes
207        samples_in_permutations = self.samples / self.permutations
208        self.permuted_scores = []
209        orig_gain = self.gain
210
211        for _ in range(self.permutations):
212            permute_attributes_in_class(self.data)
213            self.gain = self.gains() #recompute univariate gains for permuted classes
214            scores = [self.get_score(*rand.sample(attributes, 2)) for _ in range(samples_in_permutations)]
215            self.permuted_scores.extend(scores)
216
217        self.permuted_scores_len = float(len(self.permuted_scores))
218        self.permuted_scores.sort()
219
220        self.data = datacopy
221        self.gain = orig_gain #restore original gains
222
223    def get_score(self, a1, a2):
224        return orngCI.FeatureByCartesianProduct(self.data, (a1, a2), measure=self.measure)[1] - self.gain[a1] - self.gain[a2]
225
226    def __call__(self, a1, a2):
227        """Return two-attribute interaction and proportion of explained class entropy"""
228        score = self.get_score(a1, a2)
229        if self.p_values:
230            return score, score/self.class_entropy, 1.0 - bisect.bisect(self.permuted_scores, score)/self.permuted_scores_len
231        else:
232            return score, score/self.class_entropy
233
234#a1, a2 = data.domain.attributes[0], data.domain.attributes[1]
235#ab, quality = orngCI.FeatureByCartesianProduct(data, [a1, a2], measure=orange.MeasureAttribute_info)
236#r = mutual_information(a1, a2, data)
237
238# meas = orange.MeasureAttribute_info()
239
240def test():
241    x = data.domain.attributes[1]
242    y = data.domain.attributes[2]
243    c = data.domain.classVar
244    print "H(%s) = %5.5f" % (x.name, _entropy(p2f(orange.Distribution(x, data))))
245    print "H(%s) = %5.5f" % (y.name, _entropy(p2f(orange.Distribution(y, data))))
246    print "H(%s,%s)= %5.5f" % (x.name, y.name, joint_entropy(x, y, data))
247    print "I(%s;%s)= %5.5f" % (x.name, y.name, mutual_information(x, y, data))
248    print "H(%s|%s)= %5.5f" % (x.name, c.name, mutual_information(x, c, data))
249    print "InfoGain = %5.5f" % orange.MeasureAttribute_info(x, data)
Note: See TracBrowser for help on using the repository browser.