source: orange/Orange/multilabel/mlknn.py @ 11459:fc07a5c346be

Revision 11459:fc07a5c346be, 8.8 KB checked in by Ales Erjavec <ales.erjavec@…>, 12 months ago (diff)

Fixed checks for passed dataset table argument in new methods.

Use 'instances is not None' idiom and not a boolean test to guard against cases
where the passed dataset length is 0.

Line 
1"""
2.. index:: ML-kNN Learner
3
4***************************************
5ML-kNN Learner
6***************************************
7
8ML-kNN Classification is an adaptation kNN for multi-label
9classification.  In essence, ML-kNN uses the kNN algorithm
10independently for each label :math:`l`.  It finds the k nearest
11examples to the test instance and considers those that are labeled at
12least with :math:`l` as positive and the rest as negative.  What
13mainly differentiates this method from other binary relevance (BR)
14methods is the use of prior probabilities. ML-kNN can also rank labels.
15
16For more information, see Zhang, M. and Zhou, Z. 2007. `ML-KNN: A lazy
17learning approach to multi-label learning
18<http://dx.doi.org/10.1016/j.patcog.2006.12.019>`_.  Pattern
19Recogn. 40, 7 (Jul. 2007), 2038-2048.
20
21.. index:: ML-kNN Learner
22.. autoclass:: Orange.multilabel.MLkNNLearner
23   :members:
24   :show-inheritance:
25
26   :param instances: a table of instances.
27   :type instances: :class:`Orange.data.Table`
28 
29
30.. index:: ML-kNN Classifier
31.. autoclass:: Orange.multilabel.MLkNNClassifier
32   :members:
33   :show-inheritance:
34
35Examples
36========
37
38The following example demonstrates a straightforward invocation of
39this algorithm (:download:`mlc-classify.py <code/mlc-classify.py>`):
40
41.. literalinclude:: code/mlc-classify.py
42   :lines: 6, 11-13
43
44"""
45import random
46import Orange
47import multiknn as _multiknn
48
49from lp import transform_to_powerset
50
51class MLkNNLearner(_multiknn.MultikNNLearner):
52    """
53    Class implementing the ML-kNN (Multi-Label k Nearest Neighbours)
54    algorithm. The class is based on the pseudo-code made available by
55    the authors.
56   
57    The pseudo code of ML-kNN:
58   
59        :math:`[\\vec y_t,\\vec r_t] = ML-kNN(T,K,t,s)`
60   
61        :math:`\%Computing \\quad the \\quad prior \\quad probabilities \\quad P(H_b^l)`
62   
63        :math:`(1) for \\quad l \\in y \\quad do`
64
65        :math:`(2) \\quad  P(H_1^l) = (s+ \\sum_{i=1}^m \\vec y_{x_i}(l))/(s * 2+m); P(H_0^l)=1-P(H_1^l)`
66
67        :math:`\%Computing \\quad the \\quad posterior \\quad probabilities P(E_j^l|H_b^l)`
68   
69        :math:`(3) Identify \\quad N(x_i), i \\in {1,2,...,m}`
70       
71        :math:`(4) for \\quad l \\in y \\quad do`
72       
73        :math:`(5) \\quad for \\quad j \\in{0,1,...,K} \\quad do`
74       
75        :math:`(6) \\quad \\quad c[j]=0; c'[j]=0`
76       
77        :math:`(7) \\quad for \\quad i \\in{1,...,m} \\quad do`
78       
79        :math:`(8) \\quad \\quad \\delta = \\vec C_{x_i}(l)=\\sum_{a \\in N(x_i)} \\vec y_a(l)`
80       
81        :math:`(9) \\quad \\quad if (\\vec y_{x_i}(l)==1) \\quad then \\quad c[\\delta]=c[\\delta]+1`
82       
83        :math:`(10)\\quad \\quad \\quad \\quad else \\quad c'[\\delta]=c'[\\delta]+1`
84       
85        :math:`(11)\\quad for \\quad j \\in{0,1,...,K} \\quad do`
86       
87        :math:`(12)\\quad \\quad P(E_j^l|H_1^l)=(s+c[j])/(s * (K+1)+ \\sum_{p=0}^k c[p])`
88       
89        :math:`(13)\\quad \\quad P(E_j^l|H_0^l)=(s+c'[j])/(s *(K+1)+ \\sum_{p=0}^k c'[p])`
90       
91        :math:`\%Computing \\quad \\vec y_t \\quad and \\quad \\vec r_t`
92       
93        :math:`(14) Identify \\quad N(t)`
94       
95        :math:`(15) for \\quad l \\in y \\quad do`
96       
97        :math:`(16)\\quad \\vec C_t(l) = \\sum_{a \\in N(t)} \\vec y_a(L)`
98       
99        :math:`(17)\\quad \\vec y_t(l) = argmax_{b \\in {0,1}}P(H_b^l)P(E_{\\vec C_t(l)}^l|H_b^l)`
100       
101        :math:`(18)\\quad \\vec r_t(l)=P(H_1^l|E_{\\vec C_t(l)}^l)=P(H_1^l)P(E_{\\vec C_t(l)}|H_1^l)/P(E_{\\vec C_t(l)}^l)=P(H_1^l)P(E_{\\vec C_t(l)}|H_1^l)/(\\sum_{b \\in {0,1}}P(H_b^l)P(E_{\\vec C_t(l)}^l|H_b^l))`
102
103     
104    .. attribute:: k
105   
106        Number of neighbors. The default value is 1
107   
108    .. attribute:: smooth
109   
110        Smoothing parameter controlling the strength of uniform prior (Default value is set to 1 which yields the Laplace smoothing).
111   
112    .. attribute:: knn
113       
114        :class:`Orange.classification.knn.FindNearest` for nearest neighbor search
115   
116    """
117    def __new__(cls, instances = None, k=1, smooth = 1.0, weight_id = 0, **argkw):
118        """
119        Constructor of MLkNNLearner
120       
121        :param instances: a table of instances.
122        :type instances: :class:`Orange.data.Table`
123       
124        :param k: number of nearest neighbors used in classification
125        :type k: int
126       
127        :param smooth: Smoothing parameter controlling the strength of uniform prior
128        (Default value is set to 1 which yields the Laplace smoothing).
129        :type smooth: Float
130       
131        :rtype: :class:`MLkNNLearner`
132        """
133       
134        self = _multiknn.MultikNNLearner.__new__(cls, k, **argkw)
135        self.smooth = smooth
136       
137        if instances is not None:
138            self.__init__(**argkw)
139            return self.__call__(instances,weight_id)
140        else:
141            return self
142
143    def __call__(self, instances, weight_id = 0, **kwds):
144        if not Orange.multilabel.is_multilabel(instances):
145            raise TypeError("The given data set is not a multi-label data set"
146                            " with class values 0 and 1.")
147       
148        self.__dict__.update(kwds)
149        self._build_knn(instances)
150
151        #Computing the prior probabilities P(H_b^l)
152        prior_prob = self.compute_prior(instances)
153       
154        #Computing the posterior probabilities P(E_j^l|H_b^l)
155        cond_prob = list(self.compute_cond(instances))
156       
157        return MLkNNClassifier(instances = instances,
158                               prior_prob = prior_prob, 
159                               cond_prob = cond_prob,
160                               knn = self.knn,
161                               k = self.k)
162
163    def compute_prior(self, instances):
164        """ Compute prior probability for each label of the training set. """
165        prior_prob = []
166        for lvar in instances.domain.class_vars:
167            freq = sum(inst[lvar].value == '1' for inst in instances)
168            prior_prob.append( float(self.smooth + freq) / (self.smooth * 2 + len(instances)) )
169        return prior_prob
170           
171    def compute_cond(self, instances):
172        """ Compute posterior probabilities for each label of the training set. """
173        k = self.k
174       
175        def _remove_identical(table, inst):
176            try:
177                i = [inst1.get_classes() == inst.get_classes() for inst1 in table].index(1)
178            except:
179                i = -1
180            del table[i]
181            return table
182           
183           
184        neighbor_lists = [_remove_identical(self.knn(inst, k+1), inst) for inst in instances]
185        p1 = [[0]*(k+1) for lvar in instances.domain.class_vars]
186        p0 = [[0]*(k+1) for lvar in instances.domain.class_vars]
187
188        for li, lvar in enumerate(instances.domain.class_vars):
189            c  = [0] * (k + 1)
190            cn = [0] * (k + 1)
191           
192            for inst, neighbors in zip(instances, neighbor_lists):
193                delta = sum(n[lvar].value=='1' for n in neighbors)
194               
195                (c if inst[lvar].value == '1' else cn)[delta] += 1
196               
197            for j in range(k+1):
198                p1[li][j] = float(self.smooth + c[j]) / (self.smooth * (k+1) + sum(c))
199                p0[li][j] = float(self.smooth + cn[j]) / (self.smooth * (k+1) + sum(cn))
200       
201        return p0, p1
202 
203class MLkNNClassifier(_multiknn.MultikNNClassifier):     
204    def __call__(self, instance, result_type=Orange.classification.Classifier.GetValue):
205        """
206        :rtype: a list of :class:`Orange.data.Value`, a list of :class:`Orange.statistics.distribution.Distribution`, or a tuple with both
207        """
208        neighbors = self.knn(instance, self.k)
209       
210        labels = []
211        dists = []
212       
213        for li, lvar in enumerate(self.instances.domain.class_vars):
214            delta = sum(n[lvar].value=='1' for n in neighbors)
215   
216            p1 = self.prior_prob[li] * self.cond_prob[1][li][delta]
217            p0 = (1-self.prior_prob[li]) * self.cond_prob[0][li][delta]
218            y = (p1 >= p0)
219            labels.append(Orange.data.Value(lvar, str(int(y))))
220           
221            r = p1 / (p0+p1)
222            dists.append( Orange.statistics.distribution.Discrete([1-r, r]) )
223       
224        for d, lvar in zip(dists, self.instances.domain.class_vars):
225            d.variable = lvar
226       
227        if result_type == Orange.classification.Classifier.GetValue:
228            return labels
229        if result_type == Orange.classification.Classifier.GetProbabilities:
230            return dists
231        return labels, dists
232       
233#########################################################################################
234# Test the code, run from DOS prompt
235# assume the data file is in proper directory
236
237if __name__ == "__main__":
238    data = Orange.data.Table("emotions.tab")
239
240    classifier = Orange.multilabel.MLkNNLearner(data,5,1.0)
241    for i in range(10):
242        c,p = classifier(data[i],Orange.classification.Classifier.GetBoth)
243        print c,p
Note: See TracBrowser for help on using the repository browser.