source: orange/orange/multilabel/mlknn.py @ 9669:165371b04b4a

Revision 9669:165371b04b4a, 9.0 KB checked in by anze <anze.staric@…>, 2 years ago (diff)

Moved content of Orange dir to package dir

Line 
1"""
2.. index:: ML-kNN Learner
3
4***************************************
5ML-kNN Learner
6***************************************
7
8ML-kNN Classification is a kind of adaptation method for multi-label classification.
9It is an adaptation of the kNN lazy learning algorithm for multi-label data.
10In essence, ML-kNN uses the kNN algorithm independently for each label :math:`l`.
11It finds the k nearest examples to the test instance and considers those that are
12labeled at least with :math:`l` as positive and the rest as negative.
13Actually this method follows the paradigm of Binary Relevance (BR). What mainly
14differentiates this method from BR is the use of prior probabilities. ML-kNN has also
15the capability of producing a ranking of the labels as an output.
16For more information, see Zhang, M. and Zhou, Z. 2007. `ML-KNN: A lazy learning
17approach to multi-label learning <http://dx.doi.org/10.1016/j.patcog.2006.12.019>`_.
18Pattern Recogn. 40, 7 (Jul. 2007), 2038-2048. 
19
20.. index:: ML-kNN Learner
21.. autoclass:: Orange.multilabel.MLkNNLearner
22   :members:
23   :show-inheritance:
24
25   :param instances: a table of instances.
26   :type instances: :class:`Orange.data.Table`
27 
28
29.. index:: ML-kNN Classifier
30.. autoclass:: Orange.multilabel.MLkNNClassifier
31   :members:
32   :show-inheritance:
33
34Examples
35========
36
37The following example demonstrates a straightforward invocation of
38this algorithm (:download:`mlc-classify.py <code/mlc-classify.py>`, uses
39:download:`emotions.tab <code/emotions.tab>`):
40
41.. literalinclude:: code/mlc-classify.py
42   :lines: 6, 11-13
43
44"""
45import random
46import Orange
47import multiknn as _multiknn
48
49from lp import transform_to_powerset
50
51class MLkNNLearner(_multiknn.MultikNNLearner):
52    """
53    Class implementing the ML-kNN (Multi-Label k Nearest Neighbours) algorithm. The class is based on the
54    pseudo-code made available by the authors.
55   
56    The pseudo code of ML-kNN:
57   
58        :math:`[\\vec y_t,\\vec r_t] = ML-kNN(T,K,t,s)`
59   
60        :math:`\%Computing \\quad the \\quad prior \\quad probabilities \\quad P(H_b^l)`
61   
62        :math:`(1) for \\quad l \\in y \\quad do`
63
64        :math:`(2) \\quad  P(H_1^l) = (s+ \\sum_{i=1}^m \\vec y_{x_i}(l))/(s * 2+m); P(H_0^l)=1-P(H_1^l)`
65
66        :math:`\%Computing \\quad the \\quad posterior \\quad probabilities P(E_j^l|H_b^l)`
67   
68        :math:`(3) Identify \\quad N(x_i), i \\in {1,2,...,m}`
69       
70        :math:`(4) for \\quad l \\in y \\quad do`
71       
72        :math:`(5) \\quad for \\quad j \\in{0,1,...,K} \\quad do`
73       
74        :math:`(6) \\quad \\quad c[j]=0; c'[j]=0`
75       
76        :math:`(7) \\quad for \\quad i \\in{1,...,m} \\quad do`
77       
78        :math:`(8) \\quad \\quad \\delta = \\vec C_{x_i}(l)=\\sum_{a \\in N(x_i)} \\vec y_a(l)`
79       
80        :math:`(9) \\quad \\quad if (\\vec y_{x_i}(l)==1) \\quad then \\quad c[\\delta]=c[\\delta]+1`
81       
82        :math:`(10)\\quad \\quad \\quad \\quad else \\quad c'[\\delta]=c'[\\delta]+1`
83       
84        :math:`(11)\\quad for \\quad j \\in{0,1,...,K} \\quad do`
85       
86        :math:`(12)\\quad \\quad P(E_j^l|H_1^l)=(s+c[j])/(s * (K+1)+ \\sum_{p=0}^k c[p])`
87       
88        :math:`(13)\\quad \\quad P(E_j^l|H_0^l)=(s+c'[j])/(s *(K+1)+ \\sum_{p=0}^k c'[p])`
89       
90        :math:`\%Computing \\quad \\vec y_t \\quad and \\quad \\vec r_t`
91       
92        :math:`(14) Identify \\quad N(t)`
93       
94        :math:`(15) for \\quad l \\in y \\quad do`
95       
96        :math:`(16)\\quad \\vec C_t(l) = \\sum_{a \\in N(t)} \\vec y_a(L)`
97       
98        :math:`(17)\\quad \\vec y_t(l) = argmax_{b \\in {0,1}}P(H_b^l)P(E_{\\vec C_t(l)}^l|H_b^l)`
99       
100        :math:`(18)\\quad \\vec r_t(l)=P(H_1^l|E_{\\vec C_t(l)}^l)=P(H_1^l)P(E_{\\vec C_t(l)}|H_1^l)/P(E_{\\vec C_t(l)}^l)=P(H_1^l)P(E_{\\vec C_t(l)}|H_1^l)/(\\sum_{b \\in {0,1}}P(H_b^l)P(E_{\\vec C_t(l)}^l|H_b^l))`
101
102     
103    .. attribute:: k
104   
105        Number of neighbors. The default value is 1
106   
107    .. attribute:: smooth
108   
109        Smoothing parameter controlling the strength of uniform prior (Default value is set to 1 which yields the Laplace smoothing).
110   
111    .. attribute:: knn
112       
113        :class:`Orange.classification.knn.FindNearest` for nearest neighbor search
114   
115    """
116    def __new__(cls, instances = None, k=1, smooth = 1.0, weight_id = 0, **argkw):
117        """
118        Constructor of MLkNNLearner
119       
120        :param instances: a table of instances.
121        :type instances: :class:`Orange.data.Table`
122       
123        :param k: number of nearest neighbors used in classification
124        :type k: int
125       
126        :param smooth: Smoothing parameter controlling the strength of uniform prior
127        (Default value is set to 1 which yields the Laplace smoothing).
128        :type smooth: Float
129       
130        :rtype: :class:`MLkNNLearner`
131        """
132       
133        self = _multiknn.MultikNNLearner.__new__(cls, k, **argkw)
134        self.smooth = smooth
135       
136        if instances:
137            self.__init__(**argkw)
138            return self.__call__(instances,weight_id)
139        else:
140            return self
141
142    def __call__(self, instances, weight_id = 0, **kwds):
143        if not Orange.multilabel.is_multilabel(instances):
144            raise TypeError("The given data set is not a multi-label data set.")
145       
146        self.__dict__.update(kwds)
147        self._build_knn(instances)
148
149        #Computing the prior probabilities P(H_b^l)
150        prior_prob = self.compute_prior(instances)
151       
152        #Computing the posterior probabilities P(E_j^l|H_b^l)
153        cond_prob = list(self.compute_cond(instances))
154       
155        return MLkNNClassifier(instances = instances,
156                               prior_prob = prior_prob, 
157                               cond_prob = cond_prob,
158                               knn = self.knn,
159                               k = self.k)
160
161    def compute_prior(self, instances):
162        """ Compute prior probability for each label of the training set. """
163        prior_prob = []
164        for lvar in instances.domain.class_vars:
165            freq = sum(inst[lvar].value == '1' for inst in instances)
166            prior_prob.append( float(self.smooth + freq) / (self.smooth * 2 + len(instances)) )
167        return prior_prob
168           
169    def compute_cond(self, instances):
170        """ Compute posterior probabilities for each label of the training set. """
171        k = self.k
172       
173        def _remove_identical(table, inst):
174            try:
175                i = [inst1.get_classes() == inst.get_classes() for inst1 in table].index(1)
176            except:
177                i = -1
178            del table[i]
179            return table
180           
181           
182        neighbor_lists = [_remove_identical(self.knn(inst, k+1), inst) for inst in instances]
183        p1 = [[0]*(k+1) for lvar in instances.domain.class_vars]
184        p0 = [[0]*(k+1) for lvar in instances.domain.class_vars]
185
186        for li, lvar in enumerate(instances.domain.class_vars):
187            c  = [0] * (k + 1)
188            cn = [0] * (k + 1)
189           
190            for inst, neighbors in zip(instances, neighbor_lists):
191                delta = sum(n[lvar].value=='1' for n in neighbors)
192               
193                (c if inst[lvar].value == '1' else cn)[delta] += 1
194               
195            for j in range(k+1):
196                p1[li][j] = float(self.smooth + c[j]) / (self.smooth * (k+1) + sum(c))
197                p0[li][j] = float(self.smooth + cn[j]) / (self.smooth * (k+1) + sum(cn))
198       
199        return p0, p1
200 
201class MLkNNClassifier(_multiknn.MultikNNClassifier):     
202    def __call__(self, instance, result_type=Orange.classification.Classifier.GetValue):
203        """
204        :rtype: a list of :class:`Orange.data.Value`, a list of :class:`Orange.statistics.distribution.Distribution`, or a tuple with both
205        """
206        neighbors = self.knn(instance, self.k)
207       
208        labels = []
209        dists = []
210       
211        for li, lvar in enumerate(self.instances.domain.class_vars):
212            delta = sum(n[lvar].value=='1' for n in neighbors)
213   
214            p1 = self.prior_prob[li] * self.cond_prob[1][li][delta]
215            p0 = (1-self.prior_prob[li]) * self.cond_prob[0][li][delta]
216            y = (p1 >= p0)
217            labels.append(Orange.data.Value(lvar, str(int(y))))
218           
219            r = p1 / (p0+p1)
220            dists.append( Orange.statistics.distribution.Discrete([1-r, r]) )
221       
222        for d, lvar in zip(dists, self.instances.domain.class_vars):
223            d.variable = lvar
224       
225        if result_type == Orange.classification.Classifier.GetValue:
226            return labels
227        if result_type == Orange.classification.Classifier.GetProbabilities:
228            return dists
229        return labels, dists
230       
231#########################################################################################
232# Test the code, run from DOS prompt
233# assume the data file is in proper directory
234
235if __name__ == "__main__":
236    data = Orange.data.Table("emotions.tab")
237
238    classifier = Orange.multilabel.MLkNNLearner(data,5,1.0)
239    for i in range(10):
240        c,p = classifier(data[i],Orange.classification.Classifier.GetBoth)
241        print c,p
Note: See TracBrowser for help on using the repository browser.