#
source:
orange/Orange/multilabel/mlknn.py
@
10502:6b593a8cd5a0

Revision 10502:6b593a8cd5a0, 8.8 KB checked in by Matija Polajnar <matija.polajnar@…>, 2 years ago (diff) |
---|

Rev | Line | |
---|---|---|

[9461] | 1 | """ |

2 | .. index:: ML-kNN Learner | |

3 | ||

4 | *************************************** | |

5 | ML-kNN Learner | |

6 | *************************************** | |

7 | ||

[10417] | 8 | ML-kNN Classification is an adaptation kNN for multi-label |

9 | classification. In essence, ML-kNN uses the kNN algorithm | |

10 | independently for each label :math:`l`. It finds the k nearest | |

11 | examples to the test instance and considers those that are labeled at | |

12 | least with :math:`l` as positive and the rest as negative. What | |

13 | mainly differentiates this method from other binary relevance (BR) | |

14 | methods is the use of prior probabilities. ML-kNN can also rank labels. | |

15 | ||

16 | For more information, see Zhang, M. and Zhou, Z. 2007. `ML-KNN: A lazy | |

17 | learning approach to multi-label learning | |

18 | <http://dx.doi.org/10.1016/j.patcog.2006.12.019>`_. Pattern | |

19 | Recogn. 40, 7 (Jul. 2007), 2038-2048. | |

[9461] | 20 | |

21 | .. index:: ML-kNN Learner | |

22 | .. autoclass:: Orange.multilabel.MLkNNLearner | |

23 | :members: | |

24 | :show-inheritance: | |

[9505] | 25 | |

[9500] | 26 | :param instances: a table of instances. |

[9461] | 27 | :type instances: :class:`Orange.data.Table` |

[9505] | 28 | |

[9461] | 29 | |

[9505] | 30 | .. index:: ML-kNN Classifier |

[9461] | 31 | .. autoclass:: Orange.multilabel.MLkNNClassifier |

32 | :members: | |

33 | :show-inheritance: | |

34 | ||

35 | Examples | |

36 | ======== | |

37 | ||

38 | The following example demonstrates a straightforward invocation of | |

[9994] | 39 | this algorithm (:download:`mlc-classify.py <code/mlc-classify.py>`): |

[9461] | 40 | |

41 | .. literalinclude:: code/mlc-classify.py | |

[9505] | 42 | :lines: 6, 11-13 |

[9461] | 43 | |

44 | """ | |

[9463] | 45 | import random |

[9461] | 46 | import Orange |

[9470] | 47 | import multiknn as _multiknn |

[9461] | 48 | |

[9500] | 49 | from lp import transform_to_powerset |

50 | ||

[9470] | 51 | class MLkNNLearner(_multiknn.MultikNNLearner): |

[9461] | 52 | """ |

[10417] | 53 | Class implementing the ML-kNN (Multi-Label k Nearest Neighbours) |

54 | algorithm. The class is based on the pseudo-code made available by | |

55 | the authors. | |

[9462] | 56 | |

57 | The pseudo code of ML-kNN: | |

58 | ||

59 | :math:`[\\vec y_t,\\vec r_t] = ML-kNN(T,K,t,s)` | |

60 | ||

61 | :math:`\%Computing \\quad the \\quad prior \\quad probabilities \\quad P(H_b^l)` | |

62 | ||

63 | :math:`(1) for \\quad l \\in y \\quad do` | |

64 | ||

65 | :math:`(2) \\quad P(H_1^l) = (s+ \\sum_{i=1}^m \\vec y_{x_i}(l))/(s * 2+m); P(H_0^l)=1-P(H_1^l)` | |

66 | ||

67 | :math:`\%Computing \\quad the \\quad posterior \\quad probabilities P(E_j^l|H_b^l)` | |

68 | ||

69 | :math:`(3) Identify \\quad N(x_i), i \\in {1,2,...,m}` | |

70 | ||

71 | :math:`(4) for \\quad l \\in y \\quad do` | |

72 | ||

73 | :math:`(5) \\quad for \\quad j \\in{0,1,...,K} \\quad do` | |

74 | ||

75 | :math:`(6) \\quad \\quad c[j]=0; c'[j]=0` | |

76 | ||

77 | :math:`(7) \\quad for \\quad i \\in{1,...,m} \\quad do` | |

78 | ||

79 | :math:`(8) \\quad \\quad \\delta = \\vec C_{x_i}(l)=\\sum_{a \\in N(x_i)} \\vec y_a(l)` | |

80 | ||

81 | :math:`(9) \\quad \\quad if (\\vec y_{x_i}(l)==1) \\quad then \\quad c[\\delta]=c[\\delta]+1` | |

82 | ||

83 | :math:`(10)\\quad \\quad \\quad \\quad else \\quad c'[\\delta]=c'[\\delta]+1` | |

84 | ||

85 | :math:`(11)\\quad for \\quad j \\in{0,1,...,K} \\quad do` | |

86 | ||

87 | :math:`(12)\\quad \\quad P(E_j^l|H_1^l)=(s+c[j])/(s * (K+1)+ \\sum_{p=0}^k c[p])` | |

88 | ||

89 | :math:`(13)\\quad \\quad P(E_j^l|H_0^l)=(s+c'[j])/(s *(K+1)+ \\sum_{p=0}^k c'[p])` | |

90 | ||

91 | :math:`\%Computing \\quad \\vec y_t \\quad and \\quad \\vec r_t` | |

92 | ||

93 | :math:`(14) Identify \\quad N(t)` | |

94 | ||

95 | :math:`(15) for \\quad l \\in y \\quad do` | |

96 | ||

97 | :math:`(16)\\quad \\vec C_t(l) = \\sum_{a \\in N(t)} \\vec y_a(L)` | |

98 | ||

99 | :math:`(17)\\quad \\vec y_t(l) = argmax_{b \\in {0,1}}P(H_b^l)P(E_{\\vec C_t(l)}^l|H_b^l)` | |

100 | ||

101 | :math:`(18)\\quad \\vec r_t(l)=P(H_1^l|E_{\\vec C_t(l)}^l)=P(H_1^l)P(E_{\\vec C_t(l)}|H_1^l)/P(E_{\\vec C_t(l)}^l)=P(H_1^l)P(E_{\\vec C_t(l)}|H_1^l)/(\\sum_{b \\in {0,1}}P(H_b^l)P(E_{\\vec C_t(l)}^l|H_b^l))` | |

102 | ||

103 | ||

104 | .. attribute:: k | |

105 | ||

[9470] | 106 | Number of neighbors. The default value is 1 |

[9462] | 107 | |

108 | .. attribute:: smooth | |

109 | ||

110 | Smoothing parameter controlling the strength of uniform prior (Default value is set to 1 which yields the Laplace smoothing). | |

111 | ||

112 | .. attribute:: knn | |

113 | ||

[9470] | 114 | :class:`Orange.classification.knn.FindNearest` for nearest neighbor search |

[9462] | 115 | |

[9461] | 116 | """ |

[9475] | 117 | def __new__(cls, instances = None, k=1, smooth = 1.0, weight_id = 0, **argkw): |

[9462] | 118 | """ |

119 | Constructor of MLkNNLearner | |

120 | ||

[9500] | 121 | :param instances: a table of instances. |

[9462] | 122 | :type instances: :class:`Orange.data.Table` |

123 | ||

124 | :param k: number of nearest neighbors used in classification | |

125 | :type k: int | |

126 | ||

[9466] | 127 | :param smooth: Smoothing parameter controlling the strength of uniform prior |

128 | (Default value is set to 1 which yields the Laplace smoothing). | |

[9462] | 129 | :type smooth: Float |

130 | ||

131 | :rtype: :class:`MLkNNLearner` | |

132 | """ | |

133 | ||

[9470] | 134 | self = _multiknn.MultikNNLearner.__new__(cls, k, **argkw) |

[9462] | 135 | self.smooth = smooth |

136 | ||

[9461] | 137 | if instances: |

138 | self.__init__(**argkw) | |

[9475] | 139 | return self.__call__(instances,weight_id) |

[9461] | 140 | else: |

141 | return self | |

[9462] | 142 | |

[9475] | 143 | def __call__(self, instances, weight_id = 0, **kwds): |

[9500] | 144 | if not Orange.multilabel.is_multilabel(instances): |

[10502] | 145 | raise TypeError("The given data set is not a multi-label data set" |

146 | " with class values 0 and 1.") | |

[9500] | 147 | |

148 | self.__dict__.update(kwds) | |

149 | self._build_knn(instances) | |

[9461] | 150 | |

[9500] | 151 | #Computing the prior probabilities P(H_b^l) |

152 | prior_prob = self.compute_prior(instances) | |

153 | ||

154 | #Computing the posterior probabilities P(E_j^l|H_b^l) | |

155 | cond_prob = list(self.compute_cond(instances)) | |

156 | ||

157 | return MLkNNClassifier(instances = instances, | |

158 | prior_prob = prior_prob, | |

159 | cond_prob = cond_prob, | |

160 | knn = self.knn, | |

161 | k = self.k) | |

[9470] | 162 | |

[9500] | 163 | def compute_prior(self, instances): |

[9505] | 164 | """ Compute prior probability for each label of the training set. """ |

[9500] | 165 | prior_prob = [] |

166 | for lvar in instances.domain.class_vars: | |

167 | freq = sum(inst[lvar].value == '1' for inst in instances) | |

168 | prior_prob.append( float(self.smooth + freq) / (self.smooth * 2 + len(instances)) ) | |

169 | return prior_prob | |

170 | ||

171 | def compute_cond(self, instances): | |

[9505] | 172 | """ Compute posterior probabilities for each label of the training set. """ |

[9462] | 173 | k = self.k |

174 | ||

[9500] | 175 | def _remove_identical(table, inst): |

176 | try: | |

177 | i = [inst1.get_classes() == inst.get_classes() for inst1 in table].index(1) | |

178 | except: | |

179 | i = -1 | |

180 | del table[i] | |

181 | return table | |

182 | ||

183 | ||

184 | neighbor_lists = [_remove_identical(self.knn(inst, k+1), inst) for inst in instances] | |

185 | p1 = [[0]*(k+1) for lvar in instances.domain.class_vars] | |

186 | p0 = [[0]*(k+1) for lvar in instances.domain.class_vars] | |

187 | ||

188 | for li, lvar in enumerate(instances.domain.class_vars): | |

189 | c = [0] * (k + 1) | |

190 | cn = [0] * (k + 1) | |

191 | ||

192 | for inst, neighbors in zip(instances, neighbor_lists): | |

193 | delta = sum(n[lvar].value=='1' for n in neighbors) | |

194 | ||

195 | (c if inst[lvar].value == '1' else cn)[delta] += 1 | |

196 | ||

197 | for j in range(k+1): | |

198 | p1[li][j] = float(self.smooth + c[j]) / (self.smooth * (k+1) + sum(c)) | |

199 | p0[li][j] = float(self.smooth + cn[j]) / (self.smooth * (k+1) + sum(cn)) | |

[9462] | 200 | |

[9500] | 201 | return p0, p1 |

[9462] | 202 | |

[9470] | 203 | class MLkNNClassifier(_multiknn.MultikNNClassifier): |

[9500] | 204 | def __call__(self, instance, result_type=Orange.classification.Classifier.GetValue): |

[9505] | 205 | """ |

206 | :rtype: a list of :class:`Orange.data.Value`, a list of :class:`Orange.statistics.distribution.Distribution`, or a tuple with both | |

207 | """ | |

[9500] | 208 | neighbors = self.knn(instance, self.k) |

209 | ||

[9461] | 210 | labels = [] |

[9500] | 211 | dists = [] |

[9461] | 212 | |

[9500] | 213 | for li, lvar in enumerate(self.instances.domain.class_vars): |

214 | delta = sum(n[lvar].value=='1' for n in neighbors) | |

[9463] | 215 | |

[9500] | 216 | p1 = self.prior_prob[li] * self.cond_prob[1][li][delta] |

217 | p0 = (1-self.prior_prob[li]) * self.cond_prob[0][li][delta] | |

218 | y = (p1 >= p0) | |

219 | labels.append(Orange.data.Value(lvar, str(int(y)))) | |

[9463] | 220 | |

[9500] | 221 | r = p1 / (p0+p1) |

222 | dists.append( Orange.statistics.distribution.Discrete([1-r, r]) ) | |

[9463] | 223 | |

[9500] | 224 | for d, lvar in zip(dists, self.instances.domain.class_vars): |

225 | d.variable = lvar | |

[9461] | 226 | |

227 | if result_type == Orange.classification.Classifier.GetValue: | |

228 | return labels | |

229 | if result_type == Orange.classification.Classifier.GetProbabilities: | |

[9500] | 230 | return dists |

231 | return labels, dists | |

[9475] | 232 | |

233 | ######################################################################################### | |

[9477] | 234 | # Test the code, run from DOS prompt |

235 | # assume the data file is in proper directory | |

236 | ||

[9475] | 237 | if __name__ == "__main__": |

238 | data = Orange.data.Table("emotions.tab") | |

239 | ||

240 | classifier = Orange.multilabel.MLkNNLearner(data,5,1.0) | |

241 | for i in range(10): | |

242 | c,p = classifier(data[i],Orange.classification.Classifier.GetBoth) | |

[10417] | 243 | print c,p |

**Note:**See TracBrowser for help on using the repository browser.