#
source:
orange/orange/Orange/distance/instances.py
@
9125:e9b8067e686f

Revision 9125:e9b8067e686f, 15.7 KB checked in by mocnik <mocnik@…>, 2 years ago (diff) |
---|

Line | |
---|---|

1 | """ |

2 | |

3 | ########################### |

4 | Distances between Instances |

5 | ########################### |

6 | |

7 | This page describes a bunch of classes for different metrics for measure |

8 | distances (dissimilarities) between instances. |

9 | |

10 | Typical (although not all) measures of distance between instances require |

11 | some "learning" - adjusting the measure to the data. For instance, when |

12 | the dataset contains continuous features, the distances between continuous |

13 | values should be normalized, e.g. by dividing the distance with the range |

14 | of possible values or with some interquartile distance to ensure that all |

15 | features have, in principle, similar impacts. |

16 | |

17 | Different measures of distance thus appear in pairs - a class that measures |

18 | the distance and a class that constructs it based on the data. The abstract |

19 | classes representing such a pair are `ExamplesDistance` and |

20 | `ExamplesDistanceConstructor`. |

21 | |

22 | Since most measures work on normalized distances between corresponding |

23 | features, there is an abstract intermediate class |

24 | `ExamplesDistance_Normalized` that takes care of normalizing. |

25 | The remaining classes correspond to different ways of defining the distances, |

26 | such as Manhattan or Euclidean distance. |

27 | |

28 | Unknown values are treated correctly only by Euclidean and Relief distance. |

29 | For other measure of distance, a distance between unknown and known or between |

30 | two unknown values is always 0.5. |

31 | |

32 | .. class:: ExamplesDistance |

33 | |

34 | .. method:: __call__(instance1, instance2) |

35 | |

36 | Returns a distance between the given instances as floating point number. |

37 | |

38 | .. class:: ExamplesDistanceConstructor |

39 | |

40 | .. method:: __call__([instances, weightID][, distributions][, basic_var_stat]) |

41 | |

42 | Constructs an instance of ExamplesDistance. |

43 | Not all the data needs to be given. Most measures can be constructed |

44 | from basic_var_stat; if it is not given, they can help themselves |

45 | either by instances or distributions. |

46 | Some (e.g. ExamplesDistance_Hamming) even do not need any arguments. |

47 | |

48 | .. class:: ExamplesDistance_Normalized |

49 | |

50 | This abstract class provides a function which is given two instances |

51 | and returns a list of normalized distances between values of their |

52 | features. Many distance measuring classes need such a function and are |

53 | therefore derived from this class |

54 | |

55 | .. attribute:: normalizers |

56 | |

57 | A precomputed list of normalizing factors for feature values |

58 | |

59 | - If a factor positive, differences in feature's values |

60 | are multiplied by it; for continuous features the factor |

61 | would be 1/(max_value-min_value) and for ordinal features |

62 | the factor is 1/number_of_values. If either (or both) of |

63 | features are unknown, the distance is 0.5 |

64 | - If a factor is -1, the feature is nominal; the distance |

65 | between two values is 0 if they are same (or at least |

66 | one is unknown) and 1 if they are different. |

67 | - If a factor is 0, the feature is ignored. |

68 | |

69 | .. attribute:: bases, averages, variances |

70 | |

71 | The minimal values, averages and variances |

72 | (continuous features only) |

73 | |

74 | .. attribute:: domainVersion |

75 | |

76 | Stores a domain version for which the normalizers were computed. |

77 | The domain version is increased each time a domain description is |

78 | changed (i.e. features are added or removed); this is used for a quick |

79 | check that the user is not attempting to measure distances between |

80 | instances that do not correspond to normalizers. |

81 | Since domains are practicably immutable (especially from Python), |

82 | you don't need to care about this anyway. |

83 | |

84 | .. method:: attributeDistances(instance1, instance2) |

85 | |

86 | Returns a list of floats representing distances between pairs of |

87 | feature values of the two instances. |

88 | |

89 | |

90 | .. class:: Hamming, HammingConstructor |

91 | |

92 | Hamming distance between two instances is defined as the number of |

93 | features in which the two instances differ. Note that this measure |

94 | is not really appropriate for instances that contain continuous features. |

95 | |

96 | |

97 | .. class:: Maximal, MaximalConstructor |

98 | |

99 | The maximal between two instances is defined as the maximal distance |

100 | between two feature values. If dist is the result of |

101 | ExamplesDistance_Normalized.attributeDistances, |

102 | then Maximal returns max(dist). |

103 | |

104 | |

105 | .. class:: Manhattan, ManhattanConstructor |

106 | |

107 | Manhattan distance between two instances is a sum of absolute values |

108 | of distances between pairs of features, e.g. ``apply(add, [abs(x) for x in dist])`` |

109 | where dist is the result of ExamplesDistance_Normalized.attributeDistances. |

110 | |

111 | .. class:: Euclidean, EuclideanConstructor |

112 | |

113 | |

114 | Euclidean distance is a square root of sum of squared per-feature distances, |

115 | i.e. ``sqrt(apply(add, [x*x for x in dist]))``, where dist is the result of |

116 | ExamplesDistance_Normalized.attributeDistances. |

117 | |

118 | .. method:: distributions |

119 | |

120 | An object of type |

121 | :obj:`Orange.statistics.distribution.Distribution` that holds |

122 | the distributions for all discrete features used for |

123 | computation of distances between known and unknown values. |

124 | |

125 | .. method:: bothSpecialDist |

126 | |

127 | A list containing the distance between two unknown values for each |

128 | discrete feature. |

129 | |

130 | This measure of distance deals with unknown values by computing the |

131 | expected square of distance based on the distribution obtained from the |

132 | "training" data. Squared distance between |

133 | |

134 | - A known and unknown continuous attribute equals squared distance |

135 | between the known and the average, plus variance |

136 | - Two unknown continuous attributes equals double variance |

137 | - A known and unknown discrete attribute equals the probabilit |

138 | that the unknown attribute has different value than the known |

139 | (i.e., 1 - probability of the known value) |

140 | - Two unknown discrete attributes equals the probability that two |

141 | random chosen values are equal, which can be computed as |

142 | 1 - sum of squares of probabilities. |

143 | |

144 | Continuous cases can be handled by averages and variances inherited from |

145 | ExamplesDistance_normalized. The data for discrete cases are stored in |

146 | distributions (used for unknown vs. known value) and in bothSpecial |

147 | (the precomputed distance between two unknown values). |

148 | |

149 | .. class:: Relief, ReliefConstructor |

150 | |

151 | Relief is similar to Manhattan distance, but incorporates a more |

152 | correct treatment of undefined values, which is used by ReliefF measure. |

153 | |

154 | This class is derived directly from ExamplesDistance, not from ExamplesDistance_Normalized. |

155 | |

156 | |

157 | .. autoclass:: PearsonR |

158 | :members: |

159 | |

160 | .. autoclass:: SpearmanR |

161 | :members: |

162 | |

163 | .. autoclass:: PearsonRConstructor |

164 | :members: |

165 | |

166 | .. autoclass:: SpearmanRConstructor |

167 | :members: |

168 | |

169 | |

170 | """ |

171 | |

172 | import Orange |

173 | |

174 | from Orange.core import \ |

175 | AlignmentList, \ |

176 | DistanceMap, \ |

177 | DistanceMapConstructor, \ |

178 | ExampleDistConstructor, \ |

179 | ExampleDistBySorting, \ |

180 | ExampleDistVector, \ |

181 | ExamplesDistance, \ |

182 | ExamplesDistance_Normalized, \ |

183 | ExamplesDistanceConstructor |

184 | |

185 | from Orange.core import ExamplesDistance_Hamming as Hamming |

186 | from Orange.core import ExamplesDistance_DTW as DTW |

187 | from Orange.core import ExamplesDistance_Euclidean as Euclidean |

188 | from Orange.core import ExamplesDistance_Manhattan as Manhattan |

189 | from Orange.core import ExamplesDistance_Maximal as Maximal |

190 | from Orange.core import ExamplesDistance_Relief as Relief |

191 | |

192 | from Orange.core import ExamplesDistanceConstructor_DTW as DTWConstructor |

193 | from Orange.core import ExamplesDistanceConstructor_Euclidean as EuclideanConstructor |

194 | from Orange.core import ExamplesDistanceConstructor_Hamming as HammingConstructor |

195 | from Orange.core import ExamplesDistanceConstructor_Manhattan as ManhattanConstructor |

196 | from Orange.core import ExamplesDistanceConstructor_Maximal as MaximalConstructor |

197 | from Orange.core import ExamplesDistanceConstructor_Relief as ReliefConstructor |

198 | |

199 | import statc |

200 | import numpy |

201 | from numpy import linalg |

202 | |

203 | class PearsonRConstructor(ExamplesDistanceConstructor): |

204 | """Constructs an instance of PearsonR. Not all the data needs to be given.""" |

205 | |

206 | def __new__(cls, data=None, **argkw): |

207 | self = ExamplesDistanceConstructor.__new__(cls, **argkw) |

208 | self.__dict__.update(argkw) |

209 | if data: |

210 | return self.__call__(data) |

211 | else: |

212 | return self |

213 | |

214 | def __call__(self, table): |

215 | indxs = [i for i, a in enumerate(table.domain.attributes) \ |

216 | if a.varType==Orange.data.Type.Continuous] |

217 | return PearsonR(domain=table.domain, indxs=indxs) |

218 | |

219 | class PearsonR(ExamplesDistance): |

220 | """ |

221 | `Pearson correlation coefficient |

222 | <http://en.wikipedia.org/wiki/Pearson_product-moment\ |

223 | _correlation_coefficient>`_ |

224 | """ |

225 | |

226 | def __init__(self, **argkw): |

227 | self.__dict__.update(argkw) |

228 | |

229 | def __call__(self, e1, e2): |

230 | """ |

231 | :param e1: data instances. |

232 | :param e2: data instances. |

233 | |

234 | Returns Pearson's disimilarity between e1 and e2, |

235 | i.e. (1-r)/2 where r is Sprearman's rank coefficient. |

236 | """ |

237 | X1 = [] |

238 | X2 = [] |

239 | for i in self.indxs: |

240 | if not(e1[i].isSpecial() or e2[i].isSpecial()): |

241 | X1.append(float(e1[i])) |

242 | X2.append(float(e2[i])) |

243 | if not X1: |

244 | return 1.0 |

245 | try: |

246 | return (1.0 - statc.pearsonr(X1, X2)[0]) / 2. |

247 | except: |

248 | return 1.0 |

249 | |

250 | class SpearmanRConstructor(ExamplesDistanceConstructor): |

251 | """Constructs an instance of SpearmanR. Not all the data needs to be given.""" |

252 | |

253 | def __new__(cls, data=None, **argkw): |

254 | self = ExamplesDistanceConstructor.__new__(cls, **argkw) |

255 | self.__dict__.update(argkw) |

256 | if data: |

257 | return self.__call__(data) |

258 | else: |

259 | return self |

260 | |

261 | def __call__(self, table): |

262 | indxs = [i for i, a in enumerate(table.domain.attributes) \ |

263 | if a.varType==Orange.data.Type.Continuous] |

264 | return SpearmanR(domain=table.domain, indxs=indxs) |

265 | |

266 | class SpearmanR(ExamplesDistance): |

267 | |

268 | """`Spearman's rank correlation coefficient |

269 | <http://en.wikipedia.org/wiki/Spearman%27s_rank_\ |

270 | correlation_coefficient>`_""" |

271 | |

272 | def __init__(self, **argkw): |

273 | self.__dict__.update(argkw) |

274 | |

275 | def __call__(self, e1, e2): |

276 | """ |

277 | :param e1: data instances. |

278 | :param e2: data instances. |

279 | |

280 | Returns Sprearman's disimilarity between e1 and e2, |

281 | i.e. (1-r)/2 where r is Sprearman's rank coefficient. |

282 | """ |

283 | X1 = []; X2 = [] |

284 | for i in self.indxs: |

285 | if not(e1[i].isSpecial() or e2[i].isSpecial()): |

286 | X1.append(float(e1[i])) |

287 | X2.append(float(e2[i])) |

288 | if not X1: |

289 | return 1.0 |

290 | try: |

291 | return (1.0 - statc.spearmanr(X1, X2)[0]) / 2. |

292 | except: |

293 | return 1.0 |

294 | |

295 | class MahalanobisConstructor(ExamplesDistanceConstructor): |

296 | """ Construct instance of Mahalanobis. """ |

297 | |

298 | def __new__(cls, data=None, **argkw): |

299 | self = ExamplesDistanceConstructor.__new__(cls, **argkw) |

300 | self.__dict__.update(argkw) |

301 | if data: |

302 | return self.__call__(data) |

303 | else: |

304 | return self |

305 | |

306 | # Check attributtes a, b, c |

307 | def __call__(self, table, a=None, b=None, c=None, **argkw): |

308 | # Process data |

309 | dc = Orange.core.DomainContinuizer() |

310 | dc.classTreatment = Orange.core.DomainContinuizer.Ignore |

311 | dc.continuousTreatment = Orange.core.DomainContinuizer.NormalizeBySpan |

312 | dc.multinomialTreatment = Orange.core.DomainContinuizer.NValues |

313 | |

314 | newdomain = dc(table) |

315 | newtable = table.translate(newdomain) |

316 | |

317 | data, cls, _ = newtable.to_numpy() |

318 | |

319 | covariance_matrix = numpy.cov(data, rowvar=0, bias=1) |

320 | inverse_covariance_matrix = linalg.pinv(covariance_matrix, rcond=1e-10) |

321 | |

322 | return Mahalanobis(domain=newdomain, icm=inverse_covariance_matrix) |

323 | |

324 | class Mahalanobis(ExamplesDistance): |

325 | """`Mahalanobis distance |

326 | <http://en.wikipedia.org/wiki/Mahalanobis_distance>`_""" |

327 | |

328 | def __init__(self, domain, icm, **argkw): |

329 | self.domain = domain |

330 | self.icm = icm |

331 | self.__dict__.update(argkw) |

332 | |

333 | def __call__(self, e1, e2): |

334 | """ |

335 | :param e1: data instances. |

336 | :param e2: data instances. |

337 | |

338 | Returns Mahalanobis distance between e1 and e2. |

339 | """ |

340 | e1 = Orange.data.Instance(self.domain, e1) |

341 | e2 = Orange.data.Instance(self.domain, e2) |

342 | |

343 | diff = [] |

344 | for i in range(len(self.domain.attributes)): |

345 | diff.append(e1[i].value - e2[i].value) if not(e1[i].isSpecial() or e2[i].isSpecial()) else 0.0 |

346 | diff = numpy.asmatrix(diff) |

347 | res = diff * self.icm * diff.transpose() |

348 | return res[0,0]**0.5 |

349 | |

350 | |

351 | class PearsonRAbsoluteConstructor(PearsonRConstructor): |

352 | """ Construct an instance of PearsonRAbsolute example distance estimator. |

353 | """ |

354 | def __call__(self, data): |

355 | indxs = [i for i, a in enumerate(data.domain.attributes) \ |

356 | if a.varType==Orange.data.Type.Continuous] |

357 | return PearsonRAbsolute(domain=data.domain, indxs=indxs) |

358 | |

359 | |

360 | class PearsonRAbsolute(PearsonR): |

361 | """ An example distance estimator using absolute value of Pearson |

362 | correlation coefficient. |

363 | """ |

364 | def __call__(self, e1, e2): |

365 | """ |

366 | Return absolute Pearson's dissimilarity between e1 and e2, |

367 | i.e. |

368 | |

369 | .. math:: (1 - abs(r))/2 |

370 | |

371 | where r is Pearson's correlation coefficient. |

372 | """ |

373 | X1 = []; X2 = [] |

374 | for i in self.indxs: |

375 | if not(e1[i].isSpecial() or e2[i].isSpecial()): |

376 | X1.append(float(e1[i])) |

377 | X2.append(float(e2[i])) |

378 | if not X1: |

379 | return 1.0 |

380 | try: |

381 | return (1.0 - abs(statc.pearsonr(X1, X2)[0])) |

382 | except: |

383 | return 1.0 |

384 | |

385 | |

386 | class SpearmanRAbsoluteConstructor(SpearmanRConstructor): |

387 | """ Construct an instance of SpearmanRAbsolute example distance estimator. |

388 | """ |

389 | def __call__(self, data): |

390 | indxs = [i for i, a in enumerate(data.domain.attributes) \ |

391 | if a.varType==Orange.data.Type.Continuous] |

392 | return SpearmanRAbsolute(domain=data.domain, indxs=indxs) |

393 | |

394 | |

395 | class SpearmanRAbsolute(SpearmanR): |

396 | def __call__(self, e1, e2): |

397 | """ |

398 | Return absolute Spearman's dissimilarity between e1 and e2, |

399 | i.e. |

400 | |

401 | .. math:: (1 - abs(r))/2 |

402 | |

403 | where r is Spearman's correlation coefficient. |

404 | """ |

405 | X1 = []; X2 = [] |

406 | for i in self.indxs: |

407 | if not(e1[i].isSpecial() or e2[i].isSpecial()): |

408 | X1.append(float(e1[i])) |

409 | X2.append(float(e2[i])) |

410 | if not X1: |

411 | return 1.0 |

412 | try: |

413 | return (1.0 - abs(statc.spearmanr(X1, X2)[0])) |

414 | except: |

415 | return 1.0 |

416 | |

417 | |

418 | def distance_matrix(data, distance_constructor, progress_callback=None): |

419 | """ A helper function that computes an obj:`Orange.core.SymMatrix` of all |

420 | pairwise distances between instances in `data`. |

421 | |

422 | :param data: A data table |

423 | :type data: :obj:`Orange.data.Table` |

424 | |

425 | :param distance_constructor: An ExamplesDistance_Constructor instance. |

426 | :type distance_constructor: :obj:`Orange.distances.ExampleDistConstructor` |

427 | |

428 | """ |

429 | from Orange.misc import progressBarMilestones as progress_milestones |

430 | matrix = Orange.core.SymMatrix(len(data)) |

431 | dist = distance_constructor(data) |

432 | |

433 | msize = len(data)*(len(data) - 1)/2 |

434 | milestones = progress_milestones(msize, 100) |

435 | count = 0 |

436 | for i in range(len(data)): |

437 | for j in range(i + 1, len(data)): |

438 | matrix[i, j] = dist(data[i], data[j]) |

439 | |

440 | if progress_callback and count in milestones: |

441 | progress_callback(100.0 * count / msize) |

442 | count += 1 |

443 | |

444 | return matrix |

**Note:**See TracBrowser for help on using the repository browser.