#
source:
orange/Orange/projection/linear.py
@
11459:fc07a5c346be

Revision 11459:fc07a5c346be, 72.2 KB checked in by Ales Erjavec <ales.erjavec@…>, 12 months ago (diff) |
---|

Rev | Line | |
---|---|---|

[10475] | 1 | #TODO: eliminate create_pls_projection (transform into a class) |

2 | #TODO: Projector as a preprocessor | |

[8042] | 3 | |

4 | import Orange | |

[9725] | 5 | from Orange import orangeom |

[8042] | 6 | import math |

7 | import random | |

8 | import numpy | |

9 | ||

[10611] | 10 | from Orange import classification, data, feature |

11 | from Orange.classification import knn | |

12 | ||

[10542] | 13 | from Orange.data.preprocess.scaling import ScaleLinProjData |

[9880] | 14 | from Orange.orng import orngVisFuncts as visfuncts |

[10611] | 15 | from Orange.utils import deprecated_keywords, deprecated_members |

[8042] | 16 | |

17 | try: | |

18 | import numpy.ma as MA | |

[10611] | 19 | except ImportError: |

[8042] | 20 | import numpy.core.ma as MA |

21 | ||

[10611] | 22 | class enum(int): |

23 | def set_name(self, name): | |

24 | self.name = name | |

25 | return self | |

26 | ||

27 | def __repr__(self): | |

28 | return getattr(self, "name", str(self)) | |

29 | ||

30 | ||

[8042] | 31 | #implementation |

32 | FAST_IMPLEMENTATION = 0 | |

33 | SLOW_IMPLEMENTATION = 1 | |

34 | LDA_IMPLEMENTATION = 2 | |

35 | ||

[10611] | 36 | LAW_LINEAR = enum(0).set_name("LAW_LINEAR") |

[8042] | 37 | LAW_SQUARE = 1 |

38 | LAW_GAUSSIAN = 2 | |

39 | LAW_KNN = 3 | |

40 | LAW_LINEAR_PLUS = 4 | |

41 | ||

42 | DR_PCA = 0 | |

43 | DR_SPCA = 1 | |

44 | DR_PLS = 2 | |

45 | ||

46 | def normalize(x): | |

47 | return x / numpy.linalg.norm(x) | |

48 | ||

[10611] | 49 | |

[8042] | 50 | def center(matrix): |

[10611] | 51 | """centers all variables, i.e. subtracts averages in colomns |

52 | and divides them by their standard deviations""" | |

53 | n, m = numpy.shape(matrix) | |

54 | return (matrix - numpy.multiply(matrix.mean(axis=0), | |

55 | numpy.ones((n, m)))) / numpy.std(matrix, | |

56 | axis=0) | |

[8042] | 57 | |

58 | ||

59 | class FreeViz: | |

60 | """ | |

61 | Contains an easy-to-use interface to the core of the method, which is | |

[10475] | 62 | written in C++. Differs from other linear projection optimizers in that it itself can store the data |

63 | to make iterative optimization and visualization possible. It can, however, still be used as any other | |

64 | projection optimizer by calling (:obj:`~Orange.projection.linear.FreeViz.__call__`) it. | |

[10611] | 65 | """ |

[8042] | 66 | |

[10611] | 67 | #: Coefficient for the attractive forces. By increasing or decreasing the ratio |

68 | #: between :obj:`attract_g` and :obj:`repel_g`, you can make one kind of the | |

69 | #: forces stronger. | |

70 | attract_g = 1. | |

71 | ||

72 | #: Coefficient for the repulsive forces. By increasing or decreasing the ratio | |

73 | #: between :obj:`attract_g` and :obj:`repel_g`, you can make one kind of the | |

74 | #: forces stronger. | |

75 | repel_g = 1. | |

76 | ||

77 | #: If set, the forces are balanced so that the total sum of | |

78 | #: the attractive equals the total of repulsive, before they are multiplied by | |

79 | #: the above factors. (By our experience, this gives bad results so you may | |

80 | #: want to leave this alone.) | |

81 | force_balancing = False | |

82 | ||

83 | #: Can be LAW_LINEAR, LAW_SQUARE, LAW_GAUSSIAN, LAW_KNN or LAW_LINEAR_PLUS. | |

84 | #: Default is LAW_LINEAR, which means that the attractive forces increase | |

85 | #: linearly by the distance and the repulsive forces are inversely | |

86 | #: proportional to the distance. LAW_SQUARE would make them rise or fall with | |

87 | #: the square of the distance, LAW_GAUSSIAN is based on a kind of | |

88 | #: log-likelihood estimation, LAW_KNN tries to directly optimize the | |

89 | #: classification accuracy of the kNN classifier in the projection space, and | |

90 | #: in LAW_LINEAR_PLUS both forces rise with the square of the distance, | |

91 | #: yielding a method that is somewhat similar to PCA. We found the first law | |

92 | #: perform the best, with the second to not far behind. | |

93 | law = LAW_LINEAR | |

94 | ||

95 | #: The sigma to be used in LAW_GAUSSIAN and LAW_KNN. | |

96 | force_sigma = 1. | |

97 | ||

98 | #: If enabled, it keeps the projection of the second attribute on the upper | |

99 | #: side of the graph (the first is always on the right-hand x-axis). This is | |

100 | #: useful when comparing whether two projections are the same, but has no | |

101 | #: effect on the projection's clarity or its classification accuracy. | |

102 | ||

103 | #: There are some more, undescribed, methods of a more internal nature. | |

104 | mirror_symmetry = True | |

105 | ||

106 | implementation = FAST_IMPLEMENTATION | |

107 | restrain = False | |

108 | use_generalized_eigenvectors = True | |

109 | ||

110 | # s2n heuristics parameters | |

111 | steps_before_update = 10 | |

112 | s2n_spread = 5 | |

113 | s2n_place_attributes = 50 | |

114 | s2n_mix_data = None | |

115 | auto_set_parameters = 1 | |

116 | class_permutation_list = None | |

117 | attrs_num = (5, 10, 20, 30, 50, 70, 100, 150, 200, 300, 500, 750, 1000) | |

118 | ||

119 | cancel_optimization = False | |

120 | ||

121 | ||

122 | def __init__(self, graph=None): | |

[8042] | 123 | if not graph: |

[9707] | 124 | graph = ScaleLinProjData() |

[8042] | 125 | self.graph = graph |

126 | ||

[10475] | 127 | def __call__(self, dataset=None): |

128 | """ | |

129 | Perform FreeViz optimization on the dataset, if given, and return a resulting | |

130 | linear :class:`~Orange.projection.linear.Projector`. If no dataset is given, | |

131 | the projection currently stored within the FreeViz object is returned as | |

132 | a :class:`~Orange.projection.linear.Projector`. | |

133 | ||

134 | :param dataset: input data set. | |

135 | :type dataset: :class:`Orange.data.Table` | |

136 | ||

137 | :rtype: :class:`~Orange.projection.linear.Projector` | |

138 | """ | |

139 | if dataset: | |

140 | self.graph.setData(dataset) | |

141 | self.show_all_attributes() | |

142 | ||

143 | self.radial_anchors() | |

144 | self.optimize_separation() | |

145 | ||

146 | X = dataset.to_numpy_MA("a")[0] | |

147 | Xm = numpy.mean(X, axis=0) | |

148 | Xd = X - Xm | |

149 | stdev = numpy.std(Xd, axis=0) | |

150 | else: | |

151 | Xm = numpy.zeros(len(self.graph.anchor_data)) | |

152 | stdev = None | |

153 | ||

154 | graph = self.graph | |

155 | ||

156 | U = numpy.array([val[:2] for val in self.graph.anchor_data]).T | |

157 | ||

158 | domain = graph.data_domain | |

159 | if len(domain) > len(self.graph.anchor_data): | |

[10611] | 160 | domain = data.Domain([graph.data_domain[a] |

161 | for _, _, a in self.graph.anchor_data], | |

162 | graph.data_domain.class_var) | |

[10475] | 163 | |

[10611] | 164 | return Projector(input_domain=domain, |

[10645] | 165 | center=Xm, |

166 | scale=stdev, | |

[10611] | 167 | standardize=False, |

168 | projection=U) | |

[10475] | 169 | |

170 | ||

[8042] | 171 | def clear_data(self): |

172 | self.s2n_mix_data = None | |

173 | self.class_permutation_list = None | |

[10611] | 174 | |

[8042] | 175 | clearData = clear_data |

176 | ||

177 | def set_statusbar_text(self, *args): | |

178 | pass | |

[10611] | 179 | |

[8042] | 180 | setStatusBarText = set_statusbar_text |

181 | ||

182 | def show_all_attributes(self): | |

[10611] | 183 | self.graph.anchor_data = [(0, 0, a.name) |

184 | for a in self.graph.data_domain.attributes] | |

[8042] | 185 | self.radial_anchors() |

[10611] | 186 | |

[8042] | 187 | showAllAttributes = show_all_attributes |

188 | ||

189 | def get_shown_attribute_list(self): | |

[10475] | 190 | return [anchor[2] for anchor in self.graph.anchor_data] |

[8042] | 191 | |

192 | getShownAttributeList = get_shown_attribute_list | |

193 | ||

194 | def radial_anchors(self): | |

195 | """ | |

196 | Reset the projection so that the anchors (projections of attributes) | |

197 | are placed evenly around the circle. | |

198 | ||

199 | """ | |

200 | attr_list = self.get_shown_attribute_list() | |

[8765] | 201 | if not attr_list: |

202 | return | |

[10611] | 203 | if "3d" in getattr(self, "parentName", "").lower(): |

[8765] | 204 | self.graph.anchor_data = self.graph.create_anchors(len(attr_list), attr_list) |

205 | return | |

[10611] | 206 | phi = 2 * math.pi / len(attr_list) |

207 | self.graph.anchor_data = [(math.cos(i * phi), math.sin(i * phi), a) | |

208 | for i, a in enumerate(attr_list)] | |

[8042] | 209 | |

210 | radialAnchors = radial_anchors | |

211 | ||

212 | def random_anchors(self): | |

213 | """ | |

214 | Set the projection to a random one. | |

215 | ||

216 | """ | |

[10475] | 217 | if not self.graph.have_data: |

[8765] | 218 | return |

[8042] | 219 | attr_list = self.get_shown_attribute_list() |

[8765] | 220 | if not attr_list: |

221 | return | |

[10611] | 222 | if "3d" in getattr(self, "parentName", "").lower(): |

[8765] | 223 | if self.restrain == 0: |

224 | def ranch(i, label): | |

[10611] | 225 | r = 0.3 + 0.7 * random.random() |

226 | phi = 2 * math.pi * random.random() | |

227 | theta = math.pi * random.random() | |

228 | return (r * math.sin(theta) * math.cos(phi), | |

229 | r * math.sin(theta) * math.sin(phi), | |

230 | r * math.cos(theta), | |

[8765] | 231 | label) |

232 | elif self.restrain == 1: | |

233 | def ranch(i, label): | |

[10611] | 234 | phi = 2 * math.pi * random.random() |

235 | theta = math.pi * random.random() | |

[8765] | 236 | r = 1. |

[10611] | 237 | return (r * math.sin(theta) * math.cos(phi), |

238 | r * math.sin(theta) * math.sin(phi), | |

239 | r * math.cos(theta), | |

[8765] | 240 | label) |

241 | else: | |

242 | self.graph.anchor_data = self.graph.create_anchors(len(attr_list), attr_list) | |

[10611] | 243 | |

[8765] | 244 | def ranch(i, label): |

[10611] | 245 | r = 0.3 + 0.7 * random.random() |

246 | return (r * self.graph.anchor_data[i][0], | |

247 | r * self.graph.anchor_data[i][1], | |

248 | r * self.graph.anchor_data[i][2], | |

[8765] | 249 | label) |

250 | ||

251 | anchors = [ranch(*a) for a in enumerate(attr_list)] | |

252 | ||

253 | if not self.restrain == 1: | |

[10611] | 254 | maxdist = math.sqrt(max([x[0] ** 2 + x[1] ** 2 + x[2] ** 2 for x in anchors])) |

255 | anchors = [(x[0] / maxdist, x[1] / maxdist, x[2] / maxdist, x[3]) for x in anchors] | |

[8765] | 256 | |

[10475] | 257 | self.graph.anchor_data = anchors |

[8765] | 258 | return |

[8042] | 259 | |

260 | if self.restrain == 0: | |

261 | def ranch(i, label): | |

[10611] | 262 | r = 0.3 + 0.7 * random.random() |

263 | phi = 2 * math.pi * random.random() | |

264 | return r * math.cos(phi), r * math.sin(phi), label | |

[8042] | 265 | |

266 | elif self.restrain == 1: | |

267 | def ranch(i, label): | |

[10611] | 268 | phi = 2 * math.pi * random.random() |

269 | return math.cos(phi), math.sin(phi), label | |

[8042] | 270 | |

271 | else: | |

272 | def ranch(i, label): | |

[10611] | 273 | r = 0.3 + 0.7 * random.random() |

274 | phi = 2 * math.pi * i / max(1, len(attr_list)) | |

275 | return r * math.cos(phi), r * math.sin(phi), label | |

[8042] | 276 | |

277 | anchors = [ranch(*a) for a in enumerate(attr_list)] | |

278 | ||

279 | if not self.restrain == 1: | |

[10611] | 280 | maxdist = math.sqrt(max([x[0] ** 2 + x[1] ** 2 for x in anchors])) |

281 | anchors = [(x[0] / maxdist, x[1] / maxdist, x[2]) for x in anchors] | |

[8042] | 282 | |

283 | if not self.restrain == 2 and self.mirror_symmetry: | |

[10611] | 284 | #TODO: Need to rotate and mirror here |

[8042] | 285 | pass |

286 | ||

[10475] | 287 | self.graph.anchor_data = anchors |

[8042] | 288 | |

289 | randomAnchors = random_anchors | |

290 | ||

291 | @deprecated_keywords({"singleStep": "single_step"}) | |

[10611] | 292 | def optimize_separation(self, steps=10, single_step=False, distances=None): |

[8042] | 293 | """ |

294 | Optimize the class separation. If you did not change any of the settings | |

295 | which are not documented above, it will call a fast C++ routine which | |

296 | will make :obj:`steps` optimization steps at a time, after which the | |

297 | graph (if one is given) is updated. If :obj:`single_step` is True, it | |

298 | will do that only once, | |

299 | otherwise it calls it on and on, and compares the current positions of | |

300 | the anchors with those 50 calls ago. If no anchor moved for more than | |

301 | 1e-3, it stops. In Orange Canvas the optimization is also stopped if | |

302 | someone outside (namely, the stop button) manages to set the FreeViz's | |

303 | flag attribute | |

304 | :obj:`Orange.projection.linear.FreeViz.cancel_optimization`. | |

305 | """ | |

306 | # check if we have data and a discrete class | |

[10475] | 307 | if (not self.graph.have_data or len(self.graph.raw_data) == 0 |

308 | or not (self.graph.data_has_class or distances)): | |

[8042] | 309 | return |

[10475] | 310 | ai = self.graph.attribute_name_index |

[8042] | 311 | attr_indices = [ai[label] for label in self.get_shown_attribute_list()] |

312 | if not attr_indices: return | |

313 | ||

[8769] | 314 | if self.implementation == FAST_IMPLEMENTATION and not hasattr(self, '_use_3D'): # TODO |

[8042] | 315 | return self.optimize_fast_separation(steps, single_step, distances) |

[10611] | 316 | elif self.implementation == LDA_IMPLEMENTATION: |

317 | impl = self.optimize_lda_separation | |

318 | else: | |

319 | impl = self.optimize_slow_separation | |

[8042] | 320 | |

321 | if self.__class__ != FreeViz: from PyQt4.QtGui import qApp | |

322 | if single_step: steps = 1 | |

[8761] | 323 | xanchors = None |

324 | yanchors = None | |

325 | zanchors = None | |

[8042] | 326 | |

[8761] | 327 | if hasattr(self, '_use_3D'): |

328 | if self.implementation == SLOW_IMPLEMENTATION: | |

329 | impl = self.optimize_slow_separation_3D | |

330 | elif self.implementation == LDA_IMPLEMENTATION: | |

331 | impl = self.optimize_lda_separation_3D | |

332 | else: | |

333 | print('Unimplemented method!') | |

334 | return | |

335 | ||

336 | for c in range((single_step and 1) or 50): | |

337 | for i in range(steps): | |

338 | if self.__class__ != FreeViz and self.cancel_optimization == 1: | |

339 | return | |

[10475] | 340 | self.graph.anchor_data, (xanchors, yanchors, zanchors) = impl(attr_indices, |

[10611] | 341 | self.graph.anchor_data, |

342 | xanchors, | |

343 | yanchors, | |

344 | zanchors) | |

[8761] | 345 | if self.__class__ != FreeViz: qApp.processEvents() |

346 | if hasattr(self.graph, "updateGraph"): self.graph.updateData() | |

347 | else: | |

348 | for c in range((single_step and 1) or 50): | |

349 | for i in range(steps): | |

350 | if self.__class__ != FreeViz and self.cancel_optimization == 1: | |

351 | return | |

[10475] | 352 | self.graph.anchor_data, (xanchors, yanchors) = impl(attr_indices, |

[10611] | 353 | self.graph.anchor_data, |

354 | xanchors, | |

355 | yanchors) | |

[8761] | 356 | if self.__class__ != FreeViz: qApp.processEvents() |

357 | if hasattr(self.graph, "updateGraph"): self.graph.updateData() | |

[8042] | 358 | |

359 | optimizeSeparation = optimize_separation | |

360 | ||

361 | @deprecated_keywords({"singleStep": "single_step"}) | |

[10611] | 362 | def optimize_fast_separation(self, steps=10, single_step=False, distances=None): |

[8042] | 363 | optimizer = [orangeom.optimizeAnchors, orangeom.optimizeAnchorsRadial, |

364 | orangeom.optimizeAnchorsR][self.restrain] | |

[10475] | 365 | ai = self.graph.attribute_name_index |

[8042] | 366 | attr_indices = [ai[label] for label in self.get_shown_attribute_list()] |

367 | if not attr_indices: return | |

368 | ||

369 | # repeat until less than 1% energy decrease in 5 consecutive iterations*steps steps | |

[10475] | 370 | positions = [numpy.array([x[:2] for x in self.graph.anchor_data])] |

[8042] | 371 | needed_steps = 0 |

372 | ||

[10475] | 373 | valid_data = self.graph.get_valid_list(attr_indices) |

[10611] | 374 | n_valid = sum(valid_data) |

[8042] | 375 | if not n_valid: |

376 | return 0 | |

377 | ||

[10611] | 378 | dataset = numpy.compress(valid_data, self.graph.no_jittering_scaled_data, |

379 | axis=1) | |

380 | dataset = numpy.transpose(dataset).tolist() | |

[8042] | 381 | if self.__class__ != FreeViz: from PyQt4.QtGui import qApp |

382 | ||

383 | if distances: | |

384 | if n_valid != len(valid_data): | |

[9916] | 385 | classes = Orange.misc.SymMatrix(n_valid) |

[8042] | 386 | r = 0 |

387 | for ro, vr in enumerate(valid_data): | |

388 | if not vr: | |

389 | continue | |

390 | c = 0 | |

391 | for co, vr in enumerate(valid_data): | |

392 | if vr: | |

393 | classes[r, c] = distances[ro, co] | |

394 | c += 1 | |

[10611] | 395 | r += 1 |

[8042] | 396 | else: |

397 | classes = distances | |

398 | else: | |

399 | classes = numpy.compress(valid_data, | |

[10475] | 400 | self.graph.original_data[self.graph.data_class_index]).tolist() |

[8042] | 401 | while 1: |

[10611] | 402 | self.graph.anchor_data = optimizer(dataset, classes, |

403 | self.graph.anchor_data, | |

404 | attr_indices, | |

405 | attractG=self.attract_g, | |

406 | repelG=self.repel_g, | |

407 | law=self.law, | |

408 | sigma2=self.force_sigma, | |

409 | dynamicBalancing=self.force_balancing, | |

410 | steps=steps, | |

411 | normalizeExamples=self.graph.normalize_examples, | |

412 | contClass=2 if distances | |

413 | else self.graph.data_has_continuous_class, | |

414 | mirrorSymmetry=self.mirror_symmetry) | |

[8042] | 415 | needed_steps += steps |

416 | ||

417 | if self.__class__ != FreeViz: | |

418 | qApp.processEvents() | |

419 | ||

420 | if hasattr(self.graph, "updateData"): | |

[10475] | 421 | self.graph.potentials_emp = None |

[8042] | 422 | self.graph.updateData() |

423 | ||

[10611] | 424 | positions = positions[-49:] + [numpy.array([x[:2] for x |

425 | in self.graph.anchor_data])] | |

426 | if len(positions) == 50: | |

427 | m = max(numpy.sum((positions[0] - positions[49]) ** 2), 0) | |

[8042] | 428 | if m < 1e-3: break |

429 | if single_step or (self.__class__ != FreeViz | |

430 | and self.cancel_optimization): | |

431 | break | |

432 | return needed_steps | |

433 | ||

434 | optimize_FAST_Separation = optimize_fast_separation | |

435 | ||

436 | @deprecated_keywords({"attrIndices": "attr_indices", | |

437 | "anchorData": "anchor_data", | |

438 | "XAnchors": "xanchors", | |

439 | "YAnchors": "yanchors"}) | |

[10611] | 440 | def optimize_lda_separation(self, attr_indices, anchor_data, xanchors=None, yanchors=None): |

[10475] | 441 | if (not self.graph.have_data or len(self.graph.raw_data) == 0 |

[10611] | 442 | or not self.graph.data_has_discrete_class): |

[8042] | 443 | return anchor_data, (xanchors, yanchors) |

[10475] | 444 | class_count = len(self.graph.data_domain.classVar.values) |

445 | valid_data = self.graph.get_valid_list(attr_indices) | |

[8042] | 446 | selected_data = numpy.compress(valid_data, |

[10475] | 447 | numpy.take(self.graph.no_jittering_scaled_data, |

[10611] | 448 | attr_indices, axis=0), |

449 | axis=1) | |

[8042] | 450 | |

[10611] | 451 | if xanchors is None: |

[8042] | 452 | xanchors = numpy.array([a[0] for a in anchor_data], numpy.float) |

[10611] | 453 | if yanchors is None: |

[8042] | 454 | yanchors = numpy.array([a[1] for a in anchor_data], numpy.float) |

455 | ||

[10475] | 456 | trans_proj_data = self.graph.create_projection_as_numeric_array(attr_indices, |

[10611] | 457 | validData=valid_data, |

458 | xanchors=xanchors, | |

459 | yanchors=yanchors, | |

460 | scaleFactor=self.graph.scale_factor, | |

461 | normalize=self.graph.normalize_examples, | |

462 | useAnchorData=1) | |

463 | if trans_proj_data is None: | |

[8042] | 464 | return anchor_data, (xanchors, yanchors) |

465 | ||

466 | proj_data = numpy.transpose(trans_proj_data) | |

467 | x_positions, y_positions, classData = (proj_data[0], proj_data[1], | |

468 | proj_data[2]) | |

469 | ||

470 | averages = [] | |

471 | for i in range(class_count): | |

472 | ind = classData == i | |

473 | xpos = numpy.compress(ind, x_positions) | |

474 | ypos = numpy.compress(ind, y_positions) | |

[10611] | 475 | xave = numpy.sum(xpos) / len(xpos) |

476 | yave = numpy.sum(ypos) / len(ypos) | |

[8042] | 477 | averages.append((xave, yave)) |

478 | ||

479 | # compute the positions of all the points. we will try to move all points so that the center will be in the (0,0) | |

480 | x_center_vector = -numpy.sum(x_positions) / len(x_positions) | |

481 | y_center_vector = -numpy.sum(y_positions) / len(y_positions) | |

482 | ||

483 | mean_destination_vectors = [] | |

484 | ||

485 | for i in range(class_count): | |

[10611] | 486 | xdir, ydir = 0., 0. |

[8042] | 487 | for j in range(class_count): |

[10611] | 488 | if i == j: continue |

489 | r = math.sqrt((averages[i][0] - averages[j][0]) ** 2 + | |

490 | (averages[i][1] - averages[j][1]) ** 2) | |

[8042] | 491 | if r == 0.0: |

[10611] | 492 | xdir += math.cos((i / float(class_count)) * 2 * math.pi) |

493 | ydir += math.sin((i / float(class_count)) * 2 * math.pi) | |

[8042] | 494 | else: |

[10611] | 495 | xdir += (1 / r ** 3) * ((averages[i][0] - averages[j][0])) |

496 | ydir += (1 / r ** 3) * ((averages[i][1] - averages[j][1])) | |

497 | ||

[8042] | 498 | mean_destination_vectors.append((xdir, ydir)) |

499 | ||

[10611] | 500 | maxlength = math.sqrt(max([x ** 2 + y ** 2 for (x, y) |

[8042] | 501 | in mean_destination_vectors])) |

[10611] | 502 | mean_destination_vectors = [(x / (2 * maxlength), y / (2 * maxlength)) |

503 | for (x, y) in mean_destination_vectors] #normalize destination vectors to some normal values | |

504 | mean_destination_vectors = [(mean_destination_vectors[i][0] + averages[i][0], | |

505 | mean_destination_vectors[i][1] + averages[i][1]) | |

506 | for i in range(len(mean_destination_vectors))] # add destination vectors to the class averages | |

[8042] | 507 | mean_destination_vectors = [(x + x_center_vector, y + y_center_vector) |

[10611] | 508 | for (x, y) in mean_destination_vectors] # center mean values |

[8042] | 509 | |

510 | fxs = numpy.zeros(len(x_positions), numpy.float) # forces | |

511 | fys = numpy.zeros(len(x_positions), numpy.float) | |

512 | ||

513 | for c in range(class_count): | |

514 | ind = (classData == c) | |

[10611] | 515 | numpy.putmask(fxs, ind, mean_destination_vectors[c][0] - x_positions) |

516 | numpy.putmask(fys, ind, mean_destination_vectors[c][1] - y_positions) | |

[8042] | 517 | |

518 | # compute gradient for all anchors | |

519 | gxs = numpy.array([sum(fxs * selected_data[i]) | |

520 | for i in range(len(anchor_data))], numpy.float) | |

521 | gys = numpy.array([sum(fys * selected_data[i]) | |

522 | for i in range(len(anchor_data))], numpy.float) | |

523 | ||

524 | m = max(max(abs(gxs)), max(abs(gys))) | |

[10611] | 525 | gxs /= (20 * m) |

526 | gys /= (20 * m) | |

[8042] | 527 | |

528 | newxanchors = xanchors + gxs | |

529 | newyanchors = yanchors + gys | |

530 | ||

531 | # normalize so that the anchor most far away will lie on the circle | |

[10611] | 532 | m = math.sqrt(max(newxanchors ** 2 + newyanchors ** 2)) |

[8042] | 533 | newxanchors /= m |

534 | newyanchors /= m | |

535 | ||

536 | return [(newxanchors[i], newyanchors[i], anchor_data[i][2]) | |

[10611] | 537 | for i in range(len(anchor_data))], (newxanchors, newyanchors) |

[8042] | 538 | |

539 | optimize_LDA_Separation = optimize_lda_separation | |

540 | ||

541 | @deprecated_keywords({"attrIndices": "attr_indices", | |

542 | "anchorData": "anchor_data", | |

543 | "XAnchors": "xanchors", | |

544 | "YAnchors": "yanchors"}) | |

[10611] | 545 | def optimize_slow_separation(self, attr_indices, anchor_data, xanchors=None, yanchors=None): |

[10475] | 546 | if (not self.graph.have_data or len(self.graph.raw_data) == 0 |

[10611] | 547 | or not self.graph.data_has_discrete_class): |

[8042] | 548 | return anchor_data, (xanchors, yanchors) |

[10475] | 549 | valid_data = self.graph.get_valid_list(attr_indices) |

550 | selected_data = numpy.compress(valid_data, numpy.take(self.graph.no_jittering_scaled_data, | |

[8042] | 551 | attr_indices, |

[10611] | 552 | axis=0), |

553 | axis=1) | |

[8042] | 554 | |

[10611] | 555 | if xanchors is None: |

[8042] | 556 | xanchors = numpy.array([a[0] for a in anchor_data], numpy.float) |

[10611] | 557 | if yanchors is None: |

[8042] | 558 | yanchors = numpy.array([a[1] for a in anchor_data], numpy.float) |

559 | ||

[10475] | 560 | trans_proj_data = self.graph.create_projection_as_numeric_array(attr_indices, |

[10611] | 561 | validData=valid_data, |

562 | xanchors=xanchors, | |

563 | yanchors=yanchors, | |

564 | scaleFactor=self.graph.scale_factor, | |

565 | normalize=self.graph.normalize_examples, | |

566 | useAnchorData=1) | |

567 | if trans_proj_data is None: | |

[8042] | 568 | return anchor_data, (xanchors, yanchors) |

569 | ||

570 | proj_data = numpy.transpose(trans_proj_data) | |

[10611] | 571 | x_positions = proj_data[0] |

572 | x_positions2 = numpy.array(x_positions) | |

573 | y_positions = proj_data[1] | |

574 | y_positions2 = numpy.array(y_positions) | |

575 | class_data = proj_data[2] | |

576 | class_data2 = numpy.array(class_data) | |

[8042] | 577 | |

578 | fxs = numpy.zeros(len(x_positions), numpy.float) # forces | |

579 | fys = numpy.zeros(len(x_positions), numpy.float) | |

580 | ||

581 | rotate_array = range(len(x_positions)) | |

582 | rotate_array = rotate_array[1:] + [0] | |

[10611] | 583 | for i in range(len(x_positions) - 1): |

[8042] | 584 | x_positions2 = numpy.take(x_positions2, rotate_array) |

585 | y_positions2 = numpy.take(y_positions2, rotate_array) | |

586 | class_data2 = numpy.take(class_data2, rotate_array) | |

587 | dx = x_positions2 - x_positions | |

588 | dy = y_positions2 - y_positions | |

[10611] | 589 | rs2 = dx ** 2 + dy ** 2 |

[8042] | 590 | rs2 += numpy.where(rs2 == 0.0, 0.0001, 0.0) # replace zeros to avoid divisions by zero |

591 | rs = numpy.sqrt(rs2) | |

592 | ||

593 | F = numpy.zeros(len(x_positions), numpy.float) | |

594 | classDiff = numpy.where(class_data == class_data2, 1, 0) | |

[10611] | 595 | numpy.putmask(F, classDiff, 150 * self.attract_g * rs2) |

596 | numpy.putmask(F, 1 - classDiff, -self.repel_g / rs2) | |

[8042] | 597 | fxs += F * dx / rs |

598 | fys += F * dy / rs | |

599 | ||

600 | # compute gradient for all anchors | |

601 | gxs = numpy.array([sum(fxs * selected_data[i]) | |

602 | for i in range(len(anchor_data))], numpy.float) | |

603 | gys = numpy.array([sum(fys * selected_data[i]) | |

604 | for i in range(len(anchor_data))], numpy.float) | |

605 | ||

606 | m = max(max(abs(gxs)), max(abs(gys))) | |

[10611] | 607 | gxs /= (20 * m) |

608 | gys /= (20 * m) | |

[8042] | 609 | |

610 | newxanchors = xanchors + gxs | |

611 | newyanchors = yanchors + gys | |

612 | ||

613 | # normalize so that the anchor most far away will lie on the circle | |

[10611] | 614 | m = math.sqrt(max(newxanchors ** 2 + newyanchors ** 2)) |

[8042] | 615 | newxanchors /= m |

616 | newyanchors /= m | |

617 | return [(newxanchors[i], newyanchors[i], anchor_data[i][2]) | |

[10611] | 618 | for i in range(len(anchor_data))], (newxanchors, newyanchors) |

[8042] | 619 | |

620 | optimize_SLOW_Separation = optimize_slow_separation | |

621 | ||

[8761] | 622 | |

623 | @deprecated_keywords({"attrIndices": "attr_indices", | |

624 | "anchorData": "anchor_data", | |

625 | "XAnchors": "xanchors", | |

626 | "YAnchors": "yanchors"}) | |

[10611] | 627 | def optimize_lda_separation_3D(self, attr_indices, anchor_data, xanchors=None, yanchors=None, zanchors=None): |

[10475] | 628 | if (not self.graph.have_data or len(self.graph.raw_data) == 0 |

[10611] | 629 | or not self.graph.data_has_discrete_class): |

[8761] | 630 | return anchor_data, (xanchors, yanchors, zanchors) |

[10475] | 631 | class_count = len(self.graph.data_domain.classVar.values) |

632 | valid_data = self.graph.get_valid_list(attr_indices) | |

[8761] | 633 | selected_data = numpy.compress(valid_data, |

[10475] | 634 | numpy.take(self.graph.no_jittering_scaled_data, |

[10611] | 635 | attr_indices, axis=0), |

636 | axis=1) | |

[8761] | 637 | |

[10611] | 638 | if xanchors is None: |

[8761] | 639 | xanchors = numpy.array([a[0] for a in anchor_data], numpy.float) |

[10611] | 640 | if yanchors is None: |

[8761] | 641 | yanchors = numpy.array([a[1] for a in anchor_data], numpy.float) |

[10611] | 642 | if zanchors is None: |

[8761] | 643 | zanchors = numpy.array([a[2] for a in anchor_data], numpy.float) |

644 | ||

[10475] | 645 | trans_proj_data = self.graph.create_projection_as_numeric_array(attr_indices, |

[10611] | 646 | validData=valid_data, |

647 | xanchors=xanchors, | |

648 | yanchors=yanchors, | |

649 | zanchors=zanchors, | |

650 | scaleFactor=self.graph.scale_factor, | |

651 | normalize=self.graph.normalize_examples, | |

652 | useAnchorData=1) | |

653 | if trans_proj_data is None: | |

[8761] | 654 | return anchor_data, (xanchors, yanchors, zanchors) |

655 | ||

656 | proj_data = numpy.transpose(trans_proj_data) | |

657 | x_positions, y_positions, z_positions, classData = (proj_data[0], | |

658 | proj_data[1], | |

659 | proj_data[2], | |

660 | proj_data[3]) | |

661 | ||

662 | averages = [] | |

663 | for i in range(class_count): | |

664 | ind = classData == i | |

665 | xpos = numpy.compress(ind, x_positions) | |

666 | ypos = numpy.compress(ind, y_positions) | |

667 | zpos = numpy.compress(ind, z_positions) | |

[10611] | 668 | xave = numpy.sum(xpos) / len(xpos) |

669 | yave = numpy.sum(ypos) / len(ypos) | |

670 | zave = numpy.sum(zpos) / len(zpos) | |

[8761] | 671 | averages.append((xave, yave, zave)) |

672 | ||

673 | # compute the positions of all the points. we will try to move all points so that the center will be in the (0,0) | |

674 | x_center_vector = -numpy.sum(x_positions) / len(x_positions) | |

675 | y_center_vector = -numpy.sum(y_positions) / len(y_positions) | |

676 | ||

677 | mean_destination_vectors = [] | |

678 | ||

679 | for i in range(class_count): | |

[10611] | 680 | xdir, ydir = 0., 0. |

[8761] | 681 | for j in range(class_count): |

[10611] | 682 | if i == j: continue |

683 | r = math.sqrt((averages[i][0] - averages[j][0]) ** 2 + | |

684 | (averages[i][1] - averages[j][1]) ** 2) | |

[8761] | 685 | if r == 0.0: |

[10611] | 686 | xdir += math.cos((i / float(class_count)) * 2 * math.pi) |

687 | ydir += math.sin((i / float(class_count)) * 2 * math.pi) | |

[8761] | 688 | else: |

[10611] | 689 | xdir += (1 / r ** 3) * ((averages[i][0] - averages[j][0])) |

690 | ydir += (1 / r ** 3) * ((averages[i][1] - averages[j][1])) | |

691 | ||

[8761] | 692 | mean_destination_vectors.append((xdir, ydir)) |

693 | ||

[10611] | 694 | maxlength = math.sqrt(max([x ** 2 + y ** 2 for (x, y) |

[8761] | 695 | in mean_destination_vectors])) |

[10611] | 696 | mean_destination_vectors = [(x / (2 * maxlength), y / (2 * maxlength)) for (x, y) |

697 | in | |

698 | mean_destination_vectors] # normalize destination vectors to some normal values | |

699 | mean_destination_vectors = [(mean_destination_vectors[i][0] + averages[i][0], | |

700 | mean_destination_vectors[i][1] + averages[i][1]) | |

701 | for i in range(len(mean_destination_vectors))] # add destination vectors to the class averages | |

[8761] | 702 | mean_destination_vectors = [(x + x_center_vector, y + y_center_vector) |

[10611] | 703 | for (x, y) in mean_destination_vectors] # center mean values |

[8761] | 704 | |

705 | fxs = numpy.zeros(len(x_positions), numpy.float) # forces | |

706 | fys = numpy.zeros(len(x_positions), numpy.float) | |

707 | ||

708 | for c in range(class_count): | |

709 | ind = (classData == c) | |

[10611] | 710 | numpy.putmask(fxs, ind, mean_destination_vectors[c][0] - x_positions) |

711 | numpy.putmask(fys, ind, mean_destination_vectors[c][1] - y_positions) | |

[8761] | 712 | |

713 | # compute gradient for all anchors | |

714 | gxs = numpy.array([sum(fxs * selected_data[i]) | |

715 | for i in range(len(anchor_data))], numpy.float) | |

716 | gys = numpy.array([sum(fys * selected_data[i]) | |

717 | for i in range(len(anchor_data))], numpy.float) | |

718 | ||

719 | m = max(max(abs(gxs)), max(abs(gys))) | |

[10611] | 720 | gxs /= (20 * m) |

721 | gys /= (20 * m) | |

[8761] | 722 | |

723 | newxanchors = xanchors + gxs | |

724 | newyanchors = yanchors + gys | |

725 | ||

726 | # normalize so that the anchor most far away will lie on the circle | |

[10611] | 727 | m = math.sqrt(max(newxanchors ** 2 + newyanchors ** 2)) |

[8761] | 728 | newxanchors /= m |

729 | newyanchors /= m | |

730 | ||

731 | return [(newxanchors[i], newyanchors[i], anchor_data[i][2]) | |

[10611] | 732 | for i in range(len(anchor_data))], (newxanchors, newyanchors) |

[8761] | 733 | |

734 | optimize_LDA_Separation_3D = optimize_lda_separation_3D | |

735 | ||

736 | @deprecated_keywords({"attrIndices": "attr_indices", | |

737 | "anchorData": "anchor_data", | |

738 | "XAnchors": "xanchors", | |

739 | "YAnchors": "yanchors"}) | |

[10611] | 740 | def optimize_slow_separation_3D(self, attr_indices, anchor_data, xanchors=None, yanchors=None, zanchors=None): |

[10475] | 741 | if (not self.graph.have_data or len(self.graph.raw_data) == 0 |

[10611] | 742 | or not self.graph.data_has_discrete_class): |

[8761] | 743 | return anchor_data, (xanchors, yanchors, zanchors) |

[10475] | 744 | valid_data = self.graph.get_valid_list(attr_indices) |

745 | selected_data = numpy.compress(valid_data, numpy.take(self.graph.no_jittering_scaled_data, | |

[8761] | 746 | attr_indices, |

[10611] | 747 | axis=0), |

748 | axis=1) | |

[8761] | 749 | |

[10611] | 750 | if xanchors is None: |

[8761] | 751 | xanchors = numpy.array([a[0] for a in anchor_data], numpy.float) |

[10611] | 752 | if yanchors is None: |

[8761] | 753 | yanchors = numpy.array([a[1] for a in anchor_data], numpy.float) |

[10611] | 754 | if zanchors is None: |

[8761] | 755 | zanchors = numpy.array([a[2] for a in anchor_data], numpy.float) |

756 | ||

[10475] | 757 | trans_proj_data = self.graph.create_projection_as_numeric_array(attr_indices, |

[10611] | 758 | validData=valid_data, |

759 | XAnchors=xanchors, | |

760 | YAnchors=yanchors, | |

761 | ZAnchors=zanchors, | |

762 | scaleFactor=self.graph.scale_factor, | |

763 | normalize=self.graph.normalize_examples, | |

764 | useAnchorData=1) | |

765 | if trans_proj_data is None: | |

[8761] | 766 | return anchor_data, (xanchors, yanchors, zanchors) |

767 | ||

768 | proj_data = numpy.transpose(trans_proj_data) | |

[10611] | 769 | x_positions = proj_data[0] |

770 | x_positions2 = numpy.array(x_positions) | |

771 | y_positions = proj_data[1] | |

772 | y_positions2 = numpy.array(y_positions) | |

773 | z_positions = proj_data[2] | |

774 | z_positions2 = numpy.array(z_positions) | |

775 | class_data = proj_data[3] | |

776 | class_data2 = numpy.array(class_data) | |

[8761] | 777 | |

778 | fxs = numpy.zeros(len(x_positions), numpy.float) # forces | |

779 | fys = numpy.zeros(len(x_positions), numpy.float) | |

780 | fzs = numpy.zeros(len(x_positions), numpy.float) | |

781 | ||

782 | rotate_array = range(len(x_positions)) | |

783 | rotate_array = rotate_array[1:] + [0] | |

[10611] | 784 | for i in range(len(x_positions) - 1): |

[8761] | 785 | x_positions2 = numpy.take(x_positions2, rotate_array) |

786 | y_positions2 = numpy.take(y_positions2, rotate_array) | |

787 | z_positions2 = numpy.take(z_positions2, rotate_array) | |

788 | class_data2 = numpy.take(class_data2, rotate_array) | |

789 | dx = x_positions2 - x_positions | |

790 | dy = y_positions2 - y_positions | |

791 | dz = z_positions2 - z_positions | |

[10611] | 792 | rs2 = dx ** 2 + dy ** 2 + dz ** 2 |

[8761] | 793 | rs2 += numpy.where(rs2 == 0.0, 0.0001, 0.0) # replace zeros to avoid divisions by zero |

794 | rs = numpy.sqrt(rs2) | |

795 | ||

796 | F = numpy.zeros(len(x_positions), numpy.float) | |

797 | classDiff = numpy.where(class_data == class_data2, 1, 0) | |

[10611] | 798 | numpy.putmask(F, classDiff, 150 * self.attract_g * rs2) |

799 | numpy.putmask(F, 1 - classDiff, -self.repel_g / rs2) | |

[8761] | 800 | fxs += F * dx / rs |

801 | fys += F * dy / rs | |

802 | fzs += F * dz / rs | |

803 | ||

804 | # compute gradient for all anchors | |

805 | gxs = numpy.array([sum(fxs * selected_data[i]) | |

806 | for i in range(len(anchor_data))], numpy.float) | |

807 | gys = numpy.array([sum(fys * selected_data[i]) | |

808 | for i in range(len(anchor_data))], numpy.float) | |

809 | gzs = numpy.array([sum(fzs * selected_data[i]) | |

810 | for i in range(len(anchor_data))], numpy.float) | |

811 | ||

812 | m = max(max(abs(gxs)), max(abs(gys)), max(abs(gzs))) | |

[10611] | 813 | gxs /= (20 * m) |

814 | gys /= (20 * m) | |

815 | gzs /= (20 * m) | |

[8761] | 816 | |

817 | newxanchors = xanchors + gxs | |

818 | newyanchors = yanchors + gys | |

819 | newzanchors = zanchors + gzs | |

820 | ||

821 | # normalize so that the anchor most far away will lie on the circle | |

[10611] | 822 | m = math.sqrt(max(newxanchors ** 2 + newyanchors ** 2 + newzanchors ** 2)) |

[8761] | 823 | newxanchors /= m |

824 | newyanchors /= m | |

825 | newzanchors /= m | |

826 | return [(newxanchors[i], newyanchors[i], newzanchors[i], anchor_data[i][3]) | |

[10611] | 827 | for i in range(len(anchor_data))], (newxanchors, newyanchors, newzanchors) |

[8761] | 828 | |

829 | optimize_SLOW_Separation_3D = optimize_slow_separation_3D | |

830 | ||

831 | ||

832 | ||

[8042] | 833 | # ############################################################### |

834 | # S2N HEURISTIC FUNCTIONS | |

835 | # ############################################################### | |

836 | ||

837 | ||

838 | ||

839 | # place a subset of attributes around the circle. this subset must contain "good" attributes for each of the class values | |

840 | @deprecated_keywords({"setAttributeListInRadviz": | |

[10611] | 841 | "set_attribute_list_in_radviz"}) |

842 | def s2n_mix_anchors(self, set_attribute_list_in_radviz=1): | |

[8042] | 843 | # check if we have data and a discrete class |

[10475] | 844 | if (not self.graph.have_data or len(self.graph.raw_data) == 0 |

[10611] | 845 | or not self.graph.data_has_discrete_class): |

[8042] | 846 | self.set_statusbar_text("S2N only works on data with a discrete class value") |

847 | return | |

848 | ||

849 | # compute the quality of attributes only once | |

[10611] | 850 | if self.s2n_mix_data is None: |

[10475] | 851 | ranked_attrs, ranked_attrs_by_class = visfuncts.findAttributeGroupsForRadviz(self.graph.raw_data, |

[9707] | 852 | visfuncts.S2NMeasureMix()) |

[8042] | 853 | self.s2n_mix_data = (ranked_attrs, ranked_attrs_by_class) |

854 | class_count = len(ranked_attrs_by_class) | |

[10611] | 855 | attrs = ranked_attrs[:(self.s2n_place_attributes / class_count) * |

856 | class_count] # select appropriate number of attributes | |

[8042] | 857 | else: |

858 | class_count = len(self.s2n_mix_data[1]) | |

[10611] | 859 | attrs = self.s2n_mix_data[0][:(self.s2n_place_attributes / class_count) * |

860 | class_count] | |

[8042] | 861 | |

[10611] | 862 | if not len(attrs): |

[8042] | 863 | self.set_statusbar_text("No discrete attributes found") |

864 | return 0 | |

865 | ||

866 | arr = [0] # array that will tell where to put the next attribute | |

[10611] | 867 | for i in range(1, len(attrs) / 2): arr += [i, -i] |

[8042] | 868 | |

[10611] | 869 | phi = (2 * math.pi * self.s2n_spread) / (len(attrs) * 10.0) |

870 | anchor_data = []; | |

871 | start = [] | |

872 | arr2 = arr[:(len(attrs) / class_count) + 1] | |

[8042] | 873 | for cls in range(class_count): |

[10611] | 874 | start_pos = (2 * math.pi * cls) / class_count |

[8042] | 875 | if self.class_permutation_list: cls = self.class_permutation_list[cls] |

876 | attrs_cls = attrs[cls::class_count] | |

[10611] | 877 | temp_data = [(arr2[i], math.cos(start_pos + arr2[i] * phi), |

878 | math.sin(start_pos + arr2[i] * phi), | |

[8042] | 879 | attrs_cls[i]) for i in |

[10611] | 880 | range(min(len(arr2), len(attrs_cls)))] |

881 | start.append(len(anchor_data) + len(arr2) / 2) # starting indices for each class value | |

[8042] | 882 | temp_data.sort() |

883 | anchor_data += [(x, y, name) for (i, x, y, name) in temp_data] | |

884 | ||

[10611] | 885 | anchor_data = anchor_data[(len(attrs) / (2 * class_count)):] + anchor_data[:(len(attrs) / (2 * class_count))] |

[10475] | 886 | self.graph.anchor_data = anchor_data |

[8042] | 887 | attrNames = [anchor[2] for anchor in anchor_data] |

888 | ||

889 | if self.__class__ != FreeViz: | |

890 | if set_attribute_list_in_radviz: | |

891 | self.parentWidget.setShownAttributeList(attrNames) | |

892 | self.graph.updateData(attrNames) | |

893 | self.graph.repaint() | |

894 | return 1 | |

895 | ||

896 | s2nMixAnchors = s2n_mix_anchors | |

897 | ||

898 | # find interesting linear projection using PCA, SPCA, or PLS | |

899 | @deprecated_keywords({"attrIndices": "attr_indices", | |

900 | "setAnchors": "set_anchors", | |

901 | "percentDataUsed": "percent_data_used"}) | |

[10611] | 902 | def find_projection(self, method, attr_indices=None, set_anchors=0, percent_data_used=100): |

[10475] | 903 | if not self.graph.have_data: return |

904 | ai = self.graph.attribute_name_index | |

[10611] | 905 | if attr_indices is None: |

[8042] | 906 | attributes = self.get_shown_attribute_list() |

907 | attr_indices = [ai[label] for label in attributes] | |

[10611] | 908 | if not len(attr_indices): return None |

[8042] | 909 | |

[10475] | 910 | valid_data = self.graph.get_valid_list(attr_indices) |

[8042] | 911 | if sum(valid_data) == 0: return None |

912 | ||

[10475] | 913 | data_matrix = numpy.compress(valid_data, numpy.take(self.graph.no_jittering_scaled_data, |

[8042] | 914 | attr_indices, |

[10611] | 915 | axis=0), |

916 | axis=1) | |

[10475] | 917 | if self.graph.data_has_class: |

[8042] | 918 | class_array = numpy.compress(valid_data, |

[10475] | 919 | self.graph.no_jittering_scaled_data[self.graph.data_class_index]) |

[8042] | 920 | |

921 | if percent_data_used != 100: | |

[10611] | 922 | indices = data.sample.SubsetIndices2(self.graph.raw_data, |

923 | 1.0 - (float(percent_data_used) / 100.0)) | |

[8042] | 924 | try: |

[10611] | 925 | data_matrix = numpy.compress(indices, data_matrix, axis=1) |

926 | except ValueError: | |

[8042] | 927 | pass |

[10475] | 928 | if self.graph.data_has_class: |

[8042] | 929 | class_array = numpy.compress(indices, class_array) |

930 | ||

[8769] | 931 | ncomps = 3 if hasattr(self, '_use_3D') else 2 |

[8042] | 932 | vectors = None |

933 | if method == DR_PCA: | |

[10475] | 934 | pca = Pca(standardize=False, max_components=ncomps, |

[10645] | 935 | use_generalized_eigenvectors=False, ddof=0) |

[10611] | 936 | domain = data.Domain([feature.Continuous("g%d" % i) for i |

937 | in xrange(len(data_matrix))], False) | |

938 | pca = pca(data.Table(domain, data_matrix.T)) | |

[10647] | 939 | vals, vectors = pca.variances, pca.projection |

[10475] | 940 | elif method == DR_SPCA and self.graph.data_has_class: |

941 | pca = Spca(standardize=False, max_components=ncomps, | |

[10645] | 942 | use_generalized_eigenvectors=self.use_generalized_eigenvectors, ddof=0) |

[10611] | 943 | domain = data.Domain([feature.Continuous("g%d" % i) for i |

944 | in xrange(len(data_matrix))], feature.Continuous("c")) | |

945 | pca = pca(data.Table(domain, | |

946 | numpy.hstack([data_matrix.T, numpy.array(class_array, ndmin=2).T]))) | |

[10647] | 947 | vals, vectors = pca.variances, pca.projection |

[10475] | 948 | elif method == DR_PLS and self.graph.data_has_class: |

[8042] | 949 | data_matrix = data_matrix.transpose() |

950 | class_matrix = numpy.transpose(numpy.matrix(class_array)) | |

[8769] | 951 | vectors = create_pls_projection(data_matrix, class_matrix, ncomps) |

[8042] | 952 | vectors = vectors.T |

953 | ||

954 | # test if all values are 0, if there is an invalid number in the array and if there are complex numbers in the array | |

[10480] | 955 | if (vectors is None or not vectors.any() or |

[8042] | 956 | False in numpy.isfinite(vectors) or False in numpy.isreal(vectors)): |

[10611] | 957 | self.set_statusbar_text("Unable to compute anchor positions for the selected attributes") |

[8042] | 958 | return None |

959 | ||

960 | xanchors = vectors[0] | |

961 | yanchors = vectors[1] | |

[10611] | 962 | |

[8769] | 963 | if ncomps == 3: |

964 | zanchors = vectors[2] | |

[10611] | 965 | m = math.sqrt(max(xanchors ** 2 + yanchors ** 2 + zanchors ** 2)) |

[8769] | 966 | zanchors /= m |

967 | else: | |

[10611] | 968 | m = math.sqrt(max(xanchors ** 2 + yanchors ** 2)) |

[8042] | 969 | |

970 | xanchors /= m | |

971 | yanchors /= m | |

[10475] | 972 | names = self.graph.attribute_names |

[8042] | 973 | attributes = [names[attr_indices[i]] for i in range(len(attr_indices))] |

974 | ||

975 | if set_anchors: | |

[8769] | 976 | if ncomps == 3: |

[10475] | 977 | self.graph.set_anchors(list(xanchors), list(yanchors), list(zanchors), attributes) |

[8769] | 978 | else: |

[10475] | 979 | self.graph.set_anchors(list(xanchors), list(yanchors), attributes) |

980 | if hasattr(self.graph, "updateData"): | |

981 | self.graph.updateData() | |

982 | if hasattr(self.graph, "repaint"): | |

983 | self.graph.repaint() | |

[8769] | 984 | |

985 | if ncomps == 3: | |

986 | return xanchors, yanchors, zanchors, (attributes, attr_indices) | |

987 | else: | |

988 | return xanchors, yanchors, (attributes, attr_indices) | |

[8042] | 989 | |

990 | findProjection = find_projection | |

991 | ||

992 | ||

993 | FreeViz = deprecated_members({"attractG": "attract_g", | |

994 | "repelG": "repel_g", | |

995 | "forceBalancing": "force_balancing", | |

996 | "forceSigma": "force_sigma", | |

997 | "mirrorSymmetry": "mirror_symmetry", | |

998 | "useGeneralizedEigenvectors": "use_generalized_eigenvectors", | |

999 | "stepsBeforeUpdate": "steps_before_update", | |

1000 | "s2nSpread": "s2n_spread", | |

1001 | "s2nPlaceAttributes": "s2n_place_attributes", | |

1002 | "s2nMixData": "s2n_mix_data", | |

1003 | "autoSetParameters": "auto_set_parameters", | |

1004 | "classPermutationList": "class_permutation_list", | |

1005 | "attrsNum": "attrs_num", | |

1006 | "cancelOptimization": "cancel_optimization"})(FreeViz) | |

1007 | ||

1008 | ||

1009 | @deprecated_keywords({"X": "x", "Y": "y", "Ncomp": "ncomp"}) | |

[10611] | 1010 | def create_pls_projection(x, y, ncomp=2): |

1011 | """Predict y from x using first ncomp principal components""" | |

[8042] | 1012 | |

1013 | # data dimensions | |

1014 | n, mx = numpy.shape(x) | |

1015 | my = numpy.shape(y)[1] | |

1016 | ||

1017 | # Z-scores of original matrices | |

1018 | ymean = y.mean() | |

[10611] | 1019 | x, y = center(x), center(y) |

[8042] | 1020 | |

[10611] | 1021 | p = numpy.empty((mx, ncomp)) |

1022 | w = numpy.empty((mx, ncomp)) | |

1023 | c = numpy.empty((my, ncomp)) | |

1024 | t = numpy.empty((n, ncomp)) | |

1025 | u = numpy.empty((n, ncomp)) | |

1026 | b = numpy.zeros((ncomp, ncomp)) | |

[8042] | 1027 | |

[10611] | 1028 | e, f = x, y |

[8042] | 1029 | |

1030 | # main algorithm | |

1031 | for i in range(ncomp): | |

[10611] | 1032 | u = numpy.random.random_sample((n, 1)) |

1033 | w = normalize(numpy.dot(e.T, u)) | |

1034 | t = normalize(numpy.dot(e, w)) | |

1035 | c = normalize(numpy.dot(f.T, t)) | |

[8042] | 1036 | |

1037 | dif = t | |

1038 | # iterations for loading vector t | |

1039 | while numpy.linalg.norm(dif) > 10e-16: | |

[10611] | 1040 | c = normalize(numpy.dot(f.T, t)) |

1041 | u = numpy.dot(f, c) | |

1042 | w = normalize(numpy.dot(e.T, u)) | |

1043 | t0 = normalize(numpy.dot(e, w)) | |

[8042] | 1044 | dif = t - t0 |

1045 | t = t0 | |

1046 | ||

[10611] | 1047 | t[:, i] = t.T |

1048 | u[:, i] = u.T | |

1049 | c[:, i] = c.T | |

1050 | w[:, i] = w.T | |

[8042] | 1051 | |

[10611] | 1052 | b = numpy.dot(t.T, u)[0, 0] |

[8042] | 1053 | b[i][i] = b |

[10611] | 1054 | p = numpy.dot(e.T, t) |

1055 | p[:, i] = p.T | |

1056 | e = e - numpy.dot(t, p.T) | |

1057 | xx = b * numpy.dot(t, c.T) | |

[8042] | 1058 | f = f - xx |

1059 | ||

1060 | return w | |

1061 | ||

1062 | createPLSProjection = create_pls_projection | |

1063 | ||

1064 | # ############################################################################# | |

1065 | # class that represents freeviz classifier | |

[10611] | 1066 | class FreeVizClassifier(classification.Classifier): |

[8042] | 1067 | """ |

1068 | A kNN classifier on the 2D projection of the data, optimized by FreeViz. | |

1069 | ||

1070 | Usually the learner | |

1071 | (:class:`Orange.projection.linear.FreeVizLearner`) is used to construct the | |

1072 | classifier. | |

1073 | ||

1074 | When constructing the classifier manually, the following parameters can | |

1075 | be passed: | |

1076 | ||

[10611] | 1077 | :param dataset: table of data instances to project to a 2D plane and use for |

[8042] | 1078 | classification. |

[10611] | 1079 | :type dataset: :class:`Orange.data.Table` |

[8042] | 1080 | |

1081 | :param freeviz: the FreeViz algorithm instance to use to optimize the 2D | |

1082 | projection. | |

1083 | :type freeviz: :class:`Orange.projection.linear.FreeViz` | |

1084 | ||

1085 | """ | |

[10611] | 1086 | |

1087 | def __init__(self, dataset, freeviz): | |

[8042] | 1088 | self.freeviz = freeviz |

1089 | ||

1090 | if self.freeviz.__class__ != FreeViz: | |

[10611] | 1091 | self.freeviz.parentWidget.setData(dataset) |

[10475] | 1092 | self.freeviz.parentWidget.showAllAttributes = 1 |

[8042] | 1093 | else: |

[10611] | 1094 | self.freeviz.graph.set_data(dataset) |

[8042] | 1095 | self.freeviz.show_all_attributes() |

1096 | ||

1097 | self.freeviz.radial_anchors() | |

1098 | self.freeviz.optimize_separation() | |

1099 | ||

1100 | graph = self.freeviz.graph | |

[10475] | 1101 | ai = graph.attribute_name_index |

1102 | labels = [a[2] for a in graph.anchor_data] | |

[8042] | 1103 | indices = [ai[label] for label in labels] |

1104 | ||

[10475] | 1105 | valid_data = graph.get_valid_list(indices) |

[10611] | 1106 | domain = data.Domain([graph.data_domain[i].name for i in indices] + |

1107 | [graph.data_domain.classVar.name], | |

1108 | graph.data_domain) | |

[10475] | 1109 | offsets = [graph.attr_values[graph.attribute_names[i]][0] |

[8042] | 1110 | for i in indices] |

[10475] | 1111 | normalizers = [graph.get_min_max_val(i) for i in indices] |

[10611] | 1112 | selected_data = numpy.take(graph.original_data, indices, axis=0) |

[8042] | 1113 | averages = numpy.average(numpy.compress(valid_data, selected_data, |

1114 | axis=1), 1) | |

1115 | class_data = numpy.compress(valid_data, | |

[10475] | 1116 | graph.original_data[graph.data_class_index]) |

[8042] | 1117 | |

[10611] | 1118 | graph.create_projection_as_numeric_array(indices, use_anchor_data=1, |

1119 | remove_missing_data=0, | |

1120 | valid_data=valid_data, | |

1121 | jitter_size=-1) | |

1122 | self.classifier = knn.P2NN(domain, | |

1123 | numpy.transpose(numpy.array([numpy.compress(valid_data, | |

1124 | graph.unscaled_x_positions), | |

1125 | numpy.compress(valid_data, | |

1126 | graph.unscaled_y_positions), | |

1127 | class_data])), | |

1128 | graph.anchor_data, offsets, normalizers, | |

1129 | averages, graph.normalize_examples, law=1) | |

[8042] | 1130 | |

1131 | # for a given instance run argumentation and find out to which class it most often fall | |

1132 | @deprecated_keywords({"example": "instance", "returnType": "return_type"}) | |

[10611] | 1133 | def __call__(self, instance, return_type=classification.Classifier.GetValue): |

[8817] | 1134 | return self.classifier(instance, return_type) |

[8042] | 1135 | |

[10611] | 1136 | FreeVizClassifier = deprecated_members({"FreeViz": "freeviz"})(FreeVizClassifier) |

[8042] | 1137 | |

[10611] | 1138 | class FreeVizLearner(classification.Learner): |

[8042] | 1139 | """ |

1140 | A learner that builds a :class:`FreeVizClassifier` on given data. An | |

1141 | instance of :class:`FreeViz` can be passed to the constructor as a | |

1142 | keyword argument :obj:`freeviz`. | |

1143 | ||

1144 | If data instances are provided to the constructor, the learning algorithm | |

1145 | is called and the resulting classifier is returned instead of the learner. | |

1146 | ||

1147 | """ | |

[10611] | 1148 | |

1149 | def __new__(cls, freeviz=None, instances=None, weight_id=0, **argkw): | |

1150 | self = classification.Learner.__new__(cls, **argkw) | |

[11459] | 1151 | if instances is not None: |

[8042] | 1152 | self.__init__(freeviz, **argkw) |

1153 | return self.__call__(instances, weight_id) | |

1154 | else: | |

1155 | return self | |

1156 | ||

[10611] | 1157 | def __init__(self, freeviz=None, **kwd): |

[8042] | 1158 | if not freeviz: |

1159 | freeviz = FreeViz() | |

1160 | self.freeviz = freeviz | |

1161 | self.name = "freeviz Learner" | |

1162 | ||

1163 | @deprecated_keywords({"examples": "instances", "weightID": "weight_id"}) | |

[10611] | 1164 | def __call__(self, instances, weight_id=0): |

[8042] | 1165 | return FreeVizClassifier(instances, self.freeviz) |

1166 | ||

[10611] | 1167 | FreeVizLearner = deprecated_members({"FreeViz": "freeviz"})(FreeVizLearner) |

[8042] | 1168 | |

1169 | ||

[10611] | 1170 | class S2NHeuristicLearner(classification.Learner): |

[8042] | 1171 | """ |

1172 | This class is not documented yet. | |

1173 | ||

1174 | """ | |

[10611] | 1175 | |

1176 | def __new__(cls, freeviz=None, instances=None, weight_id=0, **argkw): | |

1177 | self = classification.Learner.__new__(cls, **argkw) | |

[11459] | 1178 | if instances is not None: |

[8042] | 1179 | self.__init__(freeviz, **argkw) |

1180 | return self.__call__(instances, weight_id) | |

1181 | else: | |

1182 | return self | |

1183 | ||

[10611] | 1184 | def __init__(self, freeviz=None, **kwd): |

[8042] | 1185 | if not freeviz: |

1186 | freeviz = FreeViz() | |

1187 | self.freeviz = freeviz | |

1188 | self.name = "S2N Feature Selection Learner" | |

1189 | ||

1190 | @deprecated_keywords({"examples": "instances", "weightID": "weight_id"}) | |

[10611] | 1191 | def __call__(self, instances, weight_id=0): |

[8042] | 1192 | return S2NHeuristicClassifier(instances, self.freeviz) |

1193 | ||

1194 | S2NHeuristicLearner = deprecated_members({"FreeViz": | |

[10611] | 1195 | "freeviz"})(S2NHeuristicLearner) |

[10475] | 1196 | |

1197 | class Projector(object): | |

1198 | """ | |

1199 | Stores a linear projection of data and uses it to transform any given data with matching input domain. | |

[10612] | 1200 | """ |

1201 | #: Domain of the data set that was used to construct principal component subspace. | |

1202 | input_domain = None | |

[10475] | 1203 | |

[10612] | 1204 | #: Domain used in returned data sets. This domain has a continuous |

1205 | #: variable for each axis in the projected space, | |

1206 | #: and no class variable(s). | |

1207 | output_domain = None | |

[10475] | 1208 | |

[10612] | 1209 | #: Array containing means of each variable in the data set that was used |

1210 | #: to construct the projection. | |

[10645] | 1211 | center = numpy.array(()) |

[10475] | 1212 | |

[10612] | 1213 | #: An array containing standard deviations of each variable in the data |

1214 | #: set that was used to construct the projection. | |

[10645] | 1215 | scale = numpy.array(()) |

[10475] | 1216 | |

[10612] | 1217 | #: True, if standardization was used when constructing the projection. If |

1218 | #: set, instances will be standardized before being projected. | |

1219 | standardize = True | |

[10475] | 1220 | |

[10612] | 1221 | #: Array containing projection (vectors that describe the |

1222 | #: transformation from input to output domain). | |

1223 | projection = numpy.array(()).reshape(0, 0) | |

[10475] | 1224 | |

1225 | def __init__(self, **kwds): | |

1226 | self.__dict__.update(kwds) | |

1227 | ||

[10612] | 1228 | features = [] |

1229 | for i in range(len(self.projection)): | |

1230 | f = feature.Continuous("Comp.%d" % (i + 1)) | |

[10803] | 1231 | f.get_value_from = _ProjectSingleComponent(self, f, i) |

[10612] | 1232 | features.append(f) |

[10475] | 1233 | |

[10612] | 1234 | self.output_domain = Orange.data.Domain(features, |

1235 | self.input_domain.class_var, | |

1236 | class_vars=self.input_domain.class_vars) | |

1237 | ||

1238 | def __call__(self, dataset): | |

[10475] | 1239 | """ |

1240 | Project data. | |

1241 | ||

[10611] | 1242 | :param dataset: input data set |

1243 | :type dataset: :class:`Orange.data.Table` | |

[10475] | 1244 | |

1245 | :rtype: :class:`Orange.data.Table` | |

1246 | """ | |

[10611] | 1247 | if not isinstance(dataset, data.Table): |

1248 | dataset = data.Table([dataset]) | |

1249 | if len(self.projection.T) != len(dataset.domain.features): | |

1250 | dataset = data.Table(self.input_domain, dataset) | |

[10475] | 1251 | |

[10644] | 1252 | X, = dataset.to_numpy_MA("a") |

[10645] | 1253 | Xm, U = self.center, self.projection |

[10475] | 1254 | n, m = X.shape |

1255 | ||

1256 | if m != len(self.projection.T): | |

[10611] | 1257 | raise ValueError, "Invalid number of features" |

[10475] | 1258 | |

1259 | Xd = X - Xm | |

1260 | ||

1261 | if self.standardize: | |

[10645] | 1262 | Xd /= self.scale |

[10475] | 1263 | |

[10612] | 1264 | self.A = numpy.dot(Xd, U.T) |

[10475] | 1265 | |

[10863] | 1266 | class_, classes = dataset.to_numpy_MA("c")[0], dataset.to_numpy_MA("m")[0] |

[10612] | 1267 | return data.Table(self.output_domain, numpy.hstack((self.A, class_, classes))) |

1268 | ||

[10803] | 1269 | class _ProjectSingleComponent(): |

1270 | def __init__(self, projector, feature, idx): | |

1271 | self.projector = projector | |

1272 | self.feature = feature | |

1273 | self.idx = idx | |

1274 | ||

1275 | def __call__(self, example, return_what): | |

[10863] | 1276 | if len(self.projector.projection.T) != len(example.domain.features): |

1277 | ex = Orange.data.Table(self.projector.input_domain, [example]) | |

1278 | else: | |

1279 | ex = Orange.data.Table([example]) | |

1280 | ex = ex.to_numpy_MA("a")[0] | |

[10803] | 1281 | ex -= self.projector.center |

1282 | if self.projector.standardize: | |

1283 | ex /= self.projector.scale | |

1284 | return self.feature(numpy.dot(self.projector.projection[self.idx, :], ex.T)[0]) | |

1285 | ||

[10475] | 1286 | |

1287 | #color table for biplot | |

[10611] | 1288 | Colors = ['bo', 'go', 'yo', 'co', 'mo'] |

[10475] | 1289 | |

[10612] | 1290 | class PCA(object): |

[10475] | 1291 | """ |

1292 | Orthogonal transformation of data into a set of uncorrelated variables called | |

1293 | principal components. This transformation is defined in such a way that the | |

1294 | first variable has as high variance as possible. | |

1295 | ||

1296 | :param standardize: perform standardization of the data set. | |

1297 | :type standardize: boolean | |

1298 | :param max_components: maximum number of retained components. | |

1299 | :type max_components: int | |

1300 | :param variance_covered: percent of the variance to cover with components. | |

1301 | :type variance_covered: float | |

1302 | :param use_generalized_eigenvectors: use generalized eigenvectors (ie. | |

1303 | multiply data matrix with inverse of its covariance matrix). | |

1304 | :type use_generalized_eigenvectors: boolean | |

1305 | """ | |

1306 | ||

[10645] | 1307 | #: Delta degrees of freedom used for numpy operations. |

1308 | #: 1 means normalization with (N-1) in cov and std operations | |

1309 | ddof = 1 | |

1310 | ||

[10490] | 1311 | def __new__(cls, dataset=None, **kwds): |

[10475] | 1312 | optimizer = object.__new__(cls) |

1313 | ||

[11459] | 1314 | if dataset is not None: |

[10612] | 1315 | optimizer.__init__(**kwds) |

[10475] | 1316 | return optimizer(dataset) |

1317 | else: | |

1318 | return optimizer | |

1319 | ||

[10490] | 1320 | def __init__(self, standardize=True, max_components=0, variance_covered=1, |

[10647] | 1321 | use_generalized_eigenvectors=0, ddof=1): |

[10475] | 1322 | self.standardize = standardize |

1323 | self.max_components = max_components | |

[10490] | 1324 | self.variance_covered = min(1, variance_covered) |

[10475] | 1325 | self.use_generalized_eigenvectors = use_generalized_eigenvectors |

[10647] | 1326 | self.ddof=ddof |

[10475] | 1327 | |

1328 | def __call__(self, dataset): | |

1329 | """ | |

1330 | Perform a PCA analysis on a data set and return a linear projector | |

1331 | that maps data into principal component subspace. | |

1332 | ||

1333 | :param dataset: input data set. | |

1334 | :type dataset: :class:`Orange.data.Table` | |

1335 | ||

1336 | :rtype: :class:`~Orange.projection.linear.PcaProjector` | |

1337 | """ | |

[10612] | 1338 | Xd, stdev, mean, relevant_features = self._normalize_data(dataset) |

[10475] | 1339 | |

1340 | #use generalized eigenvectors | |

1341 | if self.use_generalized_eigenvectors: | |

1342 | inv_covar = numpy.linalg.inv(numpy.dot(Xd.T, Xd)) | |

1343 | Xg = numpy.dot(inv_covar, Xd.T) | |

1344 | else: | |

1345 | Xg = Xd.T | |

1346 | ||

[10612] | 1347 | components, variances = self._perform_pca(dataset, Xd, Xg) |

1348 | components = self._insert_zeros_for_constant_features(len(dataset.domain.features), | |

1349 | components, | |

1350 | relevant_features) | |

1351 | ||

1352 | variances, components, variance_sum = self._select_components(variances, components) | |

1353 | ||

1354 | n, m = components.shape | |

1355 | ||

1356 | return PcaProjector(input_domain=dataset.domain, | |

[10645] | 1357 | center=mean, |

1358 | scale=stdev, | |

[10612] | 1359 | standardize=self.standardize, |

1360 | projection=components, | |

[10645] | 1361 | variances=variances, |

[10612] | 1362 | variance_sum=variance_sum) |

1363 | ||

1364 | def _normalize_data(self, dataset): | |

1365 | if not len(dataset) or not len(dataset.domain.features): | |

1366 | raise ValueError("Empty dataset") | |

[10644] | 1367 | X = dataset.to_numpy_MA("a")[0] |

[10612] | 1368 | |

1369 | Xm = numpy.mean(X, axis=0) | |

1370 | Xd = X - Xm | |

1371 | ||

1372 | if not Xd.any(): | |

1373 | raise ValueError("All features are constant") | |

1374 | ||

1375 | #take care of the constant features | |

[10645] | 1376 | stdev = numpy.std(Xd, axis=0, ddof=self.ddof) |

[10612] | 1377 | stdev[stdev == 0] = 1. # to prevent division by zero |

1378 | relevant_features = stdev != 0 | |

1379 | Xd = Xd[:, relevant_features] | |

1380 | if self.standardize: | |

1381 | Xd /= stdev[relevant_features] | |

1382 | return Xd, stdev, Xm, relevant_features | |

1383 | ||

1384 | def _perform_pca(self, dataset, Xd, Xg): | |

[10490] | 1385 | n, m = Xd.shape |

[10612] | 1386 | if n < m: |

[10715] | 1387 | _, D, U = numpy.linalg.svd(Xd, full_matrices=0) |

1388 | D *= D / (n - self.ddof) | |

[10612] | 1389 | else: |

[10645] | 1390 | C = numpy.dot(Xg, Xd) / (n - self.ddof) |

[10612] | 1391 | U, D, T = numpy.linalg.svd(C) |

1392 | U = U.T # eigenvectors are now in rows | |

1393 | return U, D | |

[10475] | 1394 | |

[10612] | 1395 | def _select_components(self, D, U): |

[10475] | 1396 | variance_sum = D.sum() |

1397 | #select eigen vectors | |

1398 | if self.variance_covered != 1: | |

[10490] | 1399 | nfeatures = numpy.searchsorted(numpy.cumsum(D) / variance_sum, |

1400 | self.variance_covered) + 1 | |

[10475] | 1401 | U = U[:nfeatures, :] |

1402 | D = D[:nfeatures] | |

1403 | if self.max_components > 0: | |

1404 | U = U[:self.max_components, :] | |

1405 | D = D[:self.max_components] | |

[10612] | 1406 | return D, U, variance_sum |

[10475] | 1407 | |

[10612] | 1408 | def _insert_zeros_for_constant_features(self, original_dimension, U, relevant_features): |

[10475] | 1409 | n, m = U.shape |

[10612] | 1410 | if m != original_dimension: |

1411 | U_ = numpy.zeros((n, original_dimension)) | |

1412 | U_[:, relevant_features] = U | |

1413 | U = U_ | |

1414 | return U | |

[10475] | 1415 | |

[10612] | 1416 | Pca = PCA |

[10475] | 1417 | |

1418 | ||

[10645] | 1419 | class Spca(PCA): |

[10612] | 1420 | def _perform_pca(self, dataset, Xd, Xg): |

[10475] | 1421 | # define the Laplacian matrix |

1422 | c = dataset.to_numpy("c")[0] | |

[10490] | 1423 | l = -numpy.array(numpy.hstack([(c != v) for v in c]), dtype='f') |

[10475] | 1424 | l -= numpy.diagflat(numpy.sum(l, axis=0)) |

1425 | ||

1426 | Xg = numpy.dot(Xg, l) | |

1427 | ||

[10612] | 1428 | return Pca._perform_pca(self, dataset, Xd, Xg) |

[10475] | 1429 | |

[10612] | 1430 | |

1431 | @deprecated_members({"pc_domain": "output_domain"}) | |

[10475] | 1432 | class PcaProjector(Projector): |

[10645] | 1433 | #: Array containing variances of principal components. |

1434 | variances = numpy.array(()) | |

[10475] | 1435 | |

[10612] | 1436 | #: Sum of all variances in the data set that was used to construct the PCA space. |

1437 | variance_sum = 0. | |

[10475] | 1438 | |

1439 | def __str__(self): | |

1440 | ncomponents = 10 | |

1441 | s = self.variance_sum | |

[10645] | 1442 | cs = numpy.cumsum(self.variances) / s |

1443 | stdev = numpy.sqrt(self.variances) | |

[10475] | 1444 | return "\n".join([ |

1445 | "PCA SUMMARY", | |

1446 | "", | |

1447 | "Std. deviation of components:", | |

1448 | " ".join([" "] + | |

[10612] | 1449 | ["%10s" % a.name for a in self.output_domain.attributes]), |

[10475] | 1450 | " ".join(["Std. deviation"] + |

[10645] | 1451 | ["%10.3f" % a for a in stdev]), |

[10475] | 1452 | " ".join(["Proportion Var"] + |

[10645] | 1453 | ["%10.3f" % a for a in self.variances / s * 100]), |

[10475] | 1454 | " ".join(["Cumulative Var"] + |

1455 | ["%10.3f" % a for a in cs * 100]), | |

1456 | "", | |

[10612] | 1457 | ]) if len(self.output_domain) <= ncomponents else\ |

[10475] | 1458 | "\n".join([ |

1459 | "PCA SUMMARY", | |

1460 | "", | |

1461 | "Std. deviation of components:", | |

1462 | " ".join([" "] + | |

[10612] | 1463 | ["%10s" % a.name for a in self.output_domain.attributes[:ncomponents]] + |

[10475] | 1464 | ["%10s" % "..."] + |

[10612] | 1465 | ["%10s" % self.output_domain.attributes[-1].name]), |

[10475] | 1466 | " ".join(["Std. deviation"] + |

[10645] | 1467 | ["%10.3f" % a for a in stdev[:ncomponents]] + |

[10475] | 1468 | ["%10s" % ""] + |

[10645] | 1469 | ["%10.3f" % stdev[-1]]), |

[10475] | 1470 | " ".join(["Proportion Var"] + |

[10645] | 1471 | ["%10.3f" % a for a in self.variances[:ncomponents] / s * 100] + |

[10475] | 1472 | ["%10s" % ""] + |

[10645] | 1473 | ["%10.3f" % (self.variances[-1] / s * 100)]), |

[10475] | 1474 | " ".join(["Cumulative Var"] + |

1475 | ["%10.3f" % a for a in cs[:ncomponents] * 100] + | |

1476 | ["%10s" % ""] + | |

1477 | ["%10.3f" % (cs[-1] * 100)]), | |

1478 | "", | |

[10612] | 1479 | ]) |

[10475] | 1480 | |

1481 | ||

[10612] | 1482 | def scree_plot(self, filename=None, title='Scree Plot'): |

[10475] | 1483 | """ |

1484 | Draw a scree plot of principal components | |

1485 | ||

[10612] | 1486 | :param filename: Name of the file to which the plot will be saved. |

1487 | If None, plot will be displayed instead. | |

[10475] | 1488 | :type filename: str |

1489 | :param title: Plot title | |

1490 | :type title: str | |

1491 | """ | |

1492 | import pylab as plt | |

1493 | ||

1494 | s = self.variance_sum | |

[10645] | 1495 | vc = self.variances / s |

1496 | cs = numpy.cumsum(self.variances) / s | |

[10475] | 1497 | |

1498 | fig = plt.figure() | |

1499 | ax = fig.add_subplot(111) | |

1500 | ||

[10645] | 1501 | x_axis = range(len(self.variances)) |

[10475] | 1502 | plt.grid(True) |

1503 | ||

[10566] | 1504 | ax.set_xlabel('Principal Component Number') |

[10475] | 1505 | ax.set_ylabel('Proportion of Variance') |

1506 | ax.set_title(title + "\n") | |

1507 | ax.plot(x_axis, vc, color="red") | |

1508 | ax.scatter(x_axis, vc, color="red", label="Variance") | |

1509 | ||

1510 | ax.plot(x_axis, cs, color="orange") | |

1511 | ax.scatter(x_axis, cs, color="orange", label="Cumulative Variance") | |

1512 | ax.legend(loc=0) | |

1513 | ||

[10645] | 1514 | ax.axis([-0.5, len(self.variances) - 0.5, 0, 1]) |

[10475] | 1515 | |

1516 | if filename: | |

1517 | plt.savefig(filename) | |

1518 | else: | |

1519 | plt.show() | |

1520 | ||

[10612] | 1521 | def biplot(self, filename=None, components=(0, 1), title='Biplot'): |

[10475] | 1522 | """ |

1523 | Draw biplot for PCA. Actual projection must be performed via pca(data) | |

1524 | before bipot can be used. | |

1525 | ||

[10612] | 1526 | :param filename: Name of the file to which the plot will be saved. |

1527 | If None, plot will be displayed instead. | |

1528 | :type filename: str | |

[10475] | 1529 | :param components: List of two components to plot. |

1530 | :type components: list | |

1531 | :param title: Plot title | |

1532 | :type title: str | |

1533 | """ | |

1534 | import pylab as plt | |

1535 | ||

1536 | if len(components) < 2: | |

[10611] | 1537 | raise ValueError, 'Two components are needed for biplot' |

[10475] | 1538 | |

[10645] | 1539 | if not (0 <= min(components) <= max(components) < len(self.variances)): |

[10611] | 1540 | raise ValueError, 'Invalid components' |

[10475] | 1541 | |

[10611] | 1542 | X = self.A[:, components[0]] |

1543 | Y = self.A[:, components[1]] | |

[10475] | 1544 | |

[10645] | 1545 | vectorsX = self.variances[:, components[0]] |

1546 | vectorsY = self.variances[:, components[1]] | |

[10475] | 1547 | |

[10645] | 1548 | max_load_value = self.variances.max() * 1.5 |

[10475] | 1549 | |

1550 | fig = plt.figure() | |

1551 | ax1 = fig.add_subplot(111) | |

1552 | ax1.set_title(title + "\n") | |

[10645] | 1553 | ax1.set_xlabel("PC%s (%d%%)" % (components[0], self.variances[components[0]] / self.variance_sum * 100)) |

1554 | ax1.set_ylabel("PC%s (%d%%)" % (components[1], self.variances[components[1]] / self.variance_sum * 100)) | |

[10475] | 1555 | ax1.xaxis.set_label_position('bottom') |

1556 | ax1.xaxis.set_ticks_position('bottom') | |

1557 | ax1.yaxis.set_label_position('left') | |

1558 | ax1.yaxis.set_ticks_position('left') | |

1559 | ||

1560 | ax1.plot(X, Y, Colors[0]) | |

1561 | ||

1562 | ||

1563 | #eliminate double axis on right | |

1564 | ax0 = ax1.twinx() | |

1565 | ax0.yaxis.set_visible(False) | |

1566 | ||

1567 | ax2 = ax0.twiny() | |

1568 | ax2.xaxis.set_label_position('top') | |

1569 | ax2.xaxis.set_ticks_position('top') | |

1570 | ax2.yaxis.set_label_position('right') | |

1571 | ax2.yaxis.set_ticks_position('right') | |

1572 | for tl in ax2.get_xticklabels(): | |

1573 | tl.set_color('r') | |

1574 | for tl in ax2.get_yticklabels(): | |

1575 | tl.set_color('r') | |

1576 | ||

[10611] | 1577 | arrowprops = dict(facecolor='red', edgecolor='red', width=1, headwidth=4) |

[10475] | 1578 | |

[10611] | 1579 | for (x, y, a) in zip(vectorsX, vectorsY, self.input_domain.attributes): |

[10475] | 1580 | if max(x, y) < 0.1: |

1581 | continue | |

1582 | print x, y, a | |

[10611] | 1583 | ax2.annotate('', (x, y), (0, 0), arrowprops=arrowprops) |

1584 | ax2.text(x * 1.1, y * 1.2, a.name, color='red') | |

[10475] | 1585 | |

1586 | ax2.set_xlim(-max_load_value, max_load_value) | |

1587 | ax2.set_ylim(-max_load_value, max_load_value) | |

1588 | ||

1589 | if filename: | |

1590 | plt.savefig(filename) | |

1591 | else: | |

1592 | plt.show() | |

1593 | ||

1594 | ||

1595 | class Fda(object): | |

1596 | """ | |

1597 | Construct a linear projection of data using FDA. When using this projection optimization method, data is always | |

1598 | standardized prior to being projected. | |

1599 | ||

1600 | If data instances are provided to the constructor, | |

1601 | the optimization algorithm is called and the resulting projector | |

1602 | (:class:`~Orange.projection.linear.FdaProjector`) is | |

1603 | returned instead of the optimizer (instance of this class). | |

1604 | ||

1605 | :rtype: :class:`~Orange.projection.linear.Fda` or | |

1606 | :class:`~Orange.projection.linear.FdaProjector` | |

1607 | """ | |

1608 | ||

[10611] | 1609 | def __new__(cls, dataset=None): |

[10475] | 1610 | self = object.__new__(cls) |

[11459] | 1611 | if dataset is not None: |

[10475] | 1612 | self.__init__() |

[10611] | 1613 | return self.__call__(dataset) |

[10475] | 1614 | else: |

1615 | return self | |

1616 | ||

1617 | def __call__(self, dataset): | |

1618 | """ | |

1619 | Perform a FDA analysis on a data set and return a linear projector | |

1620 | that maps data into another vector space. | |

1621 | ||

1622 | :param dataset: input data set. | |

1623 | :type dataset: :class:`Orange.data.Table` | |

1624 | ||

1625 | :rtype: :class:`~Orange.projection.linear.FdaProjector` | |

1626 | """ | |

1627 | X, Y = dataset.to_numpy_MA("a/c") | |

1628 | ||

1629 | Xm = numpy.mean(X, axis=0) | |

[10611] | 1630 | X -= Xm |

[10475] | 1631 | |

1632 | #take care of the constant features | |

1633 | stdev = numpy.std(X, axis=0) | |

1634 | relevant_features = stdev != 0 | |

1635 | stdev[stdev == 0] = 1. | |

1636 | X /= stdev | |

[10611] | 1637 | X = X[:, relevant_features] |

[10475] | 1638 | |

1639 | instances, features = X.shape | |

1640 | class_count = len(set(Y)) | |

1641 | # special case when we have two classes | |

1642 | if class_count == 2: | |

1643 | data1 = MA.take(X, numpy.argwhere(Y == 0).flatten(), axis=0) | |

1644 | data2 = MA.take(X, numpy.argwhere(Y != 0).flatten(), axis=0) | |

1645 | miDiff = MA.average(data1, axis=1) - MA.average(data2, axis=1) | |

1646 | covMatrix = (MA.dot(data1.T, data1) + MA.dot(data2.T, data2)) / instances | |

1647 | U = numpy.linalg.inv(covMatrix) * miDiff | |

1648 | D = numpy.array([1]) | |

1649 | else: | |

1650 | # compute means and average covariances of examples in each class group | |

1651 | Sw = MA.zeros([features, features]) | |

1652 | for v in set(Y): | |

1653 | d = MA.take(X, numpy.argwhere(Y == v).flatten(), axis=0) | |

[10611] | 1654 | d -= numpy.mean(d, axis=0) |

[10475] | 1655 | Sw += MA.dot(d.T, d) |

1656 | Sw /= instances | |

[10611] | 1657 | total = MA.dot(X.T, X) / float(instances) |

[10475] | 1658 | Sb = total - Sw |

1659 | ||

[10611] | 1660 | matrix = numpy.linalg.inv(Sw) * Sb |

[10475] | 1661 | D, U = numpy.linalg.eigh(matrix) |

1662 | ||

[10611] | 1663 | sorted_indices = [i for _, i in sorted([(ev, i) |

1664 | for i, ev in enumerate(D)], reverse=True)] | |

1665 | U = numpy.take(U, sorted_indices, axis=1) | |

[10475] | 1666 | D = numpy.take(D, sorted_indices) |

1667 | ||

1668 | #insert zeros for constant features | |

1669 | n, m = U.shape | |

1670 | if m != M: | |

[10611] | 1671 | U_ = numpy.zeros((n, M)) |

1672 | U_[:, relevant_features] = U | |

[10475] | 1673 | U = U_ |

1674 | ||

[10611] | 1675 | out_domain = data.Domain([feature.Continuous("Comp.%d" % |

1676 | (i + 1)) for | |

1677 | i in range(len(D))], False) | |

[10475] | 1678 | |

[10611] | 1679 | return FdaProjector(input_domain=dataset.domain, |

1680 | output_domain=out_domain, | |

[10645] | 1681 | center=Xm, |

1682 | scale=stdev, | |

[10611] | 1683 | standardize=True, |

1684 | eigen_vectors=U, | |

1685 | projection=U, | |

1686 | eigen_values=D) | |

1687 | ||

[10475] | 1688 | |

1689 | class FdaProjector(Projector): | |

1690 | """ | |

1691 | .. attribute:: eigen_vectors | |

1692 | ||

1693 | Synonymous for :obj:`~Orange.projection.linear.Projector.projection`. | |

1694 | ||

1695 | .. attribute:: eigen_values | |

1696 | ||

1697 | Array containing eigenvalues corresponding to eigenvectors. | |

1698 | ||

1699 | """ | |

1700 | ||

1701 | ||

1702 | @deprecated_keywords({"dataMatrix": "data_matrix", | |

1703 | "classArray": "class_array", | |

1704 | "NComps": "ncomps", | |

1705 | "useGeneralizedEigenvectors": "use_generalized_eigenvectors"}) | |

[10611] | 1706 | def create_pca_projection(data_matrix, class_array=None, ncomps=-1, use_generalized_eigenvectors=True): |

[10475] | 1707 | import warnings |

[10611] | 1708 | |

[10475] | 1709 | warnings.warn("Deprecated in favour of Orange" |

1710 | ".projection.linear.Pca.", | |

[10611] | 1711 | DeprecationWarning) |

1712 | if isinstance(data_matrix, numpy.ma.core.MaskedArray): | |

[10475] | 1713 | data_matrix = numpy.array(data_matrix) |

[10611] | 1714 | if isinstance(class_array, numpy.ma.core.MaskedArray): |

[10475] | 1715 | class_array = numpy.array(class_array) |

1716 | ||

1717 | data_matrix = numpy.transpose(data_matrix) | |

1718 | ||

[10611] | 1719 | s = numpy.sum(data_matrix, axis=0) / float(len(data_matrix)) |

[10475] | 1720 | data_matrix -= s # substract average value to get zero mean |

1721 | ||

[10611] | 1722 | if class_array is not None and use_generalized_eigenvectors: |

[10475] | 1723 | covarMatrix = numpy.dot(numpy.transpose(data_matrix), data_matrix) |

1724 | try: | |

[10611] | 1725 | matrix = numpy.linalg.inv(covarMatrix) |

1726 | except numpy.linalg.LinAlgError: | |

[10475] | 1727 | return None, None |

1728 | matrix = numpy.dot(matrix, numpy.transpose(data_matrix)) | |

1729 | else: | |

1730 | matrix = numpy.transpose(data_matrix) | |

1731 | ||

1732 | # compute dataMatrixT * L * dataMatrix | |

[10611] | 1733 | if class_array is not None: |

[10475] | 1734 | # define the Laplacian matrix |

1735 | l = numpy.zeros((len(data_matrix), len(data_matrix))) | |

1736 | for i in range(len(data_matrix)): | |

[10611] | 1737 | for j in range(i + 1, len(data_matrix)): |

1738 | l[i, j] = -int(class_array[i] != class_array[j]) | |

1739 | l[j, i] = -int(class_array[i] != class_array[j]) | |

[10475] | 1740 | |

1741 | s = numpy.sum(l, axis=0) # doesn't matter which axis since the matrix l is symmetrical | |

1742 | for i in range(len(data_matrix)): | |

[10611] | 1743 | l[i, i] = -s[i] |

[10475] | 1744 | |

1745 | matrix = numpy.dot(matrix, l) | |

1746 | ||

1747 | matrix = numpy.dot(matrix, data_matrix) | |

1748 | ||

[10477] | 1749 | vals, vectors = numpy.linalg.eig(matrix) |

[10475] | 1750 | if vals.dtype.kind == "c": # if eigenvalues are complex numbers then do nothing |

1751 | return None, None | |

1752 | vals = list(vals) | |

1753 | ||

1754 | if ncomps == -1: | |

1755 | ncomps = len(vals) | |

1756 | ncomps = min(ncomps, len(vals)) | |

1757 | ||

1758 | ret_vals = [] | |

1759 | ret_indices = [] | |

1760 | for i in range(ncomps): | |

1761 | ret_vals.append(max(vals)) | |

1762 | bestind = vals.index(max(vals)) | |

1763 | ret_indices.append(bestind) | |

1764 | vals[bestind] = -1 | |

1765 | ||

[10611] | 1766 | return ret_vals, numpy.take(vectors.T, ret_indices, |

1767 | axis=0) # i-th eigenvector is the i-th column in vectors so we have to transpose the array | |

[10475] | 1768 | |

1769 | createPCAProjection = create_pca_projection |

**Note:**See TracBrowser for help on using the repository browser.