#
source:
orange/Orange/orng/orngPCA.py
@
11590:a14e130116b1

Revision 11590:a14e130116b1, 17.3 KB checked in by astaric, 11 months ago (diff) |
---|

Rev | Line | |
---|---|---|

[8042] | 1 | ''' |

2 | Created on Aug 16, 2009 | |

3 | ||

4 | @author: Nejc Skofic | |

5 | @contact: nejc.skofic@gmail.com | |

6 | ''' | |

7 | ||

8 | import orange, numpy | |

9 | from numpy import sqrt, abs, dot, transpose | |

10 | from numpy.linalg import eig, inv | |

11 | ||

12 | #color table for biplot | |

[9674] | 13 | Colors = ['bo', 'go', 'yo', 'co', 'mo'] |

[8042] | 14 | |

15 | def defaultImputer(dataset): | |

16 | """Default imputer with average data imputaton.""" | |

17 | return orange.ImputerConstructor_average(dataset) | |

18 | ||

19 | def defaultContinuizer(dataset): | |

20 | """Default continuizer with: | |

21 | ||

22 | - multinomial -> as normalized ordinal | |

23 | - class -> ignore | |

24 | - continuous -> leave | |

25 | ||

26 | """ | |

27 | continuizer = orange.DomainContinuizer() | |

28 | continuizer.multinomialTreatment = continuizer.AsNormalizedOrdinal | |

29 | continuizer.classTreatment = continuizer.Ignore | |

30 | continuizer.continuousTreatment = continuizer.Leave | |

31 | return continuizer(dataset) | |

32 | ||

33 | def centerData(dataMatrix): | |

34 | """Perfomrs centering od data along rows, returns center and centered data.""" | |

[9674] | 35 | center = numpy.sum(dataMatrix, axis=0) / float(len(dataMatrix)) |

[8042] | 36 | return center, (dataMatrix - center) |

37 | ||

38 | def standardizeData(dataMatrix): | |

39 | """Performs standardization of data along rows. Throws error if constant | |

40 | variable is present.""" | |

[9674] | 41 | scale = numpy.std(dataMatrix, axis=0) |

[8042] | 42 | if 0. in scale: |

43 | raise orange.KernelException, "Constant variable, cannot standardize!" | |

[9674] | 44 | return scale, dataMatrix * 1. / scale |

[8042] | 45 | |

46 | class PCA(object): | |

47 | """Class that creates PCA projection | |

48 | ||

49 | Constructor parameters | |

50 | ---------------------- | |

51 | dataset : orange.ExampleTable object. | |

52 | If not present (default), constructor will just set all parameters | |

53 | needed for projection. Actual projection can then be performed with | |

54 | *pca(dataset)* | |

55 | attributes : list-like object containing names of attributes retained in | |

56 | PCA. If None (default) all attributes are retained | |

57 | rows : list-like object containg 0s and 1s for which rows should be retained. | |

58 | If None (default) all rows are retained | |

59 | standardize : True or False (default). If True performs standardisation of | |

60 | dataset | |

61 | imputer : orange.Imputer object. Defines how data is imputed if values are | |

62 | missing. Must NOT be trained. Default is average imputation | |

63 | continuizer : orange.Continuizer object. Defines how data is continuized if | |

64 | there are descrete attributes. Default: | |

65 | ||

66 | - multinomial -> as normalized ordinal | |

67 | - class -> ignore | |

68 | - continuous -> leave | |

69 | ||

70 | maxNumberOfComponents : How many components should be retained. Default is 10, | |

71 | if -1 all components will be retained | |

72 | varianceCovered : How much variance of data should be covered. Default is 0.95 | |

73 | useGeneralizedVectors : True or False (default). If True, generalized | |

74 | eigenvectors are used | |

75 | ||

76 | Returns | |

77 | ------- | |

78 | PCA object if dataset was None | |

79 | ||

80 | PCAClassifier object if projection was successful or None if projection has failed | |

81 | """ | |

[9674] | 82 | |

83 | def __new__(cls, dataset=None, attributes=None, rows=None, standardize=0, | |

84 | imputer=defaultImputer, continuizer=defaultContinuizer, | |

85 | maxNumberOfComponents=10, varianceCovered=0.95, | |

86 | useGeneralizedVectors=0): | |

87 | ||

[8042] | 88 | learner = object.__new__(cls) |

[9674] | 89 | |

[8042] | 90 | if dataset: |

91 | learner.__init__(attributes, rows, standardize, imputer, continuizer, | |

92 | maxNumberOfComponents, varianceCovered, useGeneralizedVectors) | |

93 | return learner(dataset) | |

94 | else: | |

95 | return learner | |

[9674] | 96 | |

[8042] | 97 | #decide what should be inmutable and what not |

98 | #imutable for sure: imputer and continuizer | |

[9674] | 99 | def __init__(self, attributes=None, rows=None, standardize=0, |

100 | imputer=defaultImputer, continuizer=defaultContinuizer, | |

101 | maxNumberOfComponents=10, varianceCovered=0.95, | |

102 | useGeneralizedVectors=0): | |

103 | ||

[8042] | 104 | self.attributes = attributes |

105 | self.rows = rows | |

106 | self.standardize = standardize | |

107 | self.imputer = imputer | |

108 | self.continuizer = continuizer | |

109 | self.maxNumberOfComponents = maxNumberOfComponents | |

110 | self.varianceCovered = varianceCovered | |

111 | self.useGeneralizedVectors = useGeneralizedVectors | |

[9674] | 112 | |

[8042] | 113 | def __call__(self, dataset): |

[9674] | 114 | |

[8042] | 115 | #Modify dataset |

116 | dataset = self._datasetPreprocessing(dataset) | |

117 | dataMatrix, classArray, center, deviation = self._dataMatrixPreprocessing(dataset) | |

[9674] | 118 | |

[8042] | 119 | #Perform projection |

120 | evalues, evectors = self._createPCAProjection(dataMatrix, classArray) | |

[9674] | 121 | |

[8042] | 122 | #check if return status is None, None |

123 | if (evalues, evectors) == (None, None): | |

124 | print "Principal component could not be performed (complex eigenvalues or singular matrix if generalized eigenvectors were used)" | |

125 | return None | |

[9674] | 126 | |

[8042] | 127 | #return PCAClassifier |

[9674] | 128 | return PCAClassifier(domain=self.attributes, |

129 | imputer=self.imputer, | |

130 | continuizer=self.continuizer, | |

131 | center=center, | |

132 | deviation=deviation, | |

133 | evalues=evalues, | |

134 | loadings=evectors) | |

[8042] | 135 | |

136 | def _datasetPreprocessing(self, dataset): | |

137 | """ | |

138 | First remove unwanted attributes, save domain (so that PCA remembers on | |

139 | what kind of dataset it was trained), remove unwanted rows and impute | |

140 | values and continuize. | |

141 | """ | |

142 | ||

143 | if self.attributes: | |

144 | dataset = dataset.select(self.attributes) | |

[9674] | 145 | |

[8042] | 146 | #we need to retain only selected attributes without class attribute |

147 | self.attributes = [att.name for att in dataset.domain.attributes] | |

148 | ||

149 | imputer = self.imputer(dataset) | |

[9674] | 150 | |

[8042] | 151 | if self.rows: |

152 | dataset = dataset.select(self.rows) | |

153 | ||

154 | ||

155 | dataset = imputer(dataset) | |

156 | domain = self.continuizer(dataset) | |

157 | dataset = dataset.translate(domain) | |

158 | ||

[9674] | 159 | return dataset |

[8042] | 160 | |

161 | def _dataMatrixPreprocessing(self, dataset): | |

162 | """ | |

163 | Creates numpy arrays, center dataMatrix and standardize it if that | |

164 | option was selected, and return dataMatrix, classArray, center and | |

165 | deviation. | |

166 | """ | |

167 | ||

168 | dataMatrix, classArray, x = dataset.toNumpy() | |

[9674] | 169 | |

[8042] | 170 | center, dataMatrix = centerData(dataMatrix) |

[9674] | 171 | |

[8042] | 172 | deviation = None |

173 | if self.standardize: | |

174 | deviation, dataMatrix = standardizeData(dataMatrix) | |

175 | ||

176 | return dataMatrix, classArray, center, deviation | |

[9674] | 177 | |

[8042] | 178 | def _createPCAProjection(self, dataMatrix, classArray): |

179 | """ | |

180 | L -> Laplacian weight matrix constructed from classArray or identity if | |

181 | classArray is None | |

182 | M -> dataMatrix | |

183 | ||

184 | Normal method: t(M) * L * M * x = lambda * x | |

185 | Snapshot method: M * t(M) * L * M * x = lambda * M * x | |

186 | Generalized vectors: (t(M) * M)^(-1) * t(M) * L * M * x = lambda * x | |

187 | Snapshot with generalized vectors: M * (t(M) * M)^(-1) * t(M) * L * M * x = lambda * M * x | |

188 | """ | |

[9674] | 189 | |

[8042] | 190 | n, d = numpy.shape(dataMatrix) |

[9674] | 191 | |

[8042] | 192 | if classArray != None: |

193 | L = numpy.zeros((len(dataMatrix), len(dataMatrix))) | |

194 | for i in range(len(dataMatrix)): | |

[9674] | 195 | for j in range(i + 1, len(dataMatrix)): |

196 | L[i, j] = -int(classArray[i] != classArray[j]) | |

197 | L[j, i] = -int(classArray[i] != classArray[j]) | |

[8042] | 198 | |

199 | s = numpy.sum(L, axis=0) # doesn't matter which axis since the matrix L is symmetrical | |

200 | for i in range(len(dataMatrix)): | |

[9674] | 201 | L[i, i] = -s[i] |

202 | ||

[8042] | 203 | matrix = dot(transpose(dataMatrix), L) |

204 | else: | |

205 | matrix = transpose(dataMatrix) | |

[9674] | 206 | |

[8042] | 207 | if self.useGeneralizedVectors: |

208 | temp_matrix = dot(transpose(dataMatrix), dataMatrix) | |

209 | try: | |

210 | temp_matrix = inv(temp_matrix) | |

211 | except: | |

212 | return None, None | |

213 | matrix = dot(temp_matrix, matrix) | |

[9674] | 214 | |

[8042] | 215 | if n < d: |

216 | #snapshot method | |

217 | covMatrix = dot(dataMatrix, matrix) | |

218 | trace = numpy.trace(covMatrix) | |

219 | scale = n / trace | |

220 | evalues, evectors = eig(covMatrix) | |

221 | if evalues.dtype.kind == 'c': | |

222 | return None, None | |

223 | positiveArray = numpy.array([value > 0. for value in evalues]) | |

224 | evalues = evalues[positiveArray] | |

225 | evectors = evectors[positiveArray] | |

[9674] | 226 | evectors = transpose(1. / sqrt(evalues)) * transpose(dot(evectors, dataMatrix)) |

[8042] | 227 | else: |

228 | covMatrix = dot(matrix, dataMatrix) | |

229 | trace = numpy.trace(covMatrix) | |

230 | scale = d / trace | |

231 | evalues, evectors = eig(covMatrix) | |

232 | if evalues.dtype.kind == 'c': | |

233 | return None, None | |

234 | positiveArray = numpy.array([value > 0. for value in evalues]) | |

235 | evalues = evalues[positiveArray] | |

236 | evectors = evectors[positiveArray] | |

[9674] | 237 | |

[8042] | 238 | order = (numpy.argsort(evalues)[::-1]) |

239 | N = len(evalues) | |

240 | maxComp = self.maxNumberOfComponents | |

241 | variance = self.varianceCovered | |

[9674] | 242 | |

[8042] | 243 | if maxComp == -1: |

244 | maxComp = N | |

245 | maxComp = min(maxComp, N) | |

246 | ||

247 | order = order[:maxComp] | |

248 | evalues = numpy.take(evalues, order) | |

249 | ||

250 | #evalues are scaled -> value > 1 : explains more than previous attribute | |

251 | # -> value < 1 : explains less than previous attribute | |

252 | for i in range(len(order)): | |

[9674] | 253 | variance -= evalues[i] / trace |

[8042] | 254 | if variance < 0: |

255 | return evalues[: i + 1] * scale, numpy.take(evectors, order[: i + 1], 1) | |

[9674] | 256 | |

257 | return evalues * scale, numpy.take(evectors, order, 1) | |

[8042] | 258 | |

259 | class PCAClassifier(object): | |

[9674] | 260 | |

[8042] | 261 | def __init__(self, domain, imputer, continuizer, center, deviation, evalues, loadings): |

262 | #data checking and modifying | |

263 | self.domain = domain | |

264 | self.imputer = imputer | |

265 | self.continuizer = continuizer | |

266 | #PCA properites | |

267 | self.center = center | |

268 | self.deviation = deviation | |

269 | self.evalues = evalues | |

270 | self.loadings = loadings | |

[9674] | 271 | |

[8042] | 272 | #last predicition performed -> used for biplot |

273 | self._dataMatrix = None | |

274 | self._classArray = None | |

[9674] | 275 | |

[8042] | 276 | def __call__(self, dataset): |

[9674] | 277 | |

[8042] | 278 | try: |

279 | #retain class attribute | |

280 | attrDataset = dataset.select(self.domain) | |

281 | imputer = self.imputer(attrDataset) | |

282 | attrDataset = imputer(attrDataset) | |

283 | domain = self.continuizer(attrDataset) | |

284 | attrDataset = attrDataset.translate(domain) | |

285 | except TypeError, e: | |

286 | raise orange.KernelException, "One or more attributes form training set are missing!" | |

287 | ||

288 | dataMatrix, classArray, x = attrDataset.toNumpy() | |

289 | ||

290 | dataMatrix -= self.center | |

291 | if self.deviation != None: | |

[9674] | 292 | dataMatrix *= 1. / self.deviation |

293 | ||

[8042] | 294 | #save transformed data |

295 | self._dataMatrix = numpy.dot(dataMatrix, self.loadings) | |

296 | ||

[9674] | 297 | attributes = [orange.FloatVariable("PC%d" % (i + 1,)) for i in range(len(self.evalues))] |

[8042] | 298 | new_domain = orange.Domain(attributes) |

299 | new_table = orange.ExampleTable(new_domain, self._dataMatrix) | |

300 | ||

301 | if dataset.domain.classVar: | |

302 | #suboptimal | |

303 | classTable = dataset.select([dataset.domain.classVar.name]) | |

304 | self._classArray = numpy.array([row.getclass() for row in classTable]) | |

305 | new_table = orange.ExampleTable([new_table, classTable]) | |

[9674] | 306 | |

[8042] | 307 | return new_table |

[9674] | 308 | |

[8042] | 309 | def __str__(self): |

[9674] | 310 | |

[8042] | 311 | n, d = numpy.shape(self.loadings) |

312 | comulative = 0.0 | |

[9674] | 313 | |

[8042] | 314 | summary = "PCA SUMMARY\n\nCenter:\n\n" |

315 | summary += " %15s " * len(self.domain) % tuple(self.domain) + "\n" | |

316 | summary += " %15.4f " * len(self.center) % tuple(self.center) + "\n\n" | |

[9674] | 317 | |

[8042] | 318 | if self.deviation != None: |

319 | summary += "Deviation:\n\n" | |

320 | summary += " %15s " * len(self.domain) % tuple(self.domain) + "\n" | |

321 | summary += " %15.4f " * len(self.deviation) % tuple(self.deviation) + "\n\n" | |

[9674] | 322 | |

[8042] | 323 | summary += "Importance of components:\n\n %12s %12s %12s\n" % ("eigenvalues", "proportion", "cumulative") |

324 | for evalue in self.evalues: | |

[9674] | 325 | comulative += evalue / n |

326 | summary += " %12.4f %12.4f %12.4f\n" % (evalue, evalue / n, comulative) | |

327 | ||

328 | #summary += "\nLoadings:\n\n" | |

329 | #summary += " PC%d" * d % tuple(range(1, d + 1)) + "\n" | |

330 | #for attr_num in range(len(self.domain)): | |

331 | # summary += " % 8.4f" * d % tuple(self.loadings[attr_num]) | |

332 | # summary += " %-30s\n" % self.domain[attr_num] | |

[8042] | 333 | |

334 | return summary | |

[9674] | 335 | |

[8042] | 336 | ################ Ploting functions ################### |

[9674] | 337 | |

338 | def plot(self, title='Scree plot', filename='scree_plot.png'): | |

[8042] | 339 | """ |

340 | Draws a scree plot. Matplotlib is needed. | |

341 | ||

342 | Parameters | |

343 | ---------- | |

344 | title : Title of the plot | |

345 | filename : File name under which plot will be saved (default: scree_plot.png) | |

346 | If None, plot will be shown | |

347 | """ | |

[11590] | 348 | import pylab as plt |

[8042] | 349 | fig = plt.figure() |

350 | ax = fig.add_subplot(111) | |

[9674] | 351 | |

[8042] | 352 | x_axis = range(len(self.evalues)) |

[9674] | 353 | x_labels = ["PC%d" % (i + 1,) for i in x_axis] |

354 | ||

[8042] | 355 | ax.set_xticks(x_axis) |

356 | ax.set_xticklabels(x_labels) | |

357 | ax.set_xlabel('Principal components') | |

358 | ax.set_ylabel('Variance') | |

359 | ax.set_title(title + "\n") | |

[9674] | 360 | ax.bar(left=x_axis, height=self.evalues, align='center') |

361 | ax.axis([-0.5, len(self.evalues) - 0.5, 0, self.evalues[0] * 1.05]) | |

[8042] | 362 | if filename: |

363 | plt.savefig(filename) | |

364 | else: | |

365 | plt.show() | |

[9674] | 366 | |

367 | def biplot(self, choices=[1, 2], scale=1., title='Biplot', | |

368 | filename='biplot.png'): | |

[8042] | 369 | """ |

370 | Draws biplot for PCA. Matplotlib is needed. Actual projection must be | |

371 | performed via pca(data) before bilpot can be used. | |

372 | ||

373 | Parameters | |

374 | ---------- | |

375 | choices : lenght 2 list-like object for choosing which 2 components | |

376 | should be used in biplot. Default is first and second | |

377 | scale : scale factor (default is 1.). Should be inside [0, 1] | |

378 | title : title of biplot | |

379 | filename : File name under which plot will be saved (default: biplot.png) | |

380 | If None, plot will be shown | |

381 | """ | |

[11590] | 382 | import pylab as plt |

[9674] | 383 | |

[8042] | 384 | if self._dataMatrix == None: |

385 | raise orange.KernelException, "No data available for biplot!" | |

[9674] | 386 | |

[8042] | 387 | if len(choices) != 2: |

388 | raise orange.KernelException, 'You have to choose exactly two components' | |

[9674] | 389 | |

[8042] | 390 | if max(choices[0], choices[1]) > len(self.evalues) or min(choices[0], choices[1]) < 1: |

391 | raise orange.KernelException, 'Invalid choices' | |

[9674] | 392 | |

[8042] | 393 | choice = numpy.array([i == choices[0] - 1 or i == choices[1] - 1 for i in range(len(self.evalues))]) |

[9674] | 394 | |

395 | dataMatrix = self._dataMatrix[:, choice] | |

396 | loadings = self.loadings[:, choice] | |

[8042] | 397 | lam = (numpy.array(self.evalues)[choice]) |

398 | lam *= sqrt(len(self._dataMatrix)) | |

[9674] | 399 | |

[8042] | 400 | if scale < 0. or scale > 1.: |

401 | print "Warning: 'scale' is outside [0, 1]" | |

[9674] | 402 | lam = lam ** scale |

403 | ||

[8042] | 404 | #TO DO -> pc.biplot (maybe) |

405 | trDataMatrix = dataMatrix / lam | |

406 | trLoadings = loadings * lam | |

[9674] | 407 | |

[8042] | 408 | max_data_value = numpy.max(abs(trDataMatrix)) * 1.05 |

409 | max_load_value = numpy.max(abs(trLoadings)) * 1.5 | |

[9674] | 410 | |

[8042] | 411 | #plt.clf() |

412 | fig = plt.figure() | |

413 | ax1 = fig.add_subplot(111) | |

414 | ax1.set_title(title + "\n") | |

415 | ax1.set_xlabel("PC%s" % (choices[0])) | |

416 | ax1.set_ylabel("PC%s" % (choices[1])) | |

417 | ax1.xaxis.set_label_position('bottom') | |

418 | ax1.xaxis.set_ticks_position('bottom') | |

419 | ax1.yaxis.set_label_position('left') | |

[9674] | 420 | ax1.yaxis.set_ticks_position('left') |

421 | ||

[8042] | 422 | if self._classArray == None: |

423 | trDataMatrix = transpose(trDataMatrix) | |

424 | ax1.plot(trDataMatrix[0], trDataMatrix[1], Colors[0]) | |

425 | else: | |

426 | #suboptimal | |

427 | classValues = [] | |

428 | for classValue in self._classArray: | |

429 | if classValue not in classValues: | |

430 | classValues.append(classValue) | |

431 | for i in range(len(classValues)): | |

432 | choice = numpy.array([classValues[i] == cv for cv in self._classArray]) | |

433 | partialDataMatrix = transpose(trDataMatrix[choice]) | |

434 | ax1.plot(partialDataMatrix[0], partialDataMatrix[1], | |

[9674] | 435 | Colors[i % len(Colors)], label=str(classValues[i])) |

[8042] | 436 | ax1.legend() |

[9674] | 437 | |

[8042] | 438 | ax1.set_xlim(-max_data_value, max_data_value) |

439 | ax1.set_ylim(-max_data_value, max_data_value) | |

[9674] | 440 | |

[8042] | 441 | #eliminate double axis on right |

442 | ax0 = ax1.twinx() | |

443 | ax0.yaxis.set_visible(False) | |

[9674] | 444 | |

[8042] | 445 | ax2 = ax0.twiny() |

446 | ax2.xaxis.set_label_position('top') | |

447 | ax2.xaxis.set_ticks_position('top') | |

448 | ax2.yaxis.set_label_position('right') | |

449 | ax2.yaxis.set_ticks_position('right') | |

450 | for tl in ax2.get_xticklabels(): | |

451 | tl.set_color('r') | |

452 | for tl in ax2.get_yticklabels(): | |

453 | tl.set_color('r') | |

[9674] | 454 | |

455 | arrowprops = dict(facecolor='red', edgecolor='red', width=1, headwidth=4) | |

[8042] | 456 | #using annotations instead of arrows because there is a strange implementation |

457 | #of arrows in matplotlib version 0.99 | |

458 | for i in range(len(trLoadings)): | |

459 | x, y = trLoadings[i] | |

[9674] | 460 | ax2.annotate('', (x, y), (0, 0), arrowprops=arrowprops) |

461 | ax2.text(x * 1.1, y * 1.2, self.domain[i], color='red') | |

462 | ||

[8042] | 463 | ax2.set_xlim(-max_load_value, max_load_value) |

464 | ax2.set_ylim(-max_load_value, max_load_value) | |

[9674] | 465 | |

[8042] | 466 | if filename: |

467 | plt.savefig(filename) | |

468 | else: | |

[9674] | 469 | plt.show() |

**Note:**See TracBrowser for help on using the repository browser.