Changeset 9674:77bc4cbb4407 in orange for orange/orngPCA.py
 Timestamp:
 02/06/12 10:44:03 (2 years ago)
 Branch:
 default
 File:

 1 edited
Legend:
 Unmodified
 Added
 Removed

orange/orngPCA.py
r9282 r9674 17 17 warnings.warn("Importing of matplotlib has failed.\nPlotting functions will be unavailable.") 18 18 mathlib_import = False 19 19 20 20 #color table for biplot 21 Colors = ['bo', 'go','yo','co','mo']21 Colors = ['bo', 'go', 'yo', 'co', 'mo'] 22 22 23 23 def defaultImputer(dataset): … … 41 41 def centerData(dataMatrix): 42 42 """Perfomrs centering od data along rows, returns center and centered data.""" 43 center = numpy.sum(dataMatrix, axis = 0)/float(len(dataMatrix))43 center = numpy.sum(dataMatrix, axis=0) / float(len(dataMatrix)) 44 44 return center, (dataMatrix  center) 45 45 … … 47 47 """Performs standardization of data along rows. Throws error if constant 48 48 variable is present.""" 49 scale = numpy.std(dataMatrix, axis =0)49 scale = numpy.std(dataMatrix, axis=0) 50 50 if 0. in scale: 51 51 raise orange.KernelException, "Constant variable, cannot standardize!" 52 return scale, dataMatrix * 1. /scale52 return scale, dataMatrix * 1. / scale 53 53 54 54 class PCA(object): … … 88 88 PCAClassifier object if projection was successful or None if projection has failed 89 89 """ 90 91 def __new__(cls, dataset = None, attributes = None, rows = None, standardize =0,92 imputer = defaultImputer, continuizer =defaultContinuizer,93 maxNumberOfComponents = 10, varianceCovered =0.95,94 useGeneralizedVectors =0):95 90 91 def __new__(cls, dataset=None, attributes=None, rows=None, standardize=0, 92 imputer=defaultImputer, continuizer=defaultContinuizer, 93 maxNumberOfComponents=10, varianceCovered=0.95, 94 useGeneralizedVectors=0): 95 96 96 learner = object.__new__(cls) 97 97 98 98 if dataset: 99 99 learner.__init__(attributes, rows, standardize, imputer, continuizer, … … 102 102 else: 103 103 return learner 104 104 105 105 #decide what should be inmutable and what not 106 106 #imutable for sure: imputer and continuizer 107 def __init__(self, attributes = None, rows = None, standardize =0,108 imputer = defaultImputer, continuizer =defaultContinuizer,109 maxNumberOfComponents = 10, varianceCovered =0.95,110 useGeneralizedVectors =0):111 107 def __init__(self, attributes=None, rows=None, standardize=0, 108 imputer=defaultImputer, continuizer=defaultContinuizer, 109 maxNumberOfComponents=10, varianceCovered=0.95, 110 useGeneralizedVectors=0): 111 112 112 self.attributes = attributes 113 113 self.rows = rows … … 118 118 self.varianceCovered = varianceCovered 119 119 self.useGeneralizedVectors = useGeneralizedVectors 120 120 121 121 def __call__(self, dataset): 122 122 123 123 #Modify dataset 124 124 dataset = self._datasetPreprocessing(dataset) 125 125 dataMatrix, classArray, center, deviation = self._dataMatrixPreprocessing(dataset) 126 126 127 127 #Perform projection 128 128 evalues, evectors = self._createPCAProjection(dataMatrix, classArray) 129 129 130 130 #check if return status is None, None 131 131 if (evalues, evectors) == (None, None): 132 132 print "Principal component could not be performed (complex eigenvalues or singular matrix if generalized eigenvectors were used)" 133 133 return None 134 134 135 135 #return PCAClassifier 136 return PCAClassifier(domain =self.attributes,137 imputer =self.imputer,138 continuizer =self.continuizer,139 center =center,140 deviation =deviation,141 evalues =evalues,142 loadings =evectors)136 return PCAClassifier(domain=self.attributes, 137 imputer=self.imputer, 138 continuizer=self.continuizer, 139 center=center, 140 deviation=deviation, 141 evalues=evalues, 142 loadings=evectors) 143 143 144 144 def _datasetPreprocessing(self, dataset): … … 151 151 if self.attributes: 152 152 dataset = dataset.select(self.attributes) 153 153 154 154 #we need to retain only selected attributes without class attribute 155 155 self.attributes = [att.name for att in dataset.domain.attributes] 156 156 157 157 imputer = self.imputer(dataset) 158 158 159 159 if self.rows: 160 160 dataset = dataset.select(self.rows) … … 165 165 dataset = dataset.translate(domain) 166 166 167 return dataset 167 return dataset 168 168 169 169 def _dataMatrixPreprocessing(self, dataset): … … 175 175 176 176 dataMatrix, classArray, x = dataset.toNumpy() 177 177 178 178 center, dataMatrix = centerData(dataMatrix) 179 179 180 180 deviation = None 181 181 if self.standardize: … … 183 183 184 184 return dataMatrix, classArray, center, deviation 185 185 186 186 def _createPCAProjection(self, dataMatrix, classArray): 187 187 """ … … 195 195 Snapshot with generalized vectors: M * (t(M) * M)^(1) * t(M) * L * M * x = lambda * M * x 196 196 """ 197 197 198 198 n, d = numpy.shape(dataMatrix) 199 199 200 200 if classArray != None: 201 201 L = numpy.zeros((len(dataMatrix), len(dataMatrix))) 202 202 for i in range(len(dataMatrix)): 203 for j in range(i +1, len(dataMatrix)):204 L[i, j] = int(classArray[i] != classArray[j])205 L[j, i] = int(classArray[i] != classArray[j])203 for j in range(i + 1, len(dataMatrix)): 204 L[i, j] = int(classArray[i] != classArray[j]) 205 L[j, i] = int(classArray[i] != classArray[j]) 206 206 207 207 s = numpy.sum(L, axis=0) # doesn't matter which axis since the matrix L is symmetrical 208 208 for i in range(len(dataMatrix)): 209 L[i, i] = s[i]210 209 L[i, i] = s[i] 210 211 211 matrix = dot(transpose(dataMatrix), L) 212 212 else: 213 213 matrix = transpose(dataMatrix) 214 214 215 215 if self.useGeneralizedVectors: 216 216 temp_matrix = dot(transpose(dataMatrix), dataMatrix) … … 220 220 return None, None 221 221 matrix = dot(temp_matrix, matrix) 222 222 223 223 if n < d: 224 224 #snapshot method … … 232 232 evalues = evalues[positiveArray] 233 233 evectors = evectors[positiveArray] 234 evectors = transpose(1. /sqrt(evalues)) * transpose(dot(evectors, dataMatrix))234 evectors = transpose(1. / sqrt(evalues)) * transpose(dot(evectors, dataMatrix)) 235 235 else: 236 236 covMatrix = dot(matrix, dataMatrix) … … 243 243 evalues = evalues[positiveArray] 244 244 evectors = evectors[positiveArray] 245 245 246 246 order = (numpy.argsort(evalues)[::1]) 247 247 N = len(evalues) 248 248 maxComp = self.maxNumberOfComponents 249 249 variance = self.varianceCovered 250 250 251 251 if maxComp == 1: 252 252 maxComp = N … … 259 259 # > value < 1 : explains less than previous attribute 260 260 for i in range(len(order)): 261 variance = evalues[i] / trace 261 variance = evalues[i] / trace 262 262 if variance < 0: 263 263 return evalues[: i + 1] * scale, numpy.take(evectors, order[: i + 1], 1) 264 265 return evalues * scale, numpy.take(evectors, order, 1) 264 265 return evalues * scale, numpy.take(evectors, order, 1) 266 266 267 267 class PCAClassifier(object): 268 268 269 269 def __init__(self, domain, imputer, continuizer, center, deviation, evalues, loadings): 270 270 #data checking and modifying … … 277 277 self.evalues = evalues 278 278 self.loadings = loadings 279 279 280 280 #last predicition performed > used for biplot 281 281 self._dataMatrix = None 282 282 self._classArray = None 283 283 284 284 def __call__(self, dataset): 285 285 286 286 try: 287 287 #retain class attribute … … 298 298 dataMatrix = self.center 299 299 if self.deviation != None: 300 dataMatrix *= 1. /self.deviation301 300 dataMatrix *= 1. / self.deviation 301 302 302 #save transformed data 303 303 self._dataMatrix = numpy.dot(dataMatrix, self.loadings) 304 304 305 attributes = [orange.FloatVariable("PC%d" % (i + 1, 305 attributes = [orange.FloatVariable("PC%d" % (i + 1,)) for i in range(len(self.evalues))] 306 306 new_domain = orange.Domain(attributes) 307 307 new_table = orange.ExampleTable(new_domain, self._dataMatrix) … … 312 312 self._classArray = numpy.array([row.getclass() for row in classTable]) 313 313 new_table = orange.ExampleTable([new_table, classTable]) 314 314 315 315 return new_table 316 316 317 317 def __str__(self): 318 318 319 319 n, d = numpy.shape(self.loadings) 320 320 comulative = 0.0 321 321 322 322 summary = "PCA SUMMARY\n\nCenter:\n\n" 323 323 summary += " %15s " * len(self.domain) % tuple(self.domain) + "\n" 324 324 summary += " %15.4f " * len(self.center) % tuple(self.center) + "\n\n" 325 325 326 326 if self.deviation != None: 327 327 summary += "Deviation:\n\n" 328 328 summary += " %15s " * len(self.domain) % tuple(self.domain) + "\n" 329 329 summary += " %15.4f " * len(self.deviation) % tuple(self.deviation) + "\n\n" 330 330 331 331 summary += "Importance of components:\n\n %12s %12s %12s\n" % ("eigenvalues", "proportion", "cumulative") 332 332 for evalue in self.evalues: 333 comulative += evalue /n334 summary += " %12.4f %12.4f %12.4f\n" % (evalue, evalue /n, comulative)335 336 summary += "\nLoadings:\n\n"337 summary += " PC%d" * d % tuple(range(1, d + 1)) + "\n"338 for attr_num in range(len(self.domain)):339 summary += " % 8.4f" * d % tuple(self.loadings[attr_num])340 summary += " %30s\n" % self.domain[attr_num]333 comulative += evalue / n 334 summary += " %12.4f %12.4f %12.4f\n" % (evalue, evalue / n, comulative) 335 336 #summary += "\nLoadings:\n\n" 337 #summary += " PC%d" * d % tuple(range(1, d + 1)) + "\n" 338 #for attr_num in range(len(self.domain)): 339 # summary += " % 8.4f" * d % tuple(self.loadings[attr_num]) 340 # summary += " %30s\n" % self.domain[attr_num] 341 341 342 342 return summary 343 343 344 344 ################ Ploting functions ################### 345 346 def plot(self, title = 'Scree plot', filename ='scree_plot.png'):345 346 def plot(self, title='Scree plot', filename='scree_plot.png'): 347 347 """ 348 348 Draws a scree plot. Matplotlib is needed. … … 354 354 If None, plot will be shown 355 355 """ 356 356 357 357 if not mathlib_import: 358 358 raise orange.KernelException, "Matplotlib was not imported!" 359 359 360 360 #plt.clf() > opens two windows 361 361 fig = plt.figure() 362 362 ax = fig.add_subplot(111) 363 363 364 364 x_axis = range(len(self.evalues)) 365 x_labels = ["PC%d" % (i + 1, 366 365 x_labels = ["PC%d" % (i + 1,) for i in x_axis] 366 367 367 ax.set_xticks(x_axis) 368 368 ax.set_xticklabels(x_labels) … … 370 370 ax.set_ylabel('Variance') 371 371 ax.set_title(title + "\n") 372 ax.bar(left = x_axis, height = self.evalues, align ='center')373 ax.axis([0.5, len(self.evalues)  0.5, 0, self.evalues[0] *1.05])372 ax.bar(left=x_axis, height=self.evalues, align='center') 373 ax.axis([0.5, len(self.evalues)  0.5, 0, self.evalues[0] * 1.05]) 374 374 if filename: 375 375 plt.savefig(filename) 376 376 else: 377 377 plt.show() 378 379 def biplot(self, choices = [1,2], scale = 1., title ='Biplot',380 filename ='biplot.png'):378 379 def biplot(self, choices=[1, 2], scale=1., title='Biplot', 380 filename='biplot.png'): 381 381 """ 382 382 Draws biplot for PCA. Matplotlib is needed. Actual projection must be … … 392 392 If None, plot will be shown 393 393 """ 394 394 395 395 if not mathlib_import: 396 396 raise orange.KernelException, "Matplotlib was not imported!" 397 397 398 398 if self._dataMatrix == None: 399 399 raise orange.KernelException, "No data available for biplot!" 400 400 401 401 if len(choices) != 2: 402 402 raise orange.KernelException, 'You have to choose exactly two components' 403 403 404 404 if max(choices[0], choices[1]) > len(self.evalues) or min(choices[0], choices[1]) < 1: 405 405 raise orange.KernelException, 'Invalid choices' 406 406 407 407 choice = numpy.array([i == choices[0]  1 or i == choices[1]  1 for i in range(len(self.evalues))]) 408 409 dataMatrix = self._dataMatrix[:, choice]410 loadings = self.loadings[:, choice]408 409 dataMatrix = self._dataMatrix[:, choice] 410 loadings = self.loadings[:, choice] 411 411 lam = (numpy.array(self.evalues)[choice]) 412 412 lam *= sqrt(len(self._dataMatrix)) 413 413 414 414 if scale < 0. or scale > 1.: 415 415 print "Warning: 'scale' is outside [0, 1]" 416 lam = lam **scale417 416 lam = lam ** scale 417 418 418 #TO DO > pc.biplot (maybe) 419 419 trDataMatrix = dataMatrix / lam 420 420 trLoadings = loadings * lam 421 421 422 422 max_data_value = numpy.max(abs(trDataMatrix)) * 1.05 423 423 max_load_value = numpy.max(abs(trLoadings)) * 1.5 424 424 425 425 #plt.clf() 426 426 fig = plt.figure() … … 432 432 ax1.xaxis.set_ticks_position('bottom') 433 433 ax1.yaxis.set_label_position('left') 434 ax1.yaxis.set_ticks_position('left') 435 434 ax1.yaxis.set_ticks_position('left') 435 436 436 if self._classArray == None: 437 437 trDataMatrix = transpose(trDataMatrix) … … 447 447 partialDataMatrix = transpose(trDataMatrix[choice]) 448 448 ax1.plot(partialDataMatrix[0], partialDataMatrix[1], 449 Colors[i % len(Colors)], label =str(classValues[i]))449 Colors[i % len(Colors)], label=str(classValues[i])) 450 450 ax1.legend() 451 451 452 452 ax1.set_xlim(max_data_value, max_data_value) 453 453 ax1.set_ylim(max_data_value, max_data_value) 454 454 455 455 #eliminate double axis on right 456 456 ax0 = ax1.twinx() 457 457 ax0.yaxis.set_visible(False) 458 458 459 459 ax2 = ax0.twiny() 460 460 ax2.xaxis.set_label_position('top') … … 466 466 for tl in ax2.get_yticklabels(): 467 467 tl.set_color('r') 468 469 arrowprops = dict(facecolor = 'red', edgecolor = 'red', width = 1, headwidth =4)468 469 arrowprops = dict(facecolor='red', edgecolor='red', width=1, headwidth=4) 470 470 #using annotations instead of arrows because there is a strange implementation 471 471 #of arrows in matplotlib version 0.99 472 472 for i in range(len(trLoadings)): 473 473 x, y = trLoadings[i] 474 ax2.annotate('', (x, y), (0, 0), arrowprops =arrowprops)475 ax2.text(x * 1.1, y * 1.2, self.domain[i], color ='red')476 474 ax2.annotate('', (x, y), (0, 0), arrowprops=arrowprops) 475 ax2.text(x * 1.1, y * 1.2, self.domain[i], color='red') 476 477 477 ax2.set_xlim(max_load_value, max_load_value) 478 478 ax2.set_ylim(max_load_value, max_load_value) 479 479 480 480 if filename: 481 481 plt.savefig(filename)
Note: See TracChangeset
for help on using the changeset viewer.