source: orange/orange/orngPCA.py @ 9674:77bc4cbb4407

Revision 9674:77bc4cbb4407, 17.7 KB checked in by Miha Stajdohar <miha.stajdohar@…>, 2 years ago (diff)

Removed loadings from print.

Line 
1'''
2Created on Aug 16, 2009
3
4@author: Nejc Skofic
5@contact: nejc.skofic@gmail.com
6'''
7
8import orange, numpy
9from numpy import sqrt, abs, dot, transpose
10from numpy.linalg import eig, inv
11
12mathlib_import = True
13try:
14    import matplotlib.pyplot as plt
15except:
16    import warnings
17    warnings.warn("Importing of matplotlib has failed.\nPlotting functions will be unavailable.")
18    mathlib_import = False
19
20#color table for biplot
21Colors = ['bo', 'go', 'yo', 'co', 'mo']
22
23def defaultImputer(dataset):
24    """Default imputer with average data imputaton."""
25    return orange.ImputerConstructor_average(dataset)
26
27def defaultContinuizer(dataset):
28    """Default continuizer with:
29       
30    - multinomial -> as normalized ordinal
31    - class -> ignore
32    - continuous -> leave
33       
34    """
35    continuizer = orange.DomainContinuizer()
36    continuizer.multinomialTreatment = continuizer.AsNormalizedOrdinal
37    continuizer.classTreatment = continuizer.Ignore
38    continuizer.continuousTreatment = continuizer.Leave
39    return continuizer(dataset)
40
41def centerData(dataMatrix):
42    """Perfomrs centering od data along rows, returns center and centered data."""
43    center = numpy.sum(dataMatrix, axis=0) / float(len(dataMatrix))
44    return center, (dataMatrix - center)
45
46def standardizeData(dataMatrix):
47    """Performs standardization of data along rows. Throws error if constant
48    variable is present."""
49    scale = numpy.std(dataMatrix, axis=0)
50    if 0. in scale:
51        raise orange.KernelException, "Constant variable, cannot standardize!"
52    return scale, dataMatrix * 1. / scale
53
54class PCA(object):
55    """Class that creates PCA projection
56   
57    Constructor parameters
58    ----------------------
59    dataset : orange.ExampleTable object.
60            If not present (default), constructor will just set all parameters
61            needed for projection. Actual projection can then be performed with
62            *pca(dataset)*
63    attributes : list-like object containing names of attributes retained in
64            PCA. If None (default) all attributes are retained
65    rows : list-like object containg 0s and 1s for which rows should be retained.
66            If None (default) all rows are retained
67    standardize : True or False (default). If True performs standardisation of
68            dataset
69    imputer : orange.Imputer object. Defines how data is imputed if values are
70            missing. Must NOT be trained. Default is average imputation
71    continuizer : orange.Continuizer object. Defines how data is continuized if
72            there are descrete attributes. Default:
73           
74                - multinomial -> as normalized ordinal
75                - class -> ignore
76                - continuous -> leave
77               
78    maxNumberOfComponents : How many components should be retained. Default is 10,
79            if -1 all components will be retained
80    varianceCovered : How much variance of data should be covered. Default is 0.95
81    useGeneralizedVectors : True or False (default). If True, generalized
82            eigenvectors are used
83           
84    Returns
85    -------
86    PCA object if dataset was None
87   
88    PCAClassifier object if projection was successful or None if projection has failed
89    """
90
91    def __new__(cls, dataset=None, attributes=None, rows=None, standardize=0,
92                 imputer=defaultImputer, continuizer=defaultContinuizer,
93                 maxNumberOfComponents=10, varianceCovered=0.95,
94                 useGeneralizedVectors=0):
95
96        learner = object.__new__(cls)
97
98        if dataset:
99            learner.__init__(attributes, rows, standardize, imputer, continuizer,
100                             maxNumberOfComponents, varianceCovered, useGeneralizedVectors)
101            return learner(dataset)
102        else:
103            return learner
104
105    #decide what should be inmutable and what not
106    #imutable for sure: imputer and continuizer
107    def __init__(self, attributes=None, rows=None, standardize=0,
108                 imputer=defaultImputer, continuizer=defaultContinuizer,
109                 maxNumberOfComponents=10, varianceCovered=0.95,
110                 useGeneralizedVectors=0):
111
112        self.attributes = attributes
113        self.rows = rows
114        self.standardize = standardize
115        self.imputer = imputer
116        self.continuizer = continuizer
117        self.maxNumberOfComponents = maxNumberOfComponents
118        self.varianceCovered = varianceCovered
119        self.useGeneralizedVectors = useGeneralizedVectors
120
121    def __call__(self, dataset):
122
123        #Modify dataset
124        dataset = self._datasetPreprocessing(dataset)
125        dataMatrix, classArray, center, deviation = self._dataMatrixPreprocessing(dataset)
126
127        #Perform projection
128        evalues, evectors = self._createPCAProjection(dataMatrix, classArray)
129
130        #check if return status is None, None
131        if (evalues, evectors) == (None, None):
132            print "Principal component could not be performed (complex eigenvalues or singular matrix if generalized eigenvectors were used)"
133            return None
134
135        #return PCAClassifier
136        return PCAClassifier(domain=self.attributes,
137                             imputer=self.imputer,
138                             continuizer=self.continuizer,
139                             center=center,
140                             deviation=deviation,
141                             evalues=evalues,
142                             loadings=evectors)
143
144    def _datasetPreprocessing(self, dataset):
145        """
146        First remove unwanted attributes, save domain (so that PCA remembers on
147        what kind of dataset it was trained), remove unwanted rows and impute
148        values and continuize.
149        """
150
151        if self.attributes:
152            dataset = dataset.select(self.attributes)
153
154        #we need to retain only selected attributes without class attribute   
155        self.attributes = [att.name for att in dataset.domain.attributes]
156
157        imputer = self.imputer(dataset)
158
159        if self.rows:
160            dataset = dataset.select(self.rows)
161
162
163        dataset = imputer(dataset)
164        domain = self.continuizer(dataset)
165        dataset = dataset.translate(domain)
166
167        return dataset
168
169    def _dataMatrixPreprocessing(self, dataset):
170        """
171        Creates numpy arrays, center dataMatrix and standardize it if that
172        option was selected, and return dataMatrix, classArray, center and
173        deviation.
174        """
175
176        dataMatrix, classArray, x = dataset.toNumpy()
177
178        center, dataMatrix = centerData(dataMatrix)
179
180        deviation = None
181        if self.standardize:
182            deviation, dataMatrix = standardizeData(dataMatrix)
183
184        return dataMatrix, classArray, center, deviation
185
186    def _createPCAProjection(self, dataMatrix, classArray):
187        """
188        L -> Laplacian weight matrix constructed from classArray or identity if
189             classArray is None
190        M -> dataMatrix
191       
192        Normal method: t(M) * L * M * x = lambda * x
193        Snapshot method: M * t(M) * L * M * x = lambda * M * x
194        Generalized vectors: (t(M) * M)^(-1) * t(M) * L * M * x = lambda * x
195        Snapshot with generalized vectors: M * (t(M) * M)^(-1) * t(M) * L * M * x = lambda * M * x
196        """
197
198        n, d = numpy.shape(dataMatrix)
199
200        if classArray != None:
201            L = numpy.zeros((len(dataMatrix), len(dataMatrix)))
202            for i in range(len(dataMatrix)):
203                for j in range(i + 1, len(dataMatrix)):
204                    L[i, j] = -int(classArray[i] != classArray[j])
205                    L[j, i] = -int(classArray[i] != classArray[j])
206
207            s = numpy.sum(L, axis=0)      # doesn't matter which axis since the matrix L is symmetrical
208            for i in range(len(dataMatrix)):
209                L[i, i] = -s[i]
210
211            matrix = dot(transpose(dataMatrix), L)
212        else:
213            matrix = transpose(dataMatrix)
214
215        if self.useGeneralizedVectors:
216            temp_matrix = dot(transpose(dataMatrix), dataMatrix)
217            try:
218                temp_matrix = inv(temp_matrix)
219            except:
220                return None, None
221            matrix = dot(temp_matrix, matrix)
222
223        if n < d:
224            #snapshot method
225            covMatrix = dot(dataMatrix, matrix)
226            trace = numpy.trace(covMatrix)
227            scale = n / trace
228            evalues, evectors = eig(covMatrix)
229            if evalues.dtype.kind == 'c':
230                return None, None
231            positiveArray = numpy.array([value > 0. for value in evalues])
232            evalues = evalues[positiveArray]
233            evectors = evectors[positiveArray]
234            evectors = transpose(1. / sqrt(evalues)) * transpose(dot(evectors, dataMatrix))
235        else:
236            covMatrix = dot(matrix, dataMatrix)
237            trace = numpy.trace(covMatrix)
238            scale = d / trace
239            evalues, evectors = eig(covMatrix)
240            if evalues.dtype.kind == 'c':
241                return None, None
242            positiveArray = numpy.array([value > 0. for value in evalues])
243            evalues = evalues[positiveArray]
244            evectors = evectors[positiveArray]
245
246        order = (numpy.argsort(evalues)[::-1])
247        N = len(evalues)
248        maxComp = self.maxNumberOfComponents
249        variance = self.varianceCovered
250
251        if maxComp == -1:
252            maxComp = N
253        maxComp = min(maxComp, N)
254
255        order = order[:maxComp]
256        evalues = numpy.take(evalues, order)
257
258        #evalues are scaled -> value > 1 : explains more than previous attribute
259        #                   -> value < 1 : explains less than previous attribute
260        for i in range(len(order)):
261            variance -= evalues[i] / trace
262            if variance < 0:
263                return evalues[: i + 1] * scale, numpy.take(evectors, order[: i + 1], 1)
264
265        return evalues * scale, numpy.take(evectors, order, 1)
266
267class PCAClassifier(object):
268
269    def __init__(self, domain, imputer, continuizer, center, deviation, evalues, loadings):
270        #data checking and modifying
271        self.domain = domain
272        self.imputer = imputer
273        self.continuizer = continuizer
274        #PCA properites
275        self.center = center
276        self.deviation = deviation
277        self.evalues = evalues
278        self.loadings = loadings
279
280        #last predicition performed -> used for biplot
281        self._dataMatrix = None
282        self._classArray = None
283
284    def __call__(self, dataset):
285
286        try:
287            #retain class attribute
288            attrDataset = dataset.select(self.domain)
289            imputer = self.imputer(attrDataset)
290            attrDataset = imputer(attrDataset)
291            domain = self.continuizer(attrDataset)
292            attrDataset = attrDataset.translate(domain)
293        except TypeError, e:
294            raise orange.KernelException, "One or more attributes form training set are missing!"
295
296        dataMatrix, classArray, x = attrDataset.toNumpy()
297
298        dataMatrix -= self.center
299        if self.deviation != None:
300            dataMatrix *= 1. / self.deviation
301
302        #save transformed data
303        self._dataMatrix = numpy.dot(dataMatrix, self.loadings)
304
305        attributes = [orange.FloatVariable("PC%d" % (i + 1,)) for i in range(len(self.evalues))]
306        new_domain = orange.Domain(attributes)
307        new_table = orange.ExampleTable(new_domain, self._dataMatrix)
308
309        if dataset.domain.classVar:
310            #suboptimal
311            classTable = dataset.select([dataset.domain.classVar.name])
312            self._classArray = numpy.array([row.getclass() for row in classTable])
313            new_table = orange.ExampleTable([new_table, classTable])
314
315        return new_table
316
317    def __str__(self):
318
319        n, d = numpy.shape(self.loadings)
320        comulative = 0.0
321
322        summary = "PCA SUMMARY\n\nCenter:\n\n"
323        summary += " %15s  " * len(self.domain) % tuple(self.domain) + "\n"
324        summary += " %15.4f  " * len(self.center) % tuple(self.center) + "\n\n"
325
326        if self.deviation != None:
327            summary += "Deviation:\n\n"
328            summary += " %15s  " * len(self.domain) % tuple(self.domain) + "\n"
329            summary += " %15.4f  " * len(self.deviation) % tuple(self.deviation) + "\n\n"
330
331        summary += "Importance of components:\n\n %12s   %12s   %12s\n" % ("eigenvalues", "proportion", "cumulative")
332        for evalue in self.evalues:
333            comulative += evalue / n
334            summary += " %12.4f   %12.4f   %12.4f\n" % (evalue, evalue / n, comulative)
335
336        #summary += "\nLoadings:\n\n"
337        #summary += "      PC%d" * d % tuple(range(1, d + 1)) + "\n"
338        #for attr_num in range(len(self.domain)):
339        #    summary += " % 8.4f" * d % tuple(self.loadings[attr_num])
340        #    summary += "   %-30s\n" % self.domain[attr_num]
341
342        return summary
343
344    ################ Ploting functions ###################
345
346    def plot(self, title='Scree plot', filename='scree_plot.png'):
347        """
348        Draws a scree plot. Matplotlib is needed.
349       
350        Parameters
351        ----------
352        title : Title of the plot
353        filename : File name under which plot will be saved (default: scree_plot.png)
354            If None, plot will be shown
355        """
356
357        if not mathlib_import:
358            raise orange.KernelException, "Matplotlib was not imported!"
359
360        #plt.clf() -> opens two windows
361        fig = plt.figure()
362        ax = fig.add_subplot(111)
363
364        x_axis = range(len(self.evalues))
365        x_labels = ["PC%d" % (i + 1,) for i in x_axis]
366
367        ax.set_xticks(x_axis)
368        ax.set_xticklabels(x_labels)
369        ax.set_xlabel('Principal components')
370        ax.set_ylabel('Variance')
371        ax.set_title(title + "\n")
372        ax.bar(left=x_axis, height=self.evalues, align='center')
373        ax.axis([-0.5, len(self.evalues) - 0.5, 0, self.evalues[0] * 1.05])
374        if filename:
375            plt.savefig(filename)
376        else:
377            plt.show()
378
379    def biplot(self, choices=[1, 2], scale=1., title='Biplot',
380               filename='biplot.png'):
381        """
382        Draws biplot for PCA. Matplotlib is needed. Actual projection must be
383        performed via pca(data) before bilpot can be used.
384       
385        Parameters
386        ----------
387        choices : lenght 2 list-like object for choosing which 2 components
388            should be used in biplot. Default is first and second
389        scale : scale factor (default is 1.). Should be inside [0, 1]
390        title : title of biplot
391        filename : File name under which plot will be saved (default: biplot.png)
392            If None, plot will be shown
393        """
394
395        if not mathlib_import:
396            raise orange.KernelException, "Matplotlib was not imported!"
397
398        if self._dataMatrix == None:
399            raise orange.KernelException, "No data available for biplot!"
400
401        if len(choices) != 2:
402            raise orange.KernelException, 'You have to choose exactly two components'
403
404        if max(choices[0], choices[1]) > len(self.evalues) or min(choices[0], choices[1]) < 1:
405            raise orange.KernelException, 'Invalid choices'
406
407        choice = numpy.array([i == choices[0] - 1 or i == choices[1] - 1 for i in range(len(self.evalues))])
408
409        dataMatrix = self._dataMatrix[:, choice]
410        loadings = self.loadings[:, choice]
411        lam = (numpy.array(self.evalues)[choice])
412        lam *= sqrt(len(self._dataMatrix))
413
414        if scale < 0. or scale > 1.:
415            print "Warning: 'scale' is outside [0, 1]"
416        lam = lam ** scale
417
418        #TO DO -> pc.biplot (maybe)
419        trDataMatrix = dataMatrix / lam
420        trLoadings = loadings * lam
421
422        max_data_value = numpy.max(abs(trDataMatrix)) * 1.05
423        max_load_value = numpy.max(abs(trLoadings)) * 1.5
424
425        #plt.clf()
426        fig = plt.figure()
427        ax1 = fig.add_subplot(111)
428        ax1.set_title(title + "\n")
429        ax1.set_xlabel("PC%s" % (choices[0]))
430        ax1.set_ylabel("PC%s" % (choices[1]))
431        ax1.xaxis.set_label_position('bottom')
432        ax1.xaxis.set_ticks_position('bottom')
433        ax1.yaxis.set_label_position('left')
434        ax1.yaxis.set_ticks_position('left')
435
436        if self._classArray == None:
437            trDataMatrix = transpose(trDataMatrix)
438            ax1.plot(trDataMatrix[0], trDataMatrix[1], Colors[0])
439        else:
440            #suboptimal
441            classValues = []
442            for classValue in self._classArray:
443                if classValue not in classValues:
444                    classValues.append(classValue)
445            for i in range(len(classValues)):
446                choice = numpy.array([classValues[i] == cv for cv in self._classArray])
447                partialDataMatrix = transpose(trDataMatrix[choice])
448                ax1.plot(partialDataMatrix[0], partialDataMatrix[1],
449                         Colors[i % len(Colors)], label=str(classValues[i]))
450            ax1.legend()
451
452        ax1.set_xlim(-max_data_value, max_data_value)
453        ax1.set_ylim(-max_data_value, max_data_value)
454
455        #eliminate double axis on right
456        ax0 = ax1.twinx()
457        ax0.yaxis.set_visible(False)
458
459        ax2 = ax0.twiny()
460        ax2.xaxis.set_label_position('top')
461        ax2.xaxis.set_ticks_position('top')
462        ax2.yaxis.set_label_position('right')
463        ax2.yaxis.set_ticks_position('right')
464        for tl in ax2.get_xticklabels():
465            tl.set_color('r')
466        for tl in ax2.get_yticklabels():
467            tl.set_color('r')
468
469        arrowprops = dict(facecolor='red', edgecolor='red', width=1, headwidth=4)
470        #using annotations instead of arrows because there is a strange implementation
471        #of arrows in matplotlib version 0.99
472        for i in range(len(trLoadings)):
473            x, y = trLoadings[i]
474            ax2.annotate('', (x, y), (0, 0), arrowprops=arrowprops)
475            ax2.text(x * 1.1, y * 1.2, self.domain[i], color='red')
476
477        ax2.set_xlim(-max_load_value, max_load_value)
478        ax2.set_ylim(-max_load_value, max_load_value)
479
480        if filename:
481            plt.savefig(filename)
482        else:
483            plt.show()
Note: See TracBrowser for help on using the repository browser.