source: orange/Orange/orng/orngPCA.py @ 11590:a14e130116b1

Revision 11590:a14e130116b1, 17.3 KB checked in by astaric, 11 months ago (diff)

Only include PCA in drawing methods.

Line 
1'''
2Created on Aug 16, 2009
3
4@author: Nejc Skofic
5@contact: nejc.skofic@gmail.com
6'''
7
8import orange, numpy
9from numpy import sqrt, abs, dot, transpose
10from numpy.linalg import eig, inv
11
12#color table for biplot
13Colors = ['bo', 'go', 'yo', 'co', 'mo']
14
15def defaultImputer(dataset):
16    """Default imputer with average data imputaton."""
17    return orange.ImputerConstructor_average(dataset)
18
19def defaultContinuizer(dataset):
20    """Default continuizer with:
21       
22    - multinomial -> as normalized ordinal
23    - class -> ignore
24    - continuous -> leave
25       
26    """
27    continuizer = orange.DomainContinuizer()
28    continuizer.multinomialTreatment = continuizer.AsNormalizedOrdinal
29    continuizer.classTreatment = continuizer.Ignore
30    continuizer.continuousTreatment = continuizer.Leave
31    return continuizer(dataset)
32
33def centerData(dataMatrix):
34    """Perfomrs centering od data along rows, returns center and centered data."""
35    center = numpy.sum(dataMatrix, axis=0) / float(len(dataMatrix))
36    return center, (dataMatrix - center)
37
38def standardizeData(dataMatrix):
39    """Performs standardization of data along rows. Throws error if constant
40    variable is present."""
41    scale = numpy.std(dataMatrix, axis=0)
42    if 0. in scale:
43        raise orange.KernelException, "Constant variable, cannot standardize!"
44    return scale, dataMatrix * 1. / scale
45
46class PCA(object):
47    """Class that creates PCA projection
48   
49    Constructor parameters
50    ----------------------
51    dataset : orange.ExampleTable object.
52            If not present (default), constructor will just set all parameters
53            needed for projection. Actual projection can then be performed with
54            *pca(dataset)*
55    attributes : list-like object containing names of attributes retained in
56            PCA. If None (default) all attributes are retained
57    rows : list-like object containg 0s and 1s for which rows should be retained.
58            If None (default) all rows are retained
59    standardize : True or False (default). If True performs standardisation of
60            dataset
61    imputer : orange.Imputer object. Defines how data is imputed if values are
62            missing. Must NOT be trained. Default is average imputation
63    continuizer : orange.Continuizer object. Defines how data is continuized if
64            there are descrete attributes. Default:
65           
66                - multinomial -> as normalized ordinal
67                - class -> ignore
68                - continuous -> leave
69               
70    maxNumberOfComponents : How many components should be retained. Default is 10,
71            if -1 all components will be retained
72    varianceCovered : How much variance of data should be covered. Default is 0.95
73    useGeneralizedVectors : True or False (default). If True, generalized
74            eigenvectors are used
75           
76    Returns
77    -------
78    PCA object if dataset was None
79   
80    PCAClassifier object if projection was successful or None if projection has failed
81    """
82
83    def __new__(cls, dataset=None, attributes=None, rows=None, standardize=0,
84                 imputer=defaultImputer, continuizer=defaultContinuizer,
85                 maxNumberOfComponents=10, varianceCovered=0.95,
86                 useGeneralizedVectors=0):
87
88        learner = object.__new__(cls)
89
90        if dataset:
91            learner.__init__(attributes, rows, standardize, imputer, continuizer,
92                             maxNumberOfComponents, varianceCovered, useGeneralizedVectors)
93            return learner(dataset)
94        else:
95            return learner
96
97    #decide what should be inmutable and what not
98    #imutable for sure: imputer and continuizer
99    def __init__(self, attributes=None, rows=None, standardize=0,
100                 imputer=defaultImputer, continuizer=defaultContinuizer,
101                 maxNumberOfComponents=10, varianceCovered=0.95,
102                 useGeneralizedVectors=0):
103
104        self.attributes = attributes
105        self.rows = rows
106        self.standardize = standardize
107        self.imputer = imputer
108        self.continuizer = continuizer
109        self.maxNumberOfComponents = maxNumberOfComponents
110        self.varianceCovered = varianceCovered
111        self.useGeneralizedVectors = useGeneralizedVectors
112
113    def __call__(self, dataset):
114
115        #Modify dataset
116        dataset = self._datasetPreprocessing(dataset)
117        dataMatrix, classArray, center, deviation = self._dataMatrixPreprocessing(dataset)
118
119        #Perform projection
120        evalues, evectors = self._createPCAProjection(dataMatrix, classArray)
121
122        #check if return status is None, None
123        if (evalues, evectors) == (None, None):
124            print "Principal component could not be performed (complex eigenvalues or singular matrix if generalized eigenvectors were used)"
125            return None
126
127        #return PCAClassifier
128        return PCAClassifier(domain=self.attributes,
129                             imputer=self.imputer,
130                             continuizer=self.continuizer,
131                             center=center,
132                             deviation=deviation,
133                             evalues=evalues,
134                             loadings=evectors)
135
136    def _datasetPreprocessing(self, dataset):
137        """
138        First remove unwanted attributes, save domain (so that PCA remembers on
139        what kind of dataset it was trained), remove unwanted rows and impute
140        values and continuize.
141        """
142
143        if self.attributes:
144            dataset = dataset.select(self.attributes)
145
146        #we need to retain only selected attributes without class attribute   
147        self.attributes = [att.name for att in dataset.domain.attributes]
148
149        imputer = self.imputer(dataset)
150
151        if self.rows:
152            dataset = dataset.select(self.rows)
153
154
155        dataset = imputer(dataset)
156        domain = self.continuizer(dataset)
157        dataset = dataset.translate(domain)
158
159        return dataset
160
161    def _dataMatrixPreprocessing(self, dataset):
162        """
163        Creates numpy arrays, center dataMatrix and standardize it if that
164        option was selected, and return dataMatrix, classArray, center and
165        deviation.
166        """
167
168        dataMatrix, classArray, x = dataset.toNumpy()
169
170        center, dataMatrix = centerData(dataMatrix)
171
172        deviation = None
173        if self.standardize:
174            deviation, dataMatrix = standardizeData(dataMatrix)
175
176        return dataMatrix, classArray, center, deviation
177
178    def _createPCAProjection(self, dataMatrix, classArray):
179        """
180        L -> Laplacian weight matrix constructed from classArray or identity if
181             classArray is None
182        M -> dataMatrix
183       
184        Normal method: t(M) * L * M * x = lambda * x
185        Snapshot method: M * t(M) * L * M * x = lambda * M * x
186        Generalized vectors: (t(M) * M)^(-1) * t(M) * L * M * x = lambda * x
187        Snapshot with generalized vectors: M * (t(M) * M)^(-1) * t(M) * L * M * x = lambda * M * x
188        """
189
190        n, d = numpy.shape(dataMatrix)
191
192        if classArray != None:
193            L = numpy.zeros((len(dataMatrix), len(dataMatrix)))
194            for i in range(len(dataMatrix)):
195                for j in range(i + 1, len(dataMatrix)):
196                    L[i, j] = -int(classArray[i] != classArray[j])
197                    L[j, i] = -int(classArray[i] != classArray[j])
198
199            s = numpy.sum(L, axis=0)      # doesn't matter which axis since the matrix L is symmetrical
200            for i in range(len(dataMatrix)):
201                L[i, i] = -s[i]
202
203            matrix = dot(transpose(dataMatrix), L)
204        else:
205            matrix = transpose(dataMatrix)
206
207        if self.useGeneralizedVectors:
208            temp_matrix = dot(transpose(dataMatrix), dataMatrix)
209            try:
210                temp_matrix = inv(temp_matrix)
211            except:
212                return None, None
213            matrix = dot(temp_matrix, matrix)
214
215        if n < d:
216            #snapshot method
217            covMatrix = dot(dataMatrix, matrix)
218            trace = numpy.trace(covMatrix)
219            scale = n / trace
220            evalues, evectors = eig(covMatrix)
221            if evalues.dtype.kind == 'c':
222                return None, None
223            positiveArray = numpy.array([value > 0. for value in evalues])
224            evalues = evalues[positiveArray]
225            evectors = evectors[positiveArray]
226            evectors = transpose(1. / sqrt(evalues)) * transpose(dot(evectors, dataMatrix))
227        else:
228            covMatrix = dot(matrix, dataMatrix)
229            trace = numpy.trace(covMatrix)
230            scale = d / trace
231            evalues, evectors = eig(covMatrix)
232            if evalues.dtype.kind == 'c':
233                return None, None
234            positiveArray = numpy.array([value > 0. for value in evalues])
235            evalues = evalues[positiveArray]
236            evectors = evectors[positiveArray]
237
238        order = (numpy.argsort(evalues)[::-1])
239        N = len(evalues)
240        maxComp = self.maxNumberOfComponents
241        variance = self.varianceCovered
242
243        if maxComp == -1:
244            maxComp = N
245        maxComp = min(maxComp, N)
246
247        order = order[:maxComp]
248        evalues = numpy.take(evalues, order)
249
250        #evalues are scaled -> value > 1 : explains more than previous attribute
251        #                   -> value < 1 : explains less than previous attribute
252        for i in range(len(order)):
253            variance -= evalues[i] / trace
254            if variance < 0:
255                return evalues[: i + 1] * scale, numpy.take(evectors, order[: i + 1], 1)
256
257        return evalues * scale, numpy.take(evectors, order, 1)
258
259class PCAClassifier(object):
260
261    def __init__(self, domain, imputer, continuizer, center, deviation, evalues, loadings):
262        #data checking and modifying
263        self.domain = domain
264        self.imputer = imputer
265        self.continuizer = continuizer
266        #PCA properites
267        self.center = center
268        self.deviation = deviation
269        self.evalues = evalues
270        self.loadings = loadings
271
272        #last predicition performed -> used for biplot
273        self._dataMatrix = None
274        self._classArray = None
275
276    def __call__(self, dataset):
277
278        try:
279            #retain class attribute
280            attrDataset = dataset.select(self.domain)
281            imputer = self.imputer(attrDataset)
282            attrDataset = imputer(attrDataset)
283            domain = self.continuizer(attrDataset)
284            attrDataset = attrDataset.translate(domain)
285        except TypeError, e:
286            raise orange.KernelException, "One or more attributes form training set are missing!"
287
288        dataMatrix, classArray, x = attrDataset.toNumpy()
289
290        dataMatrix -= self.center
291        if self.deviation != None:
292            dataMatrix *= 1. / self.deviation
293
294        #save transformed data
295        self._dataMatrix = numpy.dot(dataMatrix, self.loadings)
296
297        attributes = [orange.FloatVariable("PC%d" % (i + 1,)) for i in range(len(self.evalues))]
298        new_domain = orange.Domain(attributes)
299        new_table = orange.ExampleTable(new_domain, self._dataMatrix)
300
301        if dataset.domain.classVar:
302            #suboptimal
303            classTable = dataset.select([dataset.domain.classVar.name])
304            self._classArray = numpy.array([row.getclass() for row in classTable])
305            new_table = orange.ExampleTable([new_table, classTable])
306
307        return new_table
308
309    def __str__(self):
310
311        n, d = numpy.shape(self.loadings)
312        comulative = 0.0
313
314        summary = "PCA SUMMARY\n\nCenter:\n\n"
315        summary += " %15s  " * len(self.domain) % tuple(self.domain) + "\n"
316        summary += " %15.4f  " * len(self.center) % tuple(self.center) + "\n\n"
317
318        if self.deviation != None:
319            summary += "Deviation:\n\n"
320            summary += " %15s  " * len(self.domain) % tuple(self.domain) + "\n"
321            summary += " %15.4f  " * len(self.deviation) % tuple(self.deviation) + "\n\n"
322
323        summary += "Importance of components:\n\n %12s   %12s   %12s\n" % ("eigenvalues", "proportion", "cumulative")
324        for evalue in self.evalues:
325            comulative += evalue / n
326            summary += " %12.4f   %12.4f   %12.4f\n" % (evalue, evalue / n, comulative)
327
328        #summary += "\nLoadings:\n\n"
329        #summary += "      PC%d" * d % tuple(range(1, d + 1)) + "\n"
330        #for attr_num in range(len(self.domain)):
331        #    summary += " % 8.4f" * d % tuple(self.loadings[attr_num])
332        #    summary += "   %-30s\n" % self.domain[attr_num]
333
334        return summary
335
336    ################ Ploting functions ###################
337
338    def plot(self, title='Scree plot', filename='scree_plot.png'):
339        """
340        Draws a scree plot. Matplotlib is needed.
341       
342        Parameters
343        ----------
344        title : Title of the plot
345        filename : File name under which plot will be saved (default: scree_plot.png)
346            If None, plot will be shown
347        """
348        import pylab as plt
349        fig = plt.figure()
350        ax = fig.add_subplot(111)
351
352        x_axis = range(len(self.evalues))
353        x_labels = ["PC%d" % (i + 1,) for i in x_axis]
354
355        ax.set_xticks(x_axis)
356        ax.set_xticklabels(x_labels)
357        ax.set_xlabel('Principal components')
358        ax.set_ylabel('Variance')
359        ax.set_title(title + "\n")
360        ax.bar(left=x_axis, height=self.evalues, align='center')
361        ax.axis([-0.5, len(self.evalues) - 0.5, 0, self.evalues[0] * 1.05])
362        if filename:
363            plt.savefig(filename)
364        else:
365            plt.show()
366
367    def biplot(self, choices=[1, 2], scale=1., title='Biplot',
368               filename='biplot.png'):
369        """
370        Draws biplot for PCA. Matplotlib is needed. Actual projection must be
371        performed via pca(data) before bilpot can be used.
372       
373        Parameters
374        ----------
375        choices : lenght 2 list-like object for choosing which 2 components
376            should be used in biplot. Default is first and second
377        scale : scale factor (default is 1.). Should be inside [0, 1]
378        title : title of biplot
379        filename : File name under which plot will be saved (default: biplot.png)
380            If None, plot will be shown
381        """
382        import pylab as plt
383
384        if self._dataMatrix == None:
385            raise orange.KernelException, "No data available for biplot!"
386
387        if len(choices) != 2:
388            raise orange.KernelException, 'You have to choose exactly two components'
389
390        if max(choices[0], choices[1]) > len(self.evalues) or min(choices[0], choices[1]) < 1:
391            raise orange.KernelException, 'Invalid choices'
392
393        choice = numpy.array([i == choices[0] - 1 or i == choices[1] - 1 for i in range(len(self.evalues))])
394
395        dataMatrix = self._dataMatrix[:, choice]
396        loadings = self.loadings[:, choice]
397        lam = (numpy.array(self.evalues)[choice])
398        lam *= sqrt(len(self._dataMatrix))
399
400        if scale < 0. or scale > 1.:
401            print "Warning: 'scale' is outside [0, 1]"
402        lam = lam ** scale
403
404        #TO DO -> pc.biplot (maybe)
405        trDataMatrix = dataMatrix / lam
406        trLoadings = loadings * lam
407
408        max_data_value = numpy.max(abs(trDataMatrix)) * 1.05
409        max_load_value = numpy.max(abs(trLoadings)) * 1.5
410
411        #plt.clf()
412        fig = plt.figure()
413        ax1 = fig.add_subplot(111)
414        ax1.set_title(title + "\n")
415        ax1.set_xlabel("PC%s" % (choices[0]))
416        ax1.set_ylabel("PC%s" % (choices[1]))
417        ax1.xaxis.set_label_position('bottom')
418        ax1.xaxis.set_ticks_position('bottom')
419        ax1.yaxis.set_label_position('left')
420        ax1.yaxis.set_ticks_position('left')
421
422        if self._classArray == None:
423            trDataMatrix = transpose(trDataMatrix)
424            ax1.plot(trDataMatrix[0], trDataMatrix[1], Colors[0])
425        else:
426            #suboptimal
427            classValues = []
428            for classValue in self._classArray:
429                if classValue not in classValues:
430                    classValues.append(classValue)
431            for i in range(len(classValues)):
432                choice = numpy.array([classValues[i] == cv for cv in self._classArray])
433                partialDataMatrix = transpose(trDataMatrix[choice])
434                ax1.plot(partialDataMatrix[0], partialDataMatrix[1],
435                         Colors[i % len(Colors)], label=str(classValues[i]))
436            ax1.legend()
437
438        ax1.set_xlim(-max_data_value, max_data_value)
439        ax1.set_ylim(-max_data_value, max_data_value)
440
441        #eliminate double axis on right
442        ax0 = ax1.twinx()
443        ax0.yaxis.set_visible(False)
444
445        ax2 = ax0.twiny()
446        ax2.xaxis.set_label_position('top')
447        ax2.xaxis.set_ticks_position('top')
448        ax2.yaxis.set_label_position('right')
449        ax2.yaxis.set_ticks_position('right')
450        for tl in ax2.get_xticklabels():
451            tl.set_color('r')
452        for tl in ax2.get_yticklabels():
453            tl.set_color('r')
454
455        arrowprops = dict(facecolor='red', edgecolor='red', width=1, headwidth=4)
456        #using annotations instead of arrows because there is a strange implementation
457        #of arrows in matplotlib version 0.99
458        for i in range(len(trLoadings)):
459            x, y = trLoadings[i]
460            ax2.annotate('', (x, y), (0, 0), arrowprops=arrowprops)
461            ax2.text(x * 1.1, y * 1.2, self.domain[i], color='red')
462
463        ax2.set_xlim(-max_load_value, max_load_value)
464        ax2.set_ylim(-max_load_value, max_load_value)
465
466        if filename:
467            plt.savefig(filename)
468        else:
469            plt.show()
Note: See TracBrowser for help on using the repository browser.