# source:orange/Orange/orng/orngCA.py@9817:31fca470d487

Revision 9817:31fca470d487, 8.8 KB checked in by Matija Polajnar <matija.polajnar@…>, 2 years ago (diff)

Put a warning for the guys in the future into orngCA that it's as obsolete as IE6.

Line
1# This has to be seriously outdated, as it uses matrixmultiply, which is not
2# present in numpy since, like, 2006.     Matija Polajnar, 2012 a.d.
3
4"""
5Correspondence analysis is a descriptive/exploratory technique designed to analyze simple two-way and
6multi-way tables containing some measure of correspondence between the rows and columns.
7"""
8
9from numpy import *
10import numpy.linalg
11import orange
12import operator
13
14def input(filename):
15    """ Loads contingency matrix from the file. """
16    table = [[],[]]
17    try:
18        file = open(filename)
19        try:
21        finally:
22            file.close()
23        table = [map(int, el.split()) for el in table]
24    except IOError:
25        pass
26    return table
27
28class CA(object):
29    """Main class for computation of correspondance analysis"""
30    def __init__(self, contingencyTable, labelR = [], labelC = []):
31        """ @contingencyTable   instance of "list of lists"
32        """
33        #calculating correspondance analysis from the data matrix
34        #algorithm described in the book (put reference) is used
35
36        self.labelR = labelR
37        self.labelC = labelC
38
39        self.__dataMatrix = matrix(contingencyTable)
40        sumElem = sum(sum(array(self.__dataMatrix))) * 1.
41
42        #corrMatrix is a matrix of relative frequencies of elements in data matrix
43        self.__corr = self.__dataMatrix / sumElem
44        self.__colSums = sum(self.__corr, 0)
45        self.__rowSums = sum(self.__corr, 1)
46
47        self.__colProfiles =  matrix(diag((1. / array(self.__colSums))[0])) * transpose(self.__corr)
48        self.__rowProfiles = matrix(diag((1. / array(self.__rowSums).reshape(1,-1))[0])) * self.__corr
49
50        self.__a, self.__d, self.__b = self.__calculateSVD();
51
52        self.__f = diag((1. / self.__rowSums).reshape(1,-1).tolist()[0]) * self.__a * self.__d
53        self.__g = diag((1. / self.__colSums).tolist()[0]) * self.__b * transpose(self.__d)
54
55    def __calculateSVD(self):
56        """
57            Computes generalized SVD...
58
59            This function is used to calculate decomposition A = N * D_mi * M' , where N' * diag(rowSums) * N = I and
60            M' * diag(colSums) * M = I. This decomposition is calculated in 4 steps:
61            i) B = diag(rowSums)^1/2 * A * diag(colSums)^1/2
62            ii) find the ordinary SVD of B: B = U * D * V'
63            iii) N = diag(rowSums)^-1/2 * U
64                M = diag(colSums)^-1/2 * V
65                D_mi = D
66            iv) A= N * D_mi * M'
67
68            returns (N, D_mi, M)
69        """
70
71        a = self.__corr - self.__rowSums * self.__colSums
72        b = diag(sqrt((1. / self.__rowSums).reshape(1,-1).tolist()[0])) * a * diag(sqrt((1. / self.__colSums).tolist()[0]))
73        u, d, v = numpy.linalg.svd(b, 0)
74        N = diag(sqrt(self.__rowSums.reshape(1, -1).tolist()[0])) * u
75        M = diag(sqrt(self.__colSums.tolist()[0])) * transpose(v)
76        d = diag(d.tolist())
77
78        return (N, d, M)
79
80    def getMatrix(self):
81        """
82            Returns array object that is representation of contingency table.
83        """
84        return self.__dataMatrix
85
86##    def getCT(self): return self.__ct
87
88    dataMatrix = property(getMatrix)
89
90    def getA(self):
91        """
92            columns of A defines the principal axes of the column clouds
93        """
94        return self.__a
95    def getD(self):
96        """
97            elements on diagonal of D are singular values
98        """
99        return self.__d
100    def getB(self):
101        """
102            columns of B defines the principal axes of the row clouds
103        """
104        return self.__b
105    def getF(self):
106        """
107            coordinates of the row profiles with respect to principal axes B
108        """
109        return self.__f
110    def getG(self):
111        """
112            coordinates of the column profiles with respect to principal axes A
113        """
114        return self.__g
115    def getPrincipalRowProfilesCoordinates(self, dim = (0, 1)):
116       """Returns principal co-ordinates of row profiles with respect
117       to principal axis B.
118       Dim defines which principal axes should be taken into account.
119       """
120       if len(dim) == 0:
121           raise Exception("Dim tuple cannot be of lenght zero")
122       return array(take(self.__f, dim, 1))
123    def getPrincipalColProfilesCoordinates(self, dim = (0, 1)):
124       """Returns principal co-ordinates of column profiles with respect
125       to principal axes A.
126       Dim defines which principal axes should be taken into account.
127       """
128       if len(dim) == 0:
129           raise Exception("Dim tuple cannot be of lenght zero")
130       return array(take(self.__g, dim, 1))
131    def getStandardRowCoordinates(self):
132        dinv = where(self.__d != 0, 1. / self.__d, 0)
133        return matrixmultiply(self.__f, transpose(dinv))
134    def getStandardColCoordinates(self):
135        dinv = where(self.__d != 0, 1. / self.__d, 0)
136        return matrixmultiply(self.__g, dinv)
137    def DecompositionOfInertia(self, axis = 0):
138        """
139            axis = 0 decomposition across rows
140            axis = 1 decomposition across columns
141
142            Columns of this matrix represents contribution of the rows or columns to the inertia of axis.
143        """
144        if axis == 0:
145            return multiply(self.__rowSums, multiply(self.__f, self.__f))
146        else:
147            return multiply(transpose(self.__colSums), multiply(self.__g, self.__g))
148    def InertiaOfAxis(self, percentage = 0):
149        inertias = array(sum(self.DecompositionOfInertia(), 0).tolist()[0])
150        if percentage:
151            return inertias / sum(inertias) * 100
152        else:
153            return inertias
154    def ContributionOfPointsToAxis(self, rowColumn = 0, axis = 0, percentage = 0):
155        contribution = array(transpose(self.DecompositionOfInertia(rowColumn)[:,axis]).tolist()[0])
156        if percentage:
157            return contribution / sum(contribution) * 100
158        else:
159            return contribution
160    def PointsWithMostInertia(self, rowColumn = 0, axis = (0, 1)):
161        contribution = self.ContributionOfPointsToAxis(rowColumn = rowColumn, axis = axis[0], percentage = 0) + \
162                        self.ContributionOfPointsToAxis(rowColumn = rowColumn, axis = axis[1], percentage = 0)
163        tmp = zip(range(len(contribution)), contribution)
164
165        tmp.sort(lambda x, y: cmp(x[1], y[1]))
166
167        a = [i for (i, v) in tmp]
168        a.reverse()
169        return a
170
171    def PlotScreeDiagram(self):
172        # todo: legend, axis, etc
173        import pylab
174        pylab.plot(range(1, min(self.__dataMatrix.shape) + 1), self.InertiaOfAxis(1))
175        pylab.axis([0, min(self.__dataMatrix.shape) + 1, 0, 100])
176        pylab.show()
177
178    def Biplot(self, dim = (0, 1)):
179        import pylab
180        if len(dim) != 2:
181           raise Exception("Dim tuple must be of length two")
182        pylab.plot(self.getPrincipalRowProfilesCoordinates()[:, dim[0]], self.getPrincipalRowProfilesCoordinates()[:, dim[1]], 'ro',
183            self.getPrincipalColProfilesCoordinates()[:, dim[0]], self.getPrincipalColProfilesCoordinates()[:, dim[1]], 'bs')
184        if self.labelR:
185            for i, x, y in zip(range(len(self.getPrincipalRowProfilesCoordinates()[:, dim[0]])), \
186                                    self.getPrincipalRowProfilesCoordinates()[:, dim[0]], \
187                                    self.getPrincipalRowProfilesCoordinates()[:, dim[1]]):
188                pylab.text(x, y, self.labelR[i], horizontalalignment='center')
189        if self.labelC:
190            for i, x, y in zip(range(len(self.getPrincipalColProfilesCoordinates()[:, dim[0]])), \
191                                    self.getPrincipalColProfilesCoordinates()[:, dim[0]], \
192                                    self.getPrincipalColProfilesCoordinates()[:, dim[1]]):
193                pylab.text(x, y, self.labelC[i], horizontalalignment='center')
194        pylab.grid()
195        pylab.show()
196
197    A = property(getA)
198    B = property(getB)
199    D = property(getD)
200    F = property(getF)
201    G = property(getG)
202
203if __name__ == '__main__':
204##    a = random.random_integers(0, 100, 100).reshape(10,-1)
205##    c = CA(a)
206##    c.Biplot()
207
208##    data = matrix([[72,    39,    26,    23 ,    4],
209##    [95,    58,    66,    84,    41],
210##    [80,    73,    83,     4 ,   96],
211##    [79,    93,    35,    73,    63]])
212##
213##    data = [[9, 11, 4],
214##                [ 3,          5,          3],
215##                [     11,          6,          3],
216##                [24,         73,         48]]
217
218
219    data = input('doc\\datasets\\smokers.tab')
220    c = CA(data, ['Senior Managers', 'Junior Managers', 'Senior Employees', 'Junior Employees', 'Secretaries'],
221        ['None', 'Light', 'Medium', 'Heavy'])
222#    c.PlotScreeDiagram()
Note: See TracBrowser for help on using the repository browser.