source: orange/orange/Orange/projection/mds.py @ 9349:fa13a2c52fcd

Revision 9349:fa13a2c52fcd, 14.4 KB checked in by mitar, 2 years ago (diff)

Changed way of linking to code in documentation.

Line 
1"""
2.. index:: multidimensional scaling (mds)
3
4.. index::
5   single: projection; multidimensional scaling (mds)
6
7**********************************
8Multidimensional scaling (``mds``)
9**********************************
10
11The functionality to perform multidimensional scaling
12(http://en.wikipedia.org/wiki/Multidimensional_scaling).
13
14The main class to perform multidimensional scaling is
15:class:`Orange.projection.mds.MDS`
16
17.. autoclass:: Orange.projection.mds.MDS
18   :members:
19   :exclude-members: Torgerson, get_distance, get_stress
20
21Stress functions
22================
23
24Stress functions that can be used for MDS have to be implemented as functions
25or callable classes:
26
27    .. method:: \ __call__(correct, current, weight=1.0)
28       
29       Compute the stress using the correct and the current distance value (the
30       :obj:`Orange.projection.mds.MDS.distances` and
31       :obj:`Orange.projection.mds.MDS.projected_distances` elements).
32       
33       :param correct: correct (actual) distance between elements, represented by
34           the two points.
35       :type correct: float
36       
37       :param current: current distance between the points in the MDS space.
38       :type current: float
39
40This module provides the following stress functions:
41
42   * :obj:`SgnRelStress`
43   * :obj:`KruskalStress`
44   * :obj:`SammonStress`
45   * :obj:`SgnSammonStress`
46
47Examples
48========
49
50MDS Scatterplot
51---------------
52
53The following script computes the Euclidean distance between the data
54instances and runs MDS. Final coordinates are plotted with matplotlib
55(not included with orange, http://matplotlib.sourceforge.net/).
56
57Example (:download:`mds-scatterplot.py <code/mds-scatterplot.py>`, uses :download:`iris.tab <code/iris.tab>`)
58
59.. literalinclude:: code/mds-scatterplot.py
60    :lines: 7-
61
62The script produces a file *mds-scatterplot.py.png*. Color denotes
63the class. Iris is a relatively simple data set with respect to
64classification; to no surprise we see that MDS finds such instance
65placement in 2D where instances of different classes are well separated.
66Note that MDS has no knowledge of points' classes.
67
68.. image:: files/mds-scatterplot.png
69
70
71A more advanced example
72-----------------------
73
74The following script performs 10 steps of Smacof optimization before computing
75the stress. This is suitable if you have a large dataset and want to save some
76time.
77
78Example (:download:`mds-advanced.py <code/mds-advanced.py>`, uses :download:`iris.tab <code/iris.tab>`)
79
80.. literalinclude:: code/mds-advanced.py
81    :lines: 7-
82
83A few representative lines of the output are::
84
85    <-0.633911848068, 0.112218663096> [5.1, 3.5, 1.4, 0.2, 'Iris-setosa']
86    <-0.624193906784, -0.111143872142> [4.9, 3.0, 1.4, 0.2, 'Iris-setosa']
87    ...
88    <0.265250980854, 0.237793982029> [7.0, 3.2, 4.7, 1.4, 'Iris-versicolor']
89    <0.208580598235, 0.116296850145> [6.4, 3.2, 4.5, 1.5, 'Iris-versicolor']
90    ...
91    <0.635814905167, 0.238721415401> [6.3, 3.3, 6.0, 2.5, 'Iris-virginica']
92    <0.356859534979, -0.175976261497> [5.8, 2.7, 5.1, 1.9, 'Iris-virginica']
93    ...
94
95
96"""
97
98
99from math import *
100from numpy import *
101from numpy.linalg import svd
102
103import Orange.core
104import orangeom as orangemds
105from Orange.misc import deprecated_keywords
106from Orange.misc import deprecated_members
107
108KruskalStress = orangemds.KruskalStress()
109SammonStress = orangemds.SammonStress()
110SgnSammonStress = orangemds.SgnSammonStress()
111SgnRelStress = orangemds.SgnRelStress()
112
113PointList = Orange.core.FloatListList
114FloatListList = Orange.core.FloatListList
115
116def _mycompare((a,aa),(b,bb)):
117    if a == b:
118        return 0
119    if a < b:
120        return -1
121    else:
122        return 1
123           
124class PivotMDS(object):
125    def __init__(self, distances=None, pivots=50, dim=2, **kwargs):
126        self.dst = array([m for m in distances])
127        self.n = len(self.dst)
128
129        if type(pivots) == type(1):
130            self.k = pivots
131            self.pivots = random.permutation(len(self.dst))[:pivots]
132            #self.pivots.sort()
133        elif type(pivots) == type([]):
134            self.pivots = pivots
135            #self.pivots.sort()
136            self.k = len(self.pivots)
137        else:
138            raise AttributeError('pivots')
139       
140    def optimize(self):
141#        # Classical MDS (Torgerson)
142#        J = identity(self.n) - (1/float(self.n))
143#        B = -1/2. * dot(dot(J, self.dst**2), J)
144#        w,v = linalg.eig(B)
145#        tmp = zip([float(val) for val in w], range(self.n))
146#        tmp.sort()
147#        w1, w2 = tmp[-1][0], tmp[-2][0]
148#        v1, v2 = v[:, tmp[-1][1]], v[:, tmp[-2][1]]
149#        return v1 * sqrt(w1), v2 * sqrt(w2)
150       
151        # Pivot MDS
152        d = self.dst[[self.pivots]].T
153        C = d**2
154        # double-center d
155        cavg = sum(d, axis=0)/(self.k+0.0)      # column sum
156        ravg = sum(d, axis=1)/(self.n+0.0)    # row sum
157        tavg = sum(cavg)/(self.n+0.0)   # total sum
158        # TODO: optimize
159        for i in xrange(self.n):
160            for j in xrange(self.k):
161                C[i,j] += -ravg[i] - cavg[j]
162       
163        C = -0.5 * (C + tavg)
164        w,v = linalg.eig(dot(C.T, C))
165        tmp = zip([float(val) for val in w], range(self.n))
166        tmp.sort()
167        w1, w2 = tmp[-1][0], tmp[-2][0]
168        v1, v2 = v[:, tmp[-1][1]], v[:, tmp[-2][1]]
169        x = dot(C, v1)
170        y = dot(C, v2)
171        return x, y
172       
173       
174class MDS(object):
175    """
176    Main class for performing multidimensional scaling.
177   
178    :param distances: original dissimilarity - a distance matrix to operate on.
179    :type distances: :class:`Orange.core.SymMatrix`
180   
181    :param dim: dimension of the projected space.
182    :type dim: int
183   
184    :param points: an initial configuration of points (optional)
185    :type points: :class:`Orange.core.FloatListList`
186   
187    An instance of MDS object has the following attributes and functions:
188   
189    .. attribute:: points
190       
191       Holds the current configuration of projected points in an
192       :class:`Orange.core.FloatListList` object.
193       
194    .. attribute:: distances
195   
196       An :class:`Orange.core.SymMatrix` containing the distances that we
197       want to achieve (lsmt changes these).
198       
199    .. attribute:: projected_distances
200
201       An :class:`Orange.core.SymMatrix` containing the distances between
202       projected points.
203       
204    .. attribute:: original_distances
205
206       An :class:`Orange.core.SymMatrix` containing the original distances
207       between points.
208       
209    .. attribute:: stress
210       
211       An :class:`Orange.core.SymMatrix` holding the stress.
212   
213    .. attribute:: dim
214
215       An integer holding the dimension of the projected space.
216       
217    .. attribute:: n
218
219       An integer holding the number of elements (points).
220       
221    .. attribute:: avg_stress
222
223       A float holding the average stress in the :obj:`stress` matrix.
224       
225    .. attribute:: progress_callback
226
227       A function that gets called after each optimization step in the
228       :func:`run` method.
229   
230    """
231   
232    def __init__(self, distances=None, dim=2, **kwargs):
233        self.mds=orangemds.MDS(distances, dim, **kwargs)
234        self.original_distances=Orange.core.SymMatrix([m for m in self.distances])
235
236    def __getattr__(self, name):
237        if name in ["points", "projected_distances", "distances" ,"stress",
238                    "progress_callback", "n", "dim", "avg_stress"]:
239            #print "rec:",name           
240            return self.__dict__["mds"].__dict__[name]
241        else:
242            raise AttributeError(name)
243
244    def __setattr__(self, name, value):
245        #print "setattr"
246        if name=="points":
247            for i in range(len(value)):
248                for j in range(len(value[i])):
249                    self.mds.points[i][j]=value[i][j]
250            return
251           
252        if name in ["projected_distances", "distances" ,"stress",
253                    "progress_callback"]:
254            self.mds.__setattr__(name, value)
255        else:
256            self.__dict__[name]=value
257           
258    def __nonzero__(self):
259        return True
260           
261    def smacof_step(self):
262        """
263        Perform a single iteration of a Smacof algorithm that optimizes
264        :obj:`stress` and updates the :obj:`points`.
265        """
266        self.mds.SMACOFstep()
267
268    def calc_distance(self):
269        """
270        Compute the distances between points and update the
271        :obj:`projected_distances` matrix.
272       
273        """
274        self.mds.get_distance()
275       
276
277    @deprecated_keywords({"stressFunc": "stress_func"})
278    def calc_stress(self, stress_func=SgnRelStress):
279        """
280        Compute the stress between the current :obj:`projected_distances` and
281        :obj:`distances` matrix using *stress_func* and update the
282        :obj:`stress` matrix and :obj:`avgStress` accordingly.
283       
284        """
285        self.mds.getStress(stress_func)
286
287    @deprecated_keywords({"stressFunc": "stress_func"})
288    def optimize(self, iter, stress_func=SgnRelStress, eps=1e-3,
289                 progress_callback=None):
290        self.mds.progress_callback=progress_callback
291        self.mds.optimize(iter, stress_func, eps)
292
293    @deprecated_keywords({"stressFunc": "stress_func"})
294    def run(self, iter, stress_func=SgnRelStress, eps=1e-3,
295            progress_callback=None):
296        """
297        Perform optimization until stopping conditions are met.
298        Stopping conditions are:
299           
300           * optimization runs for *iter* iterations of smacof_step function, or
301           * stress improvement (old stress minus new stress) is smaller than
302             eps * old stress.
303       
304        :param iter: maximum number of optimization iterations.
305        :type iter: int
306       
307        :param stress_func: stress function.
308        """
309        self.optimize(iter, stress_func, eps, progress_callback)
310
311    def torgerson(self):
312        """
313        Run the Torgerson algorithm that computes an initial analytical
314        solution of the problem.
315       
316        """
317        # Torgerson's initial approximation
318        O = array([m for m in self.distances])
319       
320##        #B = matrixmultiply(O,O)
321##        # bug!? B = O**2
322##        B = dot(O,O)
323##        # double-center B
324##        cavg = sum(B, axis=0)/(self.n+0.0)      # column sum
325##        ravg = sum(B, axis=1)/(self.n+0.0)    # row sum
326##        tavg = sum(cavg)/(self.n+0.0)   # total sum
327##        # B[row][column]
328##        for i in xrange(self.n):
329##            for j in xrange(self.n):
330##                B[i,j] += -cavg[j]-ravg[i]
331##        B = -0.5*(B+tavg)
332
333        # B = double-center O**2 !!!
334        J = identity(self.n) - (1/float(self.n))
335        B = -0.5 * dot(dot(J, O**2), J)
336       
337        # SVD-solve B = ULU'
338        #(U,L,V) = singular_value_decomposition(B)
339        (U,L,V)=svd(B)
340        # X = U(L^0.5)
341        # # self.X = matrixmultiply(U,identity(self.n)*sqrt(L))
342        # X is n-dimensional, we take the two dimensions with the largest singular values
343        idx = argsort(L)[-self.dim:].tolist()
344        idx.reverse()
345       
346        Lt = take(L,idx)   # take those singular values
347        Ut = take(U,idx,axis=1) # take those columns that are enabled
348        Dt = identity(self.dim)*sqrt(Lt)  # make a diagonal matrix, with squarooted values
349        self.points = Orange.core.FloatListList(dot(Ut,Dt))
350        self.freshD = 0
351       
352#        D = identity(self.n)*sqrt(L)  # make a diagonal matrix, with squarooted values
353#        X = matrixmultiply(U,D)
354#        self.X = take(X,idx,1)
355   
356    # Kruskal's monotone transformation
357    def lsmt(self):
358        """
359        Execute Kruskal monotone transformation.
360       
361        """
362        # optimize the distance transformation
363        # build vector o
364        effect = 0
365        self.getDistance()
366        o = []
367        for i in xrange(1,self.n):
368            for j in xrange(i):
369                o.append((self.original_distances[i,j],(i,j)))
370        o.sort(_mycompare)
371        # find the ties in o, and construct the d vector sorting in order within ties
372        d = []
373        td = []
374        uv = [] # numbers of consecutively tied o values
375        (i,j) = o[0][1]
376        distnorm = self.projected_distances[i,j]*self.projected_distances[i,j]
377        td = [self.projected_distances[i,j]] # fetch distance
378        for l in xrange(1,len(o)):
379            # copy now sorted distances in an array
380            # but sort distances within a tied o
381            (i,j) = o[l][1]
382            cd = self.projected_distances[i,j]
383            distnorm += self.projected_distances[i,j]*self.projected_distances[i,j]
384            if o[l][0] != o[l-1][0]:
385                # differing value, flush
386                sum = reduce(lambda x,y:x+y,td)+0.0
387                d.append([sum,len(td),sum/len(td),td])
388                td = []
389            td.append(cd)
390        sum = reduce(lambda x,y:x+y,td)+0.0
391        d.append([sum,len(td),sum/len(td),td])
392        ####
393        # keep merging non-monotonous areas in d
394        monotony = 0
395        while not monotony and len(d) > 1:
396            monotony = 1
397            pi = 0 # index
398            n = 1  # n-areas
399            nd = []
400            r = d[0] # current area
401            for i in range(1,len(d)):
402                tr = d[i]
403                if r[2]>=tr[2]:
404                    monotony = 0
405                    effect = 1
406                    r[0] += tr[0]
407                    r[1] += tr[1]
408                    r[2] = tr[0]/tr[1]
409                    r[3] += tr[3]
410                else:
411                    nd.append(r)
412                    r = tr
413            nd.append(r)
414            d = nd
415        # normalizing multiplier
416        sum = 0.0
417        for i in d:
418            sum += i[2]*i[2]*i[1]
419        f = sqrt(distnorm/max(sum,1e-6))
420        # transform O
421        k = 0
422        for i in d:
423            for j in range(i[1]):
424                (ii,jj) = o[k][1]
425                self.distances[ii,jj] = f*i[2]
426                k += 1
427        assert(len(o) == k)
428        self.freshD = 0
429        return effect
430   
431MDS = deprecated_members({"projectedDistances": "projected_distances",
432                     "originalDistances": "original_distances",
433                     "avgStress": "avg_stress",
434                     "progressCallback": "progress_callback",
435                     "getStress": "calc_stress",
436                     "get_stress": "calc_stress",
437                     "calcStress": "calc_stress",
438                     "getDistance": "calc_distance",
439                     "get_distance": "calc_distance",
440                     "calcDistance": "calc_distance",
441                     "Torgerson": "torgerson",
442                     "SMACOFstep": "smacof_step",
443                     "LSMT": "lsmt"})(MDS)
Note: See TracBrowser for help on using the repository browser.