# source:orange/Orange/projection/mds.py@10580:c4cbae8dcf8b

Revision 10580:c4cbae8dcf8b, 14.6 KB checked in by markotoplak, 2 years ago (diff)

Moved deprecation functions, progress bar support and environ into Orange.utils. Orange imports cleanly, although it is not tested yet.

Line
1"""
2.. index:: multidimensional scaling (mds)
3
4.. index::
5   single: projection; multidimensional scaling (mds)
6
7**********************************
8Multidimensional scaling (``mds``)
9**********************************
10
11The functionality to perform multidimensional scaling
12(http://en.wikipedia.org/wiki/Multidimensional_scaling).
13
14The main class to perform multidimensional scaling is
15:class:`Orange.projection.mds.MDS`
16
17.. autoclass:: Orange.projection.mds.MDS
18   :members:
19   :exclude-members: Torgerson, get_distance, get_stress, calc_stress, run
20
21   .. automethod:: calc_stress(stress_func=SgnRelStress)
22   .. automethod:: run(iter, stress_func=SgnRelStress, eps=1e-3, progress_callback=None)
23
24Stress functions
25================
26
27Stress functions that can be used for MDS have to be implemented as functions
28or callable classes:
29
30    .. method:: \ __call__(correct, current, weight=1.0)
31
32       Compute the stress using the correct and the current distance value (the
33       :obj:`Orange.projection.mds.MDS.distances` and
34       :obj:`Orange.projection.mds.MDS.projected_distances` elements).
35
36       :param correct: correct (actual) distance between elements, represented by
37           the two points.
38       :type correct: float
39
40       :param current: current distance between the points in the MDS space.
41       :type current: float
42
43This module provides the following stress functions:
44
45   * :obj:`SgnRelStress`
46   * :obj:`KruskalStress`
47   * :obj:`SammonStress`
48   * :obj:`SgnSammonStress`
49
50Examples
51========
52
53MDS Scatterplot
54---------------
55
56The following script computes the Euclidean distance between the data
57instances and runs MDS. Final coordinates are plotted with matplotlib
58(not included with orange, http://matplotlib.sourceforge.net/).
59
61
62.. literalinclude:: code/mds-scatterplot.py
63    :lines: 7-
64
65The script produces a file *mds-scatterplot.py.png*. Color denotes
66the class. Iris is a relatively simple data set with respect to
67classification; to no surprise we see that MDS finds such instance
68placement in 2D where instances of different classes are well separated.
69Note that MDS has no knowledge of points' classes.
70
71.. image:: files/mds-scatterplot.png
72
73
75-----------------------
76
77The following script performs 10 steps of Smacof optimization before computing
78the stress. This is suitable if you have a large dataset and want to save some
79time.
80
82
84    :lines: 7-
85
86A few representative lines of the output are::
87
88    <-0.633911848068, 0.112218663096> [5.1, 3.5, 1.4, 0.2, 'Iris-setosa']
89    <-0.624193906784, -0.111143872142> [4.9, 3.0, 1.4, 0.2, 'Iris-setosa']
90    ...
91    <0.265250980854, 0.237793982029> [7.0, 3.2, 4.7, 1.4, 'Iris-versicolor']
92    <0.208580598235, 0.116296850145> [6.4, 3.2, 4.5, 1.5, 'Iris-versicolor']
93    ...
94    <0.635814905167, 0.238721415401> [6.3, 3.3, 6.0, 2.5, 'Iris-virginica']
95    <0.356859534979, -0.175976261497> [5.8, 2.7, 5.1, 1.9, 'Iris-virginica']
96    ...
97
98
99"""
100
101import numpy
102from numpy.linalg import svd
103
104import Orange.core
105from Orange import orangeom as orangemds
106from Orange.utils import deprecated_keywords
107from Orange.utils import deprecated_members
108
109KruskalStress = orangemds.KruskalStress()
110SammonStress = orangemds.SammonStress()
111SgnSammonStress = orangemds.SgnSammonStress()
112SgnRelStress = orangemds.SgnRelStress()
113
114PointList = Orange.core.FloatListList
115FloatListList = Orange.core.FloatListList
116
117def _mycompare((a,aa),(b,bb)):
118    if a == b:
119        return 0
120    if a < b:
121        return -1
122    else:
123        return 1
124
125class PivotMDS(object):
126    def __init__(self, distances=None, pivots=50, dim=2, **kwargs):
127        self.dst = numpy.array([m for m in distances])
128        self.n = len(self.dst)
129
130        if type(pivots) == type(1):
131            self.k = pivots
132            self.pivots = numpy.random.permutation(len(self.dst))[:pivots]
133            #self.pivots.sort()
134        elif type(pivots) == type([]):
135            self.pivots = pivots
136            #self.pivots.sort()
137            self.k = len(self.pivots)
138        else:
139            raise AttributeError('pivots')
140
141    def optimize(self):
142#        # Classical MDS (Torgerson)
143#        J = identity(self.n) - (1/float(self.n))
144#        B = -1/2. * dot(dot(J, self.dst**2), J)
145#        w,v = linalg.eig(B)
146#        tmp = zip([float(val) for val in w], range(self.n))
147#        tmp.sort()
148#        w1, w2 = tmp[-1][0], tmp[-2][0]
149#        v1, v2 = v[:, tmp[-1][1]], v[:, tmp[-2][1]]
150#        return v1 * sqrt(w1), v2 * sqrt(w2)
151
152        # Pivot MDS
153        d = self.dst[[self.pivots]].T
154        C = d**2
155        # double-center d
156        cavg = numpy.sum(d, axis=0)/(self.k+0.0)      # column sum
157        ravg = numpy.sum(d, axis=1)/(self.n+0.0)    # row sum
158        tavg = numpy.sum(cavg)/(self.n+0.0)   # total sum
159        # TODO: optimize
160        for i in xrange(self.n):
161            for j in xrange(self.k):
162                C[i,j] += -ravg[i] - cavg[j]
163
164        C = -0.5 * (C + tavg)
165        w,v = numpy.linalg.eig(numpy.dot(C.T, C))
166        tmp = zip([float(val) for val in w], range(self.n))
167        tmp.sort()
168        w1, w2 = tmp[-1][0], tmp[-2][0]
169        v1, v2 = v[:, tmp[-1][1]], v[:, tmp[-2][1]]
170        x = numpy.dot(C, v1)
171        y = numpy.dot(C, v2)
172        return x, y
173
174
175class MDS(object):
176    """
177    Main class for performing multidimensional scaling.
178
179    :param distances: original dissimilarity - a distance matrix to operate on.
180    :type distances: :class:`Orange.misc.SymMatrix`
181
182    :param dim: dimension of the projected space.
183    :type dim: int
184
185    :param points: an initial configuration of points (optional)
186    :type points: :class:`Orange.core.FloatListList`
187
188    An instance of MDS object has the following attributes and functions:
189
190    .. attribute:: points
191
192       Holds the current configuration of projected points in an
193       :class:`Orange.core.FloatListList` object.
194
195    .. attribute:: distances
196
197       An :class:`Orange.misc.SymMatrix` containing the distances that we
198       want to achieve (lsmt changes these).
199
200    .. attribute:: projected_distances
201
202       An :class:`Orange.misc.SymMatrix` containing the distances between
203       projected points.
204
205    .. attribute:: original_distances
206
207       An :class:`Orange.misc.SymMatrix` containing the original distances
208       between points.
209
210    .. attribute:: stress
211
212       An :class:`Orange.misc.SymMatrix` holding the stress.
213
214    .. attribute:: dim
215
216       An integer holding the dimension of the projected space.
217
218    .. attribute:: n
219
220       An integer holding the number of elements (points).
221
222    .. attribute:: avg_stress
223
224       A float holding the average stress in the :obj:`stress` matrix.
225
226    .. attribute:: progress_callback
227
228       A function that gets called after each optimization step in the
229       :func:`run` method.
230
231    """
232
233    def __init__(self, distances=None, dim=2, **kwargs):
234        self.mds=orangemds.MDS(distances, dim, **kwargs)
235        self.original_distances=Orange.misc.SymMatrix([m for m in self.distances])
236
237    def __getattr__(self, name):
238        if name in ["points", "projected_distances", "distances" ,"stress",
239                    "progress_callback", "n", "dim", "avg_stress"]:
240            #print "rec:",name
241            return self.__dict__["mds"].__dict__[name]
242        else:
243            raise AttributeError(name)
244
245    def __setattr__(self, name, value):
246        #print "setattr"
247        if name=="points":
248            for i in range(len(value)):
249                for j in range(len(value[i])):
250                    self.mds.points[i][j]=value[i][j]
251            return
252
253        if name in ["projected_distances", "distances" ,"stress",
254                    "progress_callback"]:
255            self.mds.__setattr__(name, value)
256        else:
257            self.__dict__[name]=value
258
259    def __nonzero__(self):
260        return True
261
262    def smacof_step(self):
263        """
264        Perform a single iteration of a Smacof algorithm that optimizes
265        :obj:`stress` and updates the :obj:`points`.
266        """
267        self.mds.SMACOFstep()
268
269    def calc_distance(self):
270        """
271        Compute the distances between points and update the
272        :obj:`projected_distances` matrix.
273
274        """
275        self.mds.get_distance()
276
277
278    @deprecated_keywords({"stressFunc": "stress_func"})
279    def calc_stress(self, stress_func=SgnRelStress):
280        """
281        Compute the stress between the current :obj:`projected_distances` and
282        :obj:`distances` matrix using *stress_func* and update the
283        :obj:`stress` matrix and :obj:`avgStress` accordingly.
284
285        """
286        self.mds.getStress(stress_func)
287
288    @deprecated_keywords({"stressFunc": "stress_func"})
289    def optimize(self, iter, stress_func=SgnRelStress, eps=1e-3,
290                 progress_callback=None):
291        self.mds.progress_callback=progress_callback
292        self.mds.optimize(iter, stress_func, eps)
293
294    @deprecated_keywords({"stressFunc": "stress_func"})
295    def run(self, iter, stress_func=SgnRelStress, eps=1e-3,
296            progress_callback=None):
297        """
298        Perform optimization until stopping conditions are met.
299        Stopping conditions are:
300
301           * optimization runs for *iter* iterations of smacof_step function, or
302           * stress improvement (old stress minus new stress) is smaller than
303             eps * old stress.
304
305        :param iter: maximum number of optimization iterations.
306        :type iter: int
307
308        :param stress_func: stress function.
309        """
310        self.optimize(iter, stress_func, eps, progress_callback)
311
312    def torgerson(self):
313        """
314        Run the Torgerson algorithm that computes an initial analytical
315        solution of the problem.
316
317        """
318        # Torgerson's initial approximation
319        O = numpy.array([m for m in self.distances])
320
321##        #B = matrixmultiply(O,O)
322##        # bug!? B = O**2
323##        B = dot(O,O)
324##        # double-center B
325##        cavg = sum(B, axis=0)/(self.n+0.0)      # column sum
326##        ravg = sum(B, axis=1)/(self.n+0.0)    # row sum
327##        tavg = sum(cavg)/(self.n+0.0)   # total sum
328##        # B[row][column]
329##        for i in xrange(self.n):
330##            for j in xrange(self.n):
331##                B[i,j] += -cavg[j]-ravg[i]
332##        B = -0.5*(B+tavg)
333
334        # B = double-center O**2 !!!
335        J = numpy.identity(self.n) - (1/numpy.float(self.n))
336        B = -0.5 * numpy.dot(numpy.dot(J, O**2), J)
337
338        # SVD-solve B = ULU'
339        #(U,L,V) = singular_value_decomposition(B)
340        (U,L,V)=svd(B)
341        # X = U(L^0.5)
342        # # self.X = matrixmultiply(U,identity(self.n)*sqrt(L))
343        # X is n-dimensional, we take the two dimensions with the largest singular values
344        idx = numpy.argsort(L)[-self.dim:].tolist()
345        idx.reverse()
346
347        Lt = numpy.take(L,idx)   # take those singular values
348        Ut = numpy.take(U,idx,axis=1) # take those columns that are enabled
349        Dt = numpy.identity(self.dim)*numpy.sqrt(Lt)  # make a diagonal matrix, with squarooted values
350        self.points = Orange.core.FloatListList(numpy.dot(Ut,Dt))
351        self.freshD = 0
352
353#        D = identity(self.n)*sqrt(L)  # make a diagonal matrix, with squarooted values
354#        X = matrixmultiply(U,D)
355#        self.X = take(X,idx,1)
356
357    # Kruskal's monotone transformation
358    def lsmt(self):
359        """
360        Execute Kruskal monotone transformation.
361
362        """
363        # optimize the distance transformation
364        # build vector o
365        effect = 0
366        self.getDistance()
367        o = []
368        for i in xrange(1,self.n):
369            for j in xrange(i):
370                o.append((self.original_distances[i,j],(i,j)))
371        o.sort(_mycompare)
372        # find the ties in o, and construct the d vector sorting in order within ties
373        d = []
374        td = []
375        uv = [] # numbers of consecutively tied o values
376        (i,j) = o[0][1]
377        distnorm = self.projected_distances[i,j]*self.projected_distances[i,j]
378        td = [self.projected_distances[i,j]] # fetch distance
379        for l in xrange(1,len(o)):
380            # copy now sorted distances in an array
381            # but sort distances within a tied o
382            (i,j) = o[l][1]
383            cd = self.projected_distances[i,j]
384            distnorm += self.projected_distances[i,j]*self.projected_distances[i,j]
385            if o[l][0] != o[l-1][0]:
386                # differing value, flush
387                sum = reduce(lambda x,y:x+y,td)+0.0
388                d.append([sum,len(td),sum/len(td),td])
389                td = []
390            td.append(cd)
391        sum = reduce(lambda x,y:x+y,td)+0.0
392        d.append([sum,len(td),sum/len(td),td])
393        ####
394        # keep merging non-monotonous areas in d
395        monotony = 0
396        while not monotony and len(d) > 1:
397            monotony = 1
398            pi = 0 # index
399            n = 1  # n-areas
400            nd = []
401            r = d[0] # current area
402            for i in range(1,len(d)):
403                tr = d[i]
404                if r[2]>=tr[2]:
405                    monotony = 0
406                    effect = 1
407                    r[0] += tr[0]
408                    r[1] += tr[1]
409                    r[2] = tr[0]/tr[1]
410                    r[3] += tr[3]
411                else:
412                    nd.append(r)
413                    r = tr
414            nd.append(r)
415            d = nd
416        # normalizing multiplier
417        sum = 0.0
418        for i in d:
419            sum += i[2]*i[2]*i[1]
420        f = numpy.sqrt(distnorm/numpy.max(sum,1e-6))
421        # transform O
422        k = 0
423        for i in d:
424            for j in range(i[1]):
425                (ii,jj) = o[k][1]
426                self.distances[ii,jj] = f*i[2]
427                k += 1
428        assert(len(o) == k)
429        self.freshD = 0
430        return effect
431
432MDS = deprecated_members({"projectedDistances": "projected_distances",
433                     "originalDistances": "original_distances",
434                     "avgStress": "avg_stress",
435                     "progressCallback": "progress_callback",
436                     "getStress": "calc_stress",
437                     "get_stress": "calc_stress",
438                     "calcStress": "calc_stress",
439                     "getDistance": "calc_distance",
440                     "get_distance": "calc_distance",
441                     "calcDistance": "calc_distance",
442                     "Torgerson": "torgerson",
443                     "SMACOFstep": "smacof_step",
444                     "LSMT": "lsmt"})(MDS)
Note: See TracBrowser for help on using the repository browser.