source: orange/Orange/projection/mds.py @ 10194:1d3b705df05a

Revision 10194:1d3b705df05a, 14.4 KB checked in by anzeh <anze.staric@…>, 2 years ago (diff)

Replaced from numpy import * with import numpy

Line 
1"""
2.. index:: multidimensional scaling (mds)
3
4.. index::
5   single: projection; multidimensional scaling (mds)
6
7**********************************
8Multidimensional scaling (``mds``)
9**********************************
10
11The functionality to perform multidimensional scaling
12(http://en.wikipedia.org/wiki/Multidimensional_scaling).
13
14The main class to perform multidimensional scaling is
15:class:`Orange.projection.mds.MDS`
16
17.. autoclass:: Orange.projection.mds.MDS
18   :members:
19   :exclude-members: Torgerson, get_distance, get_stress
20
21Stress functions
22================
23
24Stress functions that can be used for MDS have to be implemented as functions
25or callable classes:
26
27    .. method:: \ __call__(correct, current, weight=1.0)
28       
29       Compute the stress using the correct and the current distance value (the
30       :obj:`Orange.projection.mds.MDS.distances` and
31       :obj:`Orange.projection.mds.MDS.projected_distances` elements).
32       
33       :param correct: correct (actual) distance between elements, represented by
34           the two points.
35       :type correct: float
36       
37       :param current: current distance between the points in the MDS space.
38       :type current: float
39
40This module provides the following stress functions:
41
42   * :obj:`SgnRelStress`
43   * :obj:`KruskalStress`
44   * :obj:`SammonStress`
45   * :obj:`SgnSammonStress`
46
47Examples
48========
49
50MDS Scatterplot
51---------------
52
53The following script computes the Euclidean distance between the data
54instances and runs MDS. Final coordinates are plotted with matplotlib
55(not included with orange, http://matplotlib.sourceforge.net/).
56
57Example (:download:`mds-scatterplot.py <code/mds-scatterplot.py>`)
58
59.. literalinclude:: code/mds-scatterplot.py
60    :lines: 7-
61
62The script produces a file *mds-scatterplot.py.png*. Color denotes
63the class. Iris is a relatively simple data set with respect to
64classification; to no surprise we see that MDS finds such instance
65placement in 2D where instances of different classes are well separated.
66Note that MDS has no knowledge of points' classes.
67
68.. image:: files/mds-scatterplot.png
69
70
71A more advanced example
72-----------------------
73
74The following script performs 10 steps of Smacof optimization before computing
75the stress. This is suitable if you have a large dataset and want to save some
76time.
77
78Example (:download:`mds-advanced.py <code/mds-advanced.py>`)
79
80.. literalinclude:: code/mds-advanced.py
81    :lines: 7-
82
83A few representative lines of the output are::
84
85    <-0.633911848068, 0.112218663096> [5.1, 3.5, 1.4, 0.2, 'Iris-setosa']
86    <-0.624193906784, -0.111143872142> [4.9, 3.0, 1.4, 0.2, 'Iris-setosa']
87    ...
88    <0.265250980854, 0.237793982029> [7.0, 3.2, 4.7, 1.4, 'Iris-versicolor']
89    <0.208580598235, 0.116296850145> [6.4, 3.2, 4.5, 1.5, 'Iris-versicolor']
90    ...
91    <0.635814905167, 0.238721415401> [6.3, 3.3, 6.0, 2.5, 'Iris-virginica']
92    <0.356859534979, -0.175976261497> [5.8, 2.7, 5.1, 1.9, 'Iris-virginica']
93    ...
94
95
96"""
97
98import numpy
99from numpy.linalg import svd
100
101import Orange.core
102from Orange import orangeom as orangemds
103from Orange.misc import deprecated_keywords
104from Orange.misc import deprecated_members
105
106KruskalStress = orangemds.KruskalStress()
107SammonStress = orangemds.SammonStress()
108SgnSammonStress = orangemds.SgnSammonStress()
109SgnRelStress = orangemds.SgnRelStress()
110
111PointList = Orange.core.FloatListList
112FloatListList = Orange.core.FloatListList
113
114def _mycompare((a,aa),(b,bb)):
115    if a == b:
116        return 0
117    if a < b:
118        return -1
119    else:
120        return 1
121           
122class PivotMDS(object):
123    def __init__(self, distances=None, pivots=50, dim=2, **kwargs):
124        self.dst = numpy.array([m for m in distances])
125        self.n = len(self.dst)
126
127        if type(pivots) == type(1):
128            self.k = pivots
129            self.pivots = numpy.random.permutation(len(self.dst))[:pivots]
130            #self.pivots.sort()
131        elif type(pivots) == type([]):
132            self.pivots = pivots
133            #self.pivots.sort()
134            self.k = len(self.pivots)
135        else:
136            raise AttributeError('pivots')
137       
138    def optimize(self):
139#        # Classical MDS (Torgerson)
140#        J = identity(self.n) - (1/float(self.n))
141#        B = -1/2. * dot(dot(J, self.dst**2), J)
142#        w,v = linalg.eig(B)
143#        tmp = zip([float(val) for val in w], range(self.n))
144#        tmp.sort()
145#        w1, w2 = tmp[-1][0], tmp[-2][0]
146#        v1, v2 = v[:, tmp[-1][1]], v[:, tmp[-2][1]]
147#        return v1 * sqrt(w1), v2 * sqrt(w2)
148       
149        # Pivot MDS
150        d = self.dst[[self.pivots]].T
151        C = d**2
152        # double-center d
153        cavg = numpy.sum(d, axis=0)/(self.k+0.0)      # column sum
154        ravg = numpy.sum(d, axis=1)/(self.n+0.0)    # row sum
155        tavg = numpy.sum(cavg)/(self.n+0.0)   # total sum
156        # TODO: optimize
157        for i in xrange(self.n):
158            for j in xrange(self.k):
159                C[i,j] += -ravg[i] - cavg[j]
160       
161        C = -0.5 * (C + tavg)
162        w,v = numpy.linalg.eig(numpy.dot(C.T, C))
163        tmp = zip([float(val) for val in w], range(self.n))
164        tmp.sort()
165        w1, w2 = tmp[-1][0], tmp[-2][0]
166        v1, v2 = v[:, tmp[-1][1]], v[:, tmp[-2][1]]
167        x = numpy.dot(C, v1)
168        y = numpy.dot(C, v2)
169        return x, y
170       
171       
172class MDS(object):
173    """
174    Main class for performing multidimensional scaling.
175   
176    :param distances: original dissimilarity - a distance matrix to operate on.
177    :type distances: :class:`Orange.misc.SymMatrix`
178   
179    :param dim: dimension of the projected space.
180    :type dim: int
181   
182    :param points: an initial configuration of points (optional)
183    :type points: :class:`Orange.core.FloatListList`
184   
185    An instance of MDS object has the following attributes and functions:
186   
187    .. attribute:: points
188       
189       Holds the current configuration of projected points in an
190       :class:`Orange.core.FloatListList` object.
191       
192    .. attribute:: distances
193   
194       An :class:`Orange.misc.SymMatrix` containing the distances that we
195       want to achieve (lsmt changes these).
196       
197    .. attribute:: projected_distances
198
199       An :class:`Orange.misc.SymMatrix` containing the distances between
200       projected points.
201       
202    .. attribute:: original_distances
203
204       An :class:`Orange.misc.SymMatrix` containing the original distances
205       between points.
206       
207    .. attribute:: stress
208       
209       An :class:`Orange.misc.SymMatrix` holding the stress.
210   
211    .. attribute:: dim
212
213       An integer holding the dimension of the projected space.
214       
215    .. attribute:: n
216
217       An integer holding the number of elements (points).
218       
219    .. attribute:: avg_stress
220
221       A float holding the average stress in the :obj:`stress` matrix.
222       
223    .. attribute:: progress_callback
224
225       A function that gets called after each optimization step in the
226       :func:`run` method.
227   
228    """
229   
230    def __init__(self, distances=None, dim=2, **kwargs):
231        self.mds=orangemds.MDS(distances, dim, **kwargs)
232        self.original_distances=Orange.misc.SymMatrix([m for m in self.distances])
233
234    def __getattr__(self, name):
235        if name in ["points", "projected_distances", "distances" ,"stress",
236                    "progress_callback", "n", "dim", "avg_stress"]:
237            #print "rec:",name           
238            return self.__dict__["mds"].__dict__[name]
239        else:
240            raise AttributeError(name)
241
242    def __setattr__(self, name, value):
243        #print "setattr"
244        if name=="points":
245            for i in range(len(value)):
246                for j in range(len(value[i])):
247                    self.mds.points[i][j]=value[i][j]
248            return
249           
250        if name in ["projected_distances", "distances" ,"stress",
251                    "progress_callback"]:
252            self.mds.__setattr__(name, value)
253        else:
254            self.__dict__[name]=value
255           
256    def __nonzero__(self):
257        return True
258           
259    def smacof_step(self):
260        """
261        Perform a single iteration of a Smacof algorithm that optimizes
262        :obj:`stress` and updates the :obj:`points`.
263        """
264        self.mds.SMACOFstep()
265
266    def calc_distance(self):
267        """
268        Compute the distances between points and update the
269        :obj:`projected_distances` matrix.
270       
271        """
272        self.mds.get_distance()
273       
274
275    @deprecated_keywords({"stressFunc": "stress_func"})
276    def calc_stress(self, stress_func=SgnRelStress):
277        """
278        Compute the stress between the current :obj:`projected_distances` and
279        :obj:`distances` matrix using *stress_func* and update the
280        :obj:`stress` matrix and :obj:`avgStress` accordingly.
281       
282        """
283        self.mds.getStress(stress_func)
284
285    @deprecated_keywords({"stressFunc": "stress_func"})
286    def optimize(self, iter, stress_func=SgnRelStress, eps=1e-3,
287                 progress_callback=None):
288        self.mds.progress_callback=progress_callback
289        self.mds.optimize(iter, stress_func, eps)
290
291    @deprecated_keywords({"stressFunc": "stress_func"})
292    def run(self, iter, stress_func=SgnRelStress, eps=1e-3,
293            progress_callback=None):
294        """
295        Perform optimization until stopping conditions are met.
296        Stopping conditions are:
297           
298           * optimization runs for *iter* iterations of smacof_step function, or
299           * stress improvement (old stress minus new stress) is smaller than
300             eps * old stress.
301       
302        :param iter: maximum number of optimization iterations.
303        :type iter: int
304       
305        :param stress_func: stress function.
306        """
307        self.optimize(iter, stress_func, eps, progress_callback)
308
309    def torgerson(self):
310        """
311        Run the Torgerson algorithm that computes an initial analytical
312        solution of the problem.
313       
314        """
315        # Torgerson's initial approximation
316        O = numpy.array([m for m in self.distances])
317       
318##        #B = matrixmultiply(O,O)
319##        # bug!? B = O**2
320##        B = dot(O,O)
321##        # double-center B
322##        cavg = sum(B, axis=0)/(self.n+0.0)      # column sum
323##        ravg = sum(B, axis=1)/(self.n+0.0)    # row sum
324##        tavg = sum(cavg)/(self.n+0.0)   # total sum
325##        # B[row][column]
326##        for i in xrange(self.n):
327##            for j in xrange(self.n):
328##                B[i,j] += -cavg[j]-ravg[i]
329##        B = -0.5*(B+tavg)
330
331        # B = double-center O**2 !!!
332        J = numpy.identity(self.n) - (1/numpy.float(self.n))
333        B = -0.5 * numpy.dot(numpy.dot(J, O**2), J)
334       
335        # SVD-solve B = ULU'
336        #(U,L,V) = singular_value_decomposition(B)
337        (U,L,V)=svd(B)
338        # X = U(L^0.5)
339        # # self.X = matrixmultiply(U,identity(self.n)*sqrt(L))
340        # X is n-dimensional, we take the two dimensions with the largest singular values
341        idx = numpy.argsort(L)[-self.dim:].tolist()
342        idx.reverse()
343       
344        Lt = numpy.take(L,idx)   # take those singular values
345        Ut = numpy.take(U,idx,axis=1) # take those columns that are enabled
346        Dt = numpy.identity(self.dim)*numpy.sqrt(Lt)  # make a diagonal matrix, with squarooted values
347        self.points = Orange.core.FloatListList(numpy.dot(Ut,Dt))
348        self.freshD = 0
349       
350#        D = identity(self.n)*sqrt(L)  # make a diagonal matrix, with squarooted values
351#        X = matrixmultiply(U,D)
352#        self.X = take(X,idx,1)
353   
354    # Kruskal's monotone transformation
355    def lsmt(self):
356        """
357        Execute Kruskal monotone transformation.
358       
359        """
360        # optimize the distance transformation
361        # build vector o
362        effect = 0
363        self.getDistance()
364        o = []
365        for i in xrange(1,self.n):
366            for j in xrange(i):
367                o.append((self.original_distances[i,j],(i,j)))
368        o.sort(_mycompare)
369        # find the ties in o, and construct the d vector sorting in order within ties
370        d = []
371        td = []
372        uv = [] # numbers of consecutively tied o values
373        (i,j) = o[0][1]
374        distnorm = self.projected_distances[i,j]*self.projected_distances[i,j]
375        td = [self.projected_distances[i,j]] # fetch distance
376        for l in xrange(1,len(o)):
377            # copy now sorted distances in an array
378            # but sort distances within a tied o
379            (i,j) = o[l][1]
380            cd = self.projected_distances[i,j]
381            distnorm += self.projected_distances[i,j]*self.projected_distances[i,j]
382            if o[l][0] != o[l-1][0]:
383                # differing value, flush
384                sum = reduce(lambda x,y:x+y,td)+0.0
385                d.append([sum,len(td),sum/len(td),td])
386                td = []
387            td.append(cd)
388        sum = reduce(lambda x,y:x+y,td)+0.0
389        d.append([sum,len(td),sum/len(td),td])
390        ####
391        # keep merging non-monotonous areas in d
392        monotony = 0
393        while not monotony and len(d) > 1:
394            monotony = 1
395            pi = 0 # index
396            n = 1  # n-areas
397            nd = []
398            r = d[0] # current area
399            for i in range(1,len(d)):
400                tr = d[i]
401                if r[2]>=tr[2]:
402                    monotony = 0
403                    effect = 1
404                    r[0] += tr[0]
405                    r[1] += tr[1]
406                    r[2] = tr[0]/tr[1]
407                    r[3] += tr[3]
408                else:
409                    nd.append(r)
410                    r = tr
411            nd.append(r)
412            d = nd
413        # normalizing multiplier
414        sum = 0.0
415        for i in d:
416            sum += i[2]*i[2]*i[1]
417        f = numpy.sqrt(distnorm/numpy.max(sum,1e-6))
418        # transform O
419        k = 0
420        for i in d:
421            for j in range(i[1]):
422                (ii,jj) = o[k][1]
423                self.distances[ii,jj] = f*i[2]
424                k += 1
425        assert(len(o) == k)
426        self.freshD = 0
427        return effect
428   
429MDS = deprecated_members({"projectedDistances": "projected_distances",
430                     "originalDistances": "original_distances",
431                     "avgStress": "avg_stress",
432                     "progressCallback": "progress_callback",
433                     "getStress": "calc_stress",
434                     "get_stress": "calc_stress",
435                     "calcStress": "calc_stress",
436                     "getDistance": "calc_distance",
437                     "get_distance": "calc_distance",
438                     "calcDistance": "calc_distance",
439                     "Torgerson": "torgerson",
440                     "SMACOFstep": "smacof_step",
441                     "LSMT": "lsmt"})(MDS)
Note: See TracBrowser for help on using the repository browser.