#
source:
orange/Orange/projection/mds.py
@
9671:a7b056375472

Revision 9671:a7b056375472, 14.4 KB checked in by anze <anze.staric@…>, 2 years ago (diff) |
---|

Line | |
---|---|

1 | """ |

2 | .. index:: multidimensional scaling (mds) |

3 | |

4 | .. index:: |

5 | single: projection; multidimensional scaling (mds) |

6 | |

7 | ********************************** |

8 | Multidimensional scaling (``mds``) |

9 | ********************************** |

10 | |

11 | The functionality to perform multidimensional scaling |

12 | (http://en.wikipedia.org/wiki/Multidimensional_scaling). |

13 | |

14 | The main class to perform multidimensional scaling is |

15 | :class:`Orange.projection.mds.MDS` |

16 | |

17 | .. autoclass:: Orange.projection.mds.MDS |

18 | :members: |

19 | :exclude-members: Torgerson, get_distance, get_stress |

20 | |

21 | Stress functions |

22 | ================ |

23 | |

24 | Stress functions that can be used for MDS have to be implemented as functions |

25 | or callable classes: |

26 | |

27 | .. method:: \ __call__(correct, current, weight=1.0) |

28 | |

29 | Compute the stress using the correct and the current distance value (the |

30 | :obj:`Orange.projection.mds.MDS.distances` and |

31 | :obj:`Orange.projection.mds.MDS.projected_distances` elements). |

32 | |

33 | :param correct: correct (actual) distance between elements, represented by |

34 | the two points. |

35 | :type correct: float |

36 | |

37 | :param current: current distance between the points in the MDS space. |

38 | :type current: float |

39 | |

40 | This module provides the following stress functions: |

41 | |

42 | * :obj:`SgnRelStress` |

43 | * :obj:`KruskalStress` |

44 | * :obj:`SammonStress` |

45 | * :obj:`SgnSammonStress` |

46 | |

47 | Examples |

48 | ======== |

49 | |

50 | MDS Scatterplot |

51 | --------------- |

52 | |

53 | The following script computes the Euclidean distance between the data |

54 | instances and runs MDS. Final coordinates are plotted with matplotlib |

55 | (not included with orange, http://matplotlib.sourceforge.net/). |

56 | |

57 | Example (:download:`mds-scatterplot.py <code/mds-scatterplot.py>`, uses :download:`iris.tab <code/iris.tab>`) |

58 | |

59 | .. literalinclude:: code/mds-scatterplot.py |

60 | :lines: 7- |

61 | |

62 | The script produces a file *mds-scatterplot.py.png*. Color denotes |

63 | the class. Iris is a relatively simple data set with respect to |

64 | classification; to no surprise we see that MDS finds such instance |

65 | placement in 2D where instances of different classes are well separated. |

66 | Note that MDS has no knowledge of points' classes. |

67 | |

68 | .. image:: files/mds-scatterplot.png |

69 | |

70 | |

71 | A more advanced example |

72 | ----------------------- |

73 | |

74 | The following script performs 10 steps of Smacof optimization before computing |

75 | the stress. This is suitable if you have a large dataset and want to save some |

76 | time. |

77 | |

78 | Example (:download:`mds-advanced.py <code/mds-advanced.py>`, uses :download:`iris.tab <code/iris.tab>`) |

79 | |

80 | .. literalinclude:: code/mds-advanced.py |

81 | :lines: 7- |

82 | |

83 | A few representative lines of the output are:: |

84 | |

85 | <-0.633911848068, 0.112218663096> [5.1, 3.5, 1.4, 0.2, 'Iris-setosa'] |

86 | <-0.624193906784, -0.111143872142> [4.9, 3.0, 1.4, 0.2, 'Iris-setosa'] |

87 | ... |

88 | <0.265250980854, 0.237793982029> [7.0, 3.2, 4.7, 1.4, 'Iris-versicolor'] |

89 | <0.208580598235, 0.116296850145> [6.4, 3.2, 4.5, 1.5, 'Iris-versicolor'] |

90 | ... |

91 | <0.635814905167, 0.238721415401> [6.3, 3.3, 6.0, 2.5, 'Iris-virginica'] |

92 | <0.356859534979, -0.175976261497> [5.8, 2.7, 5.1, 1.9, 'Iris-virginica'] |

93 | ... |

94 | |

95 | |

96 | """ |

97 | |

98 | |

99 | from math import * |

100 | from numpy import * |

101 | from numpy.linalg import svd |

102 | |

103 | import Orange.core |

104 | import orangeom as orangemds |

105 | from Orange.misc import deprecated_keywords |

106 | from Orange.misc import deprecated_members |

107 | |

108 | KruskalStress = orangemds.KruskalStress() |

109 | SammonStress = orangemds.SammonStress() |

110 | SgnSammonStress = orangemds.SgnSammonStress() |

111 | SgnRelStress = orangemds.SgnRelStress() |

112 | |

113 | PointList = Orange.core.FloatListList |

114 | FloatListList = Orange.core.FloatListList |

115 | |

116 | def _mycompare((a,aa),(b,bb)): |

117 | if a == b: |

118 | return 0 |

119 | if a < b: |

120 | return -1 |

121 | else: |

122 | return 1 |

123 | |

124 | class PivotMDS(object): |

125 | def __init__(self, distances=None, pivots=50, dim=2, **kwargs): |

126 | self.dst = array([m for m in distances]) |

127 | self.n = len(self.dst) |

128 | |

129 | if type(pivots) == type(1): |

130 | self.k = pivots |

131 | self.pivots = random.permutation(len(self.dst))[:pivots] |

132 | #self.pivots.sort() |

133 | elif type(pivots) == type([]): |

134 | self.pivots = pivots |

135 | #self.pivots.sort() |

136 | self.k = len(self.pivots) |

137 | else: |

138 | raise AttributeError('pivots') |

139 | |

140 | def optimize(self): |

141 | # # Classical MDS (Torgerson) |

142 | # J = identity(self.n) - (1/float(self.n)) |

143 | # B = -1/2. * dot(dot(J, self.dst**2), J) |

144 | # w,v = linalg.eig(B) |

145 | # tmp = zip([float(val) for val in w], range(self.n)) |

146 | # tmp.sort() |

147 | # w1, w2 = tmp[-1][0], tmp[-2][0] |

148 | # v1, v2 = v[:, tmp[-1][1]], v[:, tmp[-2][1]] |

149 | # return v1 * sqrt(w1), v2 * sqrt(w2) |

150 | |

151 | # Pivot MDS |

152 | d = self.dst[[self.pivots]].T |

153 | C = d**2 |

154 | # double-center d |

155 | cavg = sum(d, axis=0)/(self.k+0.0) # column sum |

156 | ravg = sum(d, axis=1)/(self.n+0.0) # row sum |

157 | tavg = sum(cavg)/(self.n+0.0) # total sum |

158 | # TODO: optimize |

159 | for i in xrange(self.n): |

160 | for j in xrange(self.k): |

161 | C[i,j] += -ravg[i] - cavg[j] |

162 | |

163 | C = -0.5 * (C + tavg) |

164 | w,v = linalg.eig(dot(C.T, C)) |

165 | tmp = zip([float(val) for val in w], range(self.n)) |

166 | tmp.sort() |

167 | w1, w2 = tmp[-1][0], tmp[-2][0] |

168 | v1, v2 = v[:, tmp[-1][1]], v[:, tmp[-2][1]] |

169 | x = dot(C, v1) |

170 | y = dot(C, v2) |

171 | return x, y |

172 | |

173 | |

174 | class MDS(object): |

175 | """ |

176 | Main class for performing multidimensional scaling. |

177 | |

178 | :param distances: original dissimilarity - a distance matrix to operate on. |

179 | :type distances: :class:`Orange.core.SymMatrix` |

180 | |

181 | :param dim: dimension of the projected space. |

182 | :type dim: int |

183 | |

184 | :param points: an initial configuration of points (optional) |

185 | :type points: :class:`Orange.core.FloatListList` |

186 | |

187 | An instance of MDS object has the following attributes and functions: |

188 | |

189 | .. attribute:: points |

190 | |

191 | Holds the current configuration of projected points in an |

192 | :class:`Orange.core.FloatListList` object. |

193 | |

194 | .. attribute:: distances |

195 | |

196 | An :class:`Orange.core.SymMatrix` containing the distances that we |

197 | want to achieve (lsmt changes these). |

198 | |

199 | .. attribute:: projected_distances |

200 | |

201 | An :class:`Orange.core.SymMatrix` containing the distances between |

202 | projected points. |

203 | |

204 | .. attribute:: original_distances |

205 | |

206 | An :class:`Orange.core.SymMatrix` containing the original distances |

207 | between points. |

208 | |

209 | .. attribute:: stress |

210 | |

211 | An :class:`Orange.core.SymMatrix` holding the stress. |

212 | |

213 | .. attribute:: dim |

214 | |

215 | An integer holding the dimension of the projected space. |

216 | |

217 | .. attribute:: n |

218 | |

219 | An integer holding the number of elements (points). |

220 | |

221 | .. attribute:: avg_stress |

222 | |

223 | A float holding the average stress in the :obj:`stress` matrix. |

224 | |

225 | .. attribute:: progress_callback |

226 | |

227 | A function that gets called after each optimization step in the |

228 | :func:`run` method. |

229 | |

230 | """ |

231 | |

232 | def __init__(self, distances=None, dim=2, **kwargs): |

233 | self.mds=orangemds.MDS(distances, dim, **kwargs) |

234 | self.original_distances=Orange.core.SymMatrix([m for m in self.distances]) |

235 | |

236 | def __getattr__(self, name): |

237 | if name in ["points", "projected_distances", "distances" ,"stress", |

238 | "progress_callback", "n", "dim", "avg_stress"]: |

239 | #print "rec:",name |

240 | return self.__dict__["mds"].__dict__[name] |

241 | else: |

242 | raise AttributeError(name) |

243 | |

244 | def __setattr__(self, name, value): |

245 | #print "setattr" |

246 | if name=="points": |

247 | for i in range(len(value)): |

248 | for j in range(len(value[i])): |

249 | self.mds.points[i][j]=value[i][j] |

250 | return |

251 | |

252 | if name in ["projected_distances", "distances" ,"stress", |

253 | "progress_callback"]: |

254 | self.mds.__setattr__(name, value) |

255 | else: |

256 | self.__dict__[name]=value |

257 | |

258 | def __nonzero__(self): |

259 | return True |

260 | |

261 | def smacof_step(self): |

262 | """ |

263 | Perform a single iteration of a Smacof algorithm that optimizes |

264 | :obj:`stress` and updates the :obj:`points`. |

265 | """ |

266 | self.mds.SMACOFstep() |

267 | |

268 | def calc_distance(self): |

269 | """ |

270 | Compute the distances between points and update the |

271 | :obj:`projected_distances` matrix. |

272 | |

273 | """ |

274 | self.mds.get_distance() |

275 | |

276 | |

277 | @deprecated_keywords({"stressFunc": "stress_func"}) |

278 | def calc_stress(self, stress_func=SgnRelStress): |

279 | """ |

280 | Compute the stress between the current :obj:`projected_distances` and |

281 | :obj:`distances` matrix using *stress_func* and update the |

282 | :obj:`stress` matrix and :obj:`avgStress` accordingly. |

283 | |

284 | """ |

285 | self.mds.getStress(stress_func) |

286 | |

287 | @deprecated_keywords({"stressFunc": "stress_func"}) |

288 | def optimize(self, iter, stress_func=SgnRelStress, eps=1e-3, |

289 | progress_callback=None): |

290 | self.mds.progress_callback=progress_callback |

291 | self.mds.optimize(iter, stress_func, eps) |

292 | |

293 | @deprecated_keywords({"stressFunc": "stress_func"}) |

294 | def run(self, iter, stress_func=SgnRelStress, eps=1e-3, |

295 | progress_callback=None): |

296 | """ |

297 | Perform optimization until stopping conditions are met. |

298 | Stopping conditions are: |

299 | |

300 | * optimization runs for *iter* iterations of smacof_step function, or |

301 | * stress improvement (old stress minus new stress) is smaller than |

302 | eps * old stress. |

303 | |

304 | :param iter: maximum number of optimization iterations. |

305 | :type iter: int |

306 | |

307 | :param stress_func: stress function. |

308 | """ |

309 | self.optimize(iter, stress_func, eps, progress_callback) |

310 | |

311 | def torgerson(self): |

312 | """ |

313 | Run the Torgerson algorithm that computes an initial analytical |

314 | solution of the problem. |

315 | |

316 | """ |

317 | # Torgerson's initial approximation |

318 | O = array([m for m in self.distances]) |

319 | |

320 | ## #B = matrixmultiply(O,O) |

321 | ## # bug!? B = O**2 |

322 | ## B = dot(O,O) |

323 | ## # double-center B |

324 | ## cavg = sum(B, axis=0)/(self.n+0.0) # column sum |

325 | ## ravg = sum(B, axis=1)/(self.n+0.0) # row sum |

326 | ## tavg = sum(cavg)/(self.n+0.0) # total sum |

327 | ## # B[row][column] |

328 | ## for i in xrange(self.n): |

329 | ## for j in xrange(self.n): |

330 | ## B[i,j] += -cavg[j]-ravg[i] |

331 | ## B = -0.5*(B+tavg) |

332 | |

333 | # B = double-center O**2 !!! |

334 | J = identity(self.n) - (1/float(self.n)) |

335 | B = -0.5 * dot(dot(J, O**2), J) |

336 | |

337 | # SVD-solve B = ULU' |

338 | #(U,L,V) = singular_value_decomposition(B) |

339 | (U,L,V)=svd(B) |

340 | # X = U(L^0.5) |

341 | # # self.X = matrixmultiply(U,identity(self.n)*sqrt(L)) |

342 | # X is n-dimensional, we take the two dimensions with the largest singular values |

343 | idx = argsort(L)[-self.dim:].tolist() |

344 | idx.reverse() |

345 | |

346 | Lt = take(L,idx) # take those singular values |

347 | Ut = take(U,idx,axis=1) # take those columns that are enabled |

348 | Dt = identity(self.dim)*sqrt(Lt) # make a diagonal matrix, with squarooted values |

349 | self.points = Orange.core.FloatListList(dot(Ut,Dt)) |

350 | self.freshD = 0 |

351 | |

352 | # D = identity(self.n)*sqrt(L) # make a diagonal matrix, with squarooted values |

353 | # X = matrixmultiply(U,D) |

354 | # self.X = take(X,idx,1) |

355 | |

356 | # Kruskal's monotone transformation |

357 | def lsmt(self): |

358 | """ |

359 | Execute Kruskal monotone transformation. |

360 | |

361 | """ |

362 | # optimize the distance transformation |

363 | # build vector o |

364 | effect = 0 |

365 | self.getDistance() |

366 | o = [] |

367 | for i in xrange(1,self.n): |

368 | for j in xrange(i): |

369 | o.append((self.original_distances[i,j],(i,j))) |

370 | o.sort(_mycompare) |

371 | # find the ties in o, and construct the d vector sorting in order within ties |

372 | d = [] |

373 | td = [] |

374 | uv = [] # numbers of consecutively tied o values |

375 | (i,j) = o[0][1] |

376 | distnorm = self.projected_distances[i,j]*self.projected_distances[i,j] |

377 | td = [self.projected_distances[i,j]] # fetch distance |

378 | for l in xrange(1,len(o)): |

379 | # copy now sorted distances in an array |

380 | # but sort distances within a tied o |

381 | (i,j) = o[l][1] |

382 | cd = self.projected_distances[i,j] |

383 | distnorm += self.projected_distances[i,j]*self.projected_distances[i,j] |

384 | if o[l][0] != o[l-1][0]: |

385 | # differing value, flush |

386 | sum = reduce(lambda x,y:x+y,td)+0.0 |

387 | d.append([sum,len(td),sum/len(td),td]) |

388 | td = [] |

389 | td.append(cd) |

390 | sum = reduce(lambda x,y:x+y,td)+0.0 |

391 | d.append([sum,len(td),sum/len(td),td]) |

392 | #### |

393 | # keep merging non-monotonous areas in d |

394 | monotony = 0 |

395 | while not monotony and len(d) > 1: |

396 | monotony = 1 |

397 | pi = 0 # index |

398 | n = 1 # n-areas |

399 | nd = [] |

400 | r = d[0] # current area |

401 | for i in range(1,len(d)): |

402 | tr = d[i] |

403 | if r[2]>=tr[2]: |

404 | monotony = 0 |

405 | effect = 1 |

406 | r[0] += tr[0] |

407 | r[1] += tr[1] |

408 | r[2] = tr[0]/tr[1] |

409 | r[3] += tr[3] |

410 | else: |

411 | nd.append(r) |

412 | r = tr |

413 | nd.append(r) |

414 | d = nd |

415 | # normalizing multiplier |

416 | sum = 0.0 |

417 | for i in d: |

418 | sum += i[2]*i[2]*i[1] |

419 | f = sqrt(distnorm/max(sum,1e-6)) |

420 | # transform O |

421 | k = 0 |

422 | for i in d: |

423 | for j in range(i[1]): |

424 | (ii,jj) = o[k][1] |

425 | self.distances[ii,jj] = f*i[2] |

426 | k += 1 |

427 | assert(len(o) == k) |

428 | self.freshD = 0 |

429 | return effect |

430 | |

431 | MDS = deprecated_members({"projectedDistances": "projected_distances", |

432 | "originalDistances": "original_distances", |

433 | "avgStress": "avg_stress", |

434 | "progressCallback": "progress_callback", |

435 | "getStress": "calc_stress", |

436 | "get_stress": "calc_stress", |

437 | "calcStress": "calc_stress", |

438 | "getDistance": "calc_distance", |

439 | "get_distance": "calc_distance", |

440 | "calcDistance": "calc_distance", |

441 | "Torgerson": "torgerson", |

442 | "SMACOFstep": "smacof_step", |

443 | "LSMT": "lsmt"})(MDS) |

**Note:**See TracBrowser for help on using the repository browser.