#
source:
orange-bioinformatics/_bioinformatics/widgets/prototypes/OWDataDistance.py
@
1643:2cfa80dac3d3

Revision 1643:2cfa80dac3d3, 11.4 KB checked in by mitar, 2 years ago (diff) |
---|

Rev | Line | |
---|---|---|

[972] | 1 | ## Automatically adapted for numpy.oldnumeric Oct 04, 2007 by |

2 | ||

3 | """ | |

4 | <name>Data Distance</name> | |

5 | <description>Computes a distance matrix between data files.</description> | |

6 | <icon>icons/ChipDistance.png</icon> | |

7 | <priority>1160</priority> | |

8 | <contact>Peter Juvan (peter.juvan@fri.uni-lj.si)</contact> | |

[1643] | 9 | <prototype>1</prototype> |

[972] | 10 | """ |

11 | ||

[1633] | 12 | from __future__ import absolute_import |

13 | ||

[972] | 14 | from qt import * |

15 | from qtcanvas import * | |

[1633] | 16 | |

17 | import numpy.oldnumeric as Numeric, numpy.oldnumeric.ma as MA | |

18 | ||

19 | import orange, statc | |

20 | from Orange.OrangeWidgets import OWGUI | |

21 | from Orange.OrangeWidgets.OWWidget import * | |

22 | ||

23 | from .OWDataFiles import DataFiles | |

[972] | 24 | |

25 | import warnings | |

26 | warnings.filterwarnings("ignore", "'strain'", orange.AttributeWarning) | |

27 | warnings.filterwarnings("ignore", "'dirname'", orange.AttributeWarning) | |

28 | ||

29 | ############################################################################## | |

30 | # main class | |

31 | ||

32 | class OWDataDistance(OWWidget): | |

33 | settingsList = ["Metrics"] | |

34 | ||

35 | def __init__(self, parent=None, signalManager = None): | |

36 | OWWidget.__init__(self, parent, signalManager, 'Data Distance') | |

37 | ||

38 | self.inputs = [("Structured Data", DataFiles, self.chipdata)] | |

39 | self.outputs = [("Distance Matrix", orange.SymMatrix)] | |

40 | ||

41 | self.Metrics = 0 | |

42 | self.loadSettings() | |

43 | self.data = [] | |

44 | ## self.metrics = [("Euclidean", orange.ExamplesDistanceConstructor_Euclidean), | |

45 | ## ("Manhattan", orange.ExamplesDistanceConstructor_Manhattan), | |

46 | ## ("Hamming", orange.ExamplesDistanceConstructor_Hamming)] | |

47 | self.metrics = [("Manhattan", distManhattan), ("Euclidean", distEuclidean), ("1 - (Pearson correlation coefficient)", distPearson), ("1 - (Spearman rank correlation coefficient)", distSpearman)] | |

48 | ||

49 | # GUI | |

50 | self.mainArea.setFixedWidth(0) | |

51 | # Info box | |

52 | box = QVGroupBox("Info", self.controlArea) | |

53 | self.infoa = QLabel('No data on input.', box) | |

54 | self.infob = QLabel('', box) | |

55 | OWGUI.separator(self.controlArea) | |

56 | ||

57 | # Distance metrics selection | |

58 | items = [x[0] for x in self.metrics] | |

59 | OWGUI.comboBox(self.controlArea, self, "Metrics", box="Distance Metrics", items=items, | |

60 | tooltip="Metrics to measure distance between data sets.", | |

61 | callback=self.onMetricsChange) | |

62 | ||

63 | self.resize(384, 138) | |

64 | ||

65 | ||

66 | ########################################################################## | |

67 | # handling of input/output signals | |

68 | ||

69 | ## def computeDistance(self, d1, d2, dist): | |

70 | ## """employs orange to compute distances (slower) | |

71 | ## """ | |

72 | ## d = 0 | |

73 | ## for i in range(len(d1)): | |

74 | ## d += dist(d1[i], d2[i]) | |

75 | ## d = d / len(d1) | |

76 | ## return d | |

77 | ||

78 | def computeDistance(self, d1, d2): | |

79 | """employs MA to cumpute distances (faster) | |

80 | """ | |

81 | return dist(d1.toNumpyMA("a")[0], d2.toNumpyMA("a")[0]) | |

82 | ||

83 | ||

84 | def computeMatrix(self): | |

85 | if not self.data: | |

86 | self.send("Distance Matrix", None) | |

87 | return | |

88 | ## if self.Metrics == 0: # bug in orange, correct (remove normalize) once it is fixed | |

89 | ## dist = self.metrics[self.Metrics][1](self.data[0], normalize=0) | |

90 | ## else: | |

91 | ## dist = self.metrics[self.Metrics][1](self.data[0]) | |

92 | matrix = orange.SymMatrix(len(self.data)) | |

93 | matrix.setattr('items', self.data) | |

94 | self.progressBarInit() | |

95 | pbStep = 100./(len(self.data)**2/2. - len(self.data)/2.) | |

96 | for i in range(len(self.data)-1): | |

97 | for j in range(i+1, len(self.data)): | |

98 | ## matrix[i, j] = self.computeDistance(self.data[i], self.data[j], dist) | |

99 | matrix[i, j] = self.metrics[self.Metrics][1](MA.ravel(self.data[i].toNumpyMA("a")[0]), MA.ravel(self.data[j].toNumpyMA("a")[0])) | |

100 | self.progressBarAdvance(pbStep) | |

101 | self.progressBarFinished() | |

102 | self.send("Distance Matrix", matrix) | |

103 | ||

104 | ||

105 | def chipdata(self, data): | |

106 | self.data = [] | |

107 | if data: | |

108 | self.infob.setText("") | |

109 | numFiles = reduce(lambda a,b: a+len(b[1]), data, 0) | |

110 | lenSD = len(data) | |

111 | self.infoa.setText("%d set%s, total of %d data file%s." % (lenSD, ["","s"][lenSD!=1], numFiles, ["","s"][numFiles!=1])) | |

112 | numExamplesList = [] | |

113 | # construct a list of ExampleTable lengths and a list of attribute names | |

114 | for (name, etList) in data: | |

115 | for et in etList: | |

116 | setattr(et,"dirname",name) | |

117 | setattr(et,"strain",name) | |

118 | self.data.append(et) | |

119 | numExamplesList.append(len(et)) | |

120 | if len(self.data)>1: | |

121 | # test that files contain the same attributes and equal number of examples | |

122 | attrSorted = self.data[0].domain.attributes | |

123 | attrSorted.sort() | |

124 | numEx = len(self.data[0]) | |

125 | for et in self.data[1:]: | |

126 | attrSorted2 = et.domain.attributes | |

127 | attrSorted2.sort() | |

128 | if map(lambda x: x.name, attrSorted) != map(lambda x: x.name, attrSorted2): | |

129 | self.data = [] | |

130 | self.infob.setText("Error: data files contain different attributes, aborting distance computation.") | |

131 | return | |

132 | if len(et) != numEx: | |

133 | self.data = [] | |

134 | self.infob.setText("Error: data files contain unequal number of examples, aborting distance computation.") | |

135 | return | |

136 | # compute distances | |

137 | pb = OWGUI.ProgressBar(self, iterations=len(self.data)) | |

138 | self.computeMatrix() | |

139 | pb.finish() | |

140 | ||

141 | else: | |

142 | self.data = [] | |

143 | self.infob.setText('Error: not enough data, aborting distance computation.') | |

144 | else: | |

145 | self.infoa.setText('No data on input.') | |

146 | ||

147 | ||

148 | def onMetricsChange(self): | |

149 | if self.data and len(self.data)>1: | |

150 | self.computeMatrix() | |

151 | ||

152 | ||

153 | ||

154 | ########################################################################### | |

155 | # Distance Metrics | |

156 | ########################################################################### | |

157 | ||

158 | def distManhattan(x,y): | |

159 | """normalized Manhattan distance | |

160 | """ | |

161 | x = MA.asarray(x) | |

162 | y = MA.asarray(y) | |

163 | assert MA.rank(x) == MA.rank(y) == 1 | |

164 | sumWeights = MA.add.reduce(MA.logical_not(MA.logical_or(MA.getmaskarray(x), MA.getmaskarray(y))).astype(Numeric.Float)) | |

165 | return MA.add.reduce(MA.absolute(x-y)) / sumWeights | |

166 | ||

167 | ||

168 | def distManhattanW(x,y,w): | |

169 | """normalized weighted Manhattan distance | |

170 | """ | |

171 | x = MA.asarray(x) | |

172 | y = MA.asarray(y) | |

173 | w = MA.asarray(w) | |

174 | assert MA.rank(x) == MA.rank(y) == MA.rank(w) == 1 | |

175 | sumWeights = MA.add.reduce(w * MA.logical_not(MA.logical_or(MA.getmaskarray(x), MA.getmaskarray(y))).astype(Numeric.Float)) | |

176 | return MA.add.reduce(w * MA.absolute(x-y)) / sumWeights | |

177 | ||

178 | ||

179 | def distEuclidean(x,y): | |

180 | """normalized euclidean distance | |

181 | """ | |

182 | x = MA.asarray(x) | |

183 | y = MA.asarray(y) | |

184 | assert MA.rank(x) == MA.rank(y) == 1 | |

185 | sumWeights = MA.add.reduce(MA.logical_not(MA.logical_or(MA.getmaskarray(x), MA.getmaskarray(y))).astype(Numeric.Float)) | |

186 | return MA.sqrt(MA.add.reduce((x-y)**2) / sumWeights) | |

187 | ||

188 | ||

189 | def distEuclideanW(x,y,w): | |

190 | """normalized weighted euclidean distance | |

191 | """ | |

192 | x = MA.asarray(x) | |

193 | y = MA.asarray(y) | |

194 | w = MA.asarray(w) | |

195 | assert MA.rank(x) == MA.rank(y) == MA.rank(w) == 1 | |

196 | sumWeights = MA.add.reduce(w * MA.logical_not(MA.logical_or(MA.getmaskarray(x), MA.getmaskarray(y))).astype(Numeric.Float)) | |

197 | return MA.sqrt(MA.add.reduce(w * (x-y)**2) / sumWeights) | |

198 | ||

199 | ||

200 | def distPearson(x,y): | |

201 | """distance corresponding to 1 - pearson's correlation coefficient for arrays x,y | |

202 | returns distance: 1 - pearson_r | |

203 | """ | |

204 | x = MA.asarray(x) | |

205 | y = MA.asarray(y) | |

206 | assert MA.rank(x) == MA.rank(y) == 1 | |

207 | cond = MA.logical_not(MA.logical_or(MA.getmaskarray(x), MA.getmaskarray(y))) | |

208 | return 1 - statc.pearsonr(MA.compress(cond,x).tolist(), MA.compress(cond,y).tolist())[0] | |

209 | ||

210 | ||

211 | def distPearsonW(x,y,w): | |

212 | """weighted distance corresponding to 1 - pearson's correlation coefficient for arrays x,y and weights w | |

213 | returns distance: 1 - pearson_r | |

214 | """ | |

215 | #TINY = 1.0e-20 | |

216 | # ones for non-masked places at x,y and w | |

217 | x = MA.asarray(x) | |

218 | y = MA.asarray(y) | |

219 | w = MA.asarray(w) | |

220 | assert MA.rank(x) == MA.rank(y) == MA.rank(w) == 1 | |

221 | mask = MA.logical_or(MA.logical_or(MA.getmaskarray(x), MA.getmaskarray(y)), MA.getmaskarray(w)) | |

222 | # set mask to w that is equal to the mask from x, y and w | |

223 | w = MA.masked_array(w, mask=mask) | |

224 | n_w_mean = MA.add.reduce(w) # n * mean(w) | |

225 | x_w = x*w # x * w | |

226 | y_w = y*w # y * w | |

227 | x_wmean = MA.divide(MA.add.reduce(x_w), n_w_mean) # weighted_mean(x) | |

228 | y_wmean = MA.divide(MA.add.reduce(y_w), n_w_mean) # weighted_mean(x) | |

229 | r_num = MA.add.reduce(x*y*w) - n_w_mean*x_wmean*y_wmean | |

230 | r_den = MA.sqrt((MA.add.reduce(x_w*x) - n_w_mean*x_wmean**2) * (MA.add.reduce(y_w*y) - n_w_mean*y_wmean**2)) | |

231 | return 1 - MA.divide(r_num, r_den) | |

232 | ||

233 | ||

234 | def distSpearman(x,y): | |

235 | """distance corresponding to 1 - spearman's correlation coefficient for arrays x,y | |

236 | returns distance: 1 - spearman_r | |

237 | """ | |

238 | x = MA.asarray(x) | |

239 | y = MA.asarray(y) | |

240 | assert MA.rank(x) == MA.rank(y) == 1 | |

241 | cond = MA.logical_not(MA.logical_or(MA.getmaskarray(x), MA.getmaskarray(y))) | |

242 | return 1 - statc.spearmanr(MA.compress(cond,x).tolist(), MA.compress(cond,y).tolist())[0] | |

243 | ||

244 | def distSpearmanW(x,y,w): | |

245 | """weighted distance corresponding to 1 - spearman's correlation coefficient for arrays x,y and weights w | |

246 | returns distance: 1 - spearman_r | |

247 | """ | |

248 | distSpearFunc = _distSpearmanW_NU | |

249 | for var in (x,y,w): | |

250 | if type(var) == MA.array and MA.count(var) != Numeric.multiply.reduce(var.shape): | |

251 | distSpearFunc = _distSpearmanW_MA | |

252 | break | |

253 | return distSpearFunc(x,y,w) | |

254 | ||

255 | ||

256 | def _distSpearmanW_NU(x,y,w): | |

257 | """x,y,w must be Numeric | |

258 | """ | |

259 | x = Numeric.asarray(x) | |

260 | y = Numeric.asarray(y) | |

261 | w = Numeric.asarray(w) | |

262 | assert Numeric.rank(x) == Numeric.rank(y) == Numeric.rank(w) == 1 | |

263 | rankx = Numeric.array(statc.rankdata(x.tolist())) | |

264 | ranky = Numeric.array(statc.rankdata(y.tolist())) | |

265 | return distPearsonW(rankx,ranky,w) | |

266 | ||

267 | ||

268 | def _distSpearmanW_MA(x,y,w): | |

269 | """if any of x,y,w is a MA array containing masked values | |

270 | """ | |

271 | x = MA.asarray(x) | |

272 | y = MA.asarray(y) | |

273 | w = MA.asarray(w) | |

274 | assert MA.rank(x) == MA.rank(y) == MA.rank(w) == 1 | |

275 | cond = MA.logical_not(MA.logical_or(MA.logical_or(MA.getmaskarray(x), MA.getmaskarray(y)), MA.getmaskarray(w))) | |

276 | # with MA use compress before tolist() ! | |

277 | rankx = Numeric.array(statc.rankdata(MA.compress(cond, x).tolist())) | |

278 | ranky = Numeric.array(statc.rankdata(MA.compress(cond, y).tolist())) | |

279 | return distPearsonW(rankx,ranky,MA.compress(cond,w)) | |

280 | ||

281 | ########################################################################### | |

282 | # testing | |

283 | ########################################################################### | |

284 | ||

285 | if __name__=="__main__": | |

[1633] | 286 | from . import OWDataFiles |

287 | from Orange.orng import orngSignalManager | |

[972] | 288 | signalManager = orngSignalManager.SignalManager(0) |

289 | a=QApplication(sys.argv) | |

290 | ow=OWDataDistance(signalManager = signalManager) | |

291 | signalManager.addWidget(ow) | |

292 | a.setMainWidget(ow) | |

293 | ow.show() | |

294 | ds = OWDataFiles.OWDataFiles(signalManager = signalManager) | |

295 | signalManager.addWidget(ds) | |

296 | ds.loadData("potato.sub100") | |

297 | signalManager.setFreeze(1) | |

298 | signalManager.addLink(ds, ow, 'Structured Data', 'Structured Data', 1) | |

299 | signalManager.setFreeze(0) | |

300 | a.exec_loop() | |

301 | ow.saveSettings() |

**Note:**See TracBrowser for help on using the repository browser.