Changeset 7348:a36ebe30d7be in orange
 Timestamp:
 02/03/11 22:48:59 (3 years ago)
 Branch:
 default
 Convert:
 dc672fdeb54866b79ffa5b7ab944329660083f1b
 File:

 1 edited
Legend:
 Unmodified
 Added
 Removed

orange/orngSOM.py
r7093 r7348 1 import sys, os 2 3 import numpy 4 import numpy.ma as ma 5 import orange 6 ##import math 7 8 ##from numpy import dot, abs, sqrt, random, array 9 10 HexagonalTopology = 0 11 RectangularTopology = 1 12 InitializeLinear = 0 13 InitializeRandom = 1 14 NeighbourhoodGaussian = 0 15 NeighbourhoodBubble = 1 16 NeighbourhoodEpanechicov = 2 17 18 class Node(object): 19 def __init__(self, pos, map=None, vector=None): 20 self.pos = pos 21 self.map = map 22 self.vector = vector 23 24 class Map(object): 25 HexagonalTopology = HexagonalTopology 26 RectangularTopology = RectangularTopology 27 InitializeLinear = InitializeLinear 28 InitializeRandom = InitializeRandom 29 NeighbourhoodGaussian = NeighbourhoodGaussian 30 NeighbourhoodBubble = NeighbourhoodBubble 31 NeighbourhoodEpanechicov = NeighbourhoodEpanechicov 32 33 def __init__(self, map_shape=(20, 40), topology=HexagonalTopology): 34 self.map_shape = map_shape 35 self.topology = topology 36 self.map = [[Node((i, j), self) for j in range(map_shape[1])] for i in range(map_shape[0])] 37 38 def __getitem__(self, pos): 39 """ Return the node at position x, y 40 """ 41 x, y = pos 42 return self.map[x][y] 43 44 def __iter__(self): 45 """ Iterate over all nodes in the map 46 """ 47 for row in self.map: 48 for node in row: 49 yield node 50 51 def vectors(self): 52 """ Return all vectors of the map as rows in an numpy.array 53 """ 54 return numpy.array([node.vector for node in self]) 55 56 def unit_distances(self): 57 """ Return a NxN numpy.array of internode distances (based on 58 node position in the map, not vector space) where N is the number of 59 nodes 60 """ 61 nodes = list(self) 62 dist = numpy.zeros((len(nodes), len(nodes))) 63 64 coords = self.unit_coords() 65 for i in range(len(nodes)): 66 for j in range(len(nodes)): 67 dist[i, j] = numpy.sqrt(numpy.dot(coords[i]  coords[j], coords[i]  coords[j])) 68 return numpy.array(dist) 69 70 def unit_coords(self): 71 """ Return the unit coordinates of all nodes in the map as an numpy.array 72 """ 73 nodes = list(self) 74 coords = numpy.zeros((len(nodes), len(self.map_shape))) 75 coords[:, 0] = numpy.floor(numpy.arange(len(nodes)) / self.map_shape[0]) 76 coords[:, 1] = numpy.mod(numpy.arange(len(nodes)), self.map_shape[1]) 77 78 ## in hexagonal topology we move every odd map row by 0.5 and multiply all by sqrt(0.75) 79 if self.topology == Map.HexagonalTopology: 80 ind = numpy.nonzero(1  numpy.mod(coords[:, 0], 2)) 81 coords[ind] = coords[ind] + 0.5 82 coords = coords * numpy.sqrt(0.75) 83 return coords 84 85 86 def initialize_map_random(self, data=None, dimension=5): 87 """ Initialize the map nodes vectors randomly, by supplying 88 either training data or dimension of the data 89 """ 90 if data is not None: 91 min, max = ma.min(data, 0), ma.max(data, 0); 92 dimension = data.shape[1] 93 else: 94 min, max = numpy.zeros(dimension), numpy.ones(dimension) 95 for node in self: 96 node.vector = min + numpy.random.rand(dimension) * (max  min) 97 98 def initialize_map_linear(self, data, map_shape=(10, 20)): 99 """ Initialize the map node vectors lineary over the subspace 100 of the two most significant eigenvectors 101 """ 102 data = data.copy() #ma.array(data) 103 dim = data.shape[1] 104 mdim = len(map_shape) 105 munits = len(list(self)) 106 me = ma.mean(data, 0) 107 A = numpy.zeros((dim ,dim)) 108 109 for i in range(dim): 110 data[:, i] = data[:, i]  me[i] 111 112 for i in range(dim): 113 for j in range(dim): 114 c = data[:, i] * data[:, j] 115 A[i, j] = ma.sum(c) / len(c) 116 A[j, i] = A[i, j] 117 118 eigval, eigvec = numpy.linalg.eig(A) 119 ind = list(reversed(numpy.argsort(eigval))) 120 eigval = eigval[ind[:mdim]] 121 eigvec = eigvec[:, ind[:mdim]] 122 123 for i in range(mdim): 124 eigvec[:, i] = eigvec[:, i] / numpy.sqrt(numpy.dot(eigvec[:, i], eigvec[:, i])) * numpy.sqrt(eigval[i]) 125 ## print eigvec, eigval 126 127 unit_coords = self.unit_coords() 128 for d in range(mdim): 129 max, min = numpy.max(unit_coords[:, d]), numpy.min(unit_coords[:, d]) 130 unit_coords[:, d] = (unit_coords[:, d]  min)/(max  min) 131 unit_coords = (unit_coords  0.5) * 2 132 133 vectors = numpy.array([me for i in range(munits)]) 134 for i in range(munits): 135 for d in range(mdim): 136 vectors[i] = vectors[i] + unit_coords[i][d] * numpy.transpose(eigvec[:, d]) 137 138 for i, node in enumerate(self): 139 node.vector = vectors[i] 140 141 def getUMat(self): 142 return getUMat(self) 143 144 class Solver(object): 145 """ SOM Solver class used to train the map. 146 Arguments: 147 * neighbourhood  Neighbourhood function (NeighbourhoodGaussian, or NeighbourhoodBubble) 148 * radius_ini  Inttial radius 149 * raduis_fin  Final radius 150 * epoch  Number of training iterations 151 * batch_train  If True run the batch training algorithem (default), else use the sequential one 152 * learning_rate  If learning rate for the sequential training algorithem 153 154 Both the batch ans sequential algorithems are based on SOM Toolkit for Matlab 155 """ 156 def __init__(self, **kwargs): 157 self.neighbourhood = NeighbourhoodGaussian 158 self.learning_rate = 0.05 159 self.radius_ini = 2 160 self.radius_fin = 1 161 self.epochs = 100 162 self.random_order = False 163 self.batch_train = True 164 self.eps = 1e5 165 self.qerror = [] 166 self.__dict__.update(kwargs) 167 168 def radius(self, epoch): 169 return self.radius_ini  (float(self.radius_ini)  self.radius_fin)*(float(epoch) / self.epochs) 170 171 def alpha(self, epoch): 172 return (1  epoch/self.epochs)*self.learning_rate 173 174 def __call__(self, data, map, progressCallback=None): 175 """ Train the map on data. Use progressCallback to Report on the progress. 176 """ 177 self.data = data 178 self.map = map 179 180 self.qerror = [] 181 self.bmu_cache = {} 182 if self.batch_train: 183 self.train_batch(progressCallback) 184 else: 185 self.train_sequential(progressCallback) 186 return self.map 187 188 def train_sequential(self, progressCallback): 189 self.vectors = self.map.vectors() 190 self.unit_distances = self.map.unit_distances() 191 192 ## from pylab import plot, show, draw, ion 193 ## ion() 194 ## plot(self.data[:, 0], self.data[:, 1], "ro") 195 ## vec_plot = plot(self.vectors[:, 0], self.vectors[:, 1], "bo")[0] 196 197 for epoch in range(self.epochs): 198 self.distances = [] 199 ind = range(len(self.data)) 200 if self.random_order: 201 random.shuffle(ind) 202 self.train_step_sequential(epoch, ind) 203 if progressCallback: 204 progressCallback(100.0*epoch/self.epochs) 205 self.qerror.append(numpy.mean(numpy.sqrt(self.distances))) 206 ## print epoch, "q error:", numpy.mean(numpy.sqrt(self.distances)), self.radius(epoch) 207 if epoch > 5 and numpy.mean(numpy.abs(numpy.array(self.qerror[5:1])  self.qerror[1])) <= self.eps: 208 break 209 210 ## vec_plot.set_xdata(self.vectors[:, 0]) 211 ## vec_plot.set_ydata(self.vectors[:, 1]) 212 ## draw() 213 ## show() 214 215 def train_step_sequential(self, epoch, indices=None): 216 indices = range(len(self.data)) if indices == None else indices 217 for ind in indices: 218 x = self.data[ind] 219 Dx = self.vectors  self.data[ind] 220 Dist = ma.sum(Dx**2, 1) 221 min_dist = ma.min(Dist) 222 bmu = ma.argmin(Dist) 223 self.distances.append(min_dist) 224 225 if self.neighbourhood == Map.NeighbourhoodGaussian: 226 h = numpy.exp(self.unit_distances[:, bmu]/(2*self.radius(epoch))) * (self.unit_distances[:, bmu] <= self.radius(epoch)) 227 elif self.neighbourhood == Map.NeighbourhoodEpanechicov: 228 h = 1.0  (self.unit_distances[:bmu]/self.radius(epoch))**2 229 h = h * (h >= 0.0) 230 else: 231 h = 1.0*(self.unit_distances[:, bmu] <= self.radius(epoch)) 232 h = h * self.alpha(epoch) 233 234 nonzero = ma.nonzero(h) 235 h = h[nonzero] 236 237 self.vectors[nonzero] = self.vectors[nonzero]  Dx[nonzero] * numpy.reshape(h, (len(h), 1)) 238 239 def train_batch(self, progressCallback=None): 240 self.unit_distances = self.map.unit_distances() 241 self.constant_matrix = 2 * ma.dot(numpy.eye(self.data.shape[1]), numpy.transpose(self.data)) 242 self.dist_cons = numpy.transpose(ma.dot(self.data**2, numpy.ones(self.data.shape[1]))) 243 self.weight_matrix = numpy.ones((self.data.shape[1], self.data.shape[0])) 244 self.vectors = self.map.vectors() 245 246 ## from pylab import plot, show, draw, ion 247 ## ion() 248 ## plot(self.data[:, 0], self.data[:, 1], "ro") 249 ## vec_plot = plot(self.vectors[:, 0], self.vectors[:, 1], "bo")[0] 250 251 for epoch in range(self.epochs): 252 self.train_step_batch(epoch) 253 if progressCallback: 254 progressCallback(100.0*epoch/self.epochs) 255 if False and epoch > 5 and numpy.mean(numpy.abs(numpy.array(self.qerror[5:1])  self.qerror[1])) <= self.eps: 256 break 257 ## vec_plot.set_xdata(self.vectors[:, 0]) 258 ## vec_plot.set_ydata(self.vectors[:, 1]) 259 ## draw() 260 ## show() 261 262 for node, vector in zip(self.map, self.vectors): 263 node.vector = vector 264 265 def train_step_batch(self, epoch): 266 D1 = ma.dot(self.vectors**2, self.weight_matrix) 267 D2 = ma.dot(self.vectors, self.constant_matrix) 268 Dist = D1  D2 269 270 best_nodes = ma.argmin(Dist, 0) 271 distances = ma.min(Dist, 0) 272 ## print "q error:", ma.mean(ma.sqrt(distances + self.dist_cons)), self.radius(epoch) 273 self.qerror.append(ma.mean(ma.sqrt(distances + self.dist_cons))) 274 275 if self.neighbourhood == Map.NeighbourhoodGaussian: 276 H = numpy.exp(self.unit_distances/(2*self.radius(epoch))) * (self.unit_distances <= self.radius(epoch)) 277 elif self.neighbourhood == Map.NeighbourhoodEpanechicov: 278 H = 1.0  (self.unit_distances/self.radius(epoch))**2 279 H = H * (H >= 0.0) 280 else: 281 H = 1.0*(self.unit_distances <= self.radius(epoch)) 282 283 P = numpy.zeros((self.vectors.shape[0], self.data.shape[0])) 284 285 P[(best_nodes, range(len(best_nodes)))] = numpy.ones(len(best_nodes)) 286 287 S = ma.dot(H, ma.dot(P, self.data)) 288 289 A = ma.dot(H, ma.dot(P, ~self.data._mask)) 290 291 ## nonzero = (range(epoch%2, len(self.vectors), 2), ) 292 nonzero = (numpy.array(sorted(set(ma.nonzero(A)[0]))), ) 293 294 self.vectors[nonzero] = S[nonzero] / A[nonzero] 295 296 297 class SOMLearner(orange.Learner): 298 """ SOMLearner is a class used to learn SOM from orange.ExampleTable 299 300 Example: 301 >>> som = orngSOM.SOMLearner(map_shape=(10, 20), initialize=orngSOM.InitializeRandom) 302 >>> map = som(orange.ExampleTable("iris.tab")) 303 """ 304 305 def __new__(cls, examples=None, weightId=0, **kwargs): 306 self = orange.Learner.__new__(cls, **kwargs) 307 if examples is not None: 308 self.__init__(**kwargs) 309 return self.__call__(examples, weightId) 310 else: 311 return self 312 313 def __init__(self, map_shape=(5, 10), initialize=InitializeLinear, topology=HexagonalTopology, neighbourhood=NeighbourhoodGaussian, 314 batch_train=True, learning_rate=0.05, radius_ini=3.0, radius_fin=1.0, epochs=1000, **kwargs): 315 self.map_shape = map_shape 316 self.initialize = initialize 317 self.topology = topology 318 self.neighbourhood = neighbourhood 319 self.batch_train = batch_train 320 self.learning_rate = learning_rate 321 self.radius_ini = radius_ini 322 self.radius_fin = radius_fin 323 self.epochs = epochs 324 self.eps = 1e4 325 326 orange.Learner.__init__(self, **kwargs) 327 328 def __call__(self, examples, weightID=0, progressCallback=None): 329 data, classes, w = examples.toNumpyMA() 330 map = Map(self.map_shape, topology=self.topology) 331 if self.initialize == Map.InitializeLinear: 332 map.initialize_map_linear(data) 333 else: 334 map.initialize_map_random(data) 335 map = Solver(batch_train=self.batch_train, eps=self.eps, neighbourhood=self.neighbourhood, 336 radius_ini=self.radius_ini, radius_fin=self.radius_fin, learning_rate=self.learning_rate, 337 epochs=self.epochs)(data, map, progressCallback=progressCallback) 338 return SOMMap(map, examples) 339 340 class SOMSupervisedLearner(SOMLearner): 341 """ SOMSupervisedLearner is a class used to learn SOM from orange.ExampleTable, by using the 342 class information in the learning process. This is achieved by adding a value for each class 343 to the training instances, where 1.0 signals class membership and all other values are 0.0. 344 After the training, the new values are discarded from the node vectors. 345 """ 346 def __call__(self, examples, weightID=0, progressCallback=None): 347 data, classes, w = examples.toNumpyMA() 348 nval = len(examples.domain.classVar.values) 349 ext = ma.zeros((len(data), nval)) 350 ext[([i for i, m in enumerate(classes.mask) if m], [int(c) for c, m in zip(classes, classes.mask) if m])] = 1.0 351 data = ma.hstack((data, ext)) 352 map = Map(self.map_shape, topology=self.topology) 353 if self.initialize == Map.InitializeLinear: 354 map.initialize_map_linear(data) 355 else: 356 map.initialize_map_random(data) 357 map = Solver(batch_train=self.batch_train, eps=self.eps, neighbourhood=self.neighbourhood, 358 radius_ini=self.radius_ini, radius_fin=self.radius_fin, learning_rate=self.learning_rate, 359 epoch=self.epochs)(data, map, progressCallback=progressCallback) 360 for node in map: 361 node.vector = node.vector[:nval] 362 return SOMMap(map, examples) 363 364 class SOMMap(orange.Classifier): 365 def __init__(self, map=[], examples=[]): 366 self.map = map 367 self.examples = examples 368 for node in map: 369 node.referenceExample = orange.Example(orange.Domain(examples.domain.attributes, False), 370 [(var(value) if var.varType == orange.VarTypes.Continuous else var(int(value))) \ 371 for var, value in zip(examples.domain.attributes, node.vector)]) 372 node.examples = orange.ExampleTable(examples.domain) 373 374 for ex in examples: 375 node = self.getBestMatchingNode(ex) 376 node.examples.append(ex) 377 378 if examples and examples.domain.classVar: 379 for node in self.map: 380 node.classifier = orange.MajorityLearner(node.examples if node.examples else examples) 381 382 self.classVar = examples.domain.classVar 383 else: 384 self.classVar = None 385 386 def getBestMatchingNode(self, example): 387 """ Return the best matching node 388 """ 389 example, c, w = orange.ExampleTable([example]).toNumpyMA() 390 vectors = self.map.vectors() 391 Dist = vectors  example 392 bmu = ma.argmin(ma.sum(Dist**2, 1)) 393 return list(self.map)[bmu] 394 395 def __call__(self, example, what=orange.GetValue): 396 bmu = self.getBestMatchingNode(example) 397 return bmu.classifier(example, what) 398 399 def __getattr__(self, name): 400 try: 401 return getattr(self.__dict__["map"], name) 402 except (KeyError, AttributeError): 403 raise AttributeError(name) 404 405 def __iter__(self): 406 """ Iterate over all nodes in the map 407 """ 408 return iter(self.map) 409 410 def __getitem__(self, val): 411 """ Return the node at position x, y 412 """ 413 return self.map.__getitem__(val) 414 415 def getUMat(som): 416 dim1=som.map_shape[0]*21 417 dim2=som.map_shape[1]*21 418 419 a=numpy.zeros((dim1, dim2)) 420 if som.topology == HexagonalTopology: 421 return __fillHex(a, som) 422 else: 423 return __fillRect(a, som) 424 425 def __fillHex(array, som): 426 xDim, yDim = som.map_shape 427 ## for n in som.nodes: 428 ## d[tuple(n.pos)]=n 429 d = dict([((i, j), som[i, j]) for i in range(xDim) for j in range(yDim)]) 430 check=lambda x,y:x>=0 and x<(xDim*21) and y>=0 and y<(yDim*21) 431 dx=[1,0,1] 432 dy=[0,1, 1] 433 for i in range(0, xDim*2,2): 434 for j in range(0, yDim*2,2): 435 for ddx, ddy in zip(dx, dy): 436 if check(i+ddx, j+ddy): 437 ## array[i+ddx][j+ddy]=d[(i/2, j/2)].getDistance(d[(i/2+ddx, j/2+ddy)].referenceExample) 438 array[i+ddx][j+ddy] = numpy.sqrt(ma.sum((d[(i/2, j/2)].vector  d[(i/2+ddx, j/2+ddy)].vector)**2)) 439 dx=[1,1,0,1, 0, 1] 440 dy=[0, 0,1, 1,1,1] 441 for i in range(0, xDim*2, 2): 442 for j in range(0, yDim*2, 2): 443 l=[array[i+ddx, j+ddy] for ddx, ddy in zip(dx, dy) if check(i+ddx, j+ddy)] 444 array[i][j]=sum(l)/len(l) 445 return array 446 447 def __fillRect(array, som): 448 xDim, yDim = som.map_shape 449 d = dict([((i, j), som[i, j]) for i in range(xDim) for j in range(yDim)]) 450 check=lambda x,y:x>=0 and x<xDim*21 and y>=0 and y<yDim*21 451 dx=[1, 0, 1] 452 dy=[0, 1, 1] 453 for i in range(0, xDim*2, 2): 454 for j in range(0, yDim*2, 2): 455 for ddx, ddy in zip(dx, dy): 456 if check(i+ddx, j+ddy): 457 ## array[i+ddx][j+ddy]=d[(i/2, j/2)].getDistance(d[(i/2+ddx, j/2+ddy)].referenceExample) 458 array[i+ddx][j+ddy] = numpy.sqrt(ma.sum((d[(i/2, j/2)].vector  d[(i/2+ddx, j/2+ddy)].vector)**2)) 459 dx=[1,1, 0,0,1,1,1, 1] 460 dy=[0, 0,1,1,1,1, 1,1] 461 for i in range(0, xDim*2, 2): 462 for j in range(0, yDim*2, 2): 463 l=[array[i+ddx,j+ddy] for ddx,ddy in zip(dx,dy) if check(i+ddx, j+ddy)] 464 array[i][j]=sum(l)/len(l) 465 return array 466 467 if __name__ == "__main__": 468 data = orange.ExampleTable("doc//datasets//iris.tab") 469 learner = SOMLearner() 470 learner = SOMLearner(batch_train=True, initialize=InitializeLinear, radius_ini=3, radius_fin=1, neighbourhood=Map.NeighbourhoodGaussian, epochs=1000) 471 map = learner(data) 472 for e in data: 473 print map(e), e.getclass() 474 1 from Orange.projection.som import *
Note: See TracChangeset
for help on using the changeset viewer.