Changeset 9598:2fcfcfd5b6b9 in orange for orange/Orange/clustering/mixture.py
 Timestamp:
 01/27/12 18:57:59 (2 years ago)
 Branch:
 default
 File:

 1 edited
Legend:
 Unmodified
 Added
 Removed

orange/Orange/clustering/mixture.py
r8042 r9598 1 """ Implements a Gaussian mixture model. 1 """ 2 ******************************* 3 Gaussian Mixtures (``mixture``) 4 ******************************* 5 6 This module implements a Gaussian mixture model. 2 7 3 8 Example :: 4 9 5 >>> mixture = GaussianMixture(data, n _centers=3)10 >>> mixture = GaussianMixture(data, n=3) 6 11 >>> print mixture.means 7 12 >>> print mixture.weights … … 14 19 import numpy 15 20 import random 16 import Orange .data21 import Orange 17 22 18 23 class GMModel(object): 19 24 """ Gaussian mixture model 20 25 """ 21 def __init__(self, weights, means, covariances): 26 def __init__(self, weights, means, covariances, inv_covariances=None, 27 cov_determinants=None): 22 28 self.weights = weights 23 29 self.means = means 24 30 self.covariances = covariances 25 self.inverse_covariances = [numpy.linalg.pinv(cov) for cov in covariances] 31 if inv_covariances is None: 32 self.inv_covariances = [numpy.linalg.pinv(cov) for cov in covariances] 33 else: 34 self.inv_covariances = inv_covariances 35 36 if cov_determinants is None: 37 self.cov_determinants = [numpy.linalg.det(cov) for cov in covariances] 38 else: 39 self.cov_determinants = cov_determinants 40 26 41 27 42 def __call__(self, instance): 28 43 """ Return the probability of instance. 29 44 """ 30 return numpy.sum(prob_est([instance], self.weights, self.means, self.covariances)) 45 return numpy.sum(prob_est([instance], self.weights, self.means, 46 self.covariances, 47 self.inv_covariances, 48 self.cov_determinants)) 31 49 32 50 def __getitem__(self, index): 33 51 """ Return the indexth gaussian. 34 52 """ 35 return GMModel([1.0], self.means[index: index + 1], self.covariances[index: index + 1]) 53 return GMModel([1.0], self.means[index: index + 1], 54 self.covariances[index: index + 1], 55 self.inv_covariances[index: index + 1], 56 self.cov_determinants[index: index + 1]) 36 57 37 58 def __len__(self): … … 39 60 40 61 41 def init_random(data, n _centers, *args, **kwargs):62 def init_random(data, n, *args, **kwargs): 42 63 """ Init random means and correlations from a data table. 43 64 44 65 :param data: data table 45 66 :type data: :class:`Orange.data.Table` 46 :param n _centers: Number of centers and correlations to return.47 :type n _centers: int67 :param n: Number of centers and correlations to return. 68 :type n: int 48 69 49 70 """ … … 55 76 min, max = array.max(0), array.min(0) 56 77 dim = array.shape[1] 57 means = numpy.zeros((n _centers, dim))58 for i in range(n _centers):78 means = numpy.zeros((n, dim)) 79 for i in range(n): 59 80 means[i] = [numpy.random.uniform(low, high) for low, high in zip(min, max)] 60 81 61 correlations = [numpy.asmatrix(numpy.eye(dim)) for i in range(n _centers)]82 correlations = [numpy.asmatrix(numpy.eye(dim)) for i in range(n)] 62 83 return means, correlations 63 84 64 def init_kmeans(data, n _centers, *args, **kwargs):85 def init_kmeans(data, n, *args, **kwargs): 65 86 """ Init with kmeans algorithm. 66 87 67 88 :param data: data table 68 89 :type data: :class:`Orange.data.Table` 69 :param n _centers: Number of centers and correlations to return.70 :type n _centers: int90 :param n: Number of centers and correlations to return. 91 :type n: int 71 92 72 93 """ … … 74 95 raise TypeError("Orange.data.Table instance expected!") 75 96 from Orange.clustering.kmeans import Clustering 76 km = Clustering(data, centroids=n _centers, maxiters=20, nstart=3)97 km = Clustering(data, centroids=n, maxiters=20, nstart=3) 77 98 centers = Orange.data.Table(km.centroids) 78 99 centers, w, c = centers.toNumpyMA() 79 100 dim = len(data.domain.attributes) 80 correlations = [numpy.asmatrix(numpy.eye(dim)) for i in range(n _centers)]101 correlations = [numpy.asmatrix(numpy.eye(dim)) for i in range(n)] 81 102 return centers, correlations 82 103 83 def prob_est1(data, mean, covariance, inv_covariance=None ):104 def prob_est1(data, mean, covariance, inv_covariance=None, det=None): 84 105 """ Return the probability of data given mean and covariance matrix 85 106 """ 86 107 data = numpy.asmatrix(data) 87 108 mean = numpy.asmatrix(mean) 88 109 if inv_covariance is None: 89 110 inv_covariance = numpy.linalg.pinv(covariance) … … 100 121 p = numpy.exp(p) 101 122 p /= numpy.power(2 * numpy.pi, numpy.rank(covariance) / 2.0) 102 det = numpy.linalg.det(covariance) 123 if det is None: 124 det = numpy.linalg.det(covariance) 103 125 assert(det != 0.0) 104 126 p /= det … … 106 128 107 129 108 def prob_est(data, weights, means, covariances, inv_covariances=None ):109 """ Return the probability estimation of data given weights, means and110 covariances.130 def prob_est(data, weights, means, covariances, inv_covariances=None, cov_determinants=None): 131 """ Return the probability estimation of data for each 132 gausian given weights, means and covariances. 111 133 112 134 """ … … 114 136 inv_covariances = [numpy.linalg.pinv(cov) for cov in covariances] 115 137 138 if cov_determinants is None: 139 cov_determinants = [numpy.linalg.det(cov) for cov in covariances] 140 116 141 data = numpy.asmatrix(data) 117 142 probs = numpy.zeros((data.shape[0], len(weights))) 118 143 119 for i, (w, mean, cov, inv_cov) in enumerate(zip(weights, means, covariances, inv_covariances)): 120 probs[:, i] = w * prob_est1(data, mean, cov, inv_cov) 144 for i, (w, mean, cov, inv_cov, det) in enumerate(zip(weights, means, 145 covariances, inv_covariances, 146 cov_determinants)): 147 probs[:, i] = w * prob_est1(data, mean, cov, inv_cov, det) 121 148 122 149 return probs … … 133 160 self.covariances = covariances 134 161 self.inv_covariances = [numpy.matrix(numpy.linalg.pinv(cov)) for cov in covariances] 135 162 self.cov_determinants = [numpy.linalg.det(cov) for cov in self.covariances] 136 163 self.n_clusters = len(self.weights) 137 164 self.data_dim = self.data.shape[1] 138 165 139 self.probs = prob_est(data, weights, means, covariances, self.inv_covariances) 166 self.probs = prob_est(data, weights, means, covariances, 167 self.inv_covariances, self.cov_determinants) 140 168 141 169 self.log_likelihood = self._log_likelihood() 142 170 # print "W", self.weights 171 # print "P", self.probs 172 # print "L", self.log_likelihood 173 # print "C", self.covariances 174 # print "Det", self.cov_determinants 143 175 144 176 def _log_likelihood(self): … … 154 186 self.covariances, self.inv_covariances) 155 187 188 # print "PPP", self.probs 189 # print "P sum", numpy.sum(self.probs, axis=1).reshape((1, 1)) 156 190 self.probs /= numpy.sum(self.probs, axis=1).reshape((1, 1)) 157 191 … … 171 205 172 206 self.log_likelihood = self._log_likelihood() 173 207 # print "Prob:", self.probs 208 # print "Log like.:", self.log_likelihood 174 209 175 210 def M_step(self): 176 211 """ M step 177 212 """ 178 # Update theweights213 # Compute the new weights 179 214 prob_sum = numpy.sum(self.probs, axis=0) 180 181 self.weights = prob_sum / numpy.sum(prob_sum)182 183 # Update the means215 weights = prob_sum / numpy.sum(prob_sum) 216 217 # Compute the new means 218 means = [] 184 219 for j in range(self.n_clusters): 185 self.means[j] = numpy.sum(self.probs[:, j].reshape((1, 1)) * self.data, axis=0) / prob_sum[j] 186 187 # Update the covariances 220 means.append(numpy.sum(self.probs[:, j].reshape((1, 1)) * self.data, axis=0) / prob_sum[j]) 221 222 # Compute the new covariances 223 covariances = [] 224 cov_determinants = [] 188 225 for j in range(self.n_clusters): 189 226 cov = numpy.zeros(self.covariances[j].shape) 190 diff = self.data  self.means[j]227 diff = self.data  means[j] 191 228 diff = numpy.asmatrix(diff) 192 229 for i in range(len(self.data)): # TODO: speed up … … 194 231 195 232 cov *= 1.0 / prob_sum[j] 196 self.covariances[j] = cov 197 self.inv_covariances[j] = numpy.linalg.pinv(cov) 233 det = numpy.linalg.det(cov) 234 235 covariances.append(numpy.asmatrix(cov)) 236 cov_determinants.append(det) 237 # self.inv_covariances[j] = numpy.linalg.pinv(cov) 238 # self.cov_determinants[j] = det 239 self.weights = weights 240 self.means = numpy.asmatrix(means) 241 self.covariances = covariances 242 self.cov_determinants = cov_determinants 198 243 199 244 def one_step(self): … … 227 272 # print abs(old_objective  self.log_likelihood) 228 273 if abs(old_objective  self.log_likelihood) < eps or curr_iter > max_iter: 229 break 230 231 232 #class GASolver(object): 233 # """ A toy genetic algorithm solver 234 # """ 235 # def __init__(self, data, weights, means, covariances): 236 # raise NotImplementedError 237 # 238 # 239 #class PSSolver(object): 240 # """ A toy particle swarm solver 241 # """ 242 # def __init__(self, data, weights, means, covariances): 243 # raise NotImplementedError 244 # 245 #class HybridSolver(object): 246 # """ A hybrid solver 247 # """ 248 # def __init__(self, data, weights, means, covariances): 249 # raise NotImplementedError 250 274 break 251 275 252 276 class GaussianMixture(object): … … 261 285 return self 262 286 263 def __init__(self, n _centers=3, init_function=init_kmeans):264 self.n _centers = n_centers287 def __init__(self, n=3, init_function=init_kmeans): 288 self.n = n 265 289 self.init_function = init_function 266 290 267 291 def __call__(self, data, weightId=None): 268 means, correlations = self.init_function(data, self.n_centers) 292 from Orange.preprocess import Preprocessor_impute, DomainContinuizer 293 # data = Preprocessor_impute(data) 294 dc = DomainContinuizer() 295 dc.multinomial_treatment = DomainContinuizer.AsOrdinal 296 dc.continuous_treatment = DomainContinuizer.NormalizeByVariance 297 dc.class_treatment = DomainContinuizer.Ignore 298 domain = dc(data) 299 data = data.translate(domain) 300 301 means, correlations = self.init_function(data, self.n) 269 302 means = numpy.asmatrix(means) 270 303 array, _, _ = data.to_numpy_MA() 271 solver = EMSolver(array, numpy.ones((self.n_centers)) / self.n_centers, 304 # avg = numpy.mean(array, axis=0) 305 # array = avg.reshape((1, 1)) 306 # means = avg.reshape((1, 1)) 307 # std = numpy.std(array, axis=0) 308 # array /= std.reshape((1, 1)) 309 # means /= std.reshape((1, 1)) 310 solver = EMSolver(array, numpy.ones((self.n)) / self.n, 272 311 means, correlations) 273 312 solver.run() 274 return GMModel(solver.weights, solver.means, solver.covariances) 313 norm_model = GMModel(solver.weights, solver.means, solver.covariances) 314 return GMClusterModel(domain, norm_model) 315 316 317 class GMClusterModel(object): 318 """ 319 """ 320 def __init__(self, domain, norm_model): 321 self.domain = domain 322 self.norm_model = norm_model 323 self.cluster_vars = [Orange.data.variable.Continuous("cluster %i" % i)\ 324 for i in range(len(norm_model))] 325 self.weights = self.norm_model.weights 326 self.means = self.norm_model.means 327 self.covariances = self.norm_model.covariances 328 self.inv_covariances = self.norm_model.inv_covariances 329 self.cov_determinants = self.norm_model.cov_determinants 330 331 def __call__(self, instance, *args): 332 data = Orange.data.Table(self.domain, [instance]) 333 data,_,_ = data.to_numpy_MA() 334 # print data 335 336 p = prob_est(data, self.norm_model.weights, 337 self.norm_model.means, 338 self.norm_model.covariances, 339 self.norm_model.inv_covariances, 340 self.norm_model.cov_determinants) 341 # print p 342 p /= numpy.sum(p) 343 vals = [] 344 for p, var in zip(p[0], self.cluster_vars): 345 vals.append(var(p)) 346 return vals 275 347 276 348 … … 285 357 286 358 axis = list(axis) 359 360 if isinstance(mixture, GMClusterModel): 361 mixture = mixture.norm_model 362 287 363 if isinstance(data_array, Orange.data.Table): 288 364 data_array, _, _ = data_array.to_numpy_MA() … … 292 368 means = mixture.means[:, axis] 293 369 294 covariances = [cov[axis,:][:, axis] for cov in mixture.covariances] 370 covariances = [cov[axis,:][:, axis] for cov in mixture.covariances] # TODO: Need the whole marginal distribution. 295 371 296 372 gmm = GMModel(weights, means, covariances) … … 320 396 321 397 def test(seed=0): 322 # data = Orange.data.Table(os.path.expanduser(" ../../doc/datasets/brownselected.tab"))398 # data = Orange.data.Table(os.path.expanduser("brownselected.tab")) 323 399 # data = Orange.data.Table(os.path.expanduser("~/Documents/brownselectedfss.tab")) 324 data = Orange.data.Table(os.path.expanduser("~/Documents/brownselectedfss1.tab")) 325 data = Orange.data.Table("../../doc/datasets/iris.tab") 400 # data = Orange.data.Table(os.path.expanduser("~/Documents/brownselectedfss1.tab")) 401 # data = Orange.data.Table(os.path.expanduser("~/ozone1")) 402 data = Orange.data.Table("iris.tab") 326 403 # data = Orange.data.Table(Orange.data.Domain(data.domain[:2], None), data) 327 404 numpy.random.seed(seed) 328 405 random.seed(seed) 329 gmm = GaussianMixture(data, n_centers=3, init_function=init_kmeans) 330 plot_model(data, gmm, axis=(0, 1), samples=40, contour_lines=100) 406 gmm = GaussianMixture(data, n=3, init_function=init_kmeans) 407 data = data.translate(gmm.domain) 408 plot_model(data, gmm, axis=(0, 2), samples=40, contour_lines=100) 331 409 332 410
Note: See TracChangeset
for help on using the changeset viewer.