# source:orange/orange/Orange/clustering/mixture.py@8042:ffcb93bc9028

Revision 8042:ffcb93bc9028, 10.8 KB checked in by markotoplak, 3 years ago (diff)

Hierarchical clustering: also catch RuntimeError when importing matplotlib (or the documentation could not be built on server).

Line
1""" Implements a Gaussian mixture model.
2
3Example ::
4
5    >>> mixture = GaussianMixture(data, n_centers=3)
6    >>> print mixture.means
7    >>> print mixture.weights
8    >>> print mixture.covariances
9    >>> plot_model(data, mixture, samples=40)
10
11"""
12
13import sys, os
14import numpy
15import random
16import Orange.data
17
18class GMModel(object):
19    """ Gaussian mixture model
20    """
21    def __init__(self, weights, means, covariances):
22        self.weights = weights
23        self.means = means
24        self.covariances = covariances
25        self.inverse_covariances = [numpy.linalg.pinv(cov) for cov in covariances]
26
27    def __call__(self, instance):
28        """ Return the probability of instance.
29        """
30        return numpy.sum(prob_est([instance], self.weights, self.means, self.covariances))
31
32    def __getitem__(self, index):
33        """ Return the index-th gaussian.
34        """
35        return GMModel([1.0], self.means[index: index + 1], self.covariances[index: index + 1])
36
37    def __len__(self):
38        return len(self.weights)
39
40
41def init_random(data, n_centers, *args, **kwargs):
42    """ Init random means and correlations from a data table.
43
44    :param data: data table
45    :type data: :class:`Orange.data.Table`
46    :param n_centers: Number of centers and correlations to return.
47    :type n_centers: int
48
49    """
50    if isinstance(data, Orange.data.Table):
51        array, w, c = data.toNumpyMA()
52    else:
53        array = numpy.asarray(data)
54
55    min, max = array.max(0), array.min(0)
56    dim = array.shape[1]
57    means = numpy.zeros((n_centers, dim))
58    for i in range(n_centers):
59        means[i] = [numpy.random.uniform(low, high) for low, high in zip(min, max)]
60
61    correlations = [numpy.asmatrix(numpy.eye(dim)) for i in range(n_centers)]
62    return means, correlations
63
64def init_kmeans(data, n_centers, *args, **kwargs):
65    """ Init with k-means algorithm.
66
67    :param data: data table
68    :type data: :class:`Orange.data.Table`
69    :param n_centers: Number of centers and correlations to return.
70    :type n_centers: int
71
72    """
73    if not isinstance(data, Orange.data.Table):
74        raise TypeError("Orange.data.Table instance expected!")
75    from Orange.clustering.kmeans import Clustering
76    km = Clustering(data, centroids=n_centers, maxiters=20, nstart=3)
77    centers = Orange.data.Table(km.centroids)
78    centers, w, c = centers.toNumpyMA()
79    dim = len(data.domain.attributes)
80    correlations = [numpy.asmatrix(numpy.eye(dim)) for i in range(n_centers)]
81    return centers, correlations
82
83def prob_est1(data, mean, covariance, inv_covariance=None):
84    """ Return the probability of data given mean and covariance matrix
85    """
86    data = numpy.asmatrix(data)
87
88    if inv_covariance is None:
89        inv_covariance = numpy.linalg.pinv(covariance)
90
91    inv_covariance = numpy.asmatrix(inv_covariance)
92
93    diff = data - mean
94    p = numpy.zeros(data.shape[0])
95    for i in range(data.shape[0]):
96        d = diff[i]
97        p[i] = d * inv_covariance * d.T
98
99    p *= -0.5
100    p = numpy.exp(p)
101    p /= numpy.power(2 * numpy.pi, numpy.rank(covariance) / 2.0)
102    det = numpy.linalg.det(covariance)
103    assert(det != 0.0)
104    p /= det
105    return p
106
107
108def prob_est(data, weights, means, covariances, inv_covariances=None):
109    """ Return the probability estimation of data given weights, means and
110    covariances.
111
112    """
113    if inv_covariances is None:
114        inv_covariances = [numpy.linalg.pinv(cov) for cov in covariances]
115
116    data = numpy.asmatrix(data)
117    probs = numpy.zeros((data.shape[0], len(weights)))
118
119    for i, (w, mean, cov, inv_cov) in enumerate(zip(weights, means, covariances, inv_covariances)):
120        probs[:, i] = w * prob_est1(data, mean, cov, inv_cov)
121
122    return probs
123
124
125class EMSolver(object):
126    """ An EM solver for gaussian mixture model
127    """
128    _TRACE_MEAN = False
129    def __init__(self, data, weights, means, covariances):
130        self.data = data
131        self.weights = weights
132        self.means = means
133        self.covariances = covariances
134        self.inv_covariances = [numpy.matrix(numpy.linalg.pinv(cov)) for cov in covariances]
135
136        self.n_clusters = len(self.weights)
137        self.data_dim = self.data.shape[1]
138
139        self.probs = prob_est(data, weights, means, covariances, self.inv_covariances)
140
141        self.log_likelihood = self._log_likelihood()
142
143
144    def _log_likelihood(self):
145        """ Compute the log likelihood of the current solution.
146        """
147        log_p = numpy.log(numpy.sum(self.weights * self.probs, axis=0))
148        return 1.0 / len(self.data) * numpy.sum(log_p)
149
150    def E_step(self):
151        """ E Step
152        """
153        self.probs = prob_est(self.data, self.weights, self.means,
154                         self.covariances, self.inv_covariances)
155
156        self.probs /= numpy.sum(self.probs, axis=1).reshape((-1, 1))
157
158        # Update the Q
159#        self.Q = 0.0
160#        prob_sum = numpy.sum(self.probs, axis=0)
161#        self.Q = sum([p*(numpy.log(w) - 0.5 * numpy.linalg.det(cov)) \
162#                      for p, w, cov in zip(prob_sum, self.weights,
163#                                           self.covariances)]) * \
164#                      len(self.data)
165#
166#        for i in range(len(data)):
167#            for j in range(self.n_clusters):
168#                diff = numpy.asmatrix(self.data[i] - self.means[j])
169#                self.Q += - 0.5 * self.probs[i, j] * diff.T * self.inv_covariances[j] * diff
170#        print self.Q
171
172        self.log_likelihood = self._log_likelihood()
173
174
175    def M_step(self):
176        """ M step
177        """
178        # Update the weights
179        prob_sum = numpy.sum(self.probs, axis=0)
180
181        self.weights = prob_sum / numpy.sum(prob_sum)
182
183        # Update the means
184        for j in range(self.n_clusters):
185            self.means[j] = numpy.sum(self.probs[:, j].reshape((-1, 1)) * self.data, axis=0) /  prob_sum[j]
186
187        # Update the covariances
188        for j in range(self.n_clusters):
189            cov = numpy.zeros(self.covariances[j].shape)
190            diff = self.data - self.means[j]
191            diff = numpy.asmatrix(diff)
192            for i in range(len(self.data)): # TODO: speed up
193                cov += self.probs[i, j] * diff[i].T * diff[i]
194
195            cov *= 1.0 / prob_sum[j]
196            self.covariances[j] = cov
197            self.inv_covariances[j] = numpy.linalg.pinv(cov)
198
199    def one_step(self):
200        """ One iteration of both M and E step.
201        """
202        self.E_step()
203        self.M_step()
204
205    def run(self, max_iter=sys.maxint, eps=1e-5):
206        """ Run the EM algorithm.
207        """
208        if self._TRACE_MEAN:
209            from pylab import plot, show, draw, ion
210            ion()
211            plot(self.data[:, 0], self.data[:, 1], "ro")
212            vec_plot = plot(self.means[:, 0], self.means[:, 1], "bo")[0]
213
214        curr_iter = 0
215
216        while True:
217            old_objective = self.log_likelihood
218            self.one_step()
219
220            if self._TRACE_MEAN:
221                vec_plot.set_xdata(self.means[:, 0])
222                vec_plot.set_ydata(self.means[:, 1])
223                draw()
224
225            curr_iter += 1
226#            print curr_iter
227#            print abs(old_objective - self.log_likelihood)
228            if abs(old_objective - self.log_likelihood) < eps or curr_iter > max_iter:
229                break
230
231
232#class GASolver(object):
233#    """ A toy genetic algorithm solver
234#    """
235#    def __init__(self, data, weights, means, covariances):
236#        raise NotImplementedError
237#
238#
239#class PSSolver(object):
240#    """ A toy particle swarm solver
241#    """
242#    def __init__(self, data, weights, means, covariances):
243#        raise NotImplementedError
244#
245#class HybridSolver(object):
246#    """ A hybrid solver
247#    """
248#    def __init__(self, data, weights, means, covariances):
249#        raise NotImplementedError
250
251
252class GaussianMixture(object):
253    """ Computes the gaussian mixture model from an Orange data-set.
254    """
255    def __new__(cls, data=None, weightId=None, **kwargs):
256        self = object.__new__(cls)
257        if data is not None:
258            self.__init__(**kwargs)
259            return self.__call__(data, weightId)
260        else:
261            return self
262
263    def __init__(self, n_centers=3, init_function=init_kmeans):
264        self.n_centers = n_centers
265        self.init_function = init_function
266
267    def __call__(self, data, weightId=None):
268        means, correlations = self.init_function(data, self.n_centers)
269        means = numpy.asmatrix(means)
270        array, _, _ = data.to_numpy_MA()
271        solver = EMSolver(array, numpy.ones((self.n_centers)) / self.n_centers,
272                          means, correlations)
273        solver.run()
274        return GMModel(solver.weights, solver.means, solver.covariances)
275
276
277def plot_model(data_array, mixture, axis=(0, 1), samples=20, contour_lines=20):
278    """ Plot the scaterplot of data_array and the contour lines of the
279    probability for the mixture.
280
281    """
282    import matplotlib
283    import matplotlib.pylab as plt
284    import matplotlib.cm as cm
285
286    axis = list(axis)
287    if isinstance(data_array, Orange.data.Table):
288        data_array, _, _ = data_array.to_numpy_MA()
289    array = data_array[:, axis]
290
291    weights = mixture.weights
292    means = mixture.means[:, axis]
293
294    covariances = [cov[axis,:][:, axis] for cov in mixture.covariances]
295
296    gmm = GMModel(weights, means, covariances)
297
298    min = numpy.min(array, 0)
299    max = numpy.max(array, 0)
300    extent = (min[0], max[0], min[1], max[1])
301
302    X = numpy.linspace(min[0], max[0], samples)
303    Y = numpy.linspace(min[1], max[1], samples)
304
305    Z = numpy.zeros((X.shape[0], Y.shape[0]))
306
307    for i, x in enumerate(X):
308        for j, y in enumerate(Y):
309            Z[i, j] = gmm([x, y])
310
311    plt.plot(array[:,0], array[:,1], "ro")
312    plt.contour(X, Y, Z.T, contour_lines,
313                extent=extent)
314
315    im = plt.imshow(Z.T, interpolation='bilinear', origin='lower',
316                cmap=cm.gray, extent=extent)
317
318    plt.plot(means[:, 0], means[:, 1], "b+")
319    plt.show()
320
321def test(seed=0):
322#    data = Orange.data.Table(os.path.expanduser("../../doc/datasets/brown-selected.tab"))
323#    data = Orange.data.Table(os.path.expanduser("~/Documents/brown-selected-fss.tab"))
324    data = Orange.data.Table(os.path.expanduser("~/Documents/brown-selected-fss-1.tab"))
325    data = Orange.data.Table("../../doc/datasets/iris.tab")
326#    data = Orange.data.Table(Orange.data.Domain(data.domain[:2], None), data)
327    numpy.random.seed(seed)
328    random.seed(seed)
329    gmm = GaussianMixture(data, n_centers=3, init_function=init_kmeans)
330    plot_model(data, gmm, axis=(0, 1), samples=40, contour_lines=100)
331
332
333
334if __name__ == "__main__":
335    test()
336
337
Note: See TracBrowser for help on using the repository browser.