#
source:
orange/orange/Orange/clustering/kmeans.py
@
7648:a3c43789905d

Revision 7648:a3c43789905d, 20.5 KB checked in by ales_erjavec <ales.erjavec@…>, 3 years ago (diff) |
---|

Line | |
---|---|

1 | """ |

2 | ****************** |

3 | K-means clustering |

4 | ****************** |

5 | |

6 | .. index:: |

7 | single: clustering, kmeans |

8 | .. index:: agglomerative clustering |

9 | |

10 | |

11 | .. autoclass:: Orange.clustering.kmeans.Clustering |

12 | :members: |

13 | |

14 | Examples |

15 | ======== |

16 | |

17 | The following code runs k-means clustering and prints out the cluster indexes |

18 | for the last 10 data instances (`kmeans-run.py`_, uses `iris.tab`_): |

19 | |

20 | .. literalinclude:: code/kmeans-run.py |

21 | |

22 | The output of this code is:: |

23 | |

24 | [1, 1, 2, 1, 1, 1, 2, 1, 1, 2] |

25 | |

26 | Invoking a call-back function may be useful when tracing the progress of the clustering. |

27 | Below is a code that uses an :obj:`inner_callback` to report on the number of instances |

28 | that have changed the cluster and to report on the clustering score. For the score |

29 | o be computed at each iteration we have to set :obj:`minscorechange`, but we can |

30 | leave it at 0 or even set it to a negative value, which allows the score to deteriorate |

31 | by some amount (`kmeans-run-callback.py`_, uses `iris.tab`_): |

32 | |

33 | .. literalinclude:: code/kmeans-run-callback.py |

34 | |

35 | The convergence on Iris data set is fast:: |

36 | |

37 | Iteration: 1, changes: 150, score: 10.9555 |

38 | Iteration: 2, changes: 12, score: 10.3867 |

39 | Iteration: 3, changes: 2, score: 10.2034 |

40 | Iteration: 4, changes: 2, score: 10.0699 |

41 | Iteration: 5, changes: 2, score: 9.9542 |

42 | Iteration: 6, changes: 1, score: 9.9168 |

43 | Iteration: 7, changes: 2, score: 9.8624 |

44 | Iteration: 8, changes: 0, score: 9.8624 |

45 | |

46 | Call-back above is used for reporting of the progress, but may as well call a function that plots a selection data projection with corresponding centroid at a given step of the clustering. This is exactly what we did with the following script (`kmeans-trace.py`_, uses `iris.tab`_): |

47 | |

48 | .. literalinclude:: code/kmeans-trace.py |

49 | |

50 | Only the first four scatterplots are shown below. Colors of the data instances indicate the cluster membership. Notice that since the Iris data set includes four attributes, the closest centroid in a particular 2-dimensional projection is not necessary also the centroid of the cluster that the data point belongs to. |

51 | |

52 | |

53 | .. image:: files/kmeans-scatter-001.png |

54 | |

55 | .. image:: files/kmeans-scatter-002.png |

56 | |

57 | .. image:: files/kmeans-scatter-003.png |

58 | |

59 | .. image:: files/kmeans-scatter-004.png |

60 | |

61 | |

62 | k-Means Utility Functions |

63 | ========================= |

64 | |

65 | .. automethod:: Orange.clustering.kmeans.init_random |

66 | |

67 | .. automethod:: Orange.clustering.kmeans.init_diversity |

68 | |

69 | .. autoclass:: Orange.clustering.kmeans.init_hclustering |

70 | :members: |

71 | |

72 | .. automethod:: Orange.clustering.kmeans.plot_silhouette |

73 | |

74 | .. automethod:: Orange.clustering.kmeans.score_distance_to_centroids |

75 | |

76 | .. automethod:: Orange.clustering.kmeans.score_silhouette |

77 | |

78 | .. automethod:: Orange.clustering.kmeans.score_fast_silhouette |

79 | |

80 | Typically, the choice of seeds has a large impact on the k-means clustering, |

81 | with better initialization methods yielding a clustering that converges faster |

82 | and finds more optimal centroids. The following code compares three different |

83 | initialization methods (random, diversity-based and hierarchical clustering-based) |

84 | in terms of how fast they converge (`kmeans-cmp-init.py`_, uses `iris.tab`_, |

85 | `housing.tab`_, `vehicle.tab`_): |

86 | |

87 | .. literalinclude:: code/kmeans-cmp-init.py |

88 | |

89 | As expected, k-means converges faster with diversity and clustering-based |

90 | initialization that with random seed selection:: |

91 | |

92 | Rnd Div HC |

93 | iris 12 3 4 |

94 | housing 14 6 4 |

95 | vehicle 11 4 3 |

96 | |

97 | The following code computes the silhouette score for k=2..7 and plots a |

98 | silhuette plot for k=3 (`kmeans-silhouette.py`_, uses `iris.tab`_): |

99 | |

100 | .. literalinclude:: code/kmeans-silhouette.py |

101 | |

102 | The analysis suggests that k=2 is preferred as it yields |

103 | the maximal silhouette coefficient:: |

104 | |

105 | 2 0.629467553352 |

106 | 3 0.504318855054 |

107 | 4 0.407259377854 |

108 | 5 0.358628975081 |

109 | 6 0.353228492088 |

110 | 7 0.366357876944 |

111 | |

112 | .. figure:: files/kmeans-silhouette.png |

113 | |

114 | Silhouette plot for k=3. |

115 | |

116 | .. _iris.tab: code/iris.tab |

117 | .. _housing.tab: code/housing.tab |

118 | .. _vehicle.tab: code/vehicle.tab |

119 | .. _kmeans-run.py: code/kmeans-run.py |

120 | .. _kmeans-run-callback.py: code/kmeans-run-callback.py |

121 | .. _kmeans-trace.py: code/kmeans-trace.py |

122 | .. _kmeans-cmp-init.py: code/kmeans-cmp-init.py |

123 | .. _kmeans-silhouette.py: code/kmeans-sillhouette.py |

124 | """ |

125 | |

126 | import math |

127 | import sys |

128 | import orange |

129 | import random |

130 | import statc |

131 | |

132 | import Orange.clustering.hierarchical |

133 | |

134 | # miscellaneous functions |

135 | |

136 | def _modus(dist): |

137 | #Check bool(dist) - False means no known cases |

138 | #Check dist.cases > 0 - We cant return some value from the domain without knowing if it is even present |

139 | #in the data. TOOD: What does this mean for k-means convergence? |

140 | if bool(dist) and dist.cases > 0: |

141 | return dist.modus() |

142 | else: |

143 | return None |

144 | |

145 | def data_center(data): |

146 | """ |

147 | Returns a center of the instances in the data set (average across data instances for continuous attributes, most frequent value for discrete attributes). |

148 | """ |

149 | atts = data.domain.attributes |

150 | astats = orange.DomainBasicAttrStat(data) |

151 | center = [astats[a].avg if a.varType == orange.VarTypes.Continuous \ |

152 | # else max(enumerate(orange.Distribution(a, data)), key=lambda x:x[1])[0] if a.varType == orange.VarTypes.Discrete |

153 | else _modus(orange.Distribution(a, data)) if a.varType == orange.VarTypes.Discrete |

154 | else None |

155 | for a in atts] |

156 | if data.domain.classVar: |

157 | center.append(0) |

158 | return orange.Example(data.domain, center) |

159 | |

160 | def minindex(x): |

161 | """Return the index of the minimum element""" |

162 | return x.index(min(x)) |

163 | |

164 | def avg(x): |

165 | """Return the average (mean) of a given list""" |

166 | return (float(sum(x)) / len(x)) if x else 0 |

167 | |

168 | # |

169 | # data distances |

170 | # |

171 | |

172 | # k-means clustering |

173 | |

174 | # clustering scoring functions |

175 | |

176 | def score_distance_to_centroids(km): |

177 | """Returns an average distance of data instances to their associated centroids. |

178 | |

179 | :param km: a k-means clustering object. |

180 | :type km: :class:`KMeans` |

181 | """ |

182 | return sum(km.distance(km.centroids[km.clusters[i]], d) for i,d in enumerate(km.data)) |

183 | |

184 | score_distance_to_centroids.minimize = True |

185 | |

186 | def score_conditionalEntropy(km): |

187 | """UNIMPLEMENTED cluster quality measured by conditional entropy""" |

188 | pass |

189 | |

190 | def score_withinClusterDistance(km): |

191 | """UNIMPLEMENTED weighted average within-cluster pairwise distance""" |

192 | pass |

193 | |

194 | score_withinClusterDistance.minimize = True |

195 | |

196 | def score_betweenClusterDistance(km): |

197 | """Sum of distances from elements to 'nearest miss' centroids""" |

198 | return sum(min(km.distance(c, d) for j,c in enumerate(km.centroids) if j!=km.clusters[i]) for i,d in enumerate(km.data)) |

199 | |

200 | def score_silhouette(km, index=None): |

201 | """Returns an average silhouette score of data instances. |

202 | |

203 | :param km: a k-means clustering object. |

204 | :type km: :class:`KMeans` |

205 | |

206 | :param index: if given, the functon returns just the silhouette score of that particular data instance. |

207 | :type index: integer |

208 | """ |

209 | |

210 | if index == None: |

211 | return avg([score_silhouette(km, i) for i in range(len(km.data))]) |

212 | cind = km.clusters[index] |

213 | a = avg([km.distance(km.data[index], ex) for i, ex in enumerate(km.data) if |

214 | km.clusters[i] == cind and i != index]) |

215 | b = min([avg([km.distance(km.data[index], ex) for i, ex in enumerate(km.data) if |

216 | km.clusters[i] == c]) |

217 | for c in range(len(km.centroids)) if c != cind]) |

218 | return float(b - a) / max(a, b) if max(a, b) > 0 else 0.0 |

219 | |

220 | def score_fast_silhouette(km, index=None): |

221 | """Same as score_silhouette, but computes an approximation and is faster. |

222 | |

223 | :param km: a k-means clustering object. |

224 | :type km: :class:`KMeans` |

225 | """ |

226 | |

227 | if index == None: |

228 | return avg([score_fast_silhouette(km, i) for i in range(len(km.data))]) |

229 | cind = km.clusters[index] |

230 | a = km.distance(km.data[index], km.centroids[km.clusters[index]]) |

231 | b = min([km.distance(km.data[index], c) for i,c in enumerate(km.centroids) if i != cind]) |

232 | return float(b - a) / max(a, b) if max(a, b) > 0 else 0.0 |

233 | |

234 | def compute_bic(km): |

235 | """Compute bayesian information criteria score for given clustering. NEEDS REWRITE!!!""" |

236 | data = km.data |

237 | medoids = km.centroids |

238 | |

239 | M = len(data.domain.attributes) |

240 | R = float(len(data)) |

241 | Ri = [km.clusters.count(i) for i in range(km.k)] |

242 | numFreePar = (len(km.data.domain.attributes) + 1.) * km.k * math.log(R, 2.) / 2. |

243 | # sigma**2 |

244 | s2 = 0. |

245 | cidx = [i for i, attr in enumerate(data.domain.attributes) if attr.varType in [orange.VarTypes.Continuous, orange.VarTypes.Discrete]] |

246 | for x, midx in izip(data, mapping): |

247 | medoid = medoids[midx] # medoids has a dummy element at the beginning, so we don't need -1 |

248 | s2 += sum( [(float(x[i]) - float(medoid[i]))**2 for i in cidx] ) |

249 | s2 /= (R - K) |

250 | if s2 < 1e-20: |

251 | return None, [None]*K |

252 | # log-lokehood of clusters: l(Dn) |

253 | # log-likehood of clustering: l(D) |

254 | ld = 0 |

255 | bicc = [] |

256 | for k in range(1, 1+K): |

257 | ldn = -1. * Ri[k] * ((math.log(2. * math.pi, 2) / -2.) - (M * math.log(s2, 2) / 2.) + (K / 2.) + math.log(Ri[k], 2) - math.log(R, 2)) |

258 | ld += ldn |

259 | bicc.append(ldn - numFreePar) |

260 | return ld - numFreePar, bicc |

261 | |

262 | |

263 | # |

264 | # silhouette plot |

265 | # |

266 | |

267 | def plot_silhouette(km, filename='tmp.png', fast=False): |

268 | """ Saves a silhuette plot to filename, showing the distributions of silhouette scores in clusters. kmeans is a k-means clustering object. If fast is True use score_fast_silhouette to compute scores instead of score_silhouette. |

269 | |

270 | :param km: a k-means clustering object. |

271 | :type km: :class:`KMeans` |

272 | :param filename: name of output plot. |

273 | :type filename: string |

274 | :param fast: if True use :func:`score_fast_silhouette` to compute scores instead of :func:`score_silhouette` |

275 | :type fast: boolean. |

276 | |

277 | """ |

278 | import matplotlib.pyplot as plt |

279 | plt.figure() |

280 | scoring = score_fast_silhouette if fast else score_silhouette |

281 | scores = [[] for i in range(km.k)] |

282 | for i, c in enumerate(km.clusters): |

283 | scores[c].append(scoring(km, i)) |

284 | csizes = map(len, scores) |

285 | cpositions = [sum(csizes[:i]) + (i+1)*3 + csizes[i]/2 for i in range(km.k)] |

286 | scores = reduce(lambda x,y: x + [0]*3 + sorted(y), scores, []) |

287 | plt.barh(range(len(scores)), scores, linewidth=0, color='c') |

288 | plt.yticks(cpositions, map(str, range(km.k))) |

289 | #plt.title('Silhouette plot') |

290 | plt.ylabel('Cluster') |

291 | plt.xlabel('Silhouette value') |

292 | plt.savefig(filename) |

293 | |

294 | # clustering initialization (seeds) |

295 | # initialization functions should be of the type f(data, k, distfun) |

296 | |

297 | def init_random(data, k, _): |

298 | """A function that can be used for initialization of k-means clustering returns k data instances from the data. This type of initialization is also known as Fory's initialization (Forgy, 1965; He et al., 2004). |

299 | |

300 | :param data: data instances. |

301 | :type data: :class:`orange.ExampleTable` |

302 | :param k: the number of clusters. |

303 | :type k: integer |

304 | :param distfun: a distance function. |

305 | :type distfun: :class:`orange.ExamplesDistance` |

306 | """ |

307 | return data.getitems(random.sample(range(len(data)), k)) |

308 | |

309 | def init_diversity(data, k, distfun): |

310 | """A function that can be used for intialization of k-means clustering. Returns a set of centroids where the first one is a data point being the farthest away from the center of the data, and consequent centroids data points of which the minimal distance to the previous set of centroids is maximal. Differs from the initialization proposed by Katsavounidis et al. (1994) only in the selection of the first centroid (where they use a data instance with the highest norm). |

311 | |

312 | :param data: data instances. |

313 | :type data: :class:`orange.ExampleTable` |

314 | :param k: the number of clusters. |

315 | :type k: integer |

316 | :param distfun: a distance function. |

317 | :type distfun: :class:`orange.ExamplesDistance` |

318 | """ |

319 | center = data_center(data) |

320 | # the first seed should be the farthest point from the center |

321 | seeds = [max([(distfun(d, center), d) for d in data])[1]] |

322 | # other seeds are added iteratively, and are data points that are farthest from the current set of seeds |

323 | for i in range(1,k): |

324 | seeds.append(max([(min([distfun(d, s) for s in seeds]), d) for d in data if d not in seeds])[1]) |

325 | return seeds |

326 | |

327 | class init_hclustering(): |

328 | """ |

329 | A class that returns an clustering initialization function that performs |

330 | hierarhical clustering, uses it to infer k clusters, and computes a |

331 | list of cluster-based data centers |

332 | """ |

333 | |

334 | def __init__(self, n=100): |

335 | """ |

336 | :param n: number of data instances to sample. |

337 | :type n: integer |

338 | """ |

339 | self.n = n |

340 | |

341 | def __call__(self, data, k, disfun): |

342 | """ |

343 | :param data: data instances. |

344 | :type data: :class:`orange.ExampleTable` |

345 | :param k: the number of clusters. |

346 | :type k: integer |

347 | :param distfun: a distance function. |

348 | :type distfun: :class:`orange.ExamplesDistance` |

349 | """ |

350 | sample = orange.ExampleTable(random.sample(data, min(self.n, len(data)))) |

351 | root = Orange.clustering.hierarchical.clustering(sample) |

352 | cmap = Orange.clustering.hierarchical.top_clusters(root, k) |

353 | return [data_center(orange.ExampleTable([sample[e] for e in cl])) for cl in cmap] |

354 | |

355 | # |

356 | # k-means clustering, main implementation |

357 | # |

358 | |

359 | class Clustering: |

360 | """Implements a k-means clustering algorithm: |

361 | |

362 | #. Choose the number of clusters, k. |

363 | #. Choose a set of k initial centroids. |

364 | #. Assign each instances in the data set to the closest centroid. |

365 | #. For each cluster, compute a new centroid as a center of clustered |

366 | data instances. |

367 | #. Repeat the previous two steps, until some convergence criterion is |

368 | met (e.g., the cluster assignment has not changed). |

369 | |

370 | The main advantages of this algorithm are simplicity and low memory |

371 | requirements. The principal disadvantage is the dependence of results |

372 | on the selection of initial set of centroids. |

373 | |

374 | .. attribute:: k |

375 | |

376 | Number of clusters. |

377 | |

378 | .. attribute:: data |

379 | |

380 | Instances to cluster. |

381 | |

382 | .. attribute:: centroids |

383 | |

384 | Current set of centroids. |

385 | |

386 | .. attribute:: scoring |

387 | |

388 | Current clustering score. |

389 | |

390 | .. attribute:: iteration |

391 | |

392 | Current clustering iteration. |

393 | |

394 | .. attribute:: clusters |

395 | |

396 | A list of cluster indexes. An i-th element provides an |

397 | index to a centroid associated with i-th data instance from the input |

398 | data set. |

399 | """ |

400 | |

401 | def __init__(self, data=None, centroids=3, maxiters=None, minscorechange=None, |

402 | stopchanges=0, nstart=1, initialization=init_random, |

403 | distance=orange.ExamplesDistanceConstructor_Euclidean, |

404 | scoring=score_distance_to_centroids, inner_callback = None, |

405 | outer_callback = None): |

406 | """ |

407 | :param data: Data instances to be clustered. If not None, clustering will be executed immediately after initialization unless initialize_only=True. |

408 | :type data: :class:`orange.ExampleTable` or None |

409 | :param centroids: either specify a number of clusters or provide a list of examples that will serve as clustering centroids. |

410 | :type centroids: integer or a list of :class:`orange.Example` |

411 | :param nstart: If greater than one, nstart runs of the clustering algorithm will be executed, returning the clustering with the best (lowest) score. |

412 | :type nstart: integer |

413 | :param distance: an example distance constructor, which measures the distance between two instances. |

414 | :type distance: :class:`orange.ExamplesDistanceConstructor` |

415 | :param initialization: a function to select centroids given data instances, k and a example distance function. This module implements different approaches (:func:`init_random`, :func:`init_diversity`, :class:`init_hclustering`). |

416 | :param scoring: a function that takes clustering object and returns the clustering score. It could be used, for instance, in procedure that repeats the clustering nstart times, returning the clustering with the lowest score. |

417 | :param inner_callback: invoked after every clustering iteration. |

418 | :param outer_callback: invoked after every clustering restart (if nstart is greater than 1). |

419 | |

420 | Stopping criteria: |

421 | |

422 | :param maxiters: maximum number of clustering iterations |

423 | :type maxiters: integer |

424 | :param minscorechange: minimal improvement of the score from previous generation (if lower, the clustering will stop). If None, the score will not be computed between iterations |

425 | :type minscorechange: float or None |

426 | :param stopchanges: if the number of instances changing the cluster is lower or equal to stopchanges, stop the clustering. |

427 | :type stopchanges: integer |

428 | """ |

429 | |

430 | self.data = data |

431 | self.k = centroids if type(centroids)==int else len(centroids) |

432 | self.centroids = centroids if type(centroids) == orange.ExampleTable else None |

433 | self.maxiters = maxiters |

434 | self.minscorechange = minscorechange |

435 | self.stopchanges = stopchanges |

436 | self.nstart = nstart |

437 | self.initialization = initialization |

438 | self.distance_constructor = distance |

439 | self.distance = self.distance_constructor(self.data) if self.data else None |

440 | self.scoring = scoring |

441 | self.minimize_score = True if hasattr(scoring, 'minimize') else False |

442 | self.inner_callback = inner_callback |

443 | self.outer_callback = outer_callback |

444 | if self.data: |

445 | self.run() |

446 | |

447 | def __call__(self, data = None): |

448 | """Runs the k-means clustering algorithm, with optional new data.""" |

449 | if data: |

450 | self.data = data |

451 | self.distance = self.distance_constructor(self.data) |

452 | self.run() |

453 | |

454 | def init_centroids(self): |

455 | """Initialize cluster centroids""" |

456 | if self.centroids and not self.nstart > 1: # centroids were specified |

457 | return |

458 | self.centroids = self.initialization(self.data, self.k, self.distance) |

459 | |

460 | def compute_centeroid(self, data): |

461 | """Return a centroid of the data set.""" |

462 | return data_center(data) |

463 | |

464 | def compute_cluster(self): |

465 | """calculate membership in clusters""" |

466 | return [minindex([self.distance(s, d) for s in self.centroids]) for d in self.data] |

467 | |

468 | def runone(self): |

469 | """Runs a single clustering iteration, starting with re-computation of centroids, followed by computation of data membership (associating data instances to their nearest centroid).""" |

470 | self.centroids = [self.compute_centeroid(self.data.getitems( |

471 | [i for i, c in enumerate(self.clusters) if c == cl])) for cl in range(self.k)] |

472 | self.clusters = self.compute_cluster() |

473 | |

474 | def run(self): |

475 | """ |

476 | Runs clustering until the convergence conditions are met. If nstart is greater than one, nstart runs of the clustering algorithm will be executed, returning the clustering with the best (lowest) score. |

477 | """ |

478 | self.winner = None |

479 | for startindx in range(self.nstart): |

480 | self.init_centroids() |

481 | self.clusters = old_cluster = self.compute_cluster() |

482 | if self.minscorechange != None: |

483 | self.score = old_score = self.scoring(self) |

484 | self.nchanges = len(self.data) |

485 | self.iteration = 0 |

486 | stopcondition = False |

487 | if self.inner_callback: |

488 | self.inner_callback(self) |

489 | while not stopcondition: |

490 | self.iteration += 1 |

491 | self.runone() |

492 | self.nchanges = sum(map(lambda x,y: x!=y, old_cluster, self.clusters)) |

493 | old_cluster = self.clusters |

494 | if self.minscorechange != None: |

495 | self.score = self.scoring(self) |

496 | scorechange = (self.score - old_score) / old_score if old_score > 0 else self.minscorechange |

497 | if self.minimize_score: |

498 | scorechange = -scorechange |

499 | old_score = self.score |

500 | stopcondition = (self.nchanges <= self.stopchanges or |

501 | self.iteration == self.maxiters or |

502 | (self.minscorechange != None and |

503 | scorechange <= self.minscorechange)) |

504 | if self.inner_callback: |

505 | self.inner_callback(self) |

506 | if self.scoring and self.minscorechange == None: |

507 | self.score = self.scoring(self) |

508 | if self.nstart > 1: |

509 | if not self.winner or (self.score < self.winner[0] if |

510 | self.minimize_score else self.score > self.winner[0]): |

511 | self.winner = (self.score, self.clusters, self.centroids) |

512 | if self.outer_callback: |

513 | self.outer_callback(self) |

514 | |

515 | if self.nstart > 1: |

516 | self.score, self.clusters, self.centroids = self.winner |

517 |

**Note:**See TracBrowser for help on using the repository browser.