# source:orange/Orange/clustering/hierarchical.py@9906:77274e331dbb

Revision 9906:77274e331dbb, 61.4 KB checked in by lanumek, 2 years ago (diff)

Test scrip for hierarchical clustering.

Line
1r"""
2******************************************
3Hierarchical clustering (``hierarchical``)
4******************************************
5
6.. index::
7   single: clustering, hierarchical, dendrogram
8.. index:: aglomerative clustering
9
10The method for hierarchical clustering, encapsulated in class
11:class:`HierarchicalClustering` works on a distance matrix stored as
12:class:`SymMatrix`. The method works in approximately O(n2) time (with
13the worst case O(n3)). For orientation, clustering ten thousand of
14elements should take roughly 15 seconds on a 2 GHz computer.
15The algorithm can either make a copy of the distances matrix and work on
16it, or work on the original distance matrix, destroying it in the process.
17The latter is useful for clustering larger number of objects. Since the
18distance matrix stores (n+1)(n+2)/2 floats (cca 2 MB for 1000 objects and
19200 MB for 10000, assuming the a float takes 4 bytes), by copying it we
20would quickly run out of physical memory. Using virtual memory is not
21an option since the matrix is accessed in a random manner.
22
23The distance should contain no negative elements. This limitation is
24due to implementation details of the algorithm (it is not absolutely
25necessary and can be lifted in future versions if often requested; it
26only helps the algorithm run a bit faster). The elements on the diagonal
27(representing the element's distance from itself) are ignored.
28
29Distance matrix can have the attribute objects describing the objects we
30are clustering (this is available only in Python). This can be any sequence
31of the same length as the matrix - an ExampleTable, a list of examples, a
32list of attributes (if you're clustering attributes), or even a string of
33the correct length. This attribute is not used in clustering but is only
34passed to the clusters' attribute ``mapping`` (see below), which will hold a
35reference to it (if you modify the list, the changes will affect the
36clusters as well).
37
38.. class:: HierarchicalClustering
39
41
42        Specifies the linkage method, which can be either :
43
44            1. ``HierarchicalClustering.Single`` (default), where distance
45                between groups is defined as the distance between the closest
46                pair of objects, one from each group,
47            2. ``HierarchicalClustering.Average`` , where the distance between
48                two clusters is defined as the average of distances between
49                all pairs of objects, where each pair is made up of one object
50                from each group, or
51            3. ``HierarchicalClustering.Complete``, where the distance between
52                groups is defined as the distance between the most distant
53                pair of objects, one from each group. Complete linkage is
54                also called farthest neighbor.
55            4. ``HierarchicalClustering.Ward`` uses Ward's distance.
56
57    .. attribute:: overwrite_matrix
58
59        If true (default is false), the algorithm will work on the original
60        distance matrix, destroying it in the process. The benefit is that it
61        will need much less memory (not much more than what is needed to store
62        the tree of clusters).
63
64    .. attribute:: progress_callback
65
66        A callback function (None by default). It can be any function or
67        callable class in Python, which accepts a single float as an
68        argument. The function only gets called if the number of objects
69        being clustered is at least 1000. It will be called for 101 times,
70        and the argument will give the proportion of the work been done.
71        The time intervals between the function calls won't be equal (sorry
72        about that...) since the clustering proceeds faster as the number
73        of clusters decreases.
74
75    .. method:: __call__(matrix)
76
77        The ``HierarchicalClustering`` is called with a distance matrix as an
78        argument. It returns an instance of HierarchicalCluster representing
79        the root of the hierarchy (instance of :class:`HierarchicalCluster`).
80        See examples section for details.
81
82        :param matrix: A distance matrix to perform the clustering on.
83        :type matrix: :class:`Orange.misc.SymMatrix`
84
85
86.. class:: HierarchicalCluster
87
88    Represents a node in the clustering tree, as returned by
89    :obj:`HierarchicalClustering`
90
91    .. attribute:: branches
92
93        A list of sub-clusters (:class:`HierarchicalCluster` instances). If this
94        is a leaf node this attribute is `None`
95
96    .. attribute:: left
97
98        The left sub-cluster (defined only if there are only two branches).
99
100        .. note:: Same as ``branches[0]``
101
102    .. attribute:: right
103
104        The right sub-cluster (defined only if there are only two branches).
105
106        .. note:: Same as ``branches[1]``
107
108    .. attribute:: height
109
110        Height of the cluster (distance between the sub-clusters).
111
112    .. attribute:: mapping
113
114        A list of indices to the original distance matrix. It is the same
115        for all clusters in the hierarchy - it simply represents the indices
116        ordered according to the clustering.
117
118    .. attribute:: first
119    .. attribute:: last
120
121        ``first`` and ``last`` are indices into the elements of ``mapping`` that
122        belong to that cluster. (Seems weird, but is trivial - wait for the
123        examples. On the other hand, you probably won't need to understand this
124        anyway).
125
126    .. method:: __len__()
127
128        Asking for the length of the cluster gives the number of the objects
129        belonging to it. This equals ``last - first``.
130
131    .. method:: __getitem__(index)
132
133        By indexing the cluster we address its elements; these are either
134        indices or objects (you'll understand this after seeing the examples).
135        For instance cluster[2] gives the third element of the cluster, and
136        list(cluster) will return the cluster elements as a list. The cluster
137        elements are read-only. To actually modify them, you'll have to go
138        through mapping, as described below. This is intentionally complicated
139        to discourage a naive user from doing what he does not understand.
140
141    .. method:: swap()
142
143        Swaps the ``left`` and the ``right`` subcluster; obviously this will
144        report an error when the cluster has more than two subclusters. This
145        function changes the mapping and first and last of all clusters below
146        this one and thus needs O(len(cluster)) time.
147
148    .. method:: permute(permutation)
149
150        Permutes the subclusters. Permutation gives the order in which the
151        subclusters will be arranged. As for swap, this function changes the
152        mapping and first and last of all clusters below this one.
153
154
155Example 1 - Toy matrix
156----------------------
157
158Let us construct a simple distance matrix and run clustering on it.
159
160.. literalinclude:: code/hierarchical-example.py
161    :lines: 1-14
162
163Root is a root of the cluster hierarchy. We can print using a
164simple recursive function.
165
166.. literalinclude:: code/hierarchical-example.py
167    :lines: 16-20
168
169The output is not exactly nice, but it will have to do. Our clustering,
170printed by calling printClustering(root) looks like this
171::
172
173    (((04)((57)(89)))((1(26))3))
174
175The elements are separated into two groups, the first containing elements
1760, 4, 5, 7, 8, 9, and the second 1, 2, 6, 3. The difference between them
177equals ``root.height``, 9.0 in our case. The first cluster is further
178divided onto 0 and 4 in one, and 5, 7, 8, 9 in the other subcluster...
179
180It is easy to print out the cluster's objects. Here's what's in the
181left subcluster of root.
182::
183
184    >>> for el in root.left:
185        ... print el,
186    0 4 5 7 8 9
187
188Everything that can be iterated over, can as well be cast into a list or
189tuple. Let us demonstrate this by writing a better function for printing
190out the clustering (which will also come handy for something else in a
191while). The one above supposed that each leaf contains a single object.
192This is not necessarily so; instead of printing out the first (and
193supposedly the only) element of cluster, cluster[0], we shall print
194it out as a tuple.
195
196.. literalinclude:: code/hierarchical-example.py
197    :lines: 22-26
198
199The distance matrix could have been given a list of objects. We could,
200for instance, put
201
202.. literalinclude:: code/hierarchical-example.py
203    :lines: 28-29
204
205above calling the HierarchicalClustering.
206
207.. note:: This code will actually trigger a warning;
208    to avoid it, use matrix.setattr("objects", ["Ann", "Bob"....
209..    Why this is needed is explained in the page on `Orange peculiarities`_. TODO: Create page on Orange Peculiarities.
210
211If we've forgotten to store the objects into matrix prior to clustering,
212nothing is lost. We can add it into clustering later, by
213
214.. literalinclude:: code/hierarchical-example.py
215    :lines: 31
216
217So, what do these "objects" do? Call printClustering(root) again and you'll
218see. Or, let us print out the elements of the first left cluster, as we did
219before.
220::
221
222    >>> for el in root.left:
223        ... print el,
224    Ann Eve Fred Hue Ivy Jon
225
226If objects are given, the cluster's elements, as got by indexing
227(eg root.left[2]) or by iteration, as in the above case, won't be
228indices but the elements we clustered. If we put an ExampleTable
229into objects, root.left[-1] will be the last example of the first
230left cluster.
231
232Now for swapping and permutations. ::
233
234    >>> printClustering(root)
235    ((('Ann''Eve')(('Fred''Hue')('Ivy''Jon')))(('Bob'('Curt''Greg'))'Danny'))
236    >>> root.left.swap()
237    >>> printClustering(root)
238    (((('Fred''Hue')('Ivy''Jon'))('Ann''Eve'))(('Bob'('Curt''Greg'))'Danny'))
239    >>> root.permute([1, 0])
240    >>> printClustering(root)
241    ((('Bob'('Curt''Greg'))'Danny')((('Fred''Hue')('Ivy''Jon'))('Ann''Eve')))
242
243Calling ``root.left.swap`` reversed the order of subclusters of ``root.left``
244and ``root.permute([1, 0])`` (which is equivalent to ``root.swap`` - there
245aren't many possible permutations of two elements) reverses the order
246of ``root.left`` and ``root.right``.
247
248Let us write function for cluster pruning.
249
250.. literalinclude:: code/hierarchical-example.py
251    :lines: 33-39
252
253We shall use ``printClustering2`` here, since we can have multiple elements
254in a leaf of the clustering hierarchy. ::
255
256    >>> prune(root, 9)
257    >>> print printClustering2(root)
258    ((('Bob', 'Curt', 'Greg')('Danny',))(('Fred', 'Hue', 'Ivy', 'Jon')('Ann', 'Eve')))
259
260We've ended up with four clusters. Need a list of clusters?
261Here's the function.
262
263.. literalinclude:: code/hierarchical-example.py
264    :lines: 41-51
265
266The function returns a list of lists, in our case
267``[['Bob', 'Curt', 'Greg'], ['Danny'], ['Fred', 'Hue', 'Ivy', 'Jon'], ['Ann', 'Eve']]``
268If there were no ``objects`` the list would contains indices instead of names.
269
270Example 2 - Clustering of examples
271----------------------------------
272
273The most common things to cluster are certainly examples. To show how to
274this is done, we shall now load the Iris data set, initialize a distance
275matrix with the distances measure by :class:`Euclidean`
276and cluster it with average linkage. Since we don't need the matrix,
277we shall let the clustering overwrite it (not that it's needed for
278such a small data set as Iris).
279
280.. literalinclude:: code/hierarchical-example-2.py
281    :lines: 1-15
282
283Note that we haven't forgotten to set the ``matrix.objects``. We did it
284through ``matrix.setattr`` to avoid the warning. Let us now prune the
285clustering using the function we've written above, and print out the
286clusters.
287
288.. literalinclude:: code/hierarchical-example-2.py
289    :lines: 16-20
290
291Since the printout is pretty long, it might be more informative to just
292print out the class distributions for each cluster.
293
294.. literalinclude:: code/hierarchical-example-2.py
295    :lines: 22-26
296
297Here's what it shows. ::
298
299    Iris-setosa:  49    Iris-versicolor:   0    Iris-virginica:   0
300    Iris-setosa:   1    Iris-versicolor:   0    Iris-virginica:   0
301    Iris-setosa:   0    Iris-versicolor:  50    Iris-virginica:  17
302    Iris-setosa:   0    Iris-versicolor:   0    Iris-virginica:  33
303
304Note something else: ``listOfClusters`` does not return a list of
305:class:`Orange.data.Table`, but a list of lists of instances. Therefore,
306in the above script, cluster is a list of examples, not an ``Table``, but
307it gets converted to it automatically when the function is called.
308Most Orange functions will do this for you automatically. You can, for
309instance, call a learning algorithms, passing a cluster as an argument.
310It won't mind. If you, however, want to have a list of table, you can
311easily convert the list by
312
313.. literalinclude:: code/hierarchical-example-2.py
314    :lines: 28
315
316Finally, if you are dealing with examples, you may want to take the function
317``listOfClusters`` and replace ``alist.append(list(cluster))`` by
318``alist.append(Orange.data.Table(cluster))``. This function is less general,
319it will fail if objects are not of type :class:`Orange.data.Instance`.
320However, instead of list of lists, it will return a list of tables.
321
322How the data in ``HierarchicalCluster`` is really stored?
323---------------------------------------------------------
324
325To demonstrate how the data in clusters is stored, we shall continue with
326the clustering we got in the first example. ::
327
328    >>> del root.mapping.objects
329    >>> print printClustering(root)
330    (((1(26))3)(((57)(89))(04)))
331    >>> print root.mapping
332    <1, 2, 6, 3, 5, 7, 8, 9, 0, 4>
333    >>> print root.left.first
334    0
335    >>> print root.left.last
336    4
337    >>> print root.left.mapping[root.left.first:root.left.last]
338    <1, 2, 6, 3>
339    >>> print root.left.left.first
340    0
341    >>> print root.left.left.last
342    3
343
344We removed objects to just to see more clearly what is going on.
345``mapping`` is an ordered list of indices to the rows/columns of distance
346matrix (and, at the same time, indices into objects, if they exist). Each
347cluster's fields ``first`` and ``last`` are indices into mapping, so the
348clusters elements are actually
349``cluster.mapping[cluster.first:cluster.last]``. ``cluster[i]`` therefore
350returns ``cluster.mapping[cluster.first+i]`` or, if objects are specified,
351``cluster.objects[cluster.mapping[cluster.first+i]]``. Space consumption
352is minimal since all clusters share the same objects ``mapping`` and
353``objects``.
354
355
356Subclusters are ordered so that ``cluster.left.last`` always equals
357``cluster.right.first`` or, in general, ``cluster.branches[i].last``
358equals ``cluster.branches[i+1].first``.
359
360
361Swapping and permutation do three things: change the order of elements in
362``branches``, permute the corresponding regions in ``mapping`` and adjust
363the ``first`` and ``last`` for all the clusters below. For the latter, when
364subclusters of cluster are permuted, the entire subtree starting at
365``cluster.branches[i]`` is moved by the same offset.
366
367
368The hierarchy of objects that represent a clustering is open, everything is
369accessible from Python. You can write your own clustering algorithms that
370build this same structure, or you can use Orange's clustering and then do to
371the structure anything you want. For instance prune it, as we have shown
372earlier. However, it is easy to do things wrong: shuffle the mapping, for
373instance, and forget to adjust the ``first`` and ``last`` pointers. Orange
374does some checking for the internal consistency, but you are surely smarter
375and can find a way to crash it. For instance, just create a cycle in the
376structure, call ``swap`` for some cluster above the cycle and you're there.
377But don't blame it on me then.
378
379
380Utility Functions
381-----------------
382
383.. autofunction:: clustering
384.. autofunction:: clustering_features
385.. autofunction:: cluster_to_list
386.. autofunction:: top_clusters
387.. autofunction:: top_cluster_membership
388.. autofunction:: order_leaves
389
390.. autofunction:: postorder
391.. autofunction:: preorder
392.. autofunction:: dendrogram_layout
393.. autofunction:: dendrogram_draw
394.. autofunction:: clone
395.. autofunction:: prune
396.. autofunction:: pruned
397.. autofunction:: cluster_depths
398.. autofunction:: feature_distance_matrix
399.. autofunction:: joining_cluster
400.. autofunction:: cophenetic_distances
401.. autofunction:: cophenetic_correlation
402
403"""
404
405import orange
406import Orange
407from Orange.core import HierarchicalClustering, \
408                        HierarchicalCluster, \
409                        HierarchicalClusterList
410
411from Orange.misc import progress_bar_milestones, deprecated_keywords
412
413import sys
414
415SINGLE = HierarchicalClustering.Single
416AVERAGE = HierarchicalClustering.Average
417COMPLETE = HierarchicalClustering.Complete
418WARD = HierarchicalClustering.Ward
419
420def clustering(data,
421               distance_constructor=Orange.distance.Euclidean,
423               order=False,
424               progress_callback=None):
425    """ Return a hierarchical clustering of the instances in a data set.
426
427    :param data: Input data table for clustering.
428    :type data: :class:`Orange.data.Table`
429    :param distance_constructor: Instance distance constructor
430    :type distance_constructor: :class:`Orange.distance.DistanceConstructor`
431    :param linkage: Linkage flag. Must be one of global module level flags:
432
433        - SINGLE
434        - AVERAGE
435        - COMPLETE
436        - WARD
437
439    :param order: If `True` run `order_leaves` on the resulting clustering.
440    :type order: bool
441    :param progress_callback: A function (taking one argument) to use for
442        reporting the on the progress.
443    :type progress_callback: function
444
445    :rtype: :class:`HierarchicalCluster`
446
447    """
448    distance = distance_constructor(data)
449    matrix = Orange.misc.SymMatrix(len(data))
450    for i in range(len(data)):
451        for j in range(i+1):
452            matrix[i, j] = distance(data[i], data[j])
453    root = HierarchicalClustering(matrix, linkage=linkage, progress_callback=(lambda value, obj=None: progress_callback(value*100.0/(2 if order else 1))) if progress_callback else None)
454    if order:
455        order_leaves(root, matrix, progress_callback=(lambda value: progress_callback(50.0 + value/2)) if progress_callback else None)
456    return root
457
458clustering = \
459    deprecated_keywords({"distanceConstructor": "distance_constructor",
460                         "progressCallback": "progress_callback"})(clustering)
461
462
463def clustering_features(data, distance=None, linkage=orange.HierarchicalClustering.Average, order=False, progress_callback=None):
464    """ Return hierarchical clustering of attributes in a data set.
465
466    :param data: Input data table for clustering.
467    :type data: :class:`Orange.data.Table`
468    :param distance: Attribute distance constructor
469        .. note:: currently not used.
470    :param linkage: Linkage flag. Must be one of global module level flags:
471
472        - SINGLE
473        - AVERAGE
474        - COMPLETE
475        - WARD
476
478    :param order: If `True` run `order_leaves` on the resulting clustering.
479    :type order: bool
480    :param progress_callback: A function (taking one argument) to use for
481        reporting the on the progress.
482    :type progress_callback: function
483
484    :rtype: :class:`HierarchicalCluster`
485
486    """
487    matrix = Orange.misc.SymMatrix(len(data.domain.attributes))
488    for a1 in range(len(data.domain.attributes)):
489        for a2 in range(a1):
490            matrix[a1, a2] = (1.0 - orange.PearsonCorrelation(a1, a2, data, 0).r) / 2.0
491    root = orange.HierarchicalClustering(matrix, linkage=linkage, progress_callback=(lambda value, obj=None: progress_callback(value*100.0/(2 if order else 1))) if progress_callback else None)
492    if order:
493        order_leaves(root, matrix, progressCallback=(lambda value: progress_callback(50.0 + value/2)) if progress_callback else None)
494    return root
495
496clustering_features = \
497    deprecated_keywords({"progressCallback":"progress_callback"})(clustering_features)
498
499
500def cluster_to_list(node, prune=None):
501    """ Return a list of clusters down from the node of hierarchical clustering.
502
503    :param node: Cluster node.
504    :type node: :class:`HierarchicalCluster`
505    :param prune: If not `None` it must be a positive integer. Any cluster
506        with less then `prune` items will be left out of the list.
507    :type node: int or `NoneType`
508
509    :rtype: list of :class:`HierarchicalCluster` instances
510
511    """
512    if prune:
513        if len(node) <= prune:
514            return []
515    if node.branches:
516        return [node] + cluster_to_list(node.left, prune) + cluster_to_list(node.right, prune)
517    return [node]
518
519
520def top_clusters(root, k):
521    """ Return k topmost clusters from hierarchical clustering.
522
523    :param root: Root cluster.
524    :type root: :class:`HierarchicalCluster`
525    :param k: Number of top clusters.
526    :type k: int
527
528    :rtype: list of :class:`HierarchicalCluster` instances
529
530    """
531    candidates = set([root])
532    while len(candidates) < k:
533        repl = max([(c.height, c) for c in candidates if c.branches])[1]
537    return candidates
538
539def top_cluster_membership(root, k):
540    """ Return data instances' cluster membership (list of indices) to k topmost clusters.
541
542    :param root: Root cluster.
543    :type root: :class:`HierarchicalCluster`
544    :param k: Number of top clusters.
545    :type k: int
546
547    :rtype: list-of-int
548
549    """
550    clist = top_clusters(root, k)
551    cmap = [None] * len(root)
552    for i, c in enumerate(clist):
553        for e in c:
554            cmap[e] = i
555    return cmap
556
557def order_leaves_py(tree, matrix, progress_callback=None):
558    """Order the leaves in the clustering tree.
559
560    (based on Ziv Bar-Joseph et al. (Fast optimal leaf ordering for hierarchical clustering))
561
562    :param tree: Binary hierarchical clustering tree.
563    :type tree: :class:`HierarchicalCluster`
564    :param matrix: SymMatrix that was used to compute the clustering.
565    :type matrix: :class:`Orange.misc.SymMatrix`
566    :param progress_callback: Function used to report on progress.
567    :type progress_callback: function
568
569    .. note:: The ordering is done inplace.
570
571    """
572
573    objects = getattr(tree.mapping, "objects", None)
574    tree.mapping.setattr("objects", range(len(tree)))
575    M = {}
576    ordering = {}
577    visited_clusters = set()
578
579#    def opt_ordering_recursive(tree):
580#        if len(tree)==1:
581#            for leaf in tree:
582#                M[tree, leaf, leaf] = 0
583#        else:
584#            opt_ordering_recursive(tree.left)
585#            opt_ordering_recursive(tree.right)
586#
587#            Vl = set(tree.left)
588#            Vr = set(tree.right)
589#            Vlr = set(tree.left.right or tree.left)
590#            Vll = set(tree.left.left or tree.left)
591#            Vrr = set(tree.right.right or tree.right)
592#            Vrl = set(tree.right.left or tree.right)
593#            other = lambda e, V1, V2: V2 if e in V1 else V1
594#            tree_left, tree_right = tree.left, tree.right
595#            for u in Vl:
596#                for w in Vr:
597##                    if True: #Improved search
598#                    C = min([matrix[m, k] for m in other(u, Vll, Vlr) for k in other(w, Vrl, Vrr)])
599#                    ordered_m = sorted(other(u, Vll, Vlr), key=lambda m: M[tree_left, u, m])
600#                    ordered_k = sorted(other(w, Vrl, Vrr), key=lambda k: M[tree_right, w, k])
601#                    k0 = ordered_k[0]
602#                    cur_min = 1e30000
603#                    cur_m = cur_k = None
604#                    for m in ordered_m:
605#                        if M[tree_left, u, m] + M[tree_right, w, k0] + C >= cur_min:
606#                            break
607#                        for k in  ordered_k:
608#                            if M[tree_left, u, m] + M[tree_right, w, k] + C >= cur_min:
609#                                break
610#                            test_min = M[tree_left, u, m] + M[tree_right, w, k] + matrix[m, k]
611#                            if cur_min > test_min:
612#                                cur_min = test_min
613#                                cur_m = m
614#                                cur_k = k
615#                    M[tree, u, w] = M[tree, w, u] = cur_min
616#                    ordering[tree, u, w] = (tree_left, u, cur_m, tree_right, w, cur_k)
617#                    ordering[tree, w, u] = (tree_right, w, cur_k, tree_left, u, cur_m)
618##                    else:
619##                    def MFunc((m, k)):
620##                        return M[tree_left, u, m] + M[tree_right, w, k] + matrix[m, k]
621##                    m, k = min([(m, k) for m in other(u, Vll, Vlr) for k in other(w, Vrl, Vrr)], key=MFunc)
622##                    M[tree, u, w] = M[tree, w, u] = MFunc((m, k))
623##                    ordering[tree, u, w] = (tree_left, u, m, tree_right, w, k)
624##                    ordering[tree, w, u] = (tree_right, w, k, tree_left, u, m)
625#
626#            if progressCallback:
627#                progressCallback(100.0 * len(visited_clusters) / len(tree.mapping))
629#
630#    with recursion_limit(sys.getrecursionlimit() + len(tree)):
631#        opt_ordering_recursive(tree)
632
633    def opt_ordering_iterative(tree):
634        if len(tree)==1:
635            for leaf in tree:
636                M[tree, leaf, leaf] = 0
637        else:
638#            _optOrdering(tree.left)
639#            _optOrdering(tree.right)
640
641            Vl = set(tree.left)
642            Vr = set(tree.right)
643            Vlr = set(tree.left.right or tree.left)
644            Vll = set(tree.left.left or tree.left)
645            Vrr = set(tree.right.right or tree.right)
646            Vrl = set(tree.right.left or tree.right)
647            other = lambda e, V1, V2: V2 if e in V1 else V1
648            tree_left, tree_right = tree.left, tree.right
649            for u in Vl:
650                for w in Vr:
651#                    if True: #Improved search
652                        C = min([matrix[m, k] for m in other(u, Vll, Vlr) for k in other(w, Vrl, Vrr)])
653                        ordered_m = sorted(other(u, Vll, Vlr), key=lambda m: M[tree_left, u, m])
654                        ordered_k = sorted(other(w, Vrl, Vrr), key=lambda k: M[tree_right, w, k])
655                        k0 = ordered_k[0]
656                        cur_min = 1e30000
657                        cur_m = cur_k = None
658                        for m in ordered_m:
659                            if M[tree_left, u, m] + M[tree_right, w, k0] + C >= cur_min:
660                                break
661                            for k in  ordered_k:
662                                if M[tree_left, u, m] + M[tree_right, w, k] + C >= cur_min:
663                                    break
664                                test_min = M[tree_left, u, m] + M[tree_right, w, k] + matrix[m, k]
665                                if cur_min > test_min:
666                                    cur_min = test_min
667                                    cur_m = m
668                                    cur_k = k
669                        M[tree, u, w] = M[tree, w, u] = cur_min
670                        ordering[tree, u, w] = (tree_left, u, cur_m, tree_right, w, cur_k)
671                        ordering[tree, w, u] = (tree_right, w, cur_k, tree_left, u, cur_m)
672#                    else:
673#                        def MFunc((m, k)):
674#                            return M[tree_left, u, m] + M[tree_right, w, k] + matrix[m, k]
675#                        m, k = min([(m, k) for m in other(u, Vll, Vlr) for k in other(w, Vrl, Vrr)], key=MFunc)
676#                        M[tree, u, w] = M[tree, w, u] = MFunc((m, k))
677#                        ordering[tree, u, w] = (tree_left, u, m, tree_right, w, k)
678#                        ordering[tree, w, u] = (tree_right, w, k, tree_left, u, m)
679
680#            if progressCallback:
681#                progressCallback(100.0 * len(visited_clusters) / len(tree.mapping))
683
684    subtrees = postorder(tree)
685    milestones = progress_bar_milestones(len(subtrees), 1000)
686
687    for i, subtree in enumerate(subtrees):
688        opt_ordering_iterative(subtree)
689        if progress_callback and i in milestones:
690            progress_callback(100.0 * i / len(subtrees))
691
692#    def order_recursive(tree, u, w):
693#        """ Order the tree based on the computed optimal ordering.
694#        """
695#        if len(tree)==1:
696#            return
697#        left, u, m, right, w, k = ordering[tree, u, w]
698#        if len(left)>1 and m not in left.right:
699#            left.swap()
700#        order_recursive(left, u, m)
701#
702#        if len(right)>1 and k not in right.left:
703#            right.swap()
704#        order_recursive(right, k, w)
705
706    def order_iterative(tree, u, w):
707        """ Order the tree based on the computed optimal ordering.
708        """
709        opt_uw = {tree: (u, w)}
710        for subtree in preorder(tree):
711            if subtree.branches:
712                u, w = opt_uw[subtree]
713                left, u, m, right, w, k = ordering[subtree, u, w]
714                opt_uw[left] = (u, m)
715                opt_uw[right] = (k, w)
716
717                if left.branches and m not in left.right:
718                    left.swap()
719
720                if right.branches and k not in right.left:
721                    right.swap()
722
723    u, w = min([(u, w) for u in tree.left for w in tree.right], key=lambda (u, w): M[tree, u, w])
724
725##    print "M(v) =", M[tree, u, w]
726
727#    with recursion_limit(sys.getrecursionlimit() + len(tree)):
728#        order_recursive(tree, u, w)
729
730    order_iterative(tree, u, w)
731
732
733#    def _check(tree, u, w):
734#        if len(tree)==1:
735#            return
736#        left, u, m, right, w, k = ordering[tree, u, w]
737#        if tree[0] == u and tree[-1] == w:
738#            _check(left, u, m)
739#            _check(right, k, w)
740#        else:
741#            print "Error:", u, w, tree[0], tree[-1]
742#
743#    with recursion_limit(sys.getrecursionlimit() + len(tree)):
744#        _check(tree, u ,w)
745
746    if objects:
747        tree.mapping.setattr("objects", objects)
748
749
750def order_leaves_cpp(tree, matrix, progress_callback=None):
751    """ Order the leaves in the clustering tree (C++ implementation).
752
753    (based on Ziv Bar-Joseph et al. (Fast optimal leaf ordering for hierarchical clustering))
754
755    :param tree: Binary hierarchical clustering tree.
756    :type tree: :class:`HierarchicalCluster`
757    :param matrix: SymMatrix that was used to compute the clustering.
758    :type matrix: :class:`Orange.misc.SymMatrix`
759    :param progress_callback: Function used to report on progress.
760    :type progress_callback: function
761
762    .. note:: The ordering is done inplace.
763
764    """
765    node_count = iter(range(len(tree)))
766
767    if progress_callback is not None:
768        def p(*args):
769            progress_callback(100.0 * node_count.next() / len(tree))
770    else:
771        p = None
772
773    Orange.core.HierarchicalClusterOrdering(tree, matrix, progress_callback=p)
774
775order_leaves_cpp = deprecated_keywords({"progressCallback":"progress_callback"})(order_leaves_cpp)
776order_leaves_py = deprecated_keywords({"progressCallback":"progress_callback"})(order_leaves_py)
777
778order_leaves = order_leaves_cpp
779
780"""
781Matplotlib dendrogram ploting. This is mostly untested,
782use dendrogram_draw funciton instead of this.
783
784"""
785try:
786    import numpy
787except ImportError:
788    numpy = None
789
790try:
791    import matplotlib
792    from matplotlib.figure import Figure
793    from matplotlib.table import Table, Cell
794    from matplotlib.text import Text
795    from matplotlib.artist import Artist
796##    import  matplotlib.pyplot as plt
797except (ImportError, IOError, RuntimeError), ex:
798    matplotlib = None
799    Text , Artist, Table, Cell = object, object, object, object
800
801class TableCell(Cell):
803    def __init__(self, *args, **kwargs):
804        Cell.__init__(self, *args, **kwargs)
805        self._text.set_clip_on(True)
806
807class TablePlot(Table):
808    max_fontsize = 12
809    def __init__(self, xy, axes=None, bbox=None):
810        Table.__init__(self, axes or plt.gca(), bbox=bbox)
811        self.xy = xy
812        self.set_transform(self._axes.transData)
813        self._fixed_widhts = None
814        import matplotlib.pyplot as plt
815        self.max_fontsize = plt.rcParams.get("font.size", 12)
816
817    def add_cell(self, row, col, *args, **kwargs):
818        xy = (0,0)
819
820        cell = TableCell(xy, *args, **kwargs)
821        cell.set_figure(self.figure)
822        cell.set_transform(self.get_transform())
823
824        cell.set_clip_on(True)
825        cell.set_clip_box(self._axes.bbox)
826        cell._text.set_clip_box(self._axes.bbox)
827        self._cells[(row, col)] = cell
828
829    def draw(self, renderer):
830        if not self.get_visible(): return
831        self._update_positions(renderer)
832
833        keys = self._cells.keys()
834        keys.sort()
835        for key in keys:
836            self._cells[key].draw(renderer)
837
838    def _update_positions(self, renderer):
839        keys = numpy.array(self._cells.keys())
840        cells = numpy.array([[self._cells.get((row, col), None) for col in range(max(keys[:, 1] + 1))] \
841                             for row in range(max(keys[:, 0] + 1))])
842
843        widths = self._get_column_widths(renderer)
844        x = self.xy[0] + numpy.array([numpy.sum(widths[:i]) for i in range(len(widths))])
845        y = self.xy[1] - numpy.arange(cells.shape[0]) - 0.5
846
847        for i in range(cells.shape[0]):
848            for j in range(cells.shape[1]):
849                cells[i, j].set_xy((x[j], y[i]))
850                cells[i, j].set_width(widths[j])
851                cells[i, j].set_height(1.0)
852
853        self._width = numpy.sum(widths)
854        self._height = cells.shape[0]
855
856        self.pchanged()
857
858    def _get_column_widths(self, renderer):
859        keys = numpy.array(self._cells.keys())
860        widths = numpy.zeros(len(keys)).reshape((numpy.max(keys[:,0]+1), numpy.max(keys[:,1]+1)))
861        fontSize = self._calc_fontsize(renderer)
862        for (row, col), cell in self._cells.items():
863            cell.set_fontsize(fontSize)
864            l, b, w, h = cell._text.get_window_extent(renderer).bounds
865            transform = self._axes.transData.inverted()
866            x1, _ = transform.transform_point((0, 0))
867            x2, _ = transform.transform_point((w + w*TableCell.PAD + 10, 0))
868            w = abs(x1 - x2)
869            widths[row, col] = w
870        return numpy.max(widths, 0)
871
872    def _calc_fontsize(self, renderer):
873        transform = self._axes.transData
874        _, y1 = transform.transform_point((0, 0))
875        _, y2 = transform.transform_point((0, 1))
876        return min(max(int(abs(y1 - y2)*0.85) ,4), self.max_fontsize)
877
878    def get_children(self):
879        return self._cells.values()
880
881    def get_bbox(self):
882        return matplotlib.transform.Bbox([self.xy[0], self.xy[1], self.xy[0] + 10, self.xy[1] + 180])
883
884class DendrogramPlotPylab(object):
885    def __init__(self, root, data=None, labels=None, dendrogram_width=None, heatmap_width=None, label_width=None, space_width=None, border_width=0.05, plot_attr_names=False, cmap=None, params={}):
886        if not matplotlib:
887            raise ImportError("Could not import matplotlib module. Please make sure matplotlib is installed on your system.")
888        import matplotlib.pyplot as plt
889        self.plt = plt
890        self.root = root
891        self.data = data
892        self.labels = labels if labels else [str(i) for i in range(len(root))]
893        self.dendrogram_width = dendrogram_width
894        self.heatmap_width = heatmap_width
895        self.label_width = label_width
896        self.space_width = space_width
897        self.border_width = border_width
898        self.params = params
899        self.plot_attr_names = plot_attr_names
900
901    def plotDendrogram(self):
902        self.text_items = []
903        def draw_tree(tree):
904            if tree.branches:
905                points = []
906                for branch in tree.branches:
907                    center = draw_tree(branch)
908                    self.plt.plot([center[0], tree.height], [center[1], center[1]], color="black")
909                    points.append(center)
910                self.plt.plot([tree.height, tree.height], [points[0][1], points[-1][1]], color="black")
911                return (tree.height, (points[0][1] + points[-1][1])/2.0)
912            else:
913                return (0.0, tree.first)
914        draw_tree(self.root)
915
916    def plotHeatMap(self):
917        import numpy.ma as ma
918        import numpy
919        dx, dy = self.root.height, 0
920        fx, fy = self.root.height/len(self.data.domain.attributes), 1.0
921        data, c, w = self.data.toNumpyMA()
922        data = (data - ma.min(data))/(ma.max(data) - ma.min(data))
923        x = numpy.arange(data.shape[1] + 1)/float(numpy.max(data.shape))
924        y = numpy.arange(data.shape[0] + 1)/float(numpy.max(data.shape))*len(self.root)
925        self.heatmap_width = numpy.max(x)
926
927        X, Y = numpy.meshgrid(x, y - 0.5)
928
929        self.meshXOffset = numpy.max(X)
930
931        self.plt.jet()
932        mesh = self.plt.pcolormesh(X, Y, data[self.root.mapping], edgecolor="b", linewidth=2)
933
934        if self.plot_attr_names:
935            names = [attr.name for attr in self.data.domain.attributes]
936            self.plt.xticks(numpy.arange(data.shape[1] + 1)/float(numpy.max(data.shape)), names)
937        self.plt.gca().xaxis.tick_top()
938        for label in self.plt.gca().xaxis.get_ticklabels():
939            label.set_rotation(45)
940
941        for tick in self.plt.gca().xaxis.get_major_ticks():
942            tick.tick1On = False
943            tick.tick2On = False
944
945    def plotLabels_(self):
946        import numpy
947##        self.plt.yticks(numpy.arange(len(self.labels) - 1, 0, -1), self.labels)
948##        for tick in self.plt.gca().yaxis.get_major_ticks():
949##            tick.tick1On = False
950##            tick.label1On = False
951##            tick.label2On = True
952##        text = TableTextLayout(xy=(self.meshXOffset+1, len(self.root)), tableText=[[label] for label in self.labels])
953        text = TableTextLayout(xy=(self.meshXOffset*1.005, len(self.root) - 1), tableText=[[label] for label in self.labels])
954        text.set_figure(self.plt.gcf())
956        self.plt.gca()._set_artist_props(text)
957
958    def plotLabels(self):
959##        table = TablePlot(xy=(self.meshXOffset*1.005, len(self.root) -1), axes=self.plt.gca())
960        table = TablePlot(xy=(0, len(self.root) -1), axes=self.plt.gca())
961        table.set_figure(self.plt.gcf())
962        for i,label in enumerate(self.labels):
963            table.add_cell(i, 0, width=1, height=1, text=label, loc="left", edgecolor="w")
964        table.set_zorder(0)
966        self.plt.gca()._set_artist_props(table)
967
968    def plot(self, filename=None, show=False):
969        self.plt.rcParams.update(self.params)
970        labelLen = max(len(label) for label in self.labels)
971        w, h = 800, 600
972        space = 0.01 if self.space_width == None else self.space_width
973        border = self.border_width
974        width = 1.0 - 2*border
975        height = 1.0 - 2*border
976        textLineHeight = min(max(h/len(self.labels), 4), self.plt.rcParams.get("font.size", 12))
977        maxTextLineWidthEstimate = textLineHeight*labelLen
978##        print maxTextLineWidthEstimate
979        textAxisWidthRatio = 2.0*maxTextLineWidthEstimate/w
980##        print textAxisWidthRatio
981        labelsAreaRatio = min(textAxisWidthRatio, 0.4) if self.label_width == None else self.label_width
982        x, y = len(self.data.domain.attributes), len(self.data)
983
984        heatmapAreaRatio = min(1.0*y/h*x/w, 0.3) if self.heatmap_width == None else self.heatmap_width
985        dendrogramAreaRatio = 1.0 - labelsAreaRatio - heatmapAreaRatio - 2*space if self.dendrogram_width == None else self.dendrogram_width
986
987        self.fig = self.plt.figure()
988        self.labels_offset = self.root.height/20.0
989        dendrogramAxes = self.plt.axes([border, border, width*dendrogramAreaRatio, height])
990        dendrogramAxes.xaxis.grid(True)
991        import matplotlib.ticker as ticker
992
993        dendrogramAxes.yaxis.set_major_locator(ticker.NullLocator())
994        dendrogramAxes.yaxis.set_minor_locator(ticker.NullLocator())
995        dendrogramAxes.invert_xaxis()
996        self.plotDendrogram()
997        heatmapAxes = self.plt.axes([border + width*dendrogramAreaRatio + space, border, width*heatmapAreaRatio, height], sharey=dendrogramAxes)
998
999        heatmapAxes.xaxis.set_major_locator(ticker.NullLocator())
1000        heatmapAxes.xaxis.set_minor_locator(ticker.NullLocator())
1001        heatmapAxes.yaxis.set_major_locator(ticker.NullLocator())
1002        heatmapAxes.yaxis.set_minor_locator(ticker.NullLocator())
1003
1004        self.plotHeatMap()
1005        labelsAxes = self.plt.axes([border + width*(dendrogramAreaRatio + heatmapAreaRatio + 2*space), border, width*labelsAreaRatio, height], sharey=dendrogramAxes)
1006        self.plotLabels()
1007        labelsAxes.set_axis_off()
1008        labelsAxes.xaxis.set_major_locator(ticker.NullLocator())
1009        labelsAxes.xaxis.set_minor_locator(ticker.NullLocator())
1010        labelsAxes.yaxis.set_major_locator(ticker.NullLocator())
1011        labelsAxes.yaxis.set_minor_locator(ticker.NullLocator())
1012        if filename:
1013            import matplotlib.backends.backend_agg
1014            canvas = matplotlib.backends.backend_agg.FigureCanvasAgg(self.fig)
1015            canvas.print_figure(filename)
1016        if show:
1017            self.plt.show()
1018
1019
1020"""
1021Dendrogram ploting using Orange.misc.render
1022"""
1023
1024from Orange.misc.render import EPSRenderer, ColorPalette
1025
1026class DendrogramPlot(object):
1027    """ A class for drawing dendrograms.
1028
1029    .. note:: ``dendrogram_draw`` function is a more convinient interface
1030        to the functionality provided by this class and.
1031
1032    Example::
1033
1034        a = DendrogramPlot(tree)
1035        a.plot("tree.png", format="png")
1036
1037    """
1038    def __init__(self, tree, attr_tree = None, labels=None, data=None, width=None, height=None, tree_height=None, heatmap_width=None, text_width=None,
1039                 spacing=2, cluster_colors={}, color_palette=ColorPalette([(255, 0, 0), (0, 255, 0)]), maxv=None, minv=None, gamma=None, renderer=EPSRenderer, **kwargs):
1040        self.tree = tree
1041        self.attr_tree = attr_tree
1042        if not labels:
1043            if data and data.domain.class_var:
1044                labels = [str(ex.getclass()) for ex in data]
1045            elif hasattr(tree.mapping, "objects"):
1046                labels = [str(obj) for obj in tree.mapping.objects]
1047            else:
1048                labels = [""] * len(tree)
1049        self.labels = labels
1050
1051#        self.attr_labels = [str(attr.name) for attr in data.domain.attributes] if not attr_labels and data else attr_labels or []
1052        self.data = data
1053        self.width, self.height = float(width) if width else None, float(height) if height else None
1054        self.tree_height = tree_height
1055        self.heatmap_width = heatmap_width
1056        self.text_width = text_width
1057        self.font_size = 10.0
1058        self.linespacing = 0.0
1059        self.cluster_colors = cluster_colors
1060        self.horizontal_margin = 10.0
1061        self.vertical_margin = 10.0
1062        self.spacing = float(spacing) if spacing else None
1063        self.color_palette = color_palette
1064        self.minv = minv
1065        self.maxv = maxv
1066        self.gamma = gamma
1067        self.set_matrix_color_schema(color_palette, minv, maxv, gamma)
1068        self.renderer = renderer
1069
1070    def set_matrix_color_schema(self, color_palette, minv, maxv, gamma=None):
1071        """ Set the matrix color scheme.
1072        """
1073        if isinstance(color_palette, ColorPalette):
1074            self.color_palette = color_palette
1075        else:
1076            self.color_palette = ColorPalette(color_palette)
1077        self.minv = minv
1078        self.maxv = maxv
1079        self.gamma = gamma
1080
1081    def color_shema(self):
1082        vals = [float(val) for ex in self.data for val in ex if not val.isSpecial() and val.variable.varType==orange.VarTypes.Continuous] or [0]
1083        avg = sum(vals)/len(vals)
1084
1085        maxVal = self.maxv if self.maxv else max(vals)
1086        minVal = self.minv if self.minv else min(vals)
1087
1088        def _colorSchema(val):
1089            if val.isSpecial():
1090                return self.color_palette(None)
1091            elif val.variable.varType==orange.VarTypes.Continuous:
1092                r, g, b = self.color_palette((float(val) - minVal) / abs(maxVal - minVal), gamma=self.gamma)
1093            elif val.variable.varType==orange.VarTypes.Discrete:
1094                r = g = b = int(255.0*float(val)/len(val.variable.values))
1095            return (r, g, b)
1096        return _colorSchema
1097
1098    def layout(self):
1099        height_final = False
1100        width_final = False
1101        tree_height = self.tree_height or 100
1102        if self.height:
1103            height, height_final = self.height, True
1104            heatmap_height = height - (tree_height + self.spacing if self.attr_tree else 0) - 2 * self.horizontal_margin
1105            font_size =  heatmap_height / len(self.labels) #self.font_size or (height - (tree_height + self.spacing if self.attr_tree else 0) - 2 * self.horizontal_margin) / len(self.labels)
1106        else:
1107            font_size = self.font_size
1108            heatmap_height = font_size * len(self.labels)
1109            height = heatmap_height + (tree_height + self.spacing if self.attr_tree else 0) + 2 * self.horizontal_margin
1110
1111        text_width = self.text_width or max([len(label) for label in self.labels] + [0]) * font_size #max([self.renderer.string_size_hint(label) for label in self.labels])
1112
1113        if self.width:
1114            width = self.width
1115            heatmap_width = width - 2 * self.vertical_margin - tree_height - (2 if self.data else 1) * self.spacing - text_width if self.data else 0
1116        else:
1117            heatmap_width = len(self.data.domain.attributes) * heatmap_height / len(self.data) if self.data else 0
1118            width = 2 * self.vertical_margin + tree_height + (heatmap_width + self.spacing if self.data else 0) + self.spacing + text_width
1119
1120        return width, height, tree_height, heatmap_width, heatmap_height, text_width, font_size
1121
1122    def plot(self, filename="graph.eps", **kwargs):
1123        width, height, tree_height, heatmap_width, heatmap_height, text_width, font_size = self.layout()
1124        heatmap_cell_height = heatmap_height / len(self.labels)
1125
1126        heatmap_cell_width = 0.0 if not self.data else heatmap_width / len(self.data.domain.attributes)
1127
1128        self.renderer = self.renderer(width, height)
1129
1130        def draw_tree(cluster, root, treeheight, treewidth, color):
1131            height = treeheight * cluster.height / root.height
1132            if cluster.branches:
1133                centers = []
1134                for branch in cluster.branches:
1135                    center = draw_tree(branch, root, treeheight, treewidth, self.cluster_colors.get(branch, color))
1136                    centers.append(center)
1137                    self.renderer.draw_line(center[0], center[1], center[0], height, stroke_color = self.cluster_colors.get(branch, color))
1138
1139                self.renderer.draw_line(centers[0][0], height, centers[-1][0], height, stroke_color = self.cluster_colors.get(cluster, color))
1140                return (centers[0][0] + centers[-1][0]) / 2.0, height
1141            else:
1142                return float(treewidth) * cluster.first / len(root), 0.0
1143        self.renderer.save_render_state()
1144        self.renderer.translate(self.vertical_margin + tree_height, self.horizontal_margin + (tree_height + self.spacing if self.attr_tree else 0) + heatmap_cell_height / 2.0)
1145        self.renderer.rotate(90)
1146#        print self.renderer.transform()
1147        draw_tree(self.tree, self.tree, tree_height, heatmap_height, self.cluster_colors.get(self.tree, (0,0,0)))
1148        self.renderer.restore_render_state()
1149        if self.attr_tree:
1150            self.renderer.save_render_state()
1151            self.renderer.translate(self.vertical_margin + tree_height + self.spacing + heatmap_cell_width / 2.0, self.horizontal_margin + tree_height)
1152            self.renderer.scale(1.0, -1.0)
1153#            print self.renderer.transform()
1154            draw_tree(self.attr_tree, self.attr_tree, tree_height, heatmap_width, self.cluster_colors.get(self.attr_tree, (0,0,0)))
1155            self.renderer.restore_render_state()
1156
1157        self.renderer.save_render_state()
1158        self.renderer.translate(self.vertical_margin + tree_height + self.spacing, self.horizontal_margin + (tree_height + self.spacing if self.attr_tree else 0))
1159#        print self.renderer.transform()
1160        if self.data:
1161            colorSchema = self.color_shema()
1162            for i, ii in enumerate(self.tree):
1163                ex = self.data[ii]
1164                for j, jj in enumerate((self.attr_tree if self.attr_tree else range(len(self.data.domain.attributes)))):
1165                    r, g, b = colorSchema(ex[jj])
1166                    self.renderer.draw_rect(j * heatmap_cell_width, i * heatmap_cell_height, heatmap_cell_width, heatmap_cell_height, fill_color=(r, g, b), stroke_color=(255, 255, 255))
1167
1168        self.renderer.translate(heatmap_width + self.spacing, heatmap_cell_height)
1169#        print self.renderer.transform()
1170        self.renderer.set_font("Times-Roman", font_size)
1171        for index in self.tree.mapping: #label in self.labels:
1172            self.renderer.draw_text(0.0, 0.0, self.labels[index])
1173            self.renderer.translate(0.0, heatmap_cell_height)
1174        self.renderer.restore_render_state()
1175        self.renderer.save(filename, **kwargs)
1176
1177
1178def dendrogram_draw(file, cluster, attr_cluster = None, labels=None, data=None,
1179                    width=None, height=None, tree_height=None,
1180                    heatmap_width=None, text_width=None,  spacing=2,
1181                    cluster_colors={}, color_palette=ColorPalette([(255, 0, 0), (0, 255, 0)]),
1182                    maxv=None, minv=None, gamma=None,
1183                    format=None):
1184    """ Plot the dendrogram to ``file``.
1185
1186    :param file: An  open file or a filename to store the image to.
1187    :type file: str or an file-like object suitable for writing.
1188
1189    :param cluster: An instance of :class:`HierarcicalCluster`
1190    :type cluster: :class:`HierarcicalCluster`
1191
1192    :param attr_cluster: An instance of :class:`HierarcicalCluster` for the attributes
1193        in ``data`` (unused if ``data`` is not supplied)
1194    :type attr_cluster: :class:`HierarcicalCluster`
1195
1196    :param labels: Labels for the ``cluster`` leaves.
1197    :type labels: list-of-strings
1198
1199    :param data: A data table for the (optional) heatmap.
1200    :type data: :class:`Orange.data.Table`
1201
1202    :param width: Image width.
1203    :type width: int
1204
1205    :param height: Image height.
1206    :type height: int
1207
1208    :param tree_height: Dendrogram tree height in the image.
1209    :type tree_height: int
1210
1211    :param heatmap_width: Heatmap width.
1212    :type heatmap_width: int
1213
1214    :param text_width: Text labels area width.
1215    :type text_width: int
1216
1217    :param spacing: Spacing between consecutive leaves.
1218    :type spacing: int
1219
1220    :param cluster_colors: A dictionary mapping :class:`HierarcicalCluster`
1221        instances in ``cluster`` to a RGB color 3-tuple.
1222    :type cluster_colors: dict
1223
1224    :param format: Output image format Currently supported formats are
1225        "png" (default), "eps" and "svg". You only need this arguement if the
1226        format cannot be deduced from the ``file`` argument.
1227
1228    :type format: str
1229
1230    """
1231    import os
1232    from Orange.misc.render import PILRenderer, EPSRenderer, SVGRenderer
1233    if isinstance(file, basestring):
1234        name, ext = os.path.splitext(file)
1235        format = ext.lower().lstrip(".") or format
1236
1237    if format is None:
1238        format = "png"
1239
1240    renderer = {"eps":EPSRenderer, "svg":SVGRenderer, "png":PILRenderer}.get(format, "png")
1241
1242    d = DendrogramPlot(cluster, attr_cluster, labels, data, width, height,
1243                       tree_height, heatmap_width, text_width, spacing,
1244                       cluster_colors, color_palette, maxv, minv, gamma,
1245                       renderer=renderer)
1246    if renderer is PILRenderer:
1247        d.plot(file, format=format)
1248    else:
1249        d.plot(file)
1250
1251def postorder(cluster):
1252    """ Return a post order list of clusters.
1253
1254    :param cluster: Cluster
1255    :type cluster: :class:`HierarchicalCluster`
1256
1257    :rtype: list of :class:`HierarchicalCluster` instances
1258
1259    """
1260    order = []
1261    visited = set()
1262    stack = [cluster]
1263    while stack:
1264        cluster = stack.pop(0)
1265
1266        if cluster.branches:
1267            if cluster in visited:
1268                order.append(cluster)
1269            else:
1270                stack = cluster.branches + [cluster] + stack
1272        else:
1273            order.append(cluster)
1275    return order
1276
1277
1278def preorder(cluster):
1279    """ Return a pre order list of clusters.
1280
1281    :param cluster: Cluster
1282    :type cluster: :class:`HierarchicalCluster`
1283
1284    :rtype: list of :class:`HierarchicalCluster` instances
1285
1286    """
1287    order = []
1288    stack = [cluster]
1289    while stack:
1290        cluster = stack.pop(0)
1291        order.append(cluster)
1292        if cluster.branches:
1293            stack = cluster.branches + stack
1294    return order
1295
1296
1297def dendrogram_layout(cluster, expand_leaves=False):
1298    """ Return a layout of the cluster dendrogram on a 2D plane. The return
1299    value if a list of (subcluster, (start, center, end)) tuples where
1300    subcluster is an instance of :class:`HierarchicalCluster` and start,
1301    end are the two end points for the cluster branch. The list is sorted
1302    in post-order.
1303
1304    :param cluster: Cluster to layout.
1305    :type cluster: :class:`HierarchicalCluster`
1306
1307    :param expand_leaves: If True leaves will span proportional to the number
1308        of items they contain, else all leaves will be the same width.
1309
1310    :rtype: list of (:class:`HierarchicalCluster`, (start, center, end)) tuples
1311
1312    """
1313    result = []
1314    cluster_geometry = {}
1315    leaf_idx = 0
1316    for cluster in postorder(cluster):
1317        if not cluster.branches:
1318            if expand_leaves:
1319                start, end = float(cluster.first), float(cluster.last)
1320            else:
1321                start = end = leaf_idx
1322                leaf_idx += 1
1323            center = (start + end) / 2.0
1324            cluster_geometry[cluster] = (start, center, end)
1325            result.append((cluster, (start, center, end)))
1326        else:
1327            left = cluster.branches[0]
1328            right = cluster.branches[1]
1329            left_center = cluster_geometry[left][1]
1330            right_center = cluster_geometry[right][1]
1331            start, end = left_center, right_center
1332            center = (start + end) / 2.0
1333            cluster_geometry[cluster] = (start, center, end)
1334            result.append((cluster, (start, center, end)))
1335
1336    return result
1337
1338def clone(cluster):
1339    """ Clone a cluster, including it's subclusters.
1340
1341    :param cluster: Cluster to clone
1342    :type cluster: :class:`HierarchicalCluster`
1343
1344    :rtype: :class:`HierarchicalCluster`
1345
1346    """
1347    import copy
1348    clones = {}
1349    mapping = copy.copy(cluster.mapping)
1350    for node in postorder(cluster):
1351        node_clone = copy.copy(node)
1352        if node.branches:
1353            node_clone.branches = [clones[b] for b in node.branches]
1354        node_clone.mapping = mapping
1355        clones[node] = node_clone
1356
1357    return clones[cluster]
1358
1359def pruned(cluster, level=None, height=None, condition=None):
1360    """ Return a new pruned clustering instance.
1361
1362    .. note:: This uses :obj:`clone` to create a copy of the `cluster`
1363        instance.
1364
1365    :param cluster: Cluster to prune.
1366    :type cluster: :class:`HierarchicalCluster`
1367
1368    :param level: If not None prune all clusters deeper then `level`.
1369    :type level: int
1370
1371    :param height: If not None prune all clusters with height lower then
1372        `height`.
1373    :type height: float
1374
1375    :param condition: If not None condition must be a function taking a
1376        single :class:`HierarchicalCluster` instance and returning a True
1377        or False value indicating if the cluster should be pruned.
1378    :type condition: function
1379
1380    :rtype: :class:`HierarchicalCluster`
1381
1382    """
1383    cluster = clone(cluster)
1384    prune(cluster, level, height, condition)
1385    return cluster
1386
1387
1388def prune(cluster, level=None, height=None, condition=None):
1389    """ Prune the clustering instance ``cluster`` in place.
1390
1391    :param cluster: Cluster to prune.
1392    :type cluster: :class:`HierarchicalCluster`
1393
1394    :param level: If not None prune all clusters deeper then `level`.
1395    :type level: int
1396
1397    :param height: If not None prune all clusters with height lower then
1398        `height`.
1399    :type height: float
1400
1401    :param condition: If not None condition must be a function taking a
1402        single :class:`HierarchicalCluster` instance and returning a True
1403        or False value indicating if the cluster should be pruned.
1404    :type condition: function
1405
1406    """
1407    if not any(arg is not None for arg in [level, height, condition]):
1408        raise ValueError("At least one pruning argument must be supplied")
1409
1410    level_check = height_check = condition_check = lambda cl: False
1411    cluster_depth = cluster_depths(cluster)
1412
1413    if level is not None:
1414        level_check = lambda cl: cluster_depth[cl] >= level
1415
1416    if height is not None:
1417        height_check = lambda cl: cl.height <= height
1418
1419    if condition is not None:
1420        condition_check = condition
1421
1422    pruned_set = set()
1423
1424    def check_all(cl):
1425        return any([check(cl) for check in [level_check, height_check,
1426                                            condition_check]])
1427
1428    for cluster in preorder(cluster):
1429        if cluster not in pruned_set:
1430            if check_all(cluster):
1431                cluster.branches = None
1432                pruned_set.update(set(preorder(cluster)))
1433            else:
1434                pass
1435
1436
1437def cluster_depths(cluster):
1438    """ Return a dictionary mapping :class:`HierarchicalCluster` instances to
1439    their depths in the `cluster` hierarchy.
1440
1441    :param cluster: Root cluster
1442    :type cluster: :class:`HierarchicalCluster`
1443
1444    :rtype: class:`dict`
1445
1446    """
1447    depths = {}
1448    depths[cluster] = 0
1449    for cluster in preorder(cluster):
1450        cl_depth = depths[cluster]
1451        if cluster.branches:
1452            depths.update(dict.fromkeys(cluster.branches, cl_depth + 1))
1453    return depths
1454
1455instance_distance_matrix = Orange.distance.distance_matrix
1456
1457def feature_distance_matrix(data, distance=None, progress_callback=None):
1458    """ A helper function that computes an :class:`Orange.misc.SymMatrix` of
1459    all pairwise distances between features in `data`.
1460
1461    :param data: A data table
1462    :type data: :class:`Orange.data.Table`
1463
1464    :param distance: a function taking two lists and returning the distance.
1465    :type distance: function
1466
1467    :param progress_callback: A function (taking one argument) to use for
1468        reporting the on the progress.
1469    :type progress_callback: function
1470
1471    :rtype: :class:`Orange.misc.SymMatrix`
1472
1473    """
1474    attributes = data.domain.attributes
1475    matrix = Orange.misc.SymMatrix(len(attributes))
1476    iter_count = matrix.dim * (matrix.dim - 1) / 2
1477    milestones = progress_bar_milestones(iter_count, 100)
1478
1479    for count, ((i, a1), (j, a2)) in enumerate(_pairs(enumerate(attributes))):
1480        matrix[i, j] = (1.0 - orange.PearsonCorrelation(a1, a2, data, 0).r) / 2.0
1481        if progress_callback and count in milestones:
1482            progress_callback(100.0 * count / iter_count)
1483
1484    return matrix
1485
1486
1487def _pairs(seq, same = False):
1488    """ Return all pairs from elements of `seq`.
1489    """
1490    seq = list(seq)
1491    same = 0 if same else 1
1492    for i in range(len(seq)):
1493        for j in range(i + same, len(seq)):
1494            yield seq[i], seq[j]
1495
1496
1497def joining_cluster(cluster, item1, item2):
1498    """ Return the cluster where `item1` and `item2` are first joined
1499
1500    :param cluster: Clustering.
1501    :type cluster: :class:`HierarchicalCluster`
1502    :param item1: An element of `cluster.mapping` or `cluster.mapping.objects`
1503    :param item2: An element of `cluster.mapping` or `cluster.mapping.objects`
1504
1505    :rtype: :class:`HierarchicalCluster`
1506
1507    """
1508    while True:
1509        if cluster.branches:
1510            for branch in cluster.branches:
1511                if item1 in branch and item2 in branch:
1512                    cluster = branch
1513                    break
1514            else:
1515                return cluster
1516        else:
1517            return cluster
1518
1519
1520def cophenetic_distances(cluster):
1521    """ Return the cophenetic distance matrix between items in clustering.
1522    Cophenetic distance is defined as the height of the cluster where the
1523    two items are first joined.
1524
1525    :param cluster: Clustering.
1526    :type cluster: :class:`HierarchicalCluster`
1527
1528    :rtype: :class:`Orange.misc.SymMatrix`
1529
1530    """
1531
1532    mapping = cluster.mapping
1533    matrix = Orange.misc.SymMatrix(len(mapping))
1534    for cluster in postorder(cluster):
1535        if cluster.branches:
1536            for branch1, branch2 in _pairs(cluster.branches):
1537                for idx1 in mapping[branch1.first: branch1.last]:
1538                    for idx2 in mapping[branch2.first: branch2.last]:
1539                        matrix[idx1, idx2] = cluster.height
1540
1541        else:
1542            for ind1, ind2 in _pairs(mapping[cluster.first: cluster.last]):
1543                matrix[ind1, ind2] = cluster.height
1544
1545    return matrix
1546
1547
1548def cophenetic_correlation(cluster, matrix):
1549    """ Return the `cophenetic correlation coefficient
1550    <http://en.wikipedia.org/wiki/Cophenetic_correlation>`_ of the given
1551    clustering.
1552
1553    :param cluster: Clustering.
1554    :type cluster: :class:`HierarchicalCluster`
1555
1556    :param matrix: The distance matrix from which the `cluster` was
1557        derived.
1558
1559    :rtype: :class:`float`
1560
1561    """
1562    import numpy
1563    cophenetic = cophenetic_distances(cluster)
1564    cophenetic = [list(row) for row in cophenetic]
1565    original = [list(row) for row in matrix]
1566    cophenetic = numpy.ravel(cophenetic)
1567    original = numpy.ravel(original)
1568    return numpy.corrcoef(cophenetic, original)[0, 1]
1569
1570
Note: See TracBrowser for help on using the repository browser.