source: orange/Orange/classification/tree.py @ 10208:1969fb5e8558

Revision 10208:1969fb5e8558, 100.2 KB checked in by Yureh Zhbontar <jure.zbontar@…>, 2 years ago (diff)

Fixed doctests in tree.py

Line 
1"""
2
3.. index:: classification tree
4
5.. index::
6   single: classification; tree
7
8*******************************
9Classification trees (``tree``)
10*******************************
11
12Orange includes multiple implementations of classification tree learners:
13a very flexible :class:`TreeLearner`, a fast :class:`SimpleTreeLearner`,
14and a :class:`C45Learner`, which uses the C4.5 tree induction
15algorithm.
16
17The following code builds a :obj:`TreeClassifier` on the Iris data set
18(with the depth limited to three levels):
19
20.. literalinclude:: code/orngTree1.py
21   :lines: 1-4
22
23See `Decision tree learning
24<http://en.wikipedia.org/wiki/Decision_tree_learning>`_ on Wikipedia
25for introduction to classification trees.
26
27======================
28Learner and Classifier
29======================
30
31.. autoclass:: TreeLearner
32    :members:
33
34.. autoclass:: TreeClassifier
35    :members:
36
37.. class:: Node
38
39    Classification trees are a hierarchy of :obj:`Node` classes.
40
41    Node stores the instances belonging to the node, a branch selector,
42    a list of branches (if the node is not a leaf) with their descriptions
43    and strengths, and a classifier.
44
45    .. attribute:: distribution
46   
47        A distribution of learning instances.
48
49    .. attribute:: contingency
50
51        Complete contingency matrices for the learning instances.
52
53    .. attribute:: instances, weightID
54
55        Learning instances and the ID of weight meta attribute. The root
56        of the tree actually stores all instances, while other nodes
57        store only reference to instances in the root node.
58
59    .. attribute:: node_classifier
60
61        A classifier for instances coming to the node. If the node is a
62        leaf, it chooses the class (or class distribution) of an instance.
63
64    Internal nodes have additional attributes. The lists :obj:`branches`,
65    :obj:`branch_descriptions` and :obj:`branch_sizes` are of the
66    same length.
67
68    .. attribute:: branches
69
70        A list of subtrees. Each element is a :obj:`Node` or None.
71        If None, the node is empty.
72
73    .. attribute:: branch_descriptions
74
75        A list with strings describing branches. They are constructed
76        by :obj:`SplitConstructor`. A string can contain anything,
77        for example 'red' or '>12.3'.
78
79    .. attribute:: branch_sizes
80
81        A (weighted) number of training instances for
82        each branch. It can be used, for instance, for modeling
83        probabilities when classifying instances with unknown values.
84
85    .. attribute:: branch_selector
86
87        A :obj:`~Orange.classification.Classifier` that returns a branch
88        for each instance (as
89        :obj:`Orange.data.Value` in ``[0, len(branches)-1]``).  When an
90        instance cannot be classified unambiguously, the selector can
91        return a discrete distribution, which proposes how to divide
92        the instance between the branches. Whether the proposition will
93        be used depends upon the :obj:`Splitter` (for learning)
94        or :obj:`Descender` (for classification).
95
96    .. method:: tree_size()
97       
98        Return the number of nodes in the subtrees (including the node,
99        excluding null-nodes).
100
101========
102Examples
103========
104
105Tree Structure
106==============
107
108This example works with the lenses data set:
109
110    >>> import Orange
111    >>> lenses = Orange.data.Table("lenses")
112    >>> tree_classifier = Orange.classification.tree.TreeLearner(lenses)
113
114The following function counts the number of nodes in a tree:
115
116    >>> def tree_size(node):
117    ...    if not node:
118    ...        return 0
119    ...    size = 1
120    ...    if node.branch_selector:
121    ...        for branch in node.branches:
122    ...            size += tree_size(branch)
123    ...    return size
124
125If node is None, the function above return 0. Otherwise, the size is 1
126(this node) plus the sizes of all subtrees. The algorithm need to check
127if a node is internal (it has a :obj:`~Node.branch_selector`), as leaves
128don't have the :obj:`~Node.branches` attribute.
129
130    >>> tree_size(tree_classifier.tree)
131    15
132
133Note that a :obj:`Node` already has a built-in method
134:func:`~Node.tree_size`.
135
136Trees can be printed with a simple recursive function:
137
138    >>> def print_tree0(node, level):
139    ...     if not node:
140    ...         print " "*level + "<null node>"
141    ...         return
142    ...     if node.branch_selector:
143    ...         node_desc = node.branch_selector.class_var.name
144    ...         node_cont = node.distribution
145    ...         print "\\n" + "   "*level + "%s (%s)" % (node_desc, node_cont),
146    ...         for i in range(len(node.branches)):
147    ...             print "\\n" + "   "*level + ": %s" % node.branch_descriptions[i],
148    ...             print_tree0(node.branches[i], level+1)
149    ...     else:
150    ...         node_cont = node.distribution
151    ...         major_class = node.node_classifier.default_value
152    ...         print "--> %s (%s) " % (major_class, node_cont),
153
154The crux of the example is not in the formatting (\\n's etc.);
155what matters is everything but the print statements. The code
156separately handles three node types:
157
158* For null nodes (a node to which no learning instances were classified),
159  it just prints "<null node>".
160* For internal nodes, it prints a node description:
161  the feature's name and distribution of classes. :obj:`Node`'s
162  branch description is an :obj:`~Orange.classification.Classifier`,
163  and its ``class_var`` is the feature whose name is printed.  Class
164  distributions are printed as well (they are assumed to be stored).
165  The :obj:`print_tree0` with a level increased by 1 to increase the
166  indent is recursively called for each branch.
167* If the node is a leaf, it prints the distribution of learning instances
168  in the node and the class to which the instances in the node would
169  be classified. We assume that the :obj:`~Node.node_classifier` is a
170  :obj:`DefaultClassifier`. A better print function should be aware of
171  possible alternatives.
172
173The wrapper function that accepts either a
174:obj:`TreeClassifier` or a :obj:`Node` can be written as follows:
175
176    >>> def print_tree(x):
177    ...     if isinstance(x, Orange.classification.tree.TreeClassifier):
178    ...         print_tree0(x.tree, 0)
179    ...     elif isinstance(x, Orange.classification.tree.Node):
180    ...         print_tree0(x, 0)
181    ...     else:
182    ...         raise TypeError, "invalid parameter"
183
184It's straightforward: if ``x`` is a
185:obj:`TreeClassifier`, it prints ``x.tree``; if it's :obj:`Node` it
186print ``x``. If it's of some other type,
187an exception is raised. The output:
188
189    >>> print_tree(tree_classifier)
190    <BLANKLINE>
191    tear_rate (<15.000, 4.000, 5.000>)
192    : normal
193       astigmatic (<3.000, 4.000, 5.000>)
194       : no
195          age (<1.000, 0.000, 5.000>)
196          : pre-presbyopic --> soft (<0.000, 0.000, 2.000>) 
197          : presbyopic
198             prescription (<1.000, 0.000, 1.000>)
199             : hypermetrope --> soft (<0.000, 0.000, 1.000>) 
200             : myope --> none (<1.000, 0.000, 0.000>) 
201          : young --> soft (<0.000, 0.000, 2.000>) 
202       : yes
203          prescription (<2.000, 4.000, 0.000>)
204          : hypermetrope
205             age (<2.000, 1.000, 0.000>)
206             : pre-presbyopic --> none (<1.000, 0.000, 0.000>) 
207             : presbyopic --> none (<1.000, 0.000, 0.000>) 
208             : young --> hard (<0.000, 1.000, 0.000>) 
209          : myope --> hard (<0.000, 3.000, 0.000>) 
210    : reduced --> none (<12.000, 0.000, 0.000>)
211
212The tree structure examples conclude with a simple pruning function,
213written entirely in Python and unrelated to any :class:`Pruner`. It limits
214the tree depth (the number of internal nodes on any path down the tree).
215For example, to get a two-level tree, call cut_tree(root, 2). The function
216is recursive, with the second argument (level) decreasing at each call;
217when zero, the current node will be made a leaf:
218
219    >>> def cut_tree(node, level):
220    ...     if node and node.branch_selector:
221    ...         if level:
222    ...             for branch in node.branches:
223    ...                 cut_tree(branch, level-1)
224    ...         else:
225    ...             node.branch_selector = None
226    ...             node.branches = None
227    ...             node.branch_descriptions = None
228
229The function acts only when :obj:`node` and :obj:`node.branch_selector`
230are defined. If the level is not zero, is recursively calls  the function
231for each branch. Otherwise, it clears the selector, branches and branch
232descriptions.
233
234    >>> cut_tree(tree_classifier.tree, 2)
235    >>> print_tree(tree_classifier)
236    <BLANKLINE>
237    tear_rate (<15.000, 4.000, 5.000>)
238    : normal
239       astigmatic (<3.000, 4.000, 5.000>)
240       : no --> soft (<1.000, 0.000, 5.000>) 
241       : yes --> hard (<2.000, 4.000, 0.000>) 
242    : reduced --> none (<12.000, 0.000, 0.000>)
243
244Setting learning parameters
245===========================
246
247Let us construct a :obj:`TreeLearner` to play with:
248
249    >>> import Orange
250    >>> lenses = Orange.data.Table("lenses")
251    >>> learner = Orange.classification.tree.TreeLearner()
252
253There are three crucial components in learning: the
254:obj:`~TreeLearner.split` and :obj:`~TreeLearner.stop` criteria, and the
255example :obj:`~TreeLearner.splitter`. The default ``stop`` is set with:
256
257    >>> learner.stop = Orange.classification.tree.StopCriteria_common()
258
259The default stopping parameters are:
260
261    >>> print learner.stop.max_majority, learner.stop.min_examples
262    1.0 0.0
263
264The defaults only stop splitting when no instances are left or all of
265them are in the same class.
266
267If the minimal subset that is allowed to be split further is set to five
268instances, the resulting tree is smaller.
269
270    >>> learner.stop.min_examples = 5.0
271    >>> tree = learner(lenses)
272    >>> print tree
273    tear_rate=reduced: none (100.00%)
274    tear_rate=normal
275    |    astigmatic=no
276    |    |    age=pre-presbyopic: soft (100.00%)
277    |    |    age=presbyopic: none (50.00%)
278    |    |    age=young: soft (100.00%)
279    |    astigmatic=yes
280    |    |    prescription=hypermetrope: none (66.67%)
281    |    |    prescription=myope: hard (100.00%)
282    <BLANKLINE>
283
284We can also limit the maximal proportion of majority class.
285
286    >>> learner.stop.max_majority = 0.5
287    >>> tree = learner(lenses)
288    >>> print tree
289    none (62.50%)
290
291Redefining tree induction components
292====================================
293
294This example shows how to use a custom stop function.  First, the
295``def_stop`` function defines the default stop function. The first tree
296has some added randomness; the induction also stops in 20% of the
297cases when ``def_stop`` returns False. The stopping criteria for the
298second tree is completely random: it stops induction in 20% of cases.
299Note that in the second case lambda function still has three parameters,
300even though in does not need any, since so many are necessary
301for :obj:`~TreeLearner.stop`.
302
303.. literalinclude:: code/tree3.py
304   :lines: 8-23
305
306=================================
307Learner and Classifier Components
308=================================
309
310Split constructors
311=====================
312
313.. class:: SplitConstructor
314
315    Decide how to divide learning instances, ie. define branching criteria.
316   
317    The :obj:`SplitConstructor` should use the domain
318    contingency when possible, both for speed and adaptability.
319    Sometimes domain contingency does
320    not suffice, for example if ReliefF score is used.
321
322    A :obj:`SplitConstructor` can veto further tree induction by returning
323    no classifier. This is generally related to the number of learning
324    instances that would go in each branch. If there are no splits with
325    more than :obj:`SplitConstructor.min_subset` instances in the branches
326    (null nodes are allowed), the induction is stopped.
327
328    Split constructors that cannot handle a particular feature
329    type (discrete, continuous) quietly skip them. When in doubt, use
330    :obj:`SplitConstructor_Combined`, which delegates features to
331    specialized split constructors.
332
333    The same split constructors can be used both for classification and
334    regression, if the chosen score (for :obj:`SplitConstructor_Score`
335    and derived classes) supports both.
336
337    .. attribute:: min_subset
338
339        The minimal (weighted) number in non-null leaves.
340
341    .. method:: __call__(data, [ weightID, contingency, apriori_distribution, candidates, clsfr])
342
343        :param data: in any acceptable form.
344        :param weightID: Optional; the default of 0 means that all
345            instances have a weight of 1.0.
346        :param contingency: a domain contingency
347        :param apriori_distribution: apriori class probabilities.
348        :type apriori_distribution: :obj:`Orange.statistics.distribution.Distribution`
349        :param candidates: only consider these
350            features (one boolean for each feature).
351        :param clsfr: a node classifier (if it was constructed, that is,
352            if :obj:`store_node_classifier` is True)
353
354        Construct a split. Return a tuple (:obj:`branch_selector`,
355        :obj:`branch_descriptions` (a list), :obj:`subset_sizes`
356        (the number of instances for each branch, may also be
357        empty), :obj:`quality` (higher numbers mean better splits),
358        :obj:`spent_feature`). If no split is constructed,
359        the :obj:`selector`, :obj:`branch_descriptions` and
360        :obj:`subset_sizes` are None, while :obj:`quality` is 0.0 and
361        :obj:`spent_feature` is -1.
362
363        If the chosen feature will be useless in the future and
364        should not be considered for splitting in any of the subtrees
365        (typically, when discrete features are used as-they-are, without
366        any binarization or subsetting), then it should return the index
367        of this feature as :obj:`spent_feature`. If no features are spent,
368        :obj:`spent_feature` is -1.
369
370.. class:: SplitConstructor_Score
371
372    Bases: :class:`SplitConstructor`
373
374    An abstract base class that compare splits
375    with a :class:`Orange.feature.scoring.Score`.  All split
376    constructors except for :obj:`SplitConstructor_Combined` are derived
377    from this class.
378
379    .. attribute:: measure
380
381        A :class:`Orange.feature.scoring.Score` for split evaluation. It
382        has to handle the class type - for example, you cannot use
383        :class:`~Orange.feature.scoring.GainRatio` for regression or
384        :class:`~Orange.feature.scoring.MSE` for classification.
385
386    .. attribute:: worst_acceptable
387
388        The lowest allowed split quality.  The value strongly depends
389        on chosen :obj:`measure` component. Default is 0.0.
390
391.. class:: SplitConstructor_Feature
392
393    Bases: :class:`SplitConstructor_Score`
394
395    Each value of a discrete feature corresponds to a branch.  The feature
396    with the highest score (:obj:`~Measure.measure`) is selected. When
397    tied, a random feature is selected.
398
399    The constructed :obj:`branch_selector` is an instance of
400    :obj:`orange.ClassifierFromVarFD`, that returns a value of the selected
401    feature. :obj:`branch_description` contains the feature's
402    values. The feature is marked as spent (it cannot reappear in the
403    node's subtrees).
404
405.. class:: SplitConstructor_ExhaustiveBinary
406
407    Bases: :class:`SplitConstructor_Score`
408
409    Finds the binarization with the highest score among all features. In
410    case of ties, a random feature is selected.
411
412    The constructed :obj:`branch_selector` is an instance
413    :obj:`orange.ClassifierFromVarFD`, that returns a value of the
414    selected feature. Its :obj:`transformer` contains a ``MapIntValue``
415    that maps values of the feature into a binary feature. Branches
416    with a single feature value are described with that value and
417    branches with more than one are described with ``[<val1>, <val2>,
418    ..., <valn>]``. Only binary features are marked as spent.
419
420.. class:: SplitConstructor_Threshold
421
422    Bases: :class:`SplitConstructor_Score`
423
424    The only split constructor for continuous features. It divides the
425    range of feature values with a threshold that maximizes the split's
426    quality. In case of ties, a random feature is selected.  The feature
427    that yields the best binary split is returned.
428
429    The constructed :obj:`branch_selector` is an instance of
430    :obj:`orange.ClassifierFromVarFD` with an attached :obj:`transformer`,
431    of type :obj:`Orange.feature.discretization.ThresholdDiscretizer`. The
432    branch descriptions are "<threshold" and ">=threshold". The feature
433    is not spent.
434
435.. class:: SplitConstructor_OneAgainstOthers
436   
437    Bases: :class:`SplitConstructor_Score`
438
439    Undocumented.
440
441.. class:: SplitConstructor_Combined
442
443    Bases: :class:`SplitConstructor`
444
445    Uses different split constructors for discrete and continuous
446    features. Each split constructor is called with appropriate
447    features. Both construct a candidate for a split; the better of them
448    is used.
449
450    The choice of the split is not probabilistically fair, when
451    multiple candidates have the same score. For example, if there
452    are nine discrete features with the highest score the split
453    constructor for discrete features will select one of them. Now,
454    if there is also a single continuous feature with the same score,
455    :obj:`SplitConstructor_Combined` would randomly select between the
456    proposed discrete feature and continuous feature, unaware that the
457    discrete feature  has already competed with eight others.  So,
458    the probability for selecting (each) discrete feature would be
459    1/18 instead of 1/10. Although incorrect, this should not affect
460    the performance.
461
462    .. attribute: discrete_split_constructor
463
464        Split constructor for discrete features;
465        for instance, :obj:`SplitConstructor_Feature` or
466        :obj:`SplitConstructor_ExhaustiveBinary`.
467
468    .. attribute: continuous_split_constructor
469
470        Split constructor for continuous features; it
471        can be either :obj:`SplitConstructor_Threshold` or a
472        a custom split constructor.
473
474
475StopCriteria and StopCriteria_common
476============================================
477
478:obj:`StopCriteria` determines when to stop the induction of subtrees.
479
480.. class:: StopCriteria
481
482    Provides the basic stopping criteria: the tree induction stops
483    when there is at most one instance left (the actual, not weighted,
484    number). The induction also stops when all instances are in the
485    same class (for discrete problems) or have the same outcome value
486    (for regression problems).
487
488    .. method:: __call__(instances[, weightID, domain contingencies])
489
490        Return True (stop) of False (continue the induction).
491        Contingencies are not used for counting. Derived classes should
492        use the contingencies whenever possible.
493
494.. class:: StopCriteria_common
495
496    Pre-pruning with additional criteria.
497
498    .. attribute:: max_majority
499
500        Maximum proportion of majority class. When exceeded,
501        induction stops.
502
503    .. attribute:: min_instances
504
505        Minimum number of instances for splitting. Subsets with less
506        than :obj:`min_instances` instances are not split further.
507        The sample count is weighed.
508
509
510Splitters
511=================
512
513Splitters sort learning instances into branches (the branches are selected
514with a :obj:`SplitConstructor`, while a :obj:`Descender` decides the
515branch for an instance during classification).
516
517Most splitters call :obj:`Node.branch_selector` and assign
518instances correspondingly. When the value is unknown they choose a
519particular branch or skip the instance.
520
521Some splitters can also split instances: a weighed instance is
522used in more than than one subset. Each branch has a weight ID (usually,
523each its own ID) and all instances in that branch should have this meta attribute.
524
525An instance that
526hasn't been split has only one additional attribute (weight
527ID corresponding to the subset to which it went). Instance that is split
528between, say, three subsets, has three new meta attributes, one for each
529subset. The weights are used only when needed; when there is no
530splitting - no weight IDs are returned.
531
532.. class:: Splitter
533
534    An abstract base class that splits instances
535    into subsets.
536
537    .. method:: __call__(node, instances[, weightID])
538
539        :param node: a node.
540        :type node: :obj:`Node`
541        :param instances: a set of instances
542        :param weightID: weight ID.
543       
544        Use the information in :obj:`Node` (particularly the
545        :obj:`~Node.branch_selector`) to split the given set of instances into
546        subsets.  Return a tuple with a list of instance subsets and
547        a list of weights.  The list of weights is either a
548        list of integers or None when no weights are added.
549
550.. class:: Splitter_IgnoreUnknowns
551
552    Bases: :class:`Splitter`
553
554    Ignores the instances for which no single branch can be determined.
555
556.. class:: Splitter_UnknownsToCommon
557
558    Bases: :class:`Splitter`
559
560    Places all ambiguous instances to a branch with the highest number of
561    instances. If there is more than one such branch, one is selected at
562    random and then used for all instances.
563
564.. class:: Splitter_UnknownsToAll
565
566    Bases: :class:`Splitter`
567
568    Splits instances with an unknown value of the feature into all branches.
569
570.. class:: Splitter_UnknownsToRandom
571
572    Bases: :class:`Splitter`
573
574    Selects a random branch for ambiguous instances.
575
576.. class:: Splitter_UnknownsToBranch
577
578    Bases: :class:`Splitter`
579
580    Constructs an additional branch for ambiguous instances.
581    The branch's description is "unknown".
582
583.. class:: Splitter_UnknownsAsBranchSizes
584
585    Bases: :class:`Splitter`
586
587    Splits instances with unknown value of the feature according to
588    proportions of instances in each branch.
589
590.. class:: Splitter_UnknownsAsSelector
591
592    Bases: :class:`Splitter`
593
594    Splits instances with unknown value of the feature according to
595    distribution proposed by selector (usually the same as proportions
596    of instances in branches).
597
598Descenders
599=============================
600
601Descenders decide where should the instances that cannot be unambiguously
602put in a single branch go during classification (the branches are selected
603with a :obj:`SplitConstructor`, while a :obj:`Splitter` sorts instances
604during learning).
605
606.. class:: Descender
607
608    An abstract base tree descender. It descends a
609    an instance as deep as possible, according to the values
610    of instance's features. The :obj:`Descender`: calls the node's
611    :obj:`~Node.branch_selector` to get the branch index. If it's a
612    simple index, the corresponding branch is followed. If not, the
613    descender decides what to do. A descender can choose a single
614    branch (for instance, the one that is the most recommended by the
615    :obj:`~Node.branch_selector`) or it can let the branches vote.
616
617    Three are possible outcomes of a descent:
618
619    #. The descender reaches a leaf. This happens when
620       there were no unknown or out-of-range values, or when the
621       descender selected a single branch and continued the descend
622       despite them. The descender returns the :obj:`Node` it has reached.
623    #. Node's :obj:`~Node.branch_selector` returned a distribution and
624       :obj:`Descender` decided to stop the descend at this (internal)
625       node. It returns the current :obj:`Node`.
626    #. Node's :obj:`~Node.branch_selector` returned a distribution and the
627       :obj:`Node` wants to split the instance (i.e., to decide the class
628       by voting). It returns a :obj:`Node` and the vote-weights for
629       the branches.  The weights can correspond, for example,  to the
630       distribution returned by node's :obj:`~Node.branch_selector`, or to
631       the number of learning instances that were assigned to each branch.
632
633    .. method:: __call__(node, instance)
634
635        Descends until it reaches a leaf or a node in
636        which a vote of subtrees is required. A tuple
637        of two elements is returned. If it reached a leaf, the tuple contains
638        the leaf node and None. If not, it contains a node and
639        a list of floats (weights of votes).
640
641.. class:: Descender_UnknownToNode
642
643    Bases: :obj:`Descender`
644
645    When instance cannot be classified into a single branch, the current
646    node is returned. Thus, the node's :obj:`~Node.node_classifier`
647    will be used to make a decision. Therefore, internal nodes
648    need to have :obj:`Node.node_classifier` defined.
649
650.. class:: Descender_UnknownToBranch
651
652    Bases: :obj:`Descender`
653
654    Classifies instances with unknown value to a special branch. This
655    makes sense only if the tree itself was constructed with
656    :obj:`Splitter_UnknownsToBranch`.
657
658.. class:: Descender_UnknownToCommonBranch
659
660    Bases: :obj:`Descender`
661
662    Classifies instances with unknown values to the branch with the
663    highest number of instances. If there is more than one such branch,
664    random branch is chosen for each instance.
665
666.. class:: Descender_UnknownToCommonSelector
667
668    Bases: :obj:`Descender`
669
670    Classifies instances with unknown values to the branch which received
671    the highest recommendation by the selector.
672
673.. class:: Descender_UnknownMergeAsBranchSizes
674
675    Bases: :obj:`Descender`
676
677    The subtrees vote for the instance's class; the vote is weighted
678    according to the sizes of the branches.
679
680.. class:: Descender_UnknownMergeAsSelector
681
682    Bases: :obj:`Descender`
683
684    The subtrees vote for the instance's class; the vote is weighted
685    according to the selectors proposal.
686
687Pruning
688=======
689
690.. index::
691    pair: classification trees; pruning
692
693The pruners construct a shallow copy of a tree. The pruned tree's
694:obj:`Node` contain references to the same contingency matrices,
695node classifiers, branch selectors, ...  as the original tree.
696
697Pruners cannot construct a new :obj:`~Node.node_classifier`.  Thus, for
698pruning, internal nodes must have :obj:`~Node.node_classifier` defined
699(the default).
700
701.. class:: Pruner
702
703    An abstract base tree pruner.
704
705    .. method:: __call__(tree)
706
707        :param tree: either
708            a :obj:`Node` or (the C++ version of the classifier,
709            saved in :obj:`TreeClassfier.base_classifier`).
710
711        The resulting pruned tree is of the same type as the argument.
712        The original tree remains intact.
713
714.. class:: Pruner_SameMajority
715
716    Bases: :class:`Pruner`
717
718    A tree can have a subtrees where all the leaves have
719    the same majority class. This is allowed because leaves can still
720    have different class distributions and thus predict different
721    probabilities.  The :obj:`Pruner_SameMajority` prunes the tree so
722    that there is no subtree in which all the nodes would have the same
723    majority class.
724
725    This pruner will only prune the nodes in which the node classifier
726    is a :obj:`~Orange.classification.ConstantClassifier`
727    (or a derived class).
728
729    The pruning works from leaves to the root.
730    It siblings have (at least one) common majority class, they can be pruned.
731
732.. class:: Pruner_m
733
734    Bases: :class:`Pruner`
735
736    Prunes a tree by comparing m-estimates of static and dynamic
737    error as defined in (Bratko, 2002).
738
739    .. attribute:: m
740
741        Parameter m for m-estimation.
742
743Printing the tree
744=================
745
746The tree printing functions are very flexible. They can print, for
747example, numbers of instances, proportions of majority class in nodes
748and similar, or more complex statistics like the proportion of instances
749in a particular class divided by the proportion of instances of this
750class in a parent node. Users may also pass their own functions to print
751certain elements.
752
753The easiest way to print the tree is to print :func:`TreeClassifier`::
754
755    >>> print tree
756    petal width<0.800: Iris-setosa (100.00%)
757    petal width>=0.800
758    |    petal width<1.750
759    |    |    petal length<5.350: Iris-versicolor (94.23%)
760    |    |    petal length>=5.350: Iris-virginica (100.00%)
761    |    petal width>=1.750
762    |    |    petal length<4.850: Iris-virginica (66.67%)
763    |    |    petal length>=4.850: Iris-virginica (100.00%)
764
765
766Format string
767-------------
768
769Format strings are printed at every leaf or internal node with the certain
770format specifiers replaced by data from the tree node. Specifiers are
771generally of form **%[^]<precision><quantity><divisor>**.
772
773**^** at the start tells that the number should be multiplied by 100,
774which is useful for proportions like percentages.
775
776**<precision>** is in the same format as in Python (or C) string
777formatting. For instance, ``%N`` denotes the number of instances in
778the node, hence ``%6.2N`` would mean output to two decimal digits
779and six places altogether. If left out, a default format ``5.3`` is
780used, unless the numbers are multiplied by 100, in which case the default
781is ``.0`` (no decimals, the number is rounded to the nearest integer).
782
783**<divisor>** tells what to divide the quantity in that node with.
784``bP`` means division by the same quantity in the parent node; for instance,
785``%NbP`` gives the number of instances in the node divided by the
786number of instances in parent node. Precision formatting can be added,
787e.g. ``%6.2NbP``. ``bA`` denotes division by the same quantity over the entire
788data set, so ``%NbA`` will tell you the proportion of instances (out
789of the entire training data set) that fell into that node. If division is
790impossible since the parent node does not exist or some data is missing,
791a dot is printed out instead.
792
793**<quantity>** defines what to print and is the only required element.
794It can be:
795
796``V``
797    The predicted value at that node. Precision
798    or divisor can not be defined here.
799
800``N``
801    The number of instances in the node.
802
803``M``
804    The number of instances in the majority class (that is, the class
805    predicted by the node).
806
807``m``
808    The proportion of instances in the majority class.
809
810``A``
811    The average class for instances the node; this is available only for
812    regression trees.
813
814``E``
815    Standard error for class of instances in the node; available only for
816    regression trees.
817
818``I``
819    Print out the confidence interval. The modifier is used as
820    ``%I(95)`` of (more complicated) ``%5.3I(95)bP``.
821
822``C``
823    The number of instances in the given class.  For a classification
824    example, ``%5.3C="Iris-virginica"bP`` denotes the number of instances
825    of Iris-virginica by the number of instances this class in the parent
826    node ( instances that are *not* Iris-virginica could be described with
827    ``%5.3CbP!="Iris-virginica"``).
828
829    For regression trees, use operators =, !=, <, <=, >, and >=, as in
830    ``%C<22``, with optional precision and divisor. Intervals are also
831    possible: ``%C[20, 22]`` gives the number of instances between
832    20 and 22 (inclusive) and ``%C(20, 22)`` gives the number of such
833    instances excluding the boundaries. Mixing of parentheses is allowed,
834    e.g. ``%C(20, 22]``.  Add ``!`` for instances outside the interval,
835    like ``%C!(20, 22]``.
836
837``c``
838    Same as ``C``, except that it computes the proportion of the class
839    instead of the number of instances.
840
841``D``
842    The number of instances in each class. Precision and the divisor
843    are applied to each number in the distribution.  This quantity can
844    not be computed for regression trees.
845
846``d``
847    Same as ``D``, except that it shows proportions of instances.
848
849<user defined formats>
850    Instructions and examples of added formats are at the end of this
851    section.
852
853.. rubric:: Examples on classification trees
854
855A tree on the iris data set with the depth limited to three
856levels is built as follows:
857   
858.. literalinclude:: code/orngTree1.py
859   :lines: 1-4
860
861Printing the predicted class at each node, the number
862of instances in the majority class with the total number of instances in
863the node requires a custom format string::
864
865    >>> print tree.to_string(leaf_str="%V (%M out of %N)")
866    petal width<0.800: Iris-setosa (50.000 out of 50.000)
867    petal width>=0.800
868    |    petal width<1.750
869    |    |    petal length<5.350: Iris-versicolor (49.000 out of 52.000)
870    |    |    petal length>=5.350: Iris-virginica (2.000 out of 2.000)
871    |    petal width>=1.750
872    |    |    petal length<4.850: Iris-virginica (2.000 out of 3.000)
873    |    |    petal length>=4.850: Iris-virginica (43.000 out of 43.000)
874
875The number of instances as
876compared to the entire data set and to the parent node::
877
878    >>> print tree.to_string(leaf_str="%V (%^MbA%, %^MbP%)")
879    petal width<0.800: Iris-setosa (100%, 100%)
880    petal width>=0.800
881    |    petal width<1.750
882    |    |    petal length<5.350: Iris-versicolor (98%, 100%)
883    |    |    petal length>=5.350: Iris-virginica (4%, 40%)
884    |    petal width>=1.750
885    |    |    petal length<4.850: Iris-virginica (4%, 4%)
886    |    |    petal length>=4.850: Iris-virginica (86%, 96%)
887
888``%M`` is the number of instances in the majority class. Dividing by
889the number of all instances from this class on the entire data set
890is described with ``%MbA``. Add ``^`` in front for mutiplication with
891100. The percent sign *after* that is printed out literally, just as the
892comma and parentheses. For the proportion of this class in the parent the
893``bA`` is replaced with ``bA``.
894
895To print the number of versicolors in each node, together with the
896proportion of versicolors among the instances in this particular node
897and among all versicolors, use the following::
898
899    '%C="Iris-versicolor" (%^c="Iris-versicolor"% of node, %^CbA="Iris-versicolor"% of versicolors)
900
901It gives::
902
903    petal width<0.800: 0.000 (0% of node, 0% of versicolors)
904    petal width>=0.800
905    |    petal width<1.750
906    |    |    petal length<5.350: 49.000 (94% of node, 98% of versicolors)
907    |    |    petal length>=5.350: 0.000 (0% of node, 0% of versicolors)
908    |    petal width>=1.750
909    |    |    petal length<4.850: 1.000 (33% of node, 2% of versicolors)
910    |    |    petal length>=4.850: 0.000 (0% of node, 0% of versicolors)
911
912Finally, to print the distributions using a format string ``%D``::
913
914    petal width<0.800: [50.000, 0.000, 0.000]
915    petal width>=0.800
916    |    petal width<1.750
917    |    |    petal length<5.350: [0.000, 49.000, 3.000]
918    |    |    petal length>=5.350: [0.000, 0.000, 2.000]
919    |    petal width>=1.750
920    |    |    petal length<4.850: [0.000, 1.000, 2.000]
921    |    |    petal length>=4.850: [0.000, 0.000, 43.000]
922
923As the order of classes is the same as in ``data.domain.class_var.values``
924(setosa, versicolor, virginica), there are 49 versicolors and 3 virginicae
925in the node at ``petal length<5.350``. To print the proportions within
926nodes rounded to two decimals use ``%.2d``::
927
928    petal width<0.800: [1.00, 0.00, 0.00]
929    petal width>=0.800
930    |    petal width<1.750
931    |    |    petal length<5.350: [0.00, 0.94, 0.06]
932    |    |    petal length>=5.350: [0.00, 0.00, 1.00]
933    |    petal width>=1.750
934    |    |    petal length<4.850: [0.00, 0.33, 0.67]
935    |    |    petal length>=4.850: [0.00, 0.00, 1.00]
936
937The most trivial format string for internal nodes is for printing
938node predictions. ``.`` in the following example specifies
939that the node_str should be the same as leaf_str.
940
941::
942
943    tree.to_string(leaf_str="%V", node_str=".")
944 
945The output::
946
947    root: Iris-setosa
948    |    petal width<0.800: Iris-setosa
949    |    petal width>=0.800: Iris-versicolor
950    |    |    petal width<1.750: Iris-versicolor
951    |    |    |    petal length<5.350: Iris-versicolor
952    |    |    |    petal length>=5.350: Iris-virginica
953    |    |    petal width>=1.750: Iris-virginica
954    |    |    |    petal length<4.850: Iris-virginica
955    |    |    |    petal length>=4.850: Iris-virginica
956
957A node *root* has appeared and the tree looks one level
958deeper. This is needed to also print the data for tree root.
959
960To observe how the number
961of virginicas decreases down the tree try::
962
963    print tree.to_string(leaf_str='%^.1CbA="Iris-virginica"% (%^.1CbP="Iris-virginica"%)', node_str='.')
964
965Interpretation: ``CbA="Iris-virginica"`` is
966the number of instances from virginica, divided by the total number
967of instances in this class. Add ``^.1`` and the result will be
968multiplied and printed with one decimal. The trailing ``%`` is printed
969out. In parentheses the same thing was divided by
970the instances in the parent node. The single quotes were used for strings, so
971that double quotes inside the string can specify the class.
972
973::
974
975    root: 100.0% (.%)
976    |    petal width<0.800: 0.0% (0.0%)
977    |    petal width>=0.800: 100.0% (100.0%)
978    |    |    petal width<1.750: 10.0% (10.0%)
979    |    |    |    petal length<5.350: 6.0% (60.0%)
980    |    |    |    petal length>=5.350: 4.0% (40.0%)
981    |    |    petal width>=1.750: 90.0% (90.0%)
982    |    |    |    petal length<4.850: 4.0% (4.4%)
983    |    |    |    petal length>=4.850: 86.0% (95.6%)
984
985If :meth:`~TreeClassifier.to_string` cannot compute something, in this case
986because the root has no parent, it prints out a dot.
987
988The final example with classification trees prints the distributions in
989nodes, the distribution compared to the parent, the proportions compared
990to the parent and the predicted class in the leaves::
991
992    >>> print tree.to_string(leaf_str='"%V   %D %.2DbP %.2dbP"', node_str='"%D %.2DbP %.2dbP"')
993    root: [50.000, 50.000, 50.000] . .
994    |    petal width<0.800: [50.000, 0.000, 0.000] [1.00, 0.00, 0.00] [3.00, 0.00, 0.00]:
995    |        Iris-setosa   [50.000, 0.000, 0.000] [1.00, 0.00, 0.00] [3.00, 0.00, 0.00]
996    |    petal width>=0.800: [0.000, 50.000, 50.000] [0.00, 1.00, 1.00] [0.00, 1.50, 1.50]
997    |    |    petal width<1.750: [0.000, 49.000, 5.000] [0.00, 0.98, 0.10] [0.00, 1.81, 0.19]
998    |    |    |    petal length<5.350: [0.000, 49.000, 3.000] [0.00, 1.00, 0.60] [0.00, 1.04, 0.62]:
999    |    |    |        Iris-versicolor   [0.000, 49.000, 3.000] [0.00, 1.00, 0.60] [0.00, 1.04, 0.62]
1000    |    |    |    petal length>=5.350: [0.000, 0.000, 2.000] [0.00, 0.00, 0.40] [0.00, 0.00, 10.80]:
1001    |    |    |        Iris-virginica   [0.000, 0.000, 2.000] [0.00, 0.00, 0.40] [0.00, 0.00, 10.80]
1002    |    |    petal width>=1.750: [0.000, 1.000, 45.000] [0.00, 0.02, 0.90] [0.00, 0.04, 1.96]
1003    |    |    |    petal length<4.850: [0.000, 1.000, 2.000] [0.00, 1.00, 0.04] [0.00, 15.33, 0.68]:
1004    |    |    |        Iris-virginica   [0.000, 1.000, 2.000] [0.00, 1.00, 0.04] [0.00, 15.33, 0.68]
1005    |    |    |    petal length>=4.850: [0.000, 0.000, 43.000] [0.00, 0.00, 0.96] [0.00, 0.00, 1.02]:
1006    |    |    |        Iris-virginica   [0.000, 0.000, 43.000] [0.00, 0.00, 0.96] [0.00, 0.00, 1.02]
1007
1008
1009.. rubric:: Examples on regression trees
1010
1011The regression trees examples use a tree induced from the housing data
1012set. Without other argumets, :meth:`TreeClassifier.to_string` prints the
1013following::
1014
1015    RM<6.941
1016    |    LSTAT<14.400
1017    |    |    DIS<1.385: 45.6
1018    |    |    DIS>=1.385: 22.9
1019    |    LSTAT>=14.400
1020    |    |    CRIM<6.992: 17.1
1021    |    |    CRIM>=6.992: 12.0
1022    RM>=6.941
1023    |    RM<7.437
1024    |    |    CRIM<7.393: 33.3
1025    |    |    CRIM>=7.393: 14.4
1026    |    RM>=7.437
1027    |    |    TAX<534.500: 45.9
1028    |    |    TAX>=534.500: 21.9
1029
1030To add the standard error in both internal nodes and leaves, and
1031the 90% confidence intervals in the leaves, use::
1032
1033    >>> print tree.to_string(leaf_str="[SE: %E]\t %V %I(90)", node_str="[SE: %E]")
1034    root: [SE: 0.409]
1035    |    RM<6.941: [SE: 0.306]
1036    |    |    LSTAT<14.400: [SE: 0.320]
1037    |    |    |    DIS<1.385: [SE: 4.420]:
1038    |    |    |        [SE: 4.420]   45.6 [38.331-52.829]
1039    |    |    |    DIS>=1.385: [SE: 0.244]:
1040    |    |    |        [SE: 0.244]   22.9 [22.504-23.306]
1041    |    |    LSTAT>=14.400: [SE: 0.333]
1042    |    |    |    CRIM<6.992: [SE: 0.338]:
1043    |    |    |        [SE: 0.338]   17.1 [16.584-17.691]
1044    |    |    |    CRIM>=6.992: [SE: 0.448]:
1045    |    |    |        [SE: 0.448]   12.0 [11.243-12.714]
1046    |    RM>=6.941: [SE: 1.031]
1047    |    |    RM<7.437: [SE: 0.958]
1048    |    |    |    CRIM<7.393: [SE: 0.692]:
1049    |    |    |        [SE: 0.692]   33.3 [32.214-34.484]
1050    |    |    |    CRIM>=7.393: [SE: 2.157]:
1051    |    |    |        [SE: 2.157]   14.4 [10.862-17.938]
1052    |    |    RM>=7.437: [SE: 1.124]
1053    |    |    |    TAX<534.500: [SE: 0.817]:
1054    |    |    |        [SE: 0.817]   45.9 [44.556-47.237]
1055    |    |    |    TAX>=534.500: [SE: 0.000]:
1056    |    |    |        [SE: 0.000]   21.9 [21.900-21.900]
1057
1058The predicted value (``%V``) and the average (``%A``) may differ because
1059a regression tree does not always predict the leaf average, but whatever
1060the :obj:`~Node.node_classifier` in a leaf returns.  As ``%V`` uses the
1061:obj:`Orange.feature.Continuous`' function for printing the
1062value, the number has the same number of decimals as in the data file.
1063
1064Regression trees cannot print the distributions in the same way
1065as classification trees. They instead offer a set of operators for
1066observing the number of instances within a certain range. For instance,
1067to print the number of instances with values below 22 and compare
1068it with values in the parent nodes use::
1069
1070    >>> print tree.to_string(leaf_str="%C<22 (%cbP<22)", node_str=".")
1071    root: 277.000 (.)
1072    |    RM<6.941: 273.000 (1.160)
1073    |    |    LSTAT<14.400: 107.000 (0.661)
1074    |    |    |    DIS<1.385: 0.000 (0.000)
1075    |    |    |    DIS>=1.385: 107.000 (1.020)
1076    |    |    LSTAT>=14.400: 166.000 (1.494)
1077    |    |    |    CRIM<6.992: 93.000 (0.971)
1078    |    |    |    CRIM>=6.992: 73.000 (1.040)
1079    |    RM>=6.941: 4.000 (0.096)
1080    |    |    RM<7.437: 3.000 (1.239)
1081    |    |    |    CRIM<7.393: 0.000 (0.000)
1082    |    |    |    CRIM>=7.393: 3.000 (15.333)
1083    |    |    RM>=7.437: 1.000 (0.633)
1084    |    |    |    TAX<534.500: 0.000 (0.000)
1085    |    |    |    TAX>=534.500: 1.000 (30.000)</xmp>
1086
1087The last line, for instance, says the the number of instances with the
1088class below 22 is among those with tax above 534 is 30 times higher than
1089the number of such instances in its parent node.
1090
1091To count the same for all instances *outside*
1092interval [20, 22] and print out the proportions as percents use::
1093
1094    >>> print tree.to_string(leaf_str="%C![20,22] (%^cbP![20,22]%)", node_str=".")
1095
1096The format string  ``%c![20, 22]`` denotes the proportion of instances
1097(within the node) whose values are below 20 or above 22. ``%cbP![20,
109822]`` derives same statistics computed on the parent. A ``^`` is added
1099for percentages.
1100
1101::
1102
1103    root: 439.000 (.%)
1104    |    RM<6.941: 364.000 (98%)
1105    |    |    LSTAT<14.400: 200.000 (93%)
1106    |    |    |    DIS<1.385: 5.000 (127%)
1107    |    |    |    DIS>=1.385: 195.000 (99%)
1108    |    |    LSTAT>=14.400: 164.000 (111%)
1109    |    |    |    CRIM<6.992: 91.000 (96%)
1110    |    |    |    CRIM>=6.992: 73.000 (105%)
1111    |    RM>=6.941: 75.000 (114%)
1112    |    |    RM<7.437: 46.000 (101%)
1113    |    |    |    CRIM<7.393: 43.000 (100%)
1114    |    |    |    CRIM>=7.393: 3.000 (100%)
1115    |    |    RM>=7.437: 29.000 (98%)
1116    |    |    |    TAX<534.500: 29.000 (103%)
1117    |    |    |    TAX>=534.500: 0.000 (0%)
1118
1119
1120Defining custom printouts
1121-------------------------
1122
1123:meth:`TreeClassifier.to_string`'s argument :obj:`user_formats` can be used to
1124print other information.  :obj:`~TreeClassifier.format.user_formats` should
1125contain a list of tuples with a regular expression and a function to be
1126called when that expression is found in the format string. Expressions
1127from :obj:`user_formats` are checked before the built-in expressions
1128discussed above.
1129
1130The regular expression should describe a string like used above,
1131for instance ``%.2DbP``. When a leaf or internal node
1132is printed, the format string (:obj:`leaf_str` or :obj:`node_str`)
1133is checked for these regular expressions and when the match is found,
1134the corresponding callback function is called.
1135
1136The passed function will get five arguments: the format string
1137(:obj:`leaf_str` or :obj:`node_str`), the match object, the node which is
1138being printed, its parent (can be None) and the tree classifier.
1139The function should return the format string in which the part described
1140by the match object (that is, the part that is matched by the regular
1141expression) is replaced by whatever information your callback function
1142is supposed to give.
1143
1144The function can use several utility function provided in the module.
1145
1146.. autofunction:: insert_str
1147
1148.. autofunction:: insert_dot
1149
1150.. autofunction:: insert_num
1151
1152.. autofunction:: by_whom
1153
1154The module also includes reusable regular expressions:
1155
1156.. autodata:: fs
1157
1158.. autodata:: by
1159
1160For a trivial example, ``%V`` is implemented with the
1161following tuple::
1162
1163    (re.compile("%V"), replaceV)
1164
1165And ``replaceV`` is defined by::
1166
1167    def replaceV(strg, mo, node, parent, tree):
1168        return insert_str(strg, mo, str(node.node_classifier.default_value))
1169
1170``replaceV`` takes the value predicted at the node
1171(``node.node_classifier.default_value`` ), converts it to a string
1172and passes it to :func:`insert_str`.
1173
1174A more complex regular expression is the one for the proportion of
1175majority class, defined as ``"%"+fs+"M"+by``. It uses the two partial
1176expressions defined above (:obj:`fs` and :obj:`by`).
1177
1178The following code prints the classification margin for each node,
1179that is, the difference between the proportion of the largest and the
1180second largest class in the node:
1181
1182.. literalinclude:: code/orngTree2.py
1183   :lines: 7-31
1184
1185``get_margin`` computes the margin from the distribution. The replacing
1186function, ``replaceB``, computes the margin for the node.  If :data:`by`
1187group is present, we call :func:`by_whom` to get the node with whose
1188margin this node's margin is to be divided. If this node (usually the
1189parent) does not exist of if its margin is zero, :func:`insert_dot`
1190inserts dot, otherwise :func:`insert_num` is called which inserts the
1191number in the user-specified format.  ``my_format`` contains the regular
1192expression and the callback function.
1193
1194Printing the tree with
1195
1196.. literalinclude:: code/orngTree2.py
1197    :lines: 33
1198
1199yields::
1200
1201    petal width<0.800: Iris-setosa 100% (100.00%)
1202    petal width>=0.800
1203    |    petal width<1.750
1204    |    |    petal length<5.350: Iris-versicolor 88% (108.57%)
1205    |    |    petal length>=5.350: Iris-virginica 100% (122.73%)
1206    |    petal width>=1.750
1207    |    |    petal length<4.850: Iris-virginica 33% (34.85%)
1208    |    |    petal length>=4.850: Iris-virginica 100% (104.55%)
1209
1210Plotting with Dot
1211---------------------------
1212
1213To produce images of trees, first create a .dot file with
1214:meth:`TreeClassifier.dot`. If it was saved to "tree5.dot", plot a gif
1215with the following command::
1216
1217    dot -Tgif tree5.dot -otree5.gif
1218
1219Check GraphViz's dot documentation for more options and
1220output formats.
1221
1222
1223===========================
1224C4.5 Classifier and Learner
1225===========================
1226
1227C4.5 is, as  a standard benchmark in machine learning, incorporated in
1228Orange. The implementation uses the original C4.5 code, so the resulting
1229tree is exactly like the one that would be build by standalone C4.5. The
1230tree build is only made accessible in Python.
1231
1232:class:`C45Learner` and :class:`C45Classifier` behave
1233like any other Orange learner and classifier. Unlike most of Orange
1234learning algorithms, C4.5 does not accepts weighted instances.
1235
1236Building the C4.5 plug-in
1237=========================
1238
1239C4.5 is not distributed with Orange, but it can be added as a
1240plug-in. A C compiler is needed for the procedure: on Windows MS Visual C
1241(CL.EXE and LINK.EXE must be on the PATH), on Linux and OS X gcc (OS X
1242users can download it from Apple).
1243
1244Orange must be installed prior to building C4.5.
1245
1246#. Download
1247   `C4.5 (Release 8) sources <http://www.rulequest.com/Personal/c4.5r8.tar.gz>`_
1248   from the `Rule Quest's site <http://www.rulequest.com/>`_ and extract
1249   them. The files will be modified in the
1250   further process.
1251#. Download
1252   `buildC45.zip <http://orange.biolab.si/orange/download/buildC45.zip>`_
1253   and unzip its contents into the directory R8/Src of the C4.5 sources
1254   (this directory contains, for instance, the file average.c).
1255#. Run buildC45.py, which will build the plug-in and put it next to
1256   orange.pyd (or orange.so on Linux/Mac).
1257#. Run python, type ``import Orange`` and
1258   create ``Orange.classification.tree.C45Learner()``. This should
1259   succeed.
1260#. Finally, you can remove C4.5 sources.
1261
1262The buildC45.py creates .h files that wrap Quinlan's .i files and
1263ensure that they are not included twice. It modifies C4.5 sources to
1264include .h's instead of .i's (this step can hardly fail). Then it compiles
1265ensemble.c into c45.dll or c45.so and puts it next to Orange. In the end
1266it checks if the built C4.5 gives the same results as the original.
1267
1268.. autoclass:: C45Learner
1269    :members:
1270
1271.. autoclass:: C45Classifier
1272    :members:
1273
1274.. class:: C45Node
1275
1276    This class is a reimplementation of the corresponding *struct* from
1277    Quinlan's C4.5 code.
1278
1279    .. attribute:: node_type
1280
1281        Type of the node:  :obj:`C45Node.Leaf` (0),
1282        :obj:`C45Node.Branch` (1), :obj:`C45Node.Cut` (2),
1283        :obj:`C45Node.Subset` (3). "Leaves" are leaves, "branches"
1284        split instances based on values of a discrete attribute,
1285        "cuts" cut them according to a threshold value of a continuous
1286        attributes and "subsets" use discrete attributes but with subsetting
1287        so that several values can go into the same branch.
1288
1289    .. attribute:: leaf
1290
1291        Value returned by that leaf. The field is defined for internal
1292        nodes as well.
1293
1294    .. attribute:: items
1295
1296        Number of (learning) instances in the node.
1297
1298    .. attribute:: class_dist
1299
1300        Class distribution for the node (of type
1301        :obj:`Orange.statistics.distribution.Discrete`).
1302
1303    .. attribute:: tested
1304       
1305        The attribute used in the node's test. If node is a leaf,
1306        obj:`tested` is None, if node is of type :obj:`Branch` or :obj:`Cut`
1307        :obj:`tested` is a discrete attribute, and if node is of type
1308        :obj:`Cut` then :obj:`tested` is a continuous attribute.
1309
1310    .. attribute:: cut
1311
1312        A threshold for continuous attributes, if node is of type :obj:`Cut`.
1313        Undefined otherwise.
1314
1315    .. attribute:: mapping
1316
1317        Mapping for nodes of type :obj:`Subset`. Element ``mapping[i]``
1318        gives the index for an instance whose value of :obj:`tested` is *i*.
1319        Here, *i* denotes an index of value, not a :class:`Orange.data.Value`.
1320
1321    .. attribute:: branch
1322       
1323        A list of branches stemming from this node.
1324
1325Examples
1326========
1327
1328This
1329script constructs the same learner as you would get by calling
1330the usual C4.5:
1331
1332.. literalinclude:: code/tree_c45.py
1333   :lines: 7-14
1334
1335Both C4.5 command-line symbols and variable names can be used. The
1336following lines produce the same result::
1337
1338    tree = Orange.classification.tree.C45Learner(data, m=100)
1339    tree = Orange.classification.tree.C45Learner(data, min_objs=100)
1340
1341A veteran C4.5 might prefer :func:`C45Learner.commandline`::
1342
1343    lrn = Orange.classification.tree.C45Learner()
1344    lrn.commandline("-m 1 -s")
1345    tree = lrn(data)
1346
1347The following script prints out the tree same format as C4.5 does.
1348
1349.. literalinclude:: code/tree_c45_printtree.py
1350
1351For the leaves just the value in ``node.leaf`` in printed. Since
1352:obj:`C45Node` does not know to which attribute it belongs, we need to
1353convert it to a string through ``classvar``, which is passed as an extra
1354argument to the recursive part of print_tree.
1355
1356For discrete splits without subsetting, we print out all attribute values
1357and recursively call the function for all branches. Continuous splits
1358are equally easy to handle.
1359
1360For discrete splits with subsetting, we iterate through branches,
1361retrieve the corresponding values that go into each branch to inset,
1362turn the values into strings and print them out, separately treating
1363the case when only a single value goes into the branch.
1364
1365=================
1366SimpleTreeLearner
1367=================
1368
1369.. include:: /SimpleTreeLearner.txt
1370       
1371Examples
1372========
1373
1374:obj:`SimpleTreeLearner` is used in much the same way as :obj:`TreeLearner`.
1375A typical example of using :obj:`SimpleTreeLearner` would be to build a random
1376forest:
1377
1378.. literalinclude:: code/simple_tree_random_forest.py
1379
1380
1381References
1382==========
1383
1384Bratko, I. (2002). `Prolog Programming for Artificial Intelligence`, Addison
1385Wesley, 2002.
1386
1387E Koutsofios, SC North. Drawing Graphs with dot. AT&T Bell Laboratories,
1388Murray Hill NJ, U.S.A., October 1993.
1389
1390`Graphviz - open source graph drawing software <http://www.research.att.com/sw/tools/graphviz/>`_
1391A home page of AT&T's dot and similar software packages.
1392
1393"""
1394
1395"""
1396TODO C++ aliases
1397
1398SplitConstructor.discrete/continuous_split_constructor -> SplitConstructor.discrete
1399Node.examples -> Node.instances
1400"""
1401
1402from Orange.core import \
1403     TreeLearner as _TreeLearner, \
1404         TreeClassifier as _TreeClassifier, \
1405         SimpleTreeLearner, \
1406         SimpleTreeClassifier, \
1407         C45Learner as _C45Learner, \
1408         C45Classifier as _C45Classifier, \
1409         C45TreeNode as C45Node, \
1410         C45TreeNodeList as C45NodeList, \
1411         TreeDescender as Descender, \
1412              TreeDescender_UnknownMergeAsBranchSizes as Descender_UnknownMergeAsBranchSizes, \
1413              TreeDescender_UnknownMergeAsSelector as Descender_UnknownMergeAsSelector, \
1414              TreeDescender_UnknownToBranch as Descender_UnknownToBranch, \
1415              TreeDescender_UnknownToCommonBranch as Descender_UnknownToCommonBranch, \
1416              TreeDescender_UnknownToCommonSelector as Descender_UnknownToCommonSelector, \
1417         TreeExampleSplitter as Splitter, \
1418              TreeExampleSplitter_IgnoreUnknowns as Splitter_IgnoreUnknowns, \
1419              TreeExampleSplitter_UnknownsAsBranchSizes as Splitter_UnknownsAsBranchSizes, \
1420              TreeExampleSplitter_UnknownsAsSelector as Splitter_UnknownsAsSelector, \
1421              TreeExampleSplitter_UnknownsToAll as Splitter_UnknownsToAll, \
1422              TreeExampleSplitter_UnknownsToBranch as Splitter_UnknownsToBranch, \
1423              TreeExampleSplitter_UnknownsToCommon as Splitter_UnknownsToCommon, \
1424              TreeExampleSplitter_UnknownsToRandom as Splitter_UnknownsToRandom, \
1425         TreeNode as Node, \
1426         TreeNodeList as NodeList, \
1427         TreePruner as Pruner, \
1428              TreePruner_SameMajority as Pruner_SameMajority, \
1429              TreePruner_m as Pruner_m, \
1430         TreeSplitConstructor as SplitConstructor, \
1431              TreeSplitConstructor_Combined as SplitConstructor_Combined, \
1432              TreeSplitConstructor_Measure as SplitConstructor_Score, \
1433                   TreeSplitConstructor_Attribute as SplitConstructor_Feature, \
1434                   TreeSplitConstructor_ExhaustiveBinary as SplitConstructor_ExhaustiveBinary, \
1435                   TreeSplitConstructor_OneAgainstOthers as SplitConstructor_OneAgainstOthers, \
1436                   TreeSplitConstructor_Threshold as SplitConstructor_Threshold, \
1437         TreeStopCriteria as StopCriteria, \
1438              TreeStopCriteria_Python as StopCriteria_Python, \
1439              TreeStopCriteria_common as StopCriteria_common
1440
1441import Orange.core
1442import operator
1443import base64
1444import re
1445import Orange.data
1446import Orange.feature.scoring
1447import warnings
1448
1449class C45Learner(Orange.classification.Learner):
1450    """
1451    :class:`C45Learner`'s attributes have double names - those that
1452    you know from C4.5 command lines and the corresponding names of C4.5's
1453    internal variables. All defaults are set as in C4.5; if you change
1454    nothing, you are running C4.5.
1455
1456    Constructs a :obj:`C45Classifier` when given data.
1457
1458    .. attribute:: gain_ratio (g)
1459       
1460        Determines whether to use information gain (false, default)
1461        or gain ratio for selection of attributes (true).
1462
1463    .. attribute:: batch (b)
1464
1465        Turn on batch mode (no windows, no iterations); this option is
1466        not documented in C4.5 manuals. It conflicts with "window",
1467        "increment" and "trials".
1468
1469    .. attribute:: subset (s)
1470       
1471        Enables subsetting (default: false, no subsetting),
1472 
1473    .. attribute:: prob_thresh (p)
1474
1475        Probabilistic threshold for continuous attributes (default: false).
1476
1477    .. attribute:: min_objs (m)
1478       
1479        Minimal number of objects (instances) in leaves (default: 2).
1480
1481    .. attribute:: window (w)
1482
1483        Initial windows size (default: maximum of 20% and twice the
1484        square root of the number of data objects).
1485
1486    .. attribute:: increment (i)
1487
1488        The maximum number of objects that can be added to the window
1489        at each iteration (default: 20% of the initial window size).
1490
1491    .. attribute:: cf (c)
1492
1493        Prunning confidence level (default: 25%).
1494
1495    .. attribute:: trials (t)
1496
1497        Set the number of trials in iterative (i.e. non-batch) mode (default: 10).
1498
1499    .. attribute:: prune
1500       
1501        Return pruned tree (not an original C4.5 option) (default: true)
1502    """
1503
1504    _rename_new_old = { "min_objs": "minObjs", "probTresh": "prob_tresh",
1505            "gain_ratio": "gainRatio" }
1506    #_rename_new_old = {}
1507    _rename_old_new = dict((a, b) for b, a in _rename_new_old.items())
1508
1509    @classmethod
1510    def _rename_dict(cls, dic):
1511        return dict((cls._rename_arg(a), b) for a, b in dic.items())
1512
1513    @classmethod
1514    def _rename_arg(cls, a):
1515        if a in cls._rename_old_new:
1516            Orange.misc.deprecation_warning(a, cls._rename_old_new[a], stacklevel=4)
1517        return cls._rename_new_old.get(a, a)
1518
1519    def __new__(cls, instances=None, weightID=0, **argkw):
1520        self = Orange.classification.Learner.__new__(cls, **cls._rename_dict(argkw))
1521        if instances:
1522            self.__init__(**cls._rename_dict(argkw))
1523            return self.__call__(instances, weightID)
1524        else:
1525            return self
1526
1527    def __init__(self, **kwargs):
1528        self.base = _C45Learner(**self._rename_dict(kwargs))
1529
1530    def __setattr__(self, name, value):
1531        nn = self._rename_arg(name)
1532        if name != "base" and nn in self.base.__dict__:
1533            self.base.__dict__[nn] = value
1534        else:
1535            self.__dict__[nn] = value
1536
1537    def __getattr__(self, name):
1538        nn = self._rename_arg(name)
1539        if name != " base" and nn in self.base.__dict__:
1540            return self.base.__dict__[nn]
1541        else:
1542            return self.__dict__[nn]
1543
1544    def __call__(self, *args, **kwargs):
1545        return C45Classifier(self.base(*args, **self._rename_dict(kwargs)))
1546
1547    def commandline(self, ln):
1548        """
1549        Set the arguments with a C4.5 command line.
1550        """
1551        self.base.commandline(ln)
1552
1553
1554class C45Classifier(Orange.classification.Classifier):
1555    """
1556    A faithful reimplementation of Quinlan's C4.5, but
1557    uses a tree composed of :class:`C45Node` instead of C4.5's original
1558    tree structure.
1559
1560    .. attribute:: tree
1561
1562        C4.5 tree stored as :obj:`C45Node`.
1563    """
1564
1565    def __init__(self, base_classifier):
1566        self.nativeClassifier = base_classifier
1567        for k, v in self.nativeClassifier.__dict__.items():
1568            self.__dict__[k] = v
1569
1570    def __call__(self, instance, result_type=Orange.classification.Classifier.GetValue,
1571                 *args, **kwdargs):
1572        """Classify a new instance.
1573       
1574        :param instance: instance to be classified.
1575        :type instance: :class:`Orange.data.Instance`
1576        :param result_type:
1577              :class:`Orange.classification.Classifier.GetValue` or \
1578              :class:`Orange.classification.Classifier.GetProbabilities` or
1579              :class:`Orange.classification.Classifier.GetBoth`
1580       
1581        :rtype: :class:`Orange.data.Value`,
1582              :class:`Orange.statistics.Distribution` or a tuple with both
1583        """
1584        return self.nativeClassifier(instance, result_type, *args, **kwdargs)
1585
1586    def __setattr__(self, name, value):
1587        if name == "nativeClassifier":
1588            self.__dict__[name] = value
1589            return
1590        if name in self.nativeClassifier.__dict__:
1591            self.nativeClassifier.__dict__[name] = value
1592        self.__dict__[name] = value
1593
1594    def __str__(self):
1595        return self.dump()
1596
1597
1598    def dump(self):
1599        """
1600        Print the tree in the same form as Ross Quinlan's
1601        C4.5 program.
1602
1603        ::
1604
1605            import Orange
1606
1607            data = Orange.data.Table("voting")
1608            c45 = Orange.classification.tree.C45Learner(data)
1609            print c45
1610
1611        prints
1612
1613        ::
1614
1615            physician-fee-freeze = n: democrat (253.4)
1616            physician-fee-freeze = y:
1617            |   synfuels-corporation-cutback = n: republican (145.7)
1618            |   synfuels-corporation-cutback = y:
1619            |   |   mx-missile = y: democrat (6.0)
1620            |   |   mx-missile = n:
1621            |   |   |   adoption-of-the-budget-resolution = n: republican (22.6)
1622            |   |   |   adoption-of-the-budget-resolution = y:
1623            |   |   |   |   anti-satellite-test-ban = n: democrat (5.0)
1624            |   |   |   |   anti-satellite-test-ban = y: republican (2.2)
1625
1626
1627        The standalone C4.5 would print::
1628
1629            physician-fee-freeze = n: democrat (253.4/5.9)
1630            physician-fee-freeze = y:
1631            |   synfuels-corporation-cutback = n: republican (145.7/6.2)
1632            |   synfuels-corporation-cutback = y:
1633            |   |   mx-missile = y: democrat (6.0/2.4)
1634            |   |   mx-missile = n:
1635            |   |   |   adoption-of-the-budget-resolution = n: republican (22.6/5.2)
1636            |   |   |   adoption-of-the-budget-resolution = y:
1637            |   |   |   |   anti-satellite-test-ban = n: democrat (5.0/1.2)
1638            |   |   |   |   anti-satellite-test-ban = y: republican (2.2/1.0)
1639
1640        C4.5 also prints out the number of errors on learning data in
1641        each node.
1642        """
1643        return  _c45_printTree0(self.tree, self.class_var, 0)
1644
1645    to_string = dump
1646
1647def _c45_showBranch(node, classvar, lev, i):
1648    var = node.tested
1649    str_ = ""
1650    if node.node_type == 1:
1651        str_ += "\n" + "|   "*lev + "%s = %s:" % (var.name, var.values[i])
1652        str_ += _c45_printTree0(node.branch[i], classvar, lev + 1)
1653    elif node.node_type == 2:
1654        str_ += "\n" + "|   "*lev + "%s %s %.1f:" % (var.name, ["<=", ">"][i], node.cut)
1655        str_ += _c45_printTree0(node.branch[i], classvar, lev + 1)
1656    else:
1657        inset = filter(lambda a:a[1] == i, enumerate(node.mapping))
1658        inset = [var.values[j[0]] for j in inset]
1659        if len(inset) == 1:
1660            str_ += "\n" + "|   "*lev + "%s = %s:" % (var.name, inset[0])
1661        else:
1662            str_ += "\n" + "|   "*lev + "%s in {%s}:" % (var.name, ", ".join(inset))
1663        str_ += _c45_printTree0(node.branch[i], classvar, lev + 1)
1664    return str_
1665
1666
1667def _c45_printTree0(node, classvar, lev):
1668    var = node.tested
1669    str_ = ""
1670    if node.node_type == 0:
1671        str_ += "%s (%.1f)" % (classvar.values[int(node.leaf)], node.items)
1672    else:
1673        for i, branch in enumerate(node.branch):
1674            if not branch.node_type:
1675                str_ += _c45_showBranch(node, classvar, lev, i)
1676        for i, branch in enumerate(node.branch):
1677            if branch.node_type:
1678                str_ += _c45_showBranch(node, classvar, lev, i)
1679    return str_
1680
1681def _printTreeC45(tree):
1682    print _c45_printTree0(tree.tree, tree.class_var, 0)
1683
1684
1685import Orange.feature.scoring as fscoring
1686
1687class TreeLearner(Orange.core.Learner):
1688    """
1689    A classification or regression tree learner. If a set of instances
1690    is given on initialization, a :class:`TreeClassifier` is built and
1691    returned instead. All attributes can also be set on initialization.
1692
1693    **The tree induction process**
1694
1695    #. The learning instances are copied, unless
1696       :obj:`store_instances` is `False` and the instance
1697       already are stored in a :obj:`~Orange.data.Table`.
1698    #. Apriori class probabilities are computed. A list of
1699       candidate features for the split is compiled; in the beginning,
1700       all features are candidates.
1701    #. The recursive part. The contingency matrix is computed by
1702       :obj:`contingency_computer`. Contingencies are used by :obj:`split`,
1703       :obj:`stop` and :obj:`splitter`.
1704    #. If the induction should :obj:`stop`, a :obj:`~Node.node_classifier`
1705       is built by calling :obj:`node_learner` with the given instances,
1706       weight ID and the contingency matrix. As the learner uses
1707       contingencies whenever possible, the :obj:`contingency_computer`
1708       will affect the :obj:`~Node.node_classifier`. The node is returned.
1709    #. If the induction continues, a :obj:`split` is called.
1710       If :obj:`split` fails to return a branch selector, induction stops
1711       and the :obj:`Node` is returned.
1712    #. The feature spent (if any) is removed from the candidate list.
1713    #. Instances are divided into child nodes with :obj:`splitter`.
1714       The process recursively continues with step 3 for
1715       each of the non-empty subsets. If the splitter returned weights,
1716       they are used for each branch.
1717
1718    **Attributes**
1719
1720    .. attribute:: node_learner
1721
1722        Induces a classifier from instances in a node. It is used both
1723        for internal nodes and leaves. The default is
1724        :obj:`Orange.classification.majority.MajorityLearner`.
1725
1726    .. attribute:: descender
1727
1728        The descender that the induced :obj:`TreeClassifier` will
1729        use. The default is :obj:`Descender_UnknownMergeAsSelector`.
1730        It votes with the :obj:`branch_selector`'s distribution for
1731        vote weights.
1732
1733    .. attribute:: contingency_computer
1734
1735        Defines the computation of contingency matrices (used by
1736        :obj:`split`, :obj:`stop`, :obj:`splitter`). It can be used,
1737        for example, to change the treatment of unknown values. By
1738        default ordinary contingency matrices are computed.
1739
1740    **Split construction**
1741
1742    .. attribute:: split
1743       
1744        A :obj:`SplitConstructor` or a function with the same signature as
1745        :obj:`SplitConstructor.__call__`. It is useful for prototyping
1746        new tree induction algorithms. If :obj:`split` is defined, other
1747        arguments that affect split construction are ignored. These include
1748        :obj:`binarization`, :obj:`measure`, :obj:`worst_acceptable` and
1749        :obj:`min_subset`. Default: :class:`SplitConstructor_Combined`
1750        with separate constructors for discrete and continuous
1751        features. Discrete features are used as they are, while
1752        continuous are binarized. Features are scored with gain ratio.
1753        At least two instances in a leaf are required for
1754        discrete and five for continuous features.
1755
1756    .. attribute:: binarization
1757
1758        If 1, :class:`SplitConstructor_ExhaustiveBinary` is used.
1759        If 2, use :class:`SplitConstructor_OneAgainstOthers`. If
1760        0, do not use binarization (use :class:`SplitConstructor_Feature`).
1761        Default: 0.
1762
1763    .. attribute:: measure
1764   
1765        A score to evaluate features for splitting instances in a
1766        node.  A subclass of :class:`Orange.feature.scoring.Score`
1767        (perhaps :class:`~Orange.feature.scoring.InfoGain`,
1768        :class:`~Orange.feature.scoring.GainRatio`,
1769        :class:`~Orange.feature.scoring.Gini`,
1770        :class:`~Orange.feature.scoring.Relief`, or
1771        :class:`~Orange.feature.scoring.MSE`). Default:
1772        :class:`Orange.feature.scoring.GainRatio`.
1773
1774    .. attribute:: relief_m, relief_k
1775
1776        Set `m` and `k` for :class:`~Orange.feature.scoring.Relief`,
1777        if chosen.
1778
1779    .. attribute:: splitter
1780
1781        :class:`Splitter` or a function with the same
1782        signature as :obj:`Splitter.__call__`. The default is
1783        :class:`Splitter_UnknownsAsSelector` that splits the
1784        learning instances according to distributions given by the
1785        selector.
1786
1787    **Pruning**
1788
1789    .. attribute:: worst_acceptable
1790
1791        The lowest required feature score. If the score of the best
1792        feature is below this margin, the tree is not grown further
1793        (default: 0).
1794
1795    .. attribute:: min_subset
1796
1797        The lowest required number of instances in non-null leaves (default: 0).
1798
1799    .. attribute:: min_instances
1800
1801        Data subsets with less than :obj:`min_instances`
1802        instances are not split any further. Therefore, all leaves in the tree
1803        will contain at least that many instances (default: 0).
1804
1805    .. attribute:: max_depth
1806
1807        Maximal tree depth. If 0, only root is generated.
1808        The default is 100.
1809
1810    .. attribute:: max_majority
1811
1812        Induction stops when the proportion of majority class in the
1813        node exceeds the value set by this parameter (default: 1.0).
1814 
1815    .. attribute:: stop
1816
1817        :class:`StopCriteria` or a function with the same signature as
1818        :obj:`StopCriteria.__call__`. Useful for prototyping new tree
1819        induction algorithms.  When used, parameters  :obj:`max_majority`
1820        and :obj:`min_instances` will not be  considered.  The default
1821        stopping criterion stops induction when all instances in a node
1822        belong to the same class.
1823
1824    .. attribute:: m_pruning
1825
1826        If non-zero, invokes an error-based bottom-up post-pruning,
1827        where m-estimate is used to estimate class probabilities
1828        (default: 0).
1829
1830    .. attribute:: same_majority_pruning
1831
1832        If true, invokes a bottom-up post-pruning by removing the
1833        subtrees of which all leaves classify to the same class
1834        (default: False).
1835
1836    **Record keeping**
1837
1838    .. attribute:: store_distributions
1839   
1840    .. attribute:: store_contingencies
1841   
1842    .. attribute:: store_instances
1843   
1844    .. attribute:: store_node_classifier
1845
1846        Determines whether to store class distributions,
1847        contingencies and instances in :class:`Node`, and whether the
1848        :obj:`Node.node_classifier` should be build for internal nodes
1849        also (it is needed by the :obj:`Descender` or for post-pruning).
1850        Not storing distributions but storing contingencies does not
1851        save any memory, since distributions actually points to the
1852        same distribution that is stored in :obj:`contingency.classes`.
1853        By default everything except :obj:`store_instances` is enabled.
1854
1855    """
1856    def __new__(cls, data=None, weightID=0, **argkw):
1857        self = Orange.core.Learner.__new__(cls, **argkw)
1858        if data:
1859            self.__init__(**argkw)
1860            return self.__call__(data, weightID)
1861        else:
1862            return self
1863
1864    def __init__(self, **kw):
1865
1866        #name, buildfunction, parameters
1867        #buildfunctions are not saved as function references
1868        #because that would make problems with object copies
1869        for n, (fn, _) in self._built_fn.items():
1870            self.__dict__["_handset_" + n] = False
1871
1872        #measure has to be before split
1873        self.measure = None
1874        self.split = None
1875        self.stop = None
1876        self.splitter = None
1877
1878        for n, (fn, _) in self._built_fn.items():
1879            self.__dict__[n] = fn(self)
1880
1881        for k, v in kw.items():
1882            self.__setattr__(k, v)
1883
1884    def __call__(self, instances, weight=0):
1885        """
1886        Return a classifier from the given instances.
1887        """
1888        bl = self._base_learner()
1889
1890        #set the scoring criteria for regression if it was not
1891        #set by the user
1892        if not self._handset_split and not self.measure:
1893            measure = fscoring.GainRatio() \
1894                if instances.domain.class_var.var_type == Orange.feature.Type.Discrete \
1895                else fscoring.MSE()
1896            bl.split.continuous_split_constructor.measure = measure
1897            bl.split.discrete_split_constructor.measure = measure
1898
1899        if self.splitter != None:
1900            bl.example_splitter = self.splitter
1901
1902        #post pruning
1903        tree = bl(instances, weight)
1904        if getattr(self, "same_majority_pruning", 0):
1905            tree = Pruner_SameMajority(tree)
1906        if getattr(self, "m_pruning", 0):
1907            tree = Pruner_m(tree, m=self.m_pruning)
1908
1909        return TreeClassifier(base_classifier=tree)
1910
1911    def __setattr__(self, name, value):
1912        self.__dict__[name] = value
1913        for n, (fn, v) in self._built_fn.items():
1914            if name in v:
1915                if not self.__dict__["_handset_" + n]:
1916                    self.__dict__[n] = fn(self)
1917                else:
1918                    warnings.warn("Changing \"" + name + "\" does not have any effect as \"" + n + "\" was already set", UserWarning, 2)
1919            elif n == name:
1920                if value == None:
1921                    self.__dict__[n] = fn(self)
1922                    self.__dict__["_handset_" + n] = False
1923                    #print n, "was now disabled by hand"
1924                else:
1925                    self.__dict__["_handset_" + n] = True
1926                    #print n, "is now handset"
1927        #print self.__dict__
1928
1929    def __delattr__(self, name):
1930        self.__setattr__(name, None) #use the __setattr__
1931        del self.__dict__[name]
1932
1933    def instance(self):
1934        """
1935        DEPRECATED. Return a base learner - an object
1936        of :class:`_TreeLearner`.
1937        This method is left for backwards compatibility.
1938        """
1939        return self._base_learner()
1940
1941    def _build_split(self):
1942        """
1943        Return the split constructor built according to object attributes.
1944        """
1945        split = SplitConstructor_Combined()
1946        split.continuous_split_constructor = \
1947            SplitConstructor_Threshold()
1948        binarization = getattr(self, "binarization", 0)
1949        if binarization == 1:
1950            split.discrete_split_constructor = \
1951                SplitConstructor_ExhaustiveBinary()
1952        elif binarization == 2:
1953            split.discrete_split_constructor = \
1954                SplitConstructor_OneAgainstOthers()
1955        else:
1956            split.discrete_split_constructor = \
1957                SplitConstructor_Feature()
1958
1959        measures = {"infoGain": fscoring.InfoGain,
1960            "gainRatio": fscoring.GainRatio,
1961            "gini": fscoring.Gini,
1962            "relief": fscoring.Relief,
1963            "retis": fscoring.MSE
1964            }
1965
1966        measure = self.measure
1967        if isinstance(measure, str):
1968            measure = measures[measure]()
1969        if not measure:
1970            measure = fscoring.GainRatio()
1971
1972        measureIsRelief = isinstance(measure, fscoring.Relief)
1973        relM = getattr(self, "relief_m", None)
1974        if relM and measureIsRelief:
1975            measure.m = relM
1976
1977        relK = getattr(self, "relief_k", None)
1978        if relK and measureIsRelief:
1979            measure.k = relK
1980
1981        split.continuous_split_constructor.measure = measure
1982        split.discrete_split_constructor.measure = measure
1983
1984        wa = getattr(self, "worst_acceptable", 0)
1985        if wa:
1986            split.continuous_split_constructor.worst_acceptable = wa
1987            split.discrete_split_constructor.worst_acceptable = wa
1988
1989        ms = getattr(self, "min_subset", 0)
1990        if ms:
1991            split.continuous_split_constructor.min_subset = ms
1992            split.discrete_split_constructor.min_subset = ms
1993
1994        return split
1995
1996    def _build_stop(self):
1997        """
1998        Return the stop criteria built according to object's attributes.
1999        """
2000        stop = Orange.classification.tree.StopCriteria_common()
2001        mm = getattr(self, "max_majority", 1.0)
2002        if mm < 1.0:
2003            stop.max_majority = self.max_majority
2004        me = getattr(self, "min_instances", 0)
2005        if me:
2006            stop.min_instances = self.min_instances
2007        return stop
2008
2009    def _base_learner(self):
2010        learner = _TreeLearner()
2011
2012        learner.split = self.split
2013        learner.stop = self.stop
2014
2015        for a in ["store_distributions", "store_contingencies",
2016            "store_node_classifier", "node_learner", "max_depth", "contingency_computer", "descender" ]:
2017            if hasattr(self, a):
2018                setattr(learner, a, getattr(self, a))
2019
2020        if hasattr(self, "store_instances"):
2021            learner.store_examples = self.store_instances
2022
2023        return learner
2024
2025    _built_fn = {
2026            "split": [ _build_split, [ "binarization", "measure", "relief_m", "relief_k", "worst_acceptable", "min_subset" ] ], \
2027            "stop": [ _build_stop, ["max_majority", "min_instances" ] ]
2028        }
2029
2030
2031
2032TreeLearner = Orange.misc.deprecated_members({
2033          "mForPruning": "m_pruning",
2034          "sameMajorityPruning": "same_majority_pruning",
2035          "reliefM": "relief_m",
2036          "reliefK": "relief_k",
2037          "storeDistributions": "store_distributions",
2038          "storeContingencies": "store_contingencies",
2039          "storeExamples": "store_instances",
2040          "store_examples": "store_instances",
2041          "storeNodeClassifier": "store_node_classifier",
2042          "worstAcceptable": "worst_acceptable",
2043          "minSubset": "min_subset",
2044          "maxMajority": "max_majority",
2045          "minExamples": "min_instances",
2046          "maxDepth": "max_depth",
2047          "nodeLearner": "node_learner",
2048          "min_examples": "min_instances"
2049}, wrap_methods=[])(TreeLearner)
2050
2051#
2052# the following is for the output
2053#
2054
2055fs = r"(?P<m100>\^?)(?P<fs>(\d*\.?\d*)?)"
2056""" Defines the multiplier by 100 (``^``) and the format
2057for the number of decimals (e.g. ``5.3``). The corresponding
2058groups are named ``m100`` and ``fs``. """
2059
2060by = r"(?P<by>(b(P|A)))?"
2061""" Defines bP or bA or nothing; the result is in groups by. """
2062
2063bysub = r"((?P<bysub>b|s)(?P<by>P|A))?"
2064opc = r"(?P<op>=|<|>|(<=)|(>=)|(!=))(?P<num>\d*\.?\d+)"
2065opd = r'(?P<op>=|(!=))"(?P<cls>[^"]*)"'
2066intrvl = r'((\((?P<intp>\d+)%?\))|(\(0?\.(?P<intv>\d+)\))|)'
2067fromto = r"(?P<out>!?)(?P<lowin>\(|\[)(?P<lower>\d*\.?\d+)\s*,\s*(?P<upper>\d*\.?\d+)(?P<upin>\]|\))"
2068re_V = re.compile("%V")
2069re_N = re.compile("%" + fs + "N" + by)
2070re_M = re.compile("%" + fs + "M" + by)
2071re_m = re.compile("%" + fs + "m" + by)
2072re_Ccont = re.compile("%" + fs + "C" + by + opc)
2073re_Cdisc = re.compile("%" + fs + "C" + by + opd)
2074re_ccont = re.compile("%" + fs + "c" + by + opc)
2075re_cdisc = re.compile("%" + fs + "c" + by + opd)
2076re_Cconti = re.compile("%" + fs + "C" + by + fromto)
2077re_cconti = re.compile("%" + fs + "c" + by + fromto)
2078re_D = re.compile("%" + fs + "D" + by)
2079re_d = re.compile("%" + fs + "d" + by)
2080re_AE = re.compile("%" + fs + "(?P<AorE>A|E)" + bysub)
2081re_I = re.compile("%" + fs + "I" + intrvl)
2082
2083def insert_str(s, mo, sub):
2084    """ Replace the part of s which is covered by mo
2085    with the string sub. """
2086    return s[:mo.start()] + sub + s[mo.end():]
2087
2088def insert_dot(s, mo):
2089    """ Replace the part of s which is covered by mo
2090    with a dot.  You should use this when the
2091    function cannot compute the desired quantity; it is called, for instance,
2092    when it needs to divide by something in the parent, but the parent
2093    doesn't exist.
2094    """
2095    return s[:mo.start()] + "." + s[mo.end():]
2096
2097def insert_num(s, mo, n):
2098    """ Replace the part of s matched by mo with the number n,
2099    formatted as specified by the user, that is, it multiplies
2100    it by 100, if needed, and prints with the right number of
2101    places and decimals. It does so by checking the mo
2102    for a group named m100 (representing the ``^`` in the format string)
2103    and a group named fs representing the part giving the number o
2104    f decimals (e.g. ``5.3``).
2105    """
2106    grps = mo.groupdict()
2107    m100 = grps.get("m100", None)
2108    if m100:
2109        n *= 100
2110    fs = grps.get("fs") or (m100 and ".0" or "5.3")
2111    return s[:mo.start()] + ("%%%sf" % fs % n) + s[mo.end():]
2112
2113def by_whom(by, parent, tree):
2114    """ If by equals bp, return parent, else return
2115    ``tree.tree``. This is used to find what to divide the quantity
2116    with, when division is required.
2117    """
2118    if by == "bP":
2119        return parent
2120    else:
2121        return tree.tree
2122
2123def replaceV(strg, mo, node, parent, tree):
2124    return insert_str(strg, mo, str(node.node_classifier.default_value))
2125
2126def replaceN(strg, mo, node, parent, tree):
2127    by = mo.group("by")
2128    N = node.distribution.abs
2129    if by:
2130        whom = by_whom(by, parent, tree)
2131        if whom and whom.distribution:
2132            if whom.distribution.abs > 1e-30:
2133                N /= whom.distribution.abs
2134        else:
2135            return insert_dot(strg, mo)
2136    return insert_num(strg, mo, N)
2137
2138
2139def replaceM(strg, mo, node, parent, tree):
2140    by = mo.group("by")
2141    maj = int(node.node_classifier.default_value)
2142    N = node.distribution[maj]
2143    if by:
2144        whom = by_whom(by, parent, tree)
2145        if whom and whom.distribution:
2146            if whom.distribution[maj] > 1e-30:
2147                N /= whom.distribution[maj]
2148        else:
2149            return insert_dot(strg, mo)
2150    return insert_num(strg, mo, N)
2151
2152
2153def replacem(strg, mo, node, parent, tree):
2154    by = mo.group("by")
2155    maj = int(node.node_classifier.default_value)
2156    if node.distribution.abs > 1e-30:
2157        N = node.distribution[maj] / node.distribution.abs
2158        if by:
2159            if whom and whom.distribution:
2160                byN = whom.distribution[maj] / whom.distribution.abs
2161                if byN > 1e-30:
2162                    N /= byN
2163            else:
2164                return insert_dot(strg, mo)
2165    else:
2166        N = 0.
2167    return insert_num(strg, mo, N)
2168
2169
2170def replaceCdisc(strg, mo, node, parent, tree):
2171    if tree.class_var.var_type != Orange.feature.Type.Discrete:
2172        return insert_dot(strg, mo)
2173
2174    by, op, cls = mo.group("by", "op", "cls")
2175    N = node.distribution[cls]
2176    if op == "!=":
2177        N = node.distribution.abs - N
2178    if by:
2179        whom = by_whom(by, parent, tree)
2180        if whom and whom.distribution:
2181            if whom.distribution[cls] > 1e-30:
2182                N /= whom.distribution[cls]
2183        else:
2184            return insert_dot(strg, mo)
2185    return insert_num(strg, mo, N)
2186
2187
2188def replacecdisc(strg, mo, node, parent, tree):
2189    if tree.class_var.var_type != Orange.feature.Type.Discrete:
2190        return insert_dot(strg, mo)
2191
2192    op, by, cls = mo.group("op", "by", "cls")
2193    N = node.distribution[cls]
2194    if node.distribution.abs > 1e-30:
2195        N /= node.distribution.abs
2196        if op == "!=":
2197            N = 1 - N
2198    if by:
2199        whom = by_whom(by, parent, tree)
2200        if whom and whom.distribution:
2201            if whom.distribution[cls] > 1e-30:
2202                N /= whom.distribution[cls] / whom.distribution.abs
2203        else:
2204            return insert_dot(strg, mo)
2205    return insert_num(strg, mo, N)
2206
2207
2208__opdict = {"<": operator.lt, "<=": operator.le, ">": operator.gt, ">=": operator.ge, "=": operator.eq, "!=": operator.ne}
2209
2210def replaceCcont(strg, mo, node, parent, tree):
2211    if tree.class_var.var_type != Orange.feature.Type.Continuous:
2212        return insert_dot(strg, mo)
2213
2214    by, op, num = mo.group("by", "op", "num")
2215    op = __opdict[op]
2216    num = float(num)
2217    N = sum([x[1] for x in node.distribution.items() if op(x[0], num)], 0.)
2218    if by:
2219        whom = by_whom(by, parent, tree)
2220        if whom and whom.distribution:
2221            byN = sum([x[1] for x in whom.distribution.items() if op(x[0], num)], 0.)
2222            if byN > 1e-30:
2223                N /= byN
2224        else:
2225            return insert_dot(strg, mo)
2226
2227    return insert_num(strg, mo, N)
2228
2229
2230def replaceccont(strg, mo, node, parent, tree):
2231    if tree.class_var.var_type != Orange.feature.Type.Continuous:
2232        return insert_dot(strg, mo)
2233
2234    by, op, num = mo.group("by", "op", "num")
2235    op = __opdict[op]
2236    num = float(num)
2237    N = sum([x[1] for x in node.distribution.items() if op(x[0], num)], 0.)
2238    if node.distribution.abs > 1e-30:
2239        N /= node.distribution.abs
2240    if by:
2241        whom = by_whom(by, parent, tree)
2242        if whom and whom.distribution:
2243            byN = sum([x[1] for x in whom.distribution.items() if op(x[0], num)], 0.)
2244            if byN > 1e-30:
2245                N /= byN / whom.distribution.abs # abs > byN, so byN>1e-30 => abs>1e-30
2246        else:
2247            return insert_dot(strg, mo)
2248    return insert_num(strg, mo, N)
2249
2250
2251def extractInterval(mo, dist):
2252    out, lowin, lower, upper, upin = mo.group("out", "lowin", "lower", "upper", "upin")
2253    lower, upper = float(lower), float(upper)
2254    if out:
2255        lop = lowin == "(" and operator.le or operator.lt
2256        hop = upin == ")" and operator.ge or operator.ge
2257        return filter(lambda x:lop(x[0], lower) or hop(x[0], upper), dist.items())
2258    else:
2259        lop = lowin == "(" and operator.gt or operator.ge
2260        hop = upin == ")" and operator.lt or operator.le
2261        return filter(lambda x:lop(x[0], lower) and hop(x[0], upper), dist.items())
2262
2263
2264def replaceCconti(strg, mo, node, parent, tree):
2265    if tree.class_var.var_type != Orange.feature.Type.Continuous:
2266        return insert_dot(strg, mo)
2267
2268    by = mo.group("by")
2269    N = sum([x[1] for x in extractInterval(mo, node.distribution)])
2270    if by:
2271        whom = by_whom(by, parent, tree)
2272        if whom and whom.distribution:
2273            byN = sum([x[1] for x in extractInterval(mo, whom.distribution)])
2274            if byN > 1e-30:
2275                N /= byN
2276        else:
2277            return insert_dot(strg, mo)
2278
2279    return insert_num(strg, mo, N)
2280
2281
2282def replacecconti(strg, mo, node, parent, tree):
2283    if tree.class_var.var_type != Orange.feature.Type.Continuous:
2284        return insert_dot(strg, mo)
2285
2286    N = sum([x[1] for x in extractInterval(mo, node.distribution)])
2287    ab = node.distribution.abs
2288    if ab > 1e-30:
2289        N /= ab
2290
2291    by = mo.group("by")
2292    if by:
2293        whom = by_whom(by, parent, tree)
2294        if whom and whom.distribution:
2295            byN = sum([x[1] for x in extractInterval(mo, whom.distribution)])
2296            if byN > 1e-30:
2297                N /= byN / whom.distribution.abs
2298        else:
2299            return insert_dot(strg, mo)
2300
2301    return insert_num(strg, mo, N)
2302
2303
2304def replaceD(strg, mo, node, parent, tree):
2305    if tree.class_var.var_type != Orange.feature.Type.Discrete:
2306        return insert_dot(strg, mo)
2307
2308    fs, by, m100 = mo.group("fs", "by", "m100")
2309    dist = list(node.distribution)
2310    if by:
2311        whom = by_whom(by, parent, tree)
2312        if whom:
2313            for i, d in enumerate(whom.distribution):
2314                if d > 1e-30:
2315                    dist[i] /= d
2316        else:
2317            return insert_dot(strg, mo)
2318    mul = m100 and 100 or 1
2319    fs = fs or (m100 and ".0" or "5.3")
2320    return insert_str(strg, mo, "[" + ", ".join(["%%%sf" % fs % (N * mul) for N in dist]) + "]")
2321
2322
2323def replaced(strg, mo, node, parent, tree):
2324    if tree.class_var.var_type != Orange.feature.Type.Discrete:
2325        return insert_dot(strg, mo)
2326
2327    fs, by, m100 = mo.group("fs", "by", "m100")
2328    dist = list(node.distribution)
2329    ab = node.distribution.abs
2330    if ab > 1e-30:
2331        dist = [d / ab for d in dist]
2332    if by:
2333        whom = by_whom(by, parent, tree)
2334        if whom:
2335            for i, d in enumerate(whom.distribution):
2336                if d > 1e-30:
2337                    dist[i] /= d / whom.distribution.abs # abs > d => d>1e-30 => abs>1e-30
2338        else:
2339            return insert_dot(strg, mo)
2340    mul = m100 and 100 or 1
2341    fs = fs or (m100 and ".0" or "5.3")
2342    return insert_str(strg, mo, "[" + ", ".join(["%%%sf" % fs % (N * mul) for N in dist]) + "]")
2343
2344
2345def replaceAE(strg, mo, node, parent, tree):
2346    if tree.class_var.var_type != Orange.feature.Type.Continuous:
2347        return insert_dot(strg, mo)
2348
2349    AorE, bysub, by = mo.group("AorE", "bysub", "by")
2350
2351    if AorE == "A":
2352        A = node.distribution.average()
2353    else:
2354        A = node.distribution.error()
2355    if by:
2356        whom = by_whom("b" + by, parent, tree)
2357        if whom:
2358            if AorE == "A":
2359                avg = whom.distribution.average()
2360            else:
2361                avg = whom.distribution.error()
2362            if bysub == "b":
2363                if avg > 1e-30:
2364                    A /= avg
2365            else:
2366                A -= avg
2367        else:
2368            return insert_dot(strg, mo)
2369    return insert_num(strg, mo, A)
2370
2371
2372Z = { 0.75:1.15, 0.80:1.28, 0.85:1.44, 0.90:1.64, 0.95:1.96, 0.99:2.58 }
2373
2374def replaceI(strg, mo, node, parent, tree):
2375    if tree.class_var.var_type != Orange.feature.Type.Continuous:
2376        return insert_dot(strg, mo)
2377
2378    fs = mo.group("fs") or "5.3"
2379    intrvl = float(mo.group("intp") or mo.group("intv") or "95") / 100.
2380    mul = mo.group("m100") and 100 or 1
2381
2382    if not Z.has_key(intrvl):
2383        raise SystemError, "Cannot compute %5.3f% confidence intervals" % intrvl
2384
2385    av = node.distribution.average()
2386    il = node.distribution.error() * Z[intrvl]
2387    return insert_str(strg, mo, "[%%%sf-%%%sf]" % (fs, fs) % ((av - il) * mul, (av + il) * mul))
2388
2389
2390# This class is more a collection of function, merged into a class so
2391# that they don't need to transfer too many arguments. It will be
2392# constructed, used and discarded, it is not meant to store any information.
2393class _TreeDumper:
2394    defaultStringFormats = [(re_V, replaceV), (re_N, replaceN),
2395         (re_M, replaceM), (re_m, replacem),
2396         (re_Cdisc, replaceCdisc), (re_cdisc, replacecdisc),
2397         (re_Ccont, replaceCcont), (re_ccont, replaceccont),
2398         (re_Cconti, replaceCconti), (re_cconti, replacecconti),
2399         (re_D, replaceD), (re_d, replaced), (re_AE, replaceAE),
2400         (re_I, replaceI) ]
2401
2402    def node(self):
2403        return self.tree.tree if "tree" in self.tree.__dict__ else self.tree
2404
2405    def __init__(self, leafStr, nodeStr, stringFormats, minExamples,
2406        maxDepth, simpleFirst, tree, **kw):
2407        self.stringFormats = stringFormats
2408        self.minExamples = minExamples
2409        self.maxDepth = maxDepth
2410        self.simpleFirst = simpleFirst
2411        self.tree = tree
2412        self.__dict__.update(kw)
2413
2414        if leafStr:
2415            self.leafStr = leafStr
2416        else:
2417            if self.node().node_classifier.class_var.var_type == \
2418                    Orange.feature.Type.Discrete:
2419                self.leafStr = "%V (%^.2m%)"
2420            else:
2421                self.leafStr = "%V"
2422
2423        if nodeStr == ".":
2424            self.nodeStr = self.leafStr
2425        else:
2426            self.nodeStr = nodeStr
2427
2428
2429    def formatString(self, strg, node, parent):
2430        if hasattr(strg, "__call__"):
2431            return strg(node, parent, self.tree)
2432
2433        if not node:
2434            return "<null node>"
2435
2436        for rgx, replacer in self.stringFormats:
2437            if not node.distribution:
2438                strg = rgx.sub(".", strg)
2439            else:
2440                strt = 0
2441                while True:
2442                    mo = rgx.search(strg, strt)
2443                    if not mo:
2444                        break
2445                    strg = replacer(strg, mo, node, parent, self.tree)
2446                    strt = mo.start() + 1
2447
2448        return strg
2449
2450
2451    def showBranch(self, node, parent, lev, i):
2452        bdes = node.branch_descriptions[i]
2453        bdes = node.branch_selector.class_var.name + \
2454            (bdes[0] not in "<=>" and "=" or "") + bdes
2455        if node.branches[i]:
2456            nodedes = self.nodeStr and ": " + \
2457                self.formatString(self.nodeStr, node.branches[i], node) or ""
2458        else:
2459            nodedes = "<null node>"
2460        return "|    "*lev + bdes + nodedes
2461
2462
2463    def dumpTree0(self, node, parent, lev):
2464        if node.branches:
2465            if node.distribution.abs < self.minExamples or \
2466                lev > self.maxDepth:
2467                return "|    "*lev + ". . .\n"
2468
2469            res = ""
2470            if self.leafStr and self.nodeStr and self.leafStr != self.nodeStr:
2471                leafsep = "\n" + ("|    "*lev) + "    "
2472            else:
2473                leafsep = ""
2474            if self.simpleFirst:
2475                for i, branch in enumerate(node.branches):
2476                    if not branch or not branch.branches:
2477                        if self.leafStr == self.nodeStr:
2478                            res += "%s\n" % \
2479                                self.showBranch(node, parent, lev, i)
2480                        else:
2481                            res += "%s: %s\n" % \
2482                                (self.showBranch(node, parent, lev, i),
2483                                 leafsep +
2484                                 self.formatString(self.leafStr, branch, node))
2485            for i, branch in enumerate(node.branches):
2486                if branch and branch.branches:
2487                    res += "%s\n%s" % (self.showBranch(node, parent, lev, i),
2488                                       self.dumpTree0(branch, node, lev + 1))
2489                elif not self.simpleFirst:
2490                    if self.leafStr == self.nodeStr:
2491                        res += "%s\n" % self.showBranch(node, parent, lev, i)
2492                    else:
2493                        res += "%s: %s\n" % \
2494                            (self.showBranch(node, parent, lev, i),
2495                             leafsep +
2496                             self.formatString(self.leafStr, branch, node))
2497            return res
2498        else:
2499            return self.formatString(self.leafStr, node, parent)
2500
2501
2502    def dumpTree(self):
2503        node = self.node()
2504        if self.nodeStr:
2505            lev, res = 1, "root: %s\n" % \
2506                self.formatString(self.nodeStr, node, None)
2507            self.maxDepth += 1
2508        else:
2509            lev, res = 0, ""
2510        return res + self.dumpTree0(node, None, lev)
2511
2512
2513    def dotTree0(self, node, parent, internalName):
2514        if node.branches:
2515            if node.distribution.abs < self.minExamples or \
2516                len(internalName) - 1 > self.maxDepth:
2517                self.fle.write('%s [ shape="plaintext" label="..." ]\n' % \
2518                    _quoteName(internalName))
2519                return
2520
2521            label = node.branch_selector.class_var.name
2522            if self.nodeStr:
2523                label += "\\n" + self.formatString(self.nodeStr, node, parent)
2524            self.fle.write('%s [ shape=%s label="%s"]\n' % \
2525                (_quoteName(internalName), self.nodeShape, label))
2526
2527            for i, branch in enumerate(node.branches):
2528                if branch:
2529                    internalBranchName = "%s-%d" % (internalName, i)
2530                    self.fle.write('%s -> %s [ label="%s" ]\n' % \
2531                        (_quoteName(internalName),
2532                         _quoteName(internalBranchName),
2533                         node.branch_descriptions[i]))
2534                    self.dotTree0(branch, node, internalBranchName)
2535
2536        else:
2537            self.fle.write('%s [ shape=%s label="%s"]\n' % \
2538                (_quoteName(internalName), self.leafShape,
2539                self.formatString(self.leafStr, node, parent)))
2540
2541
2542    def dotTree(self, internalName="n"):
2543        self.fle.write("digraph G {\n")
2544        self.dotTree0(self.node(), None, internalName)
2545        self.fle.write("}\n")
2546
2547def _quoteName(x):
2548    return '"%s"' % (base64.b64encode(x))
2549
2550class TreeClassifier(Orange.classification.Classifier):
2551    """
2552
2553    Classifies instances according to the tree stored in :obj:`tree`.
2554
2555    **The classification process**
2556
2557    :obj:`TreeClassifier` uses the :obj:`descender` to descend the
2558    instance from the root. If the :obj:`descender` returns only a
2559    :obj:`Node` and no distribution, the descend should stop as the node
2560    was unambiguously selected. The node's :obj:`~Node.node_classifier`
2561    decides the class.
2562
2563    If the descender returns a :obj:`Node` and a distribution, as it
2564    happens, for example, if the instance's value for the :obj:`Node`'s
2565    feature is unknown, the same process repeats for all subtrees and
2566    their predictions are combined.
2567
2568    **Attributes**
2569
2570    .. attribute:: tree
2571
2572        The root of the tree, as a :class:`Node`.
2573
2574    .. attribute:: descender
2575
2576        A :obj:`Descender` used to descend an instance from the root as
2577        deeply as possible according to the instance's feature values.
2578    """
2579
2580    def __init__(self, base_classifier=None):
2581        if not base_classifier: base_classifier = _TreeClassifier()
2582        self.nativeClassifier = base_classifier
2583        for k, v in self.nativeClassifier.__dict__.items():
2584            self.__dict__[k] = v
2585
2586    def __call__(self, instance, result_type=Orange.classification.Classifier.GetValue,
2587                 *args, **kwdargs):
2588        """Classify a new instance.
2589
2590     
2591        :param instance: instance to be classified.
2592        :type instance: :class:`Orange.data.Instance`
2593        :param result_type:
2594              :class:`Orange.classification.Classifier.GetValue` or \
2595              :class:`Orange.classification.Classifier.GetProbabilities` or
2596              :class:`Orange.classification.Classifier.GetBoth`
2597       
2598        :rtype: :class:`Orange.data.Value`,
2599              :class:`Orange.statistics.Distribution` or a tuple with both
2600        """
2601        return self.nativeClassifier(instance, result_type, *args, **kwdargs)
2602
2603    def __setattr__(self, name, value):
2604        if name == "nativeClassifier":
2605            self.__dict__[name] = value
2606            return
2607        if name in self.nativeClassifier.__dict__:
2608            self.nativeClassifier.__dict__[name] = value
2609        self.__dict__[name] = value
2610
2611    def __str__(self):
2612        return self.to_string()
2613
2614    @Orange.misc.deprecated_keywords({"fileName": "file_name", \
2615        "leafStr": "leaf_str", "nodeStr": "node_str", \
2616        "userFormats": "user_formats", "minExamples": "min_instances", \
2617        "min_examples": "min_instances", \
2618        "maxDepth": "max_depth", "simpleFirst": "simple_first"})
2619    def to_string(self, leaf_str="", node_str="", \
2620            user_formats=[], min_instances=0, max_depth=1e10, \
2621            simple_first=True):
2622        """
2623        Return a string representation of a tree.
2624
2625        :arg leaf_str: The format string for the tree leaves. If
2626          left empty, ``"%V (%^.2m%)"`` will be used for classification trees
2627          and ``"%V"`` for regression trees.
2628        :type leaf_str: string
2629        :arg node_str: The format string for the internal nodes.
2630          If left empty (as it is by default), no data is printed out for
2631          internal nodes. If set to ``"."``, the same string is
2632          used as for leaves.
2633        :type node_str: string
2634        :arg max_depth: If set, it limits the depth to which the tree is
2635          printed out.
2636        :type max_depth: integer
2637        :arg min_instances: If set, the subtrees with less than the given
2638          number of examples are not printed.
2639        :type min_instances: integer
2640        :arg simple_first: If True (default), the branches with a single
2641          node are printed before the branches with larger subtrees.
2642          If False, the branches are printed in order of
2643          appearance.
2644        :type simple_first: boolean
2645        :arg user_formats: A list of regular expressions and callback
2646          function through which the user can print out other specific
2647          information in the nodes.
2648        """
2649        return _TreeDumper(leaf_str, node_str, user_formats +
2650            _TreeDumper.defaultStringFormats, min_instances,
2651            max_depth, simple_first, self).dumpTree()
2652
2653    dump = to_string
2654
2655    @Orange.misc.deprecated_keywords({"fileName": "file_name", \
2656        "leafStr": "leaf_str", "nodeStr": "node_str", \
2657        "leafShape": "leaf_shape", "nodeShape": "node_shape", \
2658        "userFormats": "user_formats", \
2659        "minExamples": "min_instances", \
2660        "min_examples": "min_instances", \
2661        "maxDepth": "max_depth", "simpleFirst": "simple_first"})
2662    def dot(self, file_name, leaf_str="", node_str="", \
2663            leaf_shape="plaintext", node_shape="plaintext", \
2664            user_formats=[], min_instances=0, max_depth=1e10, \
2665            simple_first=True):
2666        """ Print the tree to a file in a format used by `GraphViz
2667        <http://www.research.att.com/sw/tools/graphviz>`_.  Uses the
2668        same parameters as :meth:`to_string` plus two which define the shape
2669        of internal nodes and leaves of the tree:
2670
2671        :param leaf_shape: Shape of the outline around leaves of the tree.
2672            If "plaintext", no outline is used (default: "plaintext").
2673        :type leaf_shape: string
2674        :param node_shape: Shape of the outline around internal nodes
2675            of the tree. If "plaintext", no outline is used (default: "plaintext")
2676        :type node_shape: string
2677
2678        Check `Polygon-based Nodes <http://www.graphviz.org/doc/info/shapes.html>`_
2679        for various outlines supported by GraphViz.
2680        """
2681        fle = type(file_name) == str and open(file_name, "wt") or file_name
2682
2683        _TreeDumper(leaf_str, node_str, user_formats +
2684            _TreeDumper.defaultStringFormats, min_instances,
2685            max_depth, simple_first, self,
2686            leafShape=leaf_shape, nodeShape=node_shape, fle=fle).dotTree()
2687
2688    def count_nodes(self):
2689        """
2690        Return the number of nodes.
2691        """
2692        return _countNodes(self.tree)
2693
2694    def count_leaves(self):
2695        """
2696        Return the number of leaves.
2697        """
2698        return _countLeaves(self.tree)
2699
2700    def to_network(self):
2701        net = Orange.network.DiGraph()
2702        if self.class_var.var_type == Orange.feature.Type.Discrete:
2703            domain = Orange.data.Domain([self.class_var] +
2704                [Orange.feature.Continuous(name) for name in
2705                 ["instances", "majority proportion"] + list(self.class_var.values)], None)
2706        else:
2707            domain = Orange.data.Domain([self.class_var] +
2708                [Orange.feature.Continuous(name) for name in
2709                 ["error", "instances"]], None)
2710        domain = Orange.data.Domain(domain)
2711        data = Orange.data.Table(domain)
2712        self.to_network0(self.tree, net, data)
2713        net.set_items(data)
2714        return net
2715
2716    def to_network0(self, node, net, table):
2717        node_id = len(table)
2718        net.add_node(node_id)
2719        d = node.distribution
2720        maj = node.node_classifier.default_value
2721        if self.class_var.var_type == Orange.feature.Type.Discrete:
2722            if d.abs > 1e-6:
2723                table.append([maj, d.abs, d[maj]] + [x / d.abs for x in d])
2724            else:
2725                table.append([maj] + [0] * (2 + len(d)))
2726        else:
2727            table.append([maj, d.error(), d.abs])
2728        if node.branches:
2729            for branch in node.branches:
2730                if branch:
2731                    child_id = self.to_network0(branch, net, table)
2732                    net.add_edge(node_id, child_id)
2733        return node_id
2734
2735def _countNodes(node):
2736    count = 0
2737    if node:
2738        count += 1
2739        if node.branches:
2740            for node in node.branches:
2741                count += _countNodes(node)
2742    return count
2743
2744def _countLeaves(node):
2745    count = 0
2746    if node:
2747        if node.branches: # internal node
2748            for node in node.branches:
2749                count += _countLeaves(node)
2750        else:
2751            count += 1
2752    return count
2753
Note: See TracBrowser for help on using the repository browser.