source: orange/Orange/classification/tree.py @ 10467:e9e98c6d4d6c

Revision 10467:e9e98c6d4d6c, 100.5 KB checked in by Ales Erjavec <ales.erjavec@…>, 2 years ago (diff)

Unicode filename support for saving a tree to .dot format.

Line 
1"""
2
3.. index:: classification tree
4
5.. index::
6   single: classification; tree
7
8*******************************
9Classification trees (``tree``)
10*******************************
11
12Orange includes multiple implementations of classification tree learners:
13a very flexible :class:`TreeLearner`, a fast :class:`SimpleTreeLearner`,
14and a :class:`C45Learner`, which uses the C4.5 tree induction
15algorithm.
16
17The following code builds a :obj:`TreeClassifier` on the Iris data set
18(with the depth limited to three levels):
19
20.. literalinclude:: code/orngTree1.py
21   :lines: 1-4
22
23See `Decision tree learning
24<http://en.wikipedia.org/wiki/Decision_tree_learning>`_ on Wikipedia
25for introduction to classification trees.
26
27======================
28Learner and Classifier
29======================
30
31.. autoclass:: TreeLearner
32    :members:
33
34.. autoclass:: TreeClassifier
35    :members:
36
37.. class:: Node
38
39    Classification trees are a hierarchy of :obj:`Node` classes.
40
41    Node stores the instances belonging to the node, a branch selector,
42    a list of branches (if the node is not a leaf) with their descriptions
43    and strengths, and a classifier.
44
45    .. attribute:: distribution
46   
47        A distribution of learning instances.
48
49    .. attribute:: contingency
50
51        Complete contingency matrices for the learning instances.
52
53    .. attribute:: instances, weightID
54
55        Learning instances and the ID of weight meta attribute. The root
56        of the tree actually stores all instances, while other nodes
57        store only reference to instances in the root node.
58
59    .. attribute:: node_classifier
60
61        A classifier for instances coming to the node. If the node is a
62        leaf, it chooses the class (or class distribution) of an instance.
63
64    Internal nodes have additional attributes. The lists :obj:`branches`,
65    :obj:`branch_descriptions` and :obj:`branch_sizes` are of the
66    same length.
67
68    .. attribute:: branches
69
70        A list of subtrees. Each element is a :obj:`Node` or None.
71        If None, the node is empty.
72
73    .. attribute:: branch_descriptions
74
75        A list with strings describing branches. They are constructed
76        by :obj:`SplitConstructor`. A string can contain anything,
77        for example 'red' or '>12.3'.
78
79    .. attribute:: branch_sizes
80
81        A (weighted) number of training instances for
82        each branch. It can be used, for instance, for modeling
83        probabilities when classifying instances with unknown values.
84
85    .. attribute:: branch_selector
86
87        A :obj:`~Orange.classification.Classifier` that returns a branch
88        for each instance (as
89        :obj:`Orange.data.Value` in ``[0, len(branches)-1]``).  When an
90        instance cannot be classified unambiguously, the selector can
91        return a discrete distribution, which proposes how to divide
92        the instance between the branches. Whether the proposition will
93        be used depends upon the :obj:`Splitter` (for learning)
94        or :obj:`Descender` (for classification).
95
96    .. method:: tree_size()
97       
98        Return the number of nodes in the subtrees (including the node,
99        excluding null-nodes).
100
101========
102Examples
103========
104
105Tree Structure
106==============
107
108This example works with the lenses data set:
109
110    >>> import Orange
111    >>> lenses = Orange.data.Table("lenses")
112    >>> tree_classifier = Orange.classification.tree.TreeLearner(lenses)
113
114The following function counts the number of nodes in a tree:
115
116    >>> def tree_size(node):
117    ...    if not node:
118    ...        return 0
119    ...    size = 1
120    ...    if node.branch_selector:
121    ...        for branch in node.branches:
122    ...            size += tree_size(branch)
123    ...    return size
124
125If node is None, the function above return 0. Otherwise, the size is 1
126(this node) plus the sizes of all subtrees. The algorithm need to check
127if a node is internal (it has a :obj:`~Node.branch_selector`), as leaves
128don't have the :obj:`~Node.branches` attribute.
129
130    >>> tree_size(tree_classifier.tree)
131    15
132
133Note that a :obj:`Node` already has a built-in method
134:func:`~Node.tree_size`.
135
136Trees can be printed with a simple recursive function:
137
138    >>> def print_tree0(node, level):
139    ...     if not node:
140    ...         print " "*level + "<null node>"
141    ...         return
142    ...     if node.branch_selector:
143    ...         node_desc = node.branch_selector.class_var.name
144    ...         node_cont = node.distribution
145    ...         print "\\n" + "   "*level + "%s (%s)" % (node_desc, node_cont),
146    ...         for i in range(len(node.branches)):
147    ...             print "\\n" + "   "*level + ": %s" % node.branch_descriptions[i],
148    ...             print_tree0(node.branches[i], level+1)
149    ...     else:
150    ...         node_cont = node.distribution
151    ...         major_class = node.node_classifier.default_value
152    ...         print "--> %s (%s) " % (major_class, node_cont),
153
154The crux of the example is not in the formatting (\\n's etc.);
155what matters is everything but the print statements. The code
156separately handles three node types:
157
158* For null nodes (a node to which no learning instances were classified),
159  it just prints "<null node>".
160* For internal nodes, it prints a node description:
161  the feature's name and distribution of classes. :obj:`Node`'s
162  branch description is an :obj:`~Orange.classification.Classifier`,
163  and its ``class_var`` is the feature whose name is printed.  Class
164  distributions are printed as well (they are assumed to be stored).
165  The :obj:`print_tree0` with a level increased by 1 to increase the
166  indent is recursively called for each branch.
167* If the node is a leaf, it prints the distribution of learning instances
168  in the node and the class to which the instances in the node would
169  be classified. We assume that the :obj:`~Node.node_classifier` is a
170  :obj:`DefaultClassifier`. A better print function should be aware of
171  possible alternatives.
172
173The wrapper function that accepts either a
174:obj:`TreeClassifier` or a :obj:`Node` can be written as follows:
175
176    >>> def print_tree(x):
177    ...     if isinstance(x, Orange.classification.tree.TreeClassifier):
178    ...         print_tree0(x.tree, 0)
179    ...     elif isinstance(x, Orange.classification.tree.Node):
180    ...         print_tree0(x, 0)
181    ...     else:
182    ...         raise TypeError, "invalid parameter"
183
184It's straightforward: if ``x`` is a
185:obj:`TreeClassifier`, it prints ``x.tree``; if it's :obj:`Node` it
186print ``x``. If it's of some other type,
187an exception is raised. The output:
188
189    >>> print_tree(tree_classifier)
190    <BLANKLINE>
191    tear_rate (<15.000, 4.000, 5.000>)
192    : normal
193       astigmatic (<3.000, 4.000, 5.000>)
194       : no
195          age (<1.000, 0.000, 5.000>)
196          : pre-presbyopic --> soft (<0.000, 0.000, 2.000>) 
197          : presbyopic
198             prescription (<1.000, 0.000, 1.000>)
199             : hypermetrope --> soft (<0.000, 0.000, 1.000>) 
200             : myope --> none (<1.000, 0.000, 0.000>) 
201          : young --> soft (<0.000, 0.000, 2.000>) 
202       : yes
203          prescription (<2.000, 4.000, 0.000>)
204          : hypermetrope
205             age (<2.000, 1.000, 0.000>)
206             : pre-presbyopic --> none (<1.000, 0.000, 0.000>) 
207             : presbyopic --> none (<1.000, 0.000, 0.000>) 
208             : young --> hard (<0.000, 1.000, 0.000>) 
209          : myope --> hard (<0.000, 3.000, 0.000>) 
210    : reduced --> none (<12.000, 0.000, 0.000>)
211
212The tree structure examples conclude with a simple pruning function,
213written entirely in Python and unrelated to any :class:`Pruner`. It limits
214the tree depth (the number of internal nodes on any path down the tree).
215For example, to get a two-level tree, call cut_tree(root, 2). The function
216is recursive, with the second argument (level) decreasing at each call;
217when zero, the current node will be made a leaf:
218
219    >>> def cut_tree(node, level):
220    ...     if node and node.branch_selector:
221    ...         if level:
222    ...             for branch in node.branches:
223    ...                 cut_tree(branch, level-1)
224    ...         else:
225    ...             node.branch_selector = None
226    ...             node.branches = None
227    ...             node.branch_descriptions = None
228
229The function acts only when :obj:`node` and :obj:`node.branch_selector`
230are defined. If the level is not zero, is recursively calls  the function
231for each branch. Otherwise, it clears the selector, branches and branch
232descriptions.
233
234    >>> cut_tree(tree_classifier.tree, 2)
235    >>> print_tree(tree_classifier)
236    <BLANKLINE>
237    tear_rate (<15.000, 4.000, 5.000>)
238    : normal
239       astigmatic (<3.000, 4.000, 5.000>)
240       : no --> soft (<1.000, 0.000, 5.000>) 
241       : yes --> hard (<2.000, 4.000, 0.000>) 
242    : reduced --> none (<12.000, 0.000, 0.000>)
243
244Setting learning parameters
245===========================
246
247Let us construct a :obj:`TreeLearner` to play with:
248
249    >>> import Orange
250    >>> lenses = Orange.data.Table("lenses")
251    >>> learner = Orange.classification.tree.TreeLearner()
252
253There are three crucial components in learning: the
254:obj:`~TreeLearner.split` and :obj:`~TreeLearner.stop` criteria, and the
255example :obj:`~TreeLearner.splitter`. The default ``stop`` is set with:
256
257    >>> learner.stop = Orange.classification.tree.StopCriteria_common()
258
259The default stopping parameters are:
260
261    >>> print learner.stop.max_majority, learner.stop.min_examples
262    1.0 0.0
263
264The defaults only stop splitting when no instances are left or all of
265them are in the same class.
266
267If the minimal subset that is allowed to be split further is set to five
268instances, the resulting tree is smaller.
269
270    >>> learner.stop.min_examples = 5.0
271    >>> tree = learner(lenses)
272    >>> print tree
273    tear_rate=reduced: none (100.00%)
274    tear_rate=normal
275    |    astigmatic=no
276    |    |    age=pre-presbyopic: soft (100.00%)
277    |    |    age=presbyopic: none (50.00%)
278    |    |    age=young: soft (100.00%)
279    |    astigmatic=yes
280    |    |    prescription=hypermetrope: none (66.67%)
281    |    |    prescription=myope: hard (100.00%)
282    <BLANKLINE>
283
284We can also limit the maximal proportion of majority class.
285
286    >>> learner.stop.max_majority = 0.5
287    >>> tree = learner(lenses)
288    >>> print tree
289    none (62.50%)
290
291Redefining tree induction components
292====================================
293
294This example shows how to use a custom stop function.  First, the
295``def_stop`` function defines the default stop function. The first tree
296has some added randomness; the induction also stops in 20% of the
297cases when ``def_stop`` returns False. The stopping criteria for the
298second tree is completely random: it stops induction in 20% of cases.
299Note that in the second case lambda function still has three parameters,
300even though in does not need any, since so many are necessary
301for :obj:`~TreeLearner.stop`.
302
303.. literalinclude:: code/tree3.py
304   :lines: 8-23
305
306=================================
307Learner and Classifier Components
308=================================
309
310Split constructors
311=====================
312
313.. class:: SplitConstructor
314
315    Decide how to divide learning instances, ie. define branching criteria.
316   
317    The :obj:`SplitConstructor` should use the domain
318    contingency when possible, both for speed and adaptability.
319    Sometimes domain contingency does
320    not suffice, for example if ReliefF score is used.
321
322    A :obj:`SplitConstructor` can veto further tree induction by returning
323    no classifier. This is generally related to the number of learning
324    instances that would go in each branch. If there are no splits with
325    more than :obj:`SplitConstructor.min_subset` instances in the branches
326    (null nodes are allowed), the induction is stopped.
327
328    Split constructors that cannot handle a particular feature
329    type (discrete, continuous) quietly skip them. When in doubt, use
330    :obj:`SplitConstructor_Combined`, which delegates features to
331    specialized split constructors.
332
333    The same split constructors can be used both for classification and
334    regression, if the chosen score (for :obj:`SplitConstructor_Score`
335    and derived classes) supports both.
336
337    .. attribute:: min_subset
338
339        The minimal (weighted) number in non-null leaves.
340
341    .. method:: __call__(data, [ weightID, contingency, apriori_distribution, candidates, clsfr])
342
343        :param data: in any acceptable form.
344        :param weightID: Optional; the default of 0 means that all
345            instances have a weight of 1.0.
346        :param contingency: a domain contingency
347        :param apriori_distribution: apriori class probabilities.
348        :type apriori_distribution: :obj:`Orange.statistics.distribution.Distribution`
349        :param candidates: only consider these
350            features (one boolean for each feature).
351        :param clsfr: a node classifier (if it was constructed, that is,
352            if :obj:`store_node_classifier` is True)
353
354        Construct a split. Return a tuple (:obj:`branch_selector`,
355        :obj:`branch_descriptions` (a list), :obj:`subset_sizes`
356        (the number of instances for each branch, may also be
357        empty), :obj:`quality` (higher numbers mean better splits),
358        :obj:`spent_feature`). If no split is constructed,
359        the :obj:`selector`, :obj:`branch_descriptions` and
360        :obj:`subset_sizes` are None, while :obj:`quality` is 0.0 and
361        :obj:`spent_feature` is -1.
362
363        If the chosen feature will be useless in the future and
364        should not be considered for splitting in any of the subtrees
365        (typically, when discrete features are used as-they-are, without
366        any binarization or subsetting), then it should return the index
367        of this feature as :obj:`spent_feature`. If no features are spent,
368        :obj:`spent_feature` is -1.
369
370.. class:: SplitConstructor_Score
371
372    Bases: :class:`SplitConstructor`
373
374    An abstract base class that compare splits
375    with a :class:`Orange.feature.scoring.Score`.  All split
376    constructors except for :obj:`SplitConstructor_Combined` are derived
377    from this class.
378
379    .. attribute:: measure
380
381        A :class:`Orange.feature.scoring.Score` for split evaluation. It
382        has to handle the class type - for example, you cannot use
383        :class:`~Orange.feature.scoring.GainRatio` for regression or
384        :class:`~Orange.feature.scoring.MSE` for classification.
385
386    .. attribute:: worst_acceptable
387
388        The lowest allowed split quality.  The value strongly depends
389        on chosen :obj:`measure` component. Default is 0.0.
390
391.. class:: SplitConstructor_Feature
392
393    Bases: :class:`SplitConstructor_Score`
394
395    Each value of a discrete feature corresponds to a branch.  The feature
396    with the highest score (:obj:`~Measure.measure`) is selected. When
397    tied, a random feature is selected.
398
399    The constructed :obj:`branch_selector` is an instance of
400    :obj:`orange.ClassifierFromVarFD`, that returns a value of the selected
401    feature. :obj:`branch_description` contains the feature's
402    values. The feature is marked as spent (it cannot reappear in the
403    node's subtrees).
404
405.. class:: SplitConstructor_ExhaustiveBinary
406
407    Bases: :class:`SplitConstructor_Score`
408
409    Finds the binarization with the highest score among all features. In
410    case of ties, a random feature is selected.
411
412    The constructed :obj:`branch_selector` is an instance
413    :obj:`orange.ClassifierFromVarFD`, that returns a value of the
414    selected feature. Its :obj:`transformer` contains a ``MapIntValue``
415    that maps values of the feature into a binary feature. Branches
416    with a single feature value are described with that value and
417    branches with more than one are described with ``[<val1>, <val2>,
418    ..., <valn>]``. Only binary features are marked as spent.
419
420.. class:: SplitConstructor_Threshold
421
422    Bases: :class:`SplitConstructor_Score`
423
424    The only split constructor for continuous features. It divides the
425    range of feature values with a threshold that maximizes the split's
426    quality. In case of ties, a random feature is selected.  The feature
427    that yields the best binary split is returned.
428
429    The constructed :obj:`branch_selector` is an instance of
430    :obj:`orange.ClassifierFromVarFD` with an attached :obj:`transformer`,
431    of type :obj:`Orange.feature.discretization.ThresholdDiscretizer`. The
432    branch descriptions are "<threshold" and ">=threshold". The feature
433    is not spent.
434
435.. class:: SplitConstructor_OneAgainstOthers
436   
437    Bases: :class:`SplitConstructor_Score`
438
439    Undocumented.
440
441.. class:: SplitConstructor_Combined
442
443    Bases: :class:`SplitConstructor`
444
445    Uses different split constructors for discrete and continuous
446    features. Each split constructor is called with appropriate
447    features. Both construct a candidate for a split; the better of them
448    is used.
449
450    The choice of the split is not probabilistically fair, when
451    multiple candidates have the same score. For example, if there
452    are nine discrete features with the highest score the split
453    constructor for discrete features will select one of them. Now,
454    if there is also a single continuous feature with the same score,
455    :obj:`SplitConstructor_Combined` would randomly select between the
456    proposed discrete feature and continuous feature, unaware that the
457    discrete feature  has already competed with eight others.  So,
458    the probability for selecting (each) discrete feature would be
459    1/18 instead of 1/10. Although incorrect, this should not affect
460    the performance.
461
462    .. attribute: discrete_split_constructor
463
464        Split constructor for discrete features;
465        for instance, :obj:`SplitConstructor_Feature` or
466        :obj:`SplitConstructor_ExhaustiveBinary`.
467
468    .. attribute: continuous_split_constructor
469
470        Split constructor for continuous features; it
471        can be either :obj:`SplitConstructor_Threshold` or a
472        a custom split constructor.
473
474
475StopCriteria and StopCriteria_common
476============================================
477
478:obj:`StopCriteria` determines when to stop the induction of subtrees.
479
480.. class:: StopCriteria
481
482    Provides the basic stopping criteria: the tree induction stops
483    when there is at most one instance left (the actual, not weighted,
484    number). The induction also stops when all instances are in the
485    same class (for discrete problems) or have the same outcome value
486    (for regression problems).
487
488    .. method:: __call__(instances[, weightID, domain contingencies])
489
490        Return True (stop) of False (continue the induction).
491        Contingencies are not used for counting. Derived classes should
492        use the contingencies whenever possible.
493
494.. class:: StopCriteria_common
495
496    Pre-pruning with additional criteria.
497
498    .. attribute:: max_majority
499
500        Maximum proportion of majority class. When exceeded,
501        induction stops.
502
503    .. attribute:: min_instances
504
505        Minimum number of instances for splitting. Subsets with less
506        than :obj:`min_instances` instances are not split further.
507        The sample count is weighed.
508
509
510Splitters
511=================
512
513Splitters sort learning instances into branches (the branches are selected
514with a :obj:`SplitConstructor`, while a :obj:`Descender` decides the
515branch for an instance during classification).
516
517Most splitters call :obj:`Node.branch_selector` and assign
518instances correspondingly. When the value is unknown they choose a
519particular branch or skip the instance.
520
521Some splitters can also split instances: a weighed instance is
522used in more than than one subset. Each branch has a weight ID (usually,
523each its own ID) and all instances in that branch should have this meta attribute.
524
525An instance that
526hasn't been split has only one additional attribute (weight
527ID corresponding to the subset to which it went). Instance that is split
528between, say, three subsets, has three new meta attributes, one for each
529subset. The weights are used only when needed; when there is no
530splitting - no weight IDs are returned.
531
532.. class:: Splitter
533
534    An abstract base class that splits instances
535    into subsets.
536
537    .. method:: __call__(node, instances[, weightID])
538
539        :param node: a node.
540        :type node: :obj:`Node`
541        :param instances: a set of instances
542        :param weightID: weight ID.
543       
544        Use the information in :obj:`Node` (particularly the
545        :obj:`~Node.branch_selector`) to split the given set of instances into
546        subsets.  Return a tuple with a list of instance subsets and
547        a list of weights.  The list of weights is either a
548        list of integers or None when no weights are added.
549
550.. class:: Splitter_IgnoreUnknowns
551
552    Bases: :class:`Splitter`
553
554    Ignores the instances for which no single branch can be determined.
555
556.. class:: Splitter_UnknownsToCommon
557
558    Bases: :class:`Splitter`
559
560    Places all ambiguous instances to a branch with the highest number of
561    instances. If there is more than one such branch, one is selected at
562    random and then used for all instances.
563
564.. class:: Splitter_UnknownsToAll
565
566    Bases: :class:`Splitter`
567
568    Splits instances with an unknown value of the feature into all branches.
569
570.. class:: Splitter_UnknownsToRandom
571
572    Bases: :class:`Splitter`
573
574    Selects a random branch for ambiguous instances.
575
576.. class:: Splitter_UnknownsToBranch
577
578    Bases: :class:`Splitter`
579
580    Constructs an additional branch for ambiguous instances.
581    The branch's description is "unknown".
582
583.. class:: Splitter_UnknownsAsBranchSizes
584
585    Bases: :class:`Splitter`
586
587    Splits instances with unknown value of the feature according to
588    proportions of instances in each branch.
589
590.. class:: Splitter_UnknownsAsSelector
591
592    Bases: :class:`Splitter`
593
594    Splits instances with unknown value of the feature according to
595    distribution proposed by selector (usually the same as proportions
596    of instances in branches).
597
598Descenders
599=============================
600
601Descenders decide where should the instances that cannot be unambiguously
602put in a single branch go during classification (the branches are selected
603with a :obj:`SplitConstructor`, while a :obj:`Splitter` sorts instances
604during learning).
605
606.. class:: Descender
607
608    An abstract base tree descender. It descends a
609    an instance as deep as possible, according to the values
610    of instance's features. The :obj:`Descender`: calls the node's
611    :obj:`~Node.branch_selector` to get the branch index. If it's a
612    simple index, the corresponding branch is followed. If not, the
613    descender decides what to do. A descender can choose a single
614    branch (for instance, the one that is the most recommended by the
615    :obj:`~Node.branch_selector`) or it can let the branches vote.
616
617    Three are possible outcomes of a descent:
618
619    #. The descender reaches a leaf. This happens when
620       there were no unknown or out-of-range values, or when the
621       descender selected a single branch and continued the descend
622       despite them. The descender returns the :obj:`Node` it has reached.
623    #. Node's :obj:`~Node.branch_selector` returned a distribution and
624       :obj:`Descender` decided to stop the descend at this (internal)
625       node. It returns the current :obj:`Node`.
626    #. Node's :obj:`~Node.branch_selector` returned a distribution and the
627       :obj:`Node` wants to split the instance (i.e., to decide the class
628       by voting). It returns a :obj:`Node` and the vote-weights for
629       the branches.  The weights can correspond, for example,  to the
630       distribution returned by node's :obj:`~Node.branch_selector`, or to
631       the number of learning instances that were assigned to each branch.
632
633    .. method:: __call__(node, instance)
634
635        Descends until it reaches a leaf or a node in
636        which a vote of subtrees is required. A tuple
637        of two elements is returned. If it reached a leaf, the tuple contains
638        the leaf node and None. If not, it contains a node and
639        a list of floats (weights of votes).
640
641.. class:: Descender_UnknownToNode
642
643    Bases: :obj:`Descender`
644
645    When instance cannot be classified into a single branch, the current
646    node is returned. Thus, the node's :obj:`~Node.node_classifier`
647    will be used to make a decision. Therefore, internal nodes
648    need to have :obj:`Node.node_classifier` defined.
649
650.. class:: Descender_UnknownToBranch
651
652    Bases: :obj:`Descender`
653
654    Classifies instances with unknown value to a special branch. This
655    makes sense only if the tree itself was constructed with
656    :obj:`Splitter_UnknownsToBranch`.
657
658.. class:: Descender_UnknownToCommonBranch
659
660    Bases: :obj:`Descender`
661
662    Classifies instances with unknown values to the branch with the
663    highest number of instances. If there is more than one such branch,
664    random branch is chosen for each instance.
665
666.. class:: Descender_UnknownToCommonSelector
667
668    Bases: :obj:`Descender`
669
670    Classifies instances with unknown values to the branch which received
671    the highest recommendation by the selector.
672
673.. class:: Descender_UnknownMergeAsBranchSizes
674
675    Bases: :obj:`Descender`
676
677    The subtrees vote for the instance's class; the vote is weighted
678    according to the sizes of the branches.
679
680.. class:: Descender_UnknownMergeAsSelector
681
682    Bases: :obj:`Descender`
683
684    The subtrees vote for the instance's class; the vote is weighted
685    according to the selectors proposal.
686
687Pruning
688=======
689
690.. index::
691    pair: classification trees; pruning
692
693The pruners construct a shallow copy of a tree. The pruned tree's
694:obj:`Node` contain references to the same contingency matrices,
695node classifiers, branch selectors, ...  as the original tree.
696
697Pruners cannot construct a new :obj:`~Node.node_classifier`.  Thus, for
698pruning, internal nodes must have :obj:`~Node.node_classifier` defined
699(the default).
700
701.. class:: Pruner
702
703    An abstract base tree pruner.
704
705    .. method:: __call__(tree)
706
707        :param tree: either
708            a :obj:`Node` or (the C++ version of the classifier,
709            saved in :obj:`TreeClassfier.base_classifier`).
710
711        The resulting pruned tree is of the same type as the argument.
712        The original tree remains intact.
713
714.. class:: Pruner_SameMajority
715
716    Bases: :class:`Pruner`
717
718    A tree can have a subtrees where all the leaves have
719    the same majority class. This is allowed because leaves can still
720    have different class distributions and thus predict different
721    probabilities.  The :obj:`Pruner_SameMajority` prunes the tree so
722    that there is no subtree in which all the nodes would have the same
723    majority class.
724
725    This pruner will only prune the nodes in which the node classifier
726    is a :obj:`~Orange.classification.ConstantClassifier`
727    (or a derived class).
728
729    The pruning works from leaves to the root.
730    It siblings have (at least one) common majority class, they can be pruned.
731
732.. class:: Pruner_m
733
734    Bases: :class:`Pruner`
735
736    Prunes a tree by comparing m-estimates of static and dynamic
737    error as defined in (Bratko, 2002).
738
739    .. attribute:: m
740
741        Parameter m for m-estimation.
742
743Printing the tree
744=================
745
746The tree printing functions are very flexible. They can print, for
747example, numbers of instances, proportions of majority class in nodes
748and similar, or more complex statistics like the proportion of instances
749in a particular class divided by the proportion of instances of this
750class in a parent node. Users may also pass their own functions to print
751certain elements.
752
753The easiest way to print the tree is to print :func:`TreeClassifier`::
754
755    >>> print tree
756    petal width<0.800: Iris-setosa (100.00%)
757    petal width>=0.800
758    |    petal width<1.750
759    |    |    petal length<5.350: Iris-versicolor (94.23%)
760    |    |    petal length>=5.350: Iris-virginica (100.00%)
761    |    petal width>=1.750
762    |    |    petal length<4.850: Iris-virginica (66.67%)
763    |    |    petal length>=4.850: Iris-virginica (100.00%)
764
765
766Format string
767-------------
768
769Format strings are printed at every leaf or internal node with the certain
770format specifiers replaced by data from the tree node. Specifiers are
771generally of form **%[^]<precision><quantity><divisor>**.
772
773**^** at the start tells that the number should be multiplied by 100,
774which is useful for proportions like percentages.
775
776**<precision>** is in the same format as in Python (or C) string
777formatting. For instance, ``%N`` denotes the number of instances in
778the node, hence ``%6.2N`` would mean output to two decimal digits
779and six places altogether. If left out, a default format ``5.3`` is
780used, unless the numbers are multiplied by 100, in which case the default
781is ``.0`` (no decimals, the number is rounded to the nearest integer).
782
783**<divisor>** tells what to divide the quantity in that node with.
784``bP`` means division by the same quantity in the parent node; for instance,
785``%NbP`` gives the number of instances in the node divided by the
786number of instances in parent node. Precision formatting can be added,
787e.g. ``%6.2NbP``. ``bA`` denotes division by the same quantity over the entire
788data set, so ``%NbA`` will tell you the proportion of instances (out
789of the entire training data set) that fell into that node. If division is
790impossible since the parent node does not exist or some data is missing,
791a dot is printed out instead.
792
793**<quantity>** defines what to print and is the only required element.
794It can be:
795
796``V``
797    The predicted value at that node. Precision
798    or divisor can not be defined here.
799
800``N``
801    The number of instances in the node.
802
803``M``
804    The number of instances in the majority class (that is, the class
805    predicted by the node).
806
807``m``
808    The proportion of instances in the majority class.
809
810``A``
811    The average class for instances the node; this is available only for
812    regression trees.
813
814``E``
815    Standard error for class of instances in the node; available only for
816    regression trees.
817
818``I``
819    Print out the confidence interval. The modifier is used as
820    ``%I(95)`` of (more complicated) ``%5.3I(95)bP``.
821
822``C``
823    The number of instances in the given class.  For a classification
824    example, ``%5.3C="Iris-virginica"bP`` denotes the number of instances
825    of Iris-virginica by the number of instances this class in the parent
826    node ( instances that are *not* Iris-virginica could be described with
827    ``%5.3CbP!="Iris-virginica"``).
828
829    For regression trees, use operators =, !=, <, <=, >, and >=, as in
830    ``%C<22``, with optional precision and divisor. Intervals are also
831    possible: ``%C[20, 22]`` gives the number of instances between
832    20 and 22 (inclusive) and ``%C(20, 22)`` gives the number of such
833    instances excluding the boundaries. Mixing of parentheses is allowed,
834    e.g. ``%C(20, 22]``.  Add ``!`` for instances outside the interval,
835    like ``%C!(20, 22]``.
836
837``c``
838    Same as ``C``, except that it computes the proportion of the class
839    instead of the number of instances.
840
841``D``
842    The number of instances in each class. Precision and the divisor
843    are applied to each number in the distribution.  This quantity can
844    not be computed for regression trees.
845
846``d``
847    Same as ``D``, except that it shows proportions of instances.
848
849<user defined formats>
850    Instructions and examples of added formats are at the end of this
851    section.
852
853.. rubric:: Examples on classification trees
854
855A tree on the iris data set with the depth limited to three
856levels is built as follows:
857   
858.. literalinclude:: code/orngTree1.py
859   :lines: 1-4
860
861Printing the predicted class at each node, the number
862of instances in the majority class with the total number of instances in
863the node requires a custom format string::
864
865    >>> print tree.to_string(leaf_str="%V (%M out of %N)")
866    petal width<0.800: Iris-setosa (50.000 out of 50.000)
867    petal width>=0.800
868    |    petal width<1.750
869    |    |    petal length<5.350: Iris-versicolor (49.000 out of 52.000)
870    |    |    petal length>=5.350: Iris-virginica (2.000 out of 2.000)
871    |    petal width>=1.750
872    |    |    petal length<4.850: Iris-virginica (2.000 out of 3.000)
873    |    |    petal length>=4.850: Iris-virginica (43.000 out of 43.000)
874
875The number of instances as
876compared to the entire data set and to the parent node::
877
878    >>> print tree.to_string(leaf_str="%V (%^MbA%, %^MbP%)")
879    petal width<0.800: Iris-setosa (100%, 100%)
880    petal width>=0.800
881    |    petal width<1.750
882    |    |    petal length<5.350: Iris-versicolor (98%, 100%)
883    |    |    petal length>=5.350: Iris-virginica (4%, 40%)
884    |    petal width>=1.750
885    |    |    petal length<4.850: Iris-virginica (4%, 4%)
886    |    |    petal length>=4.850: Iris-virginica (86%, 96%)
887
888``%M`` is the number of instances in the majority class. Dividing by
889the number of all instances from this class on the entire data set
890is described with ``%MbA``. Add ``^`` in front for mutiplication with
891100. The percent sign *after* that is printed out literally, just as the
892comma and parentheses. For the proportion of this class in the parent the
893``bA`` is replaced with ``bA``.
894
895To print the number of versicolors in each node, together with the
896proportion of versicolors among the instances in this particular node
897and among all versicolors, use the following::
898
899    '%C="Iris-versicolor" (%^c="Iris-versicolor"% of node, %^CbA="Iris-versicolor"% of versicolors)
900
901It gives::
902
903    petal width<0.800: 0.000 (0% of node, 0% of versicolors)
904    petal width>=0.800
905    |    petal width<1.750
906    |    |    petal length<5.350: 49.000 (94% of node, 98% of versicolors)
907    |    |    petal length>=5.350: 0.000 (0% of node, 0% of versicolors)
908    |    petal width>=1.750
909    |    |    petal length<4.850: 1.000 (33% of node, 2% of versicolors)
910    |    |    petal length>=4.850: 0.000 (0% of node, 0% of versicolors)
911
912Finally, to print the distributions using a format string ``%D``::
913
914    petal width<0.800: [50.000, 0.000, 0.000]
915    petal width>=0.800
916    |    petal width<1.750
917    |    |    petal length<5.350: [0.000, 49.000, 3.000]
918    |    |    petal length>=5.350: [0.000, 0.000, 2.000]
919    |    petal width>=1.750
920    |    |    petal length<4.850: [0.000, 1.000, 2.000]
921    |    |    petal length>=4.850: [0.000, 0.000, 43.000]
922
923As the order of classes is the same as in ``data.domain.class_var.values``
924(setosa, versicolor, virginica), there are 49 versicolors and 3 virginicae
925in the node at ``petal length<5.350``. To print the proportions within
926nodes rounded to two decimals use ``%.2d``::
927
928    petal width<0.800: [1.00, 0.00, 0.00]
929    petal width>=0.800
930    |    petal width<1.750
931    |    |    petal length<5.350: [0.00, 0.94, 0.06]
932    |    |    petal length>=5.350: [0.00, 0.00, 1.00]
933    |    petal width>=1.750
934    |    |    petal length<4.850: [0.00, 0.33, 0.67]
935    |    |    petal length>=4.850: [0.00, 0.00, 1.00]
936
937The most trivial format string for internal nodes is for printing
938node predictions. ``.`` in the following example specifies
939that the node_str should be the same as leaf_str.
940
941::
942
943    tree.to_string(leaf_str="%V", node_str=".")
944 
945The output::
946
947    root: Iris-setosa
948    |    petal width<0.800: Iris-setosa
949    |    petal width>=0.800: Iris-versicolor
950    |    |    petal width<1.750: Iris-versicolor
951    |    |    |    petal length<5.350: Iris-versicolor
952    |    |    |    petal length>=5.350: Iris-virginica
953    |    |    petal width>=1.750: Iris-virginica
954    |    |    |    petal length<4.850: Iris-virginica
955    |    |    |    petal length>=4.850: Iris-virginica
956
957A node *root* has appeared and the tree looks one level
958deeper. This is needed to also print the data for tree root.
959
960To observe how the number
961of virginicas decreases down the tree try::
962
963    print tree.to_string(leaf_str='%^.1CbA="Iris-virginica"% (%^.1CbP="Iris-virginica"%)', node_str='.')
964
965Interpretation: ``CbA="Iris-virginica"`` is
966the number of instances from virginica, divided by the total number
967of instances in this class. Add ``^.1`` and the result will be
968multiplied and printed with one decimal. The trailing ``%`` is printed
969out. In parentheses the same thing was divided by
970the instances in the parent node. The single quotes were used for strings, so
971that double quotes inside the string can specify the class.
972
973::
974
975    root: 100.0% (.%)
976    |    petal width<0.800: 0.0% (0.0%)
977    |    petal width>=0.800: 100.0% (100.0%)
978    |    |    petal width<1.750: 10.0% (10.0%)
979    |    |    |    petal length<5.350: 6.0% (60.0%)
980    |    |    |    petal length>=5.350: 4.0% (40.0%)
981    |    |    petal width>=1.750: 90.0% (90.0%)
982    |    |    |    petal length<4.850: 4.0% (4.4%)
983    |    |    |    petal length>=4.850: 86.0% (95.6%)
984
985If :meth:`~TreeClassifier.to_string` cannot compute something, in this case
986because the root has no parent, it prints out a dot.
987
988The final example with classification trees prints the distributions in
989nodes, the distribution compared to the parent, the proportions compared
990to the parent and the predicted class in the leaves::
991
992    >>> print tree.to_string(leaf_str='"%V   %D %.2DbP %.2dbP"', node_str='"%D %.2DbP %.2dbP"')
993    root: [50.000, 50.000, 50.000] . .
994    |    petal width<0.800: [50.000, 0.000, 0.000] [1.00, 0.00, 0.00] [3.00, 0.00, 0.00]:
995    |        Iris-setosa   [50.000, 0.000, 0.000] [1.00, 0.00, 0.00] [3.00, 0.00, 0.00]
996    |    petal width>=0.800: [0.000, 50.000, 50.000] [0.00, 1.00, 1.00] [0.00, 1.50, 1.50]
997    |    |    petal width<1.750: [0.000, 49.000, 5.000] [0.00, 0.98, 0.10] [0.00, 1.81, 0.19]
998    |    |    |    petal length<5.350: [0.000, 49.000, 3.000] [0.00, 1.00, 0.60] [0.00, 1.04, 0.62]:
999    |    |    |        Iris-versicolor   [0.000, 49.000, 3.000] [0.00, 1.00, 0.60] [0.00, 1.04, 0.62]
1000    |    |    |    petal length>=5.350: [0.000, 0.000, 2.000] [0.00, 0.00, 0.40] [0.00, 0.00, 10.80]:
1001    |    |    |        Iris-virginica   [0.000, 0.000, 2.000] [0.00, 0.00, 0.40] [0.00, 0.00, 10.80]
1002    |    |    petal width>=1.750: [0.000, 1.000, 45.000] [0.00, 0.02, 0.90] [0.00, 0.04, 1.96]
1003    |    |    |    petal length<4.850: [0.000, 1.000, 2.000] [0.00, 1.00, 0.04] [0.00, 15.33, 0.68]:
1004    |    |    |        Iris-virginica   [0.000, 1.000, 2.000] [0.00, 1.00, 0.04] [0.00, 15.33, 0.68]
1005    |    |    |    petal length>=4.850: [0.000, 0.000, 43.000] [0.00, 0.00, 0.96] [0.00, 0.00, 1.02]:
1006    |    |    |        Iris-virginica   [0.000, 0.000, 43.000] [0.00, 0.00, 0.96] [0.00, 0.00, 1.02]
1007
1008
1009.. rubric:: Examples on regression trees
1010
1011The regression trees examples use a tree induced from the housing data
1012set. Without other argumets, :meth:`TreeClassifier.to_string` prints the
1013following::
1014
1015    RM<6.941
1016    |    LSTAT<14.400
1017    |    |    DIS<1.385: 45.6
1018    |    |    DIS>=1.385: 22.9
1019    |    LSTAT>=14.400
1020    |    |    CRIM<6.992: 17.1
1021    |    |    CRIM>=6.992: 12.0
1022    RM>=6.941
1023    |    RM<7.437
1024    |    |    CRIM<7.393: 33.3
1025    |    |    CRIM>=7.393: 14.4
1026    |    RM>=7.437
1027    |    |    TAX<534.500: 45.9
1028    |    |    TAX>=534.500: 21.9
1029
1030To add the standard error in both internal nodes and leaves, and
1031the 90% confidence intervals in the leaves, use::
1032
1033    >>> print tree.to_string(leaf_str="[SE: %E]\t %V %I(90)", node_str="[SE: %E]")
1034    root: [SE: 0.409]
1035    |    RM<6.941: [SE: 0.306]
1036    |    |    LSTAT<14.400: [SE: 0.320]
1037    |    |    |    DIS<1.385: [SE: 4.420]:
1038    |    |    |        [SE: 4.420]   45.6 [38.331-52.829]
1039    |    |    |    DIS>=1.385: [SE: 0.244]:
1040    |    |    |        [SE: 0.244]   22.9 [22.504-23.306]
1041    |    |    LSTAT>=14.400: [SE: 0.333]
1042    |    |    |    CRIM<6.992: [SE: 0.338]:
1043    |    |    |        [SE: 0.338]   17.1 [16.584-17.691]
1044    |    |    |    CRIM>=6.992: [SE: 0.448]:
1045    |    |    |        [SE: 0.448]   12.0 [11.243-12.714]
1046    |    RM>=6.941: [SE: 1.031]
1047    |    |    RM<7.437: [SE: 0.958]
1048    |    |    |    CRIM<7.393: [SE: 0.692]:
1049    |    |    |        [SE: 0.692]   33.3 [32.214-34.484]
1050    |    |    |    CRIM>=7.393: [SE: 2.157]:
1051    |    |    |        [SE: 2.157]   14.4 [10.862-17.938]
1052    |    |    RM>=7.437: [SE: 1.124]
1053    |    |    |    TAX<534.500: [SE: 0.817]:
1054    |    |    |        [SE: 0.817]   45.9 [44.556-47.237]
1055    |    |    |    TAX>=534.500: [SE: 0.000]:
1056    |    |    |        [SE: 0.000]   21.9 [21.900-21.900]
1057
1058The predicted value (``%V``) and the average (``%A``) may differ because
1059a regression tree does not always predict the leaf average, but whatever
1060the :obj:`~Node.node_classifier` in a leaf returns.  As ``%V`` uses the
1061:obj:`Orange.feature.Continuous`' function for printing the
1062value, the number has the same number of decimals as in the data file.
1063
1064Regression trees cannot print the distributions in the same way
1065as classification trees. They instead offer a set of operators for
1066observing the number of instances within a certain range. For instance,
1067to print the number of instances with values below 22 and compare
1068it with values in the parent nodes use::
1069
1070    >>> print tree.to_string(leaf_str="%C<22 (%cbP<22)", node_str=".")
1071    root: 277.000 (.)
1072    |    RM<6.941: 273.000 (1.160)
1073    |    |    LSTAT<14.400: 107.000 (0.661)
1074    |    |    |    DIS<1.385: 0.000 (0.000)
1075    |    |    |    DIS>=1.385: 107.000 (1.020)
1076    |    |    LSTAT>=14.400: 166.000 (1.494)
1077    |    |    |    CRIM<6.992: 93.000 (0.971)
1078    |    |    |    CRIM>=6.992: 73.000 (1.040)
1079    |    RM>=6.941: 4.000 (0.096)
1080    |    |    RM<7.437: 3.000 (1.239)
1081    |    |    |    CRIM<7.393: 0.000 (0.000)
1082    |    |    |    CRIM>=7.393: 3.000 (15.333)
1083    |    |    RM>=7.437: 1.000 (0.633)
1084    |    |    |    TAX<534.500: 0.000 (0.000)
1085    |    |    |    TAX>=534.500: 1.000 (30.000)</xmp>
1086
1087The last line, for instance, says the the number of instances with the
1088class below 22 is among those with tax above 534 is 30 times higher than
1089the number of such instances in its parent node.
1090
1091To count the same for all instances *outside*
1092interval [20, 22] and print out the proportions as percents use::
1093
1094    >>> print tree.to_string(leaf_str="%C![20,22] (%^cbP![20,22]%)", node_str=".")
1095
1096The format string  ``%c![20, 22]`` denotes the proportion of instances
1097(within the node) whose values are below 20 or above 22. ``%cbP![20,
109822]`` derives same statistics computed on the parent. A ``^`` is added
1099for percentages.
1100
1101::
1102
1103    root: 439.000 (.%)
1104    |    RM<6.941: 364.000 (98%)
1105    |    |    LSTAT<14.400: 200.000 (93%)
1106    |    |    |    DIS<1.385: 5.000 (127%)
1107    |    |    |    DIS>=1.385: 195.000 (99%)
1108    |    |    LSTAT>=14.400: 164.000 (111%)
1109    |    |    |    CRIM<6.992: 91.000 (96%)
1110    |    |    |    CRIM>=6.992: 73.000 (105%)
1111    |    RM>=6.941: 75.000 (114%)
1112    |    |    RM<7.437: 46.000 (101%)
1113    |    |    |    CRIM<7.393: 43.000 (100%)
1114    |    |    |    CRIM>=7.393: 3.000 (100%)
1115    |    |    RM>=7.437: 29.000 (98%)
1116    |    |    |    TAX<534.500: 29.000 (103%)
1117    |    |    |    TAX>=534.500: 0.000 (0%)
1118
1119
1120Defining custom printouts
1121-------------------------
1122
1123:meth:`TreeClassifier.to_string`'s argument :obj:`user_formats` can be used to
1124print other information.  :obj:`~TreeClassifier.format.user_formats` should
1125contain a list of tuples with a regular expression and a function to be
1126called when that expression is found in the format string. Expressions
1127from :obj:`user_formats` are checked before the built-in expressions
1128discussed above.
1129
1130The regular expression should describe a string like used above,
1131for instance ``%.2DbP``. When a leaf or internal node
1132is printed, the format string (:obj:`leaf_str` or :obj:`node_str`)
1133is checked for these regular expressions and when the match is found,
1134the corresponding callback function is called.
1135
1136The passed function will get five arguments: the format string
1137(:obj:`leaf_str` or :obj:`node_str`), the match object, the node which is
1138being printed, its parent (can be None) and the tree classifier.
1139The function should return the format string in which the part described
1140by the match object (that is, the part that is matched by the regular
1141expression) is replaced by whatever information your callback function
1142is supposed to give.
1143
1144The function can use several utility function provided in the module.
1145
1146.. autofunction:: insert_str
1147
1148.. autofunction:: insert_dot
1149
1150.. autofunction:: insert_num
1151
1152.. autofunction:: by_whom
1153
1154The module also includes reusable regular expressions:
1155
1156.. autodata:: fs
1157
1158.. autodata:: by
1159
1160For a trivial example, ``%V`` is implemented with the
1161following tuple::
1162
1163    (re.compile("%V"), replaceV)
1164
1165And ``replaceV`` is defined by::
1166
1167    def replaceV(strg, mo, node, parent, tree):
1168        return insert_str(strg, mo, str(node.node_classifier.default_value))
1169
1170``replaceV`` takes the value predicted at the node
1171(``node.node_classifier.default_value`` ), converts it to a string
1172and passes it to :func:`insert_str`.
1173
1174A more complex regular expression is the one for the proportion of
1175majority class, defined as ``"%"+fs+"M"+by``. It uses the two partial
1176expressions defined above (:obj:`fs` and :obj:`by`).
1177
1178The following code prints the classification margin for each node,
1179that is, the difference between the proportion of the largest and the
1180second largest class in the node:
1181
1182.. literalinclude:: code/orngTree2.py
1183   :lines: 7-31
1184
1185``get_margin`` computes the margin from the distribution. The replacing
1186function, ``replaceB``, computes the margin for the node.  If :data:`by`
1187group is present, we call :func:`by_whom` to get the node with whose
1188margin this node's margin is to be divided. If this node (usually the
1189parent) does not exist of if its margin is zero, :func:`insert_dot`
1190inserts dot, otherwise :func:`insert_num` is called which inserts the
1191number in the user-specified format.  ``my_format`` contains the regular
1192expression and the callback function.
1193
1194Printing the tree with
1195
1196.. literalinclude:: code/orngTree2.py
1197    :lines: 33
1198
1199yields::
1200
1201    petal width<0.800: Iris-setosa 100% (100.00%)
1202    petal width>=0.800
1203    |    petal width<1.750
1204    |    |    petal length<5.350: Iris-versicolor 88% (108.57%)
1205    |    |    petal length>=5.350: Iris-virginica 100% (122.73%)
1206    |    petal width>=1.750
1207    |    |    petal length<4.850: Iris-virginica 33% (34.85%)
1208    |    |    petal length>=4.850: Iris-virginica 100% (104.55%)
1209
1210Plotting with Dot
1211---------------------------
1212
1213To produce images of trees, first create a .dot file with
1214:meth:`TreeClassifier.dot`. If it was saved to "tree5.dot", plot a gif
1215with the following command::
1216
1217    dot -Tgif tree5.dot -otree5.gif
1218
1219Check GraphViz's dot documentation for more options and
1220output formats.
1221
1222
1223===========================
1224C4.5 Classifier and Learner
1225===========================
1226
1227C4.5 is, as  a standard benchmark in machine learning, incorporated in
1228Orange. The implementation uses the original C4.5 code, so the resulting
1229tree is exactly like the one that would be build by standalone C4.5. The
1230tree build is only made accessible in Python.
1231
1232:class:`C45Learner` and :class:`C45Classifier` behave
1233like any other Orange learner and classifier. Unlike most of Orange
1234learning algorithms, C4.5 does not accepts weighted instances.
1235
1236Building the C4.5 plug-in
1237=========================
1238
1239C4.5 is not distributed with Orange, but it can be added as a
1240plug-in. A C compiler is needed for the procedure: on Windows MS Visual C
1241(CL.EXE and LINK.EXE must be on the PATH), on Linux and OS X gcc (OS X
1242users can download it from Apple).
1243
1244Orange must be installed prior to building C4.5.
1245
1246#. Download
1247   `C4.5 (Release 8) sources <http://www.rulequest.com/Personal/c4.5r8.tar.gz>`_
1248   from the `Rule Quest's site <http://www.rulequest.com/>`_ and extract
1249   them. The files will be modified in the
1250   further process.
1251#. Download
1252   `buildC45.zip <http://orange.biolab.si/orange/download/buildC45.zip>`_
1253   and unzip its contents into the directory R8/Src of the C4.5 sources
1254   (this directory contains, for instance, the file average.c).
1255#. Run buildC45.py, which will build the plug-in and put it next to
1256   orange.pyd (or orange.so on Linux/Mac).
1257#. Run python, type ``import Orange`` and
1258   create ``Orange.classification.tree.C45Learner()``. This should
1259   succeed.
1260#. Finally, you can remove C4.5 sources.
1261
1262The buildC45.py creates .h files that wrap Quinlan's .i files and
1263ensure that they are not included twice. It modifies C4.5 sources to
1264include .h's instead of .i's (this step can hardly fail). Then it compiles
1265ensemble.c into c45.dll or c45.so and puts it next to Orange. In the end
1266it checks if the built C4.5 gives the same results as the original.
1267
1268.. autoclass:: C45Learner
1269    :members:
1270
1271.. autoclass:: C45Classifier
1272    :members:
1273
1274.. class:: C45Node
1275
1276    This class is a reimplementation of the corresponding *struct* from
1277    Quinlan's C4.5 code.
1278
1279    .. attribute:: node_type
1280
1281        Type of the node:  :obj:`C45Node.Leaf` (0),
1282        :obj:`C45Node.Branch` (1), :obj:`C45Node.Cut` (2),
1283        :obj:`C45Node.Subset` (3). "Leaves" are leaves, "branches"
1284        split instances based on values of a discrete attribute,
1285        "cuts" cut them according to a threshold value of a continuous
1286        attributes and "subsets" use discrete attributes but with subsetting
1287        so that several values can go into the same branch.
1288
1289    .. attribute:: leaf
1290
1291        Value returned by that leaf. The field is defined for internal
1292        nodes as well.
1293
1294    .. attribute:: items
1295
1296        Number of (learning) instances in the node.
1297
1298    .. attribute:: class_dist
1299
1300        Class distribution for the node (of type
1301        :obj:`Orange.statistics.distribution.Discrete`).
1302
1303    .. attribute:: tested
1304       
1305        The attribute used in the node's test. If node is a leaf,
1306        obj:`tested` is None, if node is of type :obj:`Branch` or :obj:`Cut`
1307        :obj:`tested` is a discrete attribute, and if node is of type
1308        :obj:`Cut` then :obj:`tested` is a continuous attribute.
1309
1310    .. attribute:: cut
1311
1312        A threshold for continuous attributes, if node is of type :obj:`Cut`.
1313        Undefined otherwise.
1314
1315    .. attribute:: mapping
1316
1317        Mapping for nodes of type :obj:`Subset`. Element ``mapping[i]``
1318        gives the index for an instance whose value of :obj:`tested` is *i*.
1319        Here, *i* denotes an index of value, not a :class:`Orange.data.Value`.
1320
1321    .. attribute:: branch
1322       
1323        A list of branches stemming from this node.
1324
1325Examples
1326========
1327
1328This
1329script constructs the same learner as you would get by calling
1330the usual C4.5:
1331
1332.. literalinclude:: code/tree_c45.py
1333   :lines: 7-14
1334
1335Both C4.5 command-line symbols and variable names can be used. The
1336following lines produce the same result::
1337
1338    tree = Orange.classification.tree.C45Learner(data, m=100)
1339    tree = Orange.classification.tree.C45Learner(data, min_objs=100)
1340
1341A veteran C4.5 might prefer :func:`C45Learner.commandline`::
1342
1343    lrn = Orange.classification.tree.C45Learner()
1344    lrn.commandline("-m 1 -s")
1345    tree = lrn(data)
1346
1347The following script prints out the tree same format as C4.5 does.
1348
1349.. literalinclude:: code/tree_c45_printtree.py
1350
1351For the leaves just the value in ``node.leaf`` in printed. Since
1352:obj:`C45Node` does not know to which attribute it belongs, we need to
1353convert it to a string through ``classvar``, which is passed as an extra
1354argument to the recursive part of print_tree.
1355
1356For discrete splits without subsetting, we print out all attribute values
1357and recursively call the function for all branches. Continuous splits
1358are equally easy to handle.
1359
1360For discrete splits with subsetting, we iterate through branches,
1361retrieve the corresponding values that go into each branch to inset,
1362turn the values into strings and print them out, separately treating
1363the case when only a single value goes into the branch.
1364
1365=================
1366SimpleTreeLearner
1367=================
1368
1369.. include:: /SimpleTreeLearner.txt
1370       
1371Examples
1372========
1373
1374:obj:`SimpleTreeLearner` is used in much the same way as :obj:`TreeLearner`.
1375A typical example of using :obj:`SimpleTreeLearner` would be to build a random
1376forest:
1377
1378.. literalinclude:: code/simple_tree_random_forest.py
1379
1380
1381References
1382==========
1383
1384Bratko, I. (2002). `Prolog Programming for Artificial Intelligence`, Addison
1385Wesley, 2002.
1386
1387E Koutsofios, SC North. Drawing Graphs with dot. AT&T Bell Laboratories,
1388Murray Hill NJ, U.S.A., October 1993.
1389
1390`Graphviz - open source graph drawing software <http://www.research.att.com/sw/tools/graphviz/>`_
1391A home page of AT&T's dot and similar software packages.
1392
1393"""
1394
1395"""
1396TODO C++ aliases
1397
1398SplitConstructor.discrete/continuous_split_constructor -> SplitConstructor.discrete
1399Node.examples -> Node.instances
1400"""
1401
1402from Orange.core import \
1403     TreeLearner as _TreeLearner, \
1404         TreeClassifier as _TreeClassifier, \
1405         SimpleTreeLearner, \
1406         SimpleTreeClassifier, \
1407         C45Learner as _C45Learner, \
1408         C45Classifier as _C45Classifier, \
1409         C45TreeNode as C45Node, \
1410         C45TreeNodeList as C45NodeList, \
1411         TreeDescender as Descender, \
1412              TreeDescender_UnknownMergeAsBranchSizes as Descender_UnknownMergeAsBranchSizes, \
1413              TreeDescender_UnknownMergeAsSelector as Descender_UnknownMergeAsSelector, \
1414              TreeDescender_UnknownToBranch as Descender_UnknownToBranch, \
1415              TreeDescender_UnknownToCommonBranch as Descender_UnknownToCommonBranch, \
1416              TreeDescender_UnknownToCommonSelector as Descender_UnknownToCommonSelector, \
1417         TreeExampleSplitter as Splitter, \
1418              TreeExampleSplitter_IgnoreUnknowns as Splitter_IgnoreUnknowns, \
1419              TreeExampleSplitter_UnknownsAsBranchSizes as Splitter_UnknownsAsBranchSizes, \
1420              TreeExampleSplitter_UnknownsAsSelector as Splitter_UnknownsAsSelector, \
1421              TreeExampleSplitter_UnknownsToAll as Splitter_UnknownsToAll, \
1422              TreeExampleSplitter_UnknownsToBranch as Splitter_UnknownsToBranch, \
1423              TreeExampleSplitter_UnknownsToCommon as Splitter_UnknownsToCommon, \
1424              TreeExampleSplitter_UnknownsToRandom as Splitter_UnknownsToRandom, \
1425         TreeNode as Node, \
1426         TreeNodeList as NodeList, \
1427         TreePruner as Pruner, \
1428              TreePruner_SameMajority as Pruner_SameMajority, \
1429              TreePruner_m as Pruner_m, \
1430         TreeSplitConstructor as SplitConstructor, \
1431              TreeSplitConstructor_Combined as SplitConstructor_Combined, \
1432              TreeSplitConstructor_Measure as SplitConstructor_Score, \
1433                   TreeSplitConstructor_Attribute as SplitConstructor_Feature, \
1434                   TreeSplitConstructor_ExhaustiveBinary as SplitConstructor_ExhaustiveBinary, \
1435                   TreeSplitConstructor_OneAgainstOthers as SplitConstructor_OneAgainstOthers, \
1436                   TreeSplitConstructor_Threshold as SplitConstructor_Threshold, \
1437         TreeStopCriteria as StopCriteria, \
1438              TreeStopCriteria_Python as StopCriteria_Python, \
1439              TreeStopCriteria_common as StopCriteria_common
1440
1441import Orange.core
1442import operator
1443import base64
1444import re
1445import Orange.data
1446import Orange.feature.scoring
1447import warnings
1448
1449class C45Learner(Orange.classification.Learner):
1450    """
1451    :class:`C45Learner`'s attributes have double names - those that
1452    you know from C4.5 command lines and the corresponding names of C4.5's
1453    internal variables. All defaults are set as in C4.5; if you change
1454    nothing, you are running C4.5.
1455
1456    Constructs a :obj:`C45Classifier` when given data.
1457
1458    .. attribute:: gain_ratio (g)
1459       
1460        Determines whether to use information gain (false, default)
1461        or gain ratio for selection of attributes (true).
1462
1463    .. attribute:: batch (b)
1464
1465        Turn on batch mode (no windows, no iterations); this option is
1466        not documented in C4.5 manuals. It conflicts with "window",
1467        "increment" and "trials".
1468
1469    .. attribute:: subset (s)
1470       
1471        Enables subsetting (default: false, no subsetting),
1472 
1473    .. attribute:: prob_thresh (p)
1474
1475        Probabilistic threshold for continuous attributes (default: false).
1476
1477    .. attribute:: min_objs (m)
1478       
1479        Minimal number of objects (instances) in leaves (default: 2).
1480
1481    .. attribute:: window (w)
1482
1483        Initial windows size (default: maximum of 20% and twice the
1484        square root of the number of data objects).
1485
1486    .. attribute:: increment (i)
1487
1488        The maximum number of objects that can be added to the window
1489        at each iteration (default: 20% of the initial window size).
1490
1491    .. attribute:: cf (c)
1492
1493        Prunning confidence level (default: 25%).
1494
1495    .. attribute:: trials (t)
1496
1497        Set the number of trials in iterative (i.e. non-batch) mode (default: 10).
1498
1499    .. attribute:: prune
1500       
1501        Return pruned tree (not an original C4.5 option) (default: true)
1502    """
1503
1504    _rename_new_old = { "min_objs": "minObjs", "probTresh": "prob_tresh",
1505            "gain_ratio": "gainRatio" }
1506    #_rename_new_old = {}
1507    _rename_old_new = dict((a, b) for b, a in _rename_new_old.items())
1508
1509    @classmethod
1510    def _rename_dict(cls, dic):
1511        return dict((cls._rename_arg(a), b) for a, b in dic.items())
1512
1513    @classmethod
1514    def _rename_arg(cls, a):
1515        if a in cls._rename_old_new:
1516            Orange.misc.deprecation_warning(a, cls._rename_old_new[a], stacklevel=4)
1517        return cls._rename_new_old.get(a, a)
1518
1519    def __new__(cls, instances=None, weightID=0, **argkw):
1520        self = Orange.classification.Learner.__new__(cls, **cls._rename_dict(argkw))
1521        if instances:
1522            self.__init__(**cls._rename_dict(argkw))
1523            return self.__call__(instances, weightID)
1524        else:
1525            return self
1526
1527    def __init__(self, **kwargs):
1528        self.base = _C45Learner(**self._rename_dict(kwargs))
1529
1530    def __setattr__(self, name, value):
1531        nn = self._rename_arg(name)
1532        if name != "base" and nn in self.base.__dict__:
1533            self.base.__dict__[nn] = value
1534        else:
1535            self.__dict__[nn] = value
1536
1537    def __getattr__(self, name):
1538        nn = self._rename_arg(name)
1539        if name != " base" and nn in self.base.__dict__:
1540            return self.base.__dict__[nn]
1541        else:
1542            return self.__dict__[nn]
1543
1544    def __call__(self, *args, **kwargs):
1545        return C45Classifier(self.base(*args, **self._rename_dict(kwargs)))
1546
1547    def commandline(self, ln):
1548        """
1549        Set the arguments with a C4.5 command line.
1550        """
1551        self.base.commandline(ln)
1552
1553
1554class C45Classifier(Orange.classification.Classifier):
1555    """
1556    A faithful reimplementation of Quinlan's C4.5, but
1557    uses a tree composed of :class:`C45Node` instead of C4.5's original
1558    tree structure.
1559
1560    .. attribute:: tree
1561
1562        C4.5 tree stored as :obj:`C45Node`.
1563    """
1564
1565    def __init__(self, base_classifier):
1566        self.nativeClassifier = base_classifier
1567        for k, v in self.nativeClassifier.__dict__.items():
1568            self.__dict__[k] = v
1569
1570    def __call__(self, instance, result_type=Orange.classification.Classifier.GetValue,
1571                 *args, **kwdargs):
1572        """Classify a new instance.
1573       
1574        :param instance: instance to be classified.
1575        :type instance: :class:`Orange.data.Instance`
1576        :param result_type:
1577              :class:`Orange.classification.Classifier.GetValue` or \
1578              :class:`Orange.classification.Classifier.GetProbabilities` or
1579              :class:`Orange.classification.Classifier.GetBoth`
1580       
1581        :rtype: :class:`Orange.data.Value`,
1582              :class:`Orange.statistics.Distribution` or a tuple with both
1583        """
1584        return self.nativeClassifier(instance, result_type, *args, **kwdargs)
1585
1586    def __setattr__(self, name, value):
1587        if name == "nativeClassifier":
1588            self.__dict__[name] = value
1589            return
1590        if name in self.nativeClassifier.__dict__:
1591            self.nativeClassifier.__dict__[name] = value
1592        self.__dict__[name] = value
1593
1594    def __str__(self):
1595        return self.dump()
1596
1597
1598    def dump(self):
1599        """
1600        Print the tree in the same form as Ross Quinlan's
1601        C4.5 program.
1602
1603        ::
1604
1605            import Orange
1606
1607            data = Orange.data.Table("voting")
1608            c45 = Orange.classification.tree.C45Learner(data)
1609            print c45
1610
1611        prints
1612
1613        ::
1614
1615            physician-fee-freeze = n: democrat (253.4)
1616            physician-fee-freeze = y:
1617            |   synfuels-corporation-cutback = n: republican (145.7)
1618            |   synfuels-corporation-cutback = y:
1619            |   |   mx-missile = y: democrat (6.0)
1620            |   |   mx-missile = n:
1621            |   |   |   adoption-of-the-budget-resolution = n: republican (22.6)
1622            |   |   |   adoption-of-the-budget-resolution = y:
1623            |   |   |   |   anti-satellite-test-ban = n: democrat (5.0)
1624            |   |   |   |   anti-satellite-test-ban = y: republican (2.2)
1625
1626
1627        The standalone C4.5 would print::
1628
1629            physician-fee-freeze = n: democrat (253.4/5.9)
1630            physician-fee-freeze = y:
1631            |   synfuels-corporation-cutback = n: republican (145.7/6.2)
1632            |   synfuels-corporation-cutback = y:
1633            |   |   mx-missile = y: democrat (6.0/2.4)
1634            |   |   mx-missile = n:
1635            |   |   |   adoption-of-the-budget-resolution = n: republican (22.6/5.2)
1636            |   |   |   adoption-of-the-budget-resolution = y:
1637            |   |   |   |   anti-satellite-test-ban = n: democrat (5.0/1.2)
1638            |   |   |   |   anti-satellite-test-ban = y: republican (2.2/1.0)
1639
1640        C4.5 also prints out the number of errors on learning data in
1641        each node.
1642        """
1643        return  _c45_printTree0(self.tree, self.class_var, 0)
1644
1645    to_string = dump
1646
1647def _c45_showBranch(node, classvar, lev, i):
1648    var = node.tested
1649    str_ = ""
1650    if node.node_type == 1:
1651        str_ += "\n" + "|   "*lev + "%s = %s:" % (var.name, var.values[i])
1652        str_ += _c45_printTree0(node.branch[i], classvar, lev + 1)
1653    elif node.node_type == 2:
1654        str_ += "\n" + "|   "*lev + "%s %s %.1f:" % (var.name, ["<=", ">"][i], node.cut)
1655        str_ += _c45_printTree0(node.branch[i], classvar, lev + 1)
1656    else:
1657        inset = filter(lambda a:a[1] == i, enumerate(node.mapping))
1658        inset = [var.values[j[0]] for j in inset]
1659        if len(inset) == 1:
1660            str_ += "\n" + "|   "*lev + "%s = %s:" % (var.name, inset[0])
1661        else:
1662            str_ += "\n" + "|   "*lev + "%s in {%s}:" % (var.name, ", ".join(inset))
1663        str_ += _c45_printTree0(node.branch[i], classvar, lev + 1)
1664    return str_
1665
1666
1667def _c45_printTree0(node, classvar, lev):
1668    var = node.tested
1669    str_ = ""
1670    if node.node_type == 0:
1671        str_ += "%s (%.1f)" % (classvar.values[int(node.leaf)], node.items)
1672    else:
1673        for i, branch in enumerate(node.branch):
1674            if not branch.node_type:
1675                str_ += _c45_showBranch(node, classvar, lev, i)
1676        for i, branch in enumerate(node.branch):
1677            if branch.node_type:
1678                str_ += _c45_showBranch(node, classvar, lev, i)
1679    return str_
1680
1681def _printTreeC45(tree):
1682    print _c45_printTree0(tree.tree, tree.class_var, 0)
1683
1684
1685import Orange.feature.scoring as fscoring
1686
1687class TreeLearner(Orange.core.Learner):
1688    """
1689    A classification or regression tree learner. If a set of instances
1690    is given on initialization, a :class:`TreeClassifier` is built and
1691    returned instead.
1692
1693    The learning algorithm has a large number of parameters. The class
1694    provides reasonable defaults; they can be modified either as attributes
1695    or as arguments given to the constructor.
1696
1697    The algorithm is very flexible, yet slower than the other two
1698    implementations that are more suitable for large scale
1699    experiments.
1700
1701    **The tree induction process**
1702
1703    #. The learning instances are copied, unless
1704       :obj:`store_instances` is `False` and the instance
1705       already are stored in a :obj:`~Orange.data.Table`.
1706    #. Apriori class probabilities are computed. A list of
1707       candidate features for the split is compiled; in the beginning,
1708       all features are candidates.
1709    #. The recursive part. The contingency matrix is computed by
1710       :obj:`contingency_computer`. Contingencies are used by :obj:`split`,
1711       :obj:`stop` and :obj:`splitter`.
1712    #. If the induction should :obj:`stop`, a :obj:`~Node.node_classifier`
1713       is built by calling :obj:`node_learner` with the given instances,
1714       weight ID and the contingency matrix. As the learner uses
1715       contingencies whenever possible, the :obj:`contingency_computer`
1716       will affect the :obj:`~Node.node_classifier`. The node is returned.
1717    #. If the induction continues, a :obj:`split` is called.
1718       If :obj:`split` fails to return a branch selector, induction stops
1719       and the :obj:`Node` is returned.
1720    #. The feature spent (if any) is removed from the candidate list.
1721    #. Instances are divided into child nodes with :obj:`splitter`.
1722       The process recursively continues with step 3 for
1723       each of the non-empty subsets. If the splitter returned weights,
1724       they are used for each branch.
1725
1726    **Attributes**
1727
1728    .. attribute:: node_learner
1729
1730        Induces a classifier from instances in a node. It is used both
1731        for internal nodes and leaves. The default is
1732        :obj:`Orange.classification.majority.MajorityLearner`.
1733
1734    .. attribute:: descender
1735
1736        The descender that the induced :obj:`TreeClassifier` will
1737        use. The default is :obj:`Descender_UnknownMergeAsSelector`.
1738        It votes with the :obj:`branch_selector`'s distribution for
1739        vote weights.
1740
1741    .. attribute:: contingency_computer
1742
1743        Defines the computation of contingency matrices (used by
1744        :obj:`split`, :obj:`stop`, :obj:`splitter`). It can be used,
1745        for example, to change the treatment of unknown values. By
1746        default ordinary contingency matrices are computed.
1747
1748    **Split construction**
1749
1750    .. attribute:: split
1751       
1752        A :obj:`SplitConstructor` or a function with the same signature as
1753        :obj:`SplitConstructor.__call__`. It is useful for prototyping
1754        new tree induction algorithms. If :obj:`split` is defined, other
1755        arguments that affect split construction are ignored. These include
1756        :obj:`binarization`, :obj:`measure`, :obj:`worst_acceptable` and
1757        :obj:`min_subset`. Default: :class:`SplitConstructor_Combined`
1758        with separate constructors for discrete and continuous
1759        features. Discrete features are used as they are, while
1760        continuous are binarized. Features are scored with gain ratio.
1761        At least two instances in a leaf are required for
1762        discrete and five for continuous features.
1763
1764    .. attribute:: binarization
1765
1766        If 1, :class:`SplitConstructor_ExhaustiveBinary` is used.
1767        If 2, use :class:`SplitConstructor_OneAgainstOthers`. If
1768        0, do not use binarization (use :class:`SplitConstructor_Feature`).
1769        Default: 0.
1770
1771    .. attribute:: measure
1772   
1773        A score to evaluate features for splitting instances in a
1774        node.  A subclass of :class:`Orange.feature.scoring.Score`
1775        (perhaps :class:`~Orange.feature.scoring.InfoGain`,
1776        :class:`~Orange.feature.scoring.GainRatio`,
1777        :class:`~Orange.feature.scoring.Gini`,
1778        :class:`~Orange.feature.scoring.Relief`, or
1779        :class:`~Orange.feature.scoring.MSE`). Default:
1780        :class:`Orange.feature.scoring.GainRatio`.
1781
1782    .. attribute:: relief_m, relief_k
1783
1784        Set `m` and `k` for :class:`~Orange.feature.scoring.Relief`,
1785        if chosen.
1786
1787    .. attribute:: splitter
1788
1789        :class:`Splitter` or a function with the same
1790        signature as :obj:`Splitter.__call__`. The default is
1791        :class:`Splitter_UnknownsAsSelector` that splits the
1792        learning instances according to distributions given by the
1793        selector.
1794
1795    **Pruning**
1796
1797    .. attribute:: worst_acceptable
1798
1799        The lowest required feature score. If the score of the best
1800        feature is below this margin, the tree is not grown further
1801        (default: 0).
1802
1803    .. attribute:: min_subset
1804
1805        The lowest required number of instances in non-null leaves (default: 0).
1806
1807    .. attribute:: min_instances
1808
1809        Data subsets with less than :obj:`min_instances`
1810        instances are not split any further. Therefore, all leaves in the tree
1811        will contain at least that many instances (default: 0).
1812
1813    .. attribute:: max_depth
1814
1815        Maximal tree depth. If 0, only root is generated.
1816        The default is 100.
1817
1818    .. attribute:: max_majority
1819
1820        Induction stops when the proportion of majority class in the
1821        node exceeds the value set by this parameter (default: 1.0).
1822 
1823    .. attribute:: stop
1824
1825        :class:`StopCriteria` or a function with the same signature as
1826        :obj:`StopCriteria.__call__`. Useful for prototyping new tree
1827        induction algorithms.  When used, parameters  :obj:`max_majority`
1828        and :obj:`min_instances` will not be  considered.  The default
1829        stopping criterion stops induction when all instances in a node
1830        belong to the same class.
1831
1832    .. attribute:: m_pruning
1833
1834        If non-zero, invokes an error-based bottom-up post-pruning,
1835        where m-estimate is used to estimate class probabilities
1836        (default: 0).
1837
1838    .. attribute:: same_majority_pruning
1839
1840        If true, invokes a bottom-up post-pruning by removing the
1841        subtrees of which all leaves classify to the same class
1842        (default: False).
1843
1844    **Record keeping**
1845
1846    .. attribute:: store_distributions
1847   
1848    .. attribute:: store_contingencies
1849   
1850    .. attribute:: store_instances
1851   
1852    .. attribute:: store_node_classifier
1853
1854        Determines whether to store class distributions,
1855        contingencies and instances in :class:`Node`, and whether the
1856        :obj:`Node.node_classifier` should be build for internal nodes
1857        also (it is needed by the :obj:`Descender` or for post-pruning).
1858        Not storing distributions but storing contingencies does not
1859        save any memory, since distributions actually points to the
1860        same distribution that is stored in :obj:`contingency.classes`.
1861        By default everything except :obj:`store_instances` is enabled.
1862
1863    """
1864    def __new__(cls, data=None, weightID=0, **argkw):
1865        self = Orange.core.Learner.__new__(cls, **argkw)
1866        if data:
1867            self.__init__(**argkw)
1868            return self.__call__(data, weightID)
1869        else:
1870            return self
1871
1872    def __init__(self, **kw):
1873
1874        #name, buildfunction, parameters
1875        #buildfunctions are not saved as function references
1876        #because that would make problems with object copies
1877        for n, (fn, _) in self._built_fn.items():
1878            self.__dict__["_handset_" + n] = False
1879
1880        #measure has to be before split
1881        self.measure = None
1882        self.split = None
1883        self.stop = None
1884        self.splitter = None
1885
1886        for n, (fn, _) in self._built_fn.items():
1887            self.__dict__[n] = fn(self)
1888
1889        for k, v in kw.items():
1890            self.__setattr__(k, v)
1891
1892    def __call__(self, instances, weight=0):
1893        """
1894        Return a classifier from the given instances.
1895        """
1896        bl = self._base_learner()
1897
1898        #set the scoring criteria for regression if it was not
1899        #set by the user
1900        if not self._handset_split and not self.measure:
1901            measure = fscoring.GainRatio() \
1902                if instances.domain.class_var.var_type == Orange.feature.Type.Discrete \
1903                else fscoring.MSE()
1904            bl.split.continuous_split_constructor.measure = measure
1905            bl.split.discrete_split_constructor.measure = measure
1906
1907        if self.splitter != None:
1908            bl.example_splitter = self.splitter
1909
1910        #post pruning
1911        tree = bl(instances, weight)
1912        if getattr(self, "same_majority_pruning", 0):
1913            tree = Pruner_SameMajority(tree)
1914        if getattr(self, "m_pruning", 0):
1915            tree = Pruner_m(tree, m=self.m_pruning)
1916
1917        return TreeClassifier(base_classifier=tree)
1918
1919    def __setattr__(self, name, value):
1920        self.__dict__[name] = value
1921        for n, (fn, v) in self._built_fn.items():
1922            if name in v:
1923                if not self.__dict__["_handset_" + n]:
1924                    self.__dict__[n] = fn(self)
1925                else:
1926                    warnings.warn("Changing \"" + name + "\" does not have any effect as \"" + n + "\" was already set", UserWarning, 2)
1927            elif n == name:
1928                if value == None:
1929                    self.__dict__[n] = fn(self)
1930                    self.__dict__["_handset_" + n] = False
1931                    #print n, "was now disabled by hand"
1932                else:
1933                    self.__dict__["_handset_" + n] = True
1934                    #print n, "is now handset"
1935        #print self.__dict__
1936
1937    def __delattr__(self, name):
1938        self.__setattr__(name, None) #use the __setattr__
1939        del self.__dict__[name]
1940
1941    def instance(self):
1942        """
1943        DEPRECATED. Return a base learner - an object
1944        of :class:`_TreeLearner`.
1945        This method is left for backwards compatibility.
1946        """
1947        return self._base_learner()
1948
1949    def _build_split(self):
1950        """
1951        Return the split constructor built according to object attributes.
1952        """
1953        split = SplitConstructor_Combined()
1954        split.continuous_split_constructor = \
1955            SplitConstructor_Threshold()
1956        binarization = getattr(self, "binarization", 0)
1957        if binarization == 1:
1958            split.discrete_split_constructor = \
1959                SplitConstructor_ExhaustiveBinary()
1960        elif binarization == 2:
1961            split.discrete_split_constructor = \
1962                SplitConstructor_OneAgainstOthers()
1963        else:
1964            split.discrete_split_constructor = \
1965                SplitConstructor_Feature()
1966
1967        measures = {"infoGain": fscoring.InfoGain,
1968            "gainRatio": fscoring.GainRatio,
1969            "gini": fscoring.Gini,
1970            "relief": fscoring.Relief,
1971            "retis": fscoring.MSE
1972            }
1973
1974        measure = self.measure
1975        if isinstance(measure, str):
1976            measure = measures[measure]()
1977        if not measure:
1978            measure = fscoring.GainRatio()
1979
1980        measureIsRelief = isinstance(measure, fscoring.Relief)
1981        relM = getattr(self, "relief_m", None)
1982        if relM and measureIsRelief:
1983            measure.m = relM
1984
1985        relK = getattr(self, "relief_k", None)
1986        if relK and measureIsRelief:
1987            measure.k = relK
1988
1989        split.continuous_split_constructor.measure = measure
1990        split.discrete_split_constructor.measure = measure
1991
1992        wa = getattr(self, "worst_acceptable", 0)
1993        if wa:
1994            split.continuous_split_constructor.worst_acceptable = wa
1995            split.discrete_split_constructor.worst_acceptable = wa
1996
1997        ms = getattr(self, "min_subset", 0)
1998        if ms:
1999            split.continuous_split_constructor.min_subset = ms
2000            split.discrete_split_constructor.min_subset = ms
2001
2002        return split
2003
2004    def _build_stop(self):
2005        """
2006        Return the stop criteria built according to object's attributes.
2007        """
2008        stop = Orange.classification.tree.StopCriteria_common()
2009        mm = getattr(self, "max_majority", 1.0)
2010        if mm < 1.0:
2011            stop.max_majority = self.max_majority
2012        me = getattr(self, "min_instances", 0)
2013        if me:
2014            stop.min_instances = self.min_instances
2015        return stop
2016
2017    def _base_learner(self):
2018        learner = _TreeLearner()
2019
2020        learner.split = self.split
2021        learner.stop = self.stop
2022
2023        for a in ["store_distributions", "store_contingencies",
2024            "store_node_classifier", "node_learner", "max_depth", "contingency_computer", "descender" ]:
2025            if hasattr(self, a):
2026                setattr(learner, a, getattr(self, a))
2027
2028        if hasattr(self, "store_instances"):
2029            learner.store_examples = self.store_instances
2030
2031        return learner
2032
2033    _built_fn = {
2034            "split": [ _build_split, [ "binarization", "measure", "relief_m", "relief_k", "worst_acceptable", "min_subset" ] ], \
2035            "stop": [ _build_stop, ["max_majority", "min_instances" ] ]
2036        }
2037
2038
2039
2040TreeLearner = Orange.misc.deprecated_members({
2041          "mForPruning": "m_pruning",
2042          "sameMajorityPruning": "same_majority_pruning",
2043          "reliefM": "relief_m",
2044          "reliefK": "relief_k",
2045          "storeDistributions": "store_distributions",
2046          "storeContingencies": "store_contingencies",
2047          "storeExamples": "store_instances",
2048          "store_examples": "store_instances",
2049          "storeNodeClassifier": "store_node_classifier",
2050          "worstAcceptable": "worst_acceptable",
2051          "minSubset": "min_subset",
2052          "maxMajority": "max_majority",
2053          "minExamples": "min_instances",
2054          "maxDepth": "max_depth",
2055          "nodeLearner": "node_learner",
2056          "min_examples": "min_instances"
2057}, wrap_methods=[])(TreeLearner)
2058
2059#
2060# the following is for the output
2061#
2062
2063fs = r"(?P<m100>\^?)(?P<fs>(\d*\.?\d*)?)"
2064""" Defines the multiplier by 100 (``^``) and the format
2065for the number of decimals (e.g. ``5.3``). The corresponding
2066groups are named ``m100`` and ``fs``. """
2067
2068by = r"(?P<by>(b(P|A)))?"
2069""" Defines bP or bA or nothing; the result is in groups by. """
2070
2071bysub = r"((?P<bysub>b|s)(?P<by>P|A))?"
2072opc = r"(?P<op>=|<|>|(<=)|(>=)|(!=))(?P<num>\d*\.?\d+)"
2073opd = r'(?P<op>=|(!=))"(?P<cls>[^"]*)"'
2074intrvl = r'((\((?P<intp>\d+)%?\))|(\(0?\.(?P<intv>\d+)\))|)'
2075fromto = r"(?P<out>!?)(?P<lowin>\(|\[)(?P<lower>\d*\.?\d+)\s*,\s*(?P<upper>\d*\.?\d+)(?P<upin>\]|\))"
2076re_V = re.compile("%V")
2077re_N = re.compile("%" + fs + "N" + by)
2078re_M = re.compile("%" + fs + "M" + by)
2079re_m = re.compile("%" + fs + "m" + by)
2080re_Ccont = re.compile("%" + fs + "C" + by + opc)
2081re_Cdisc = re.compile("%" + fs + "C" + by + opd)
2082re_ccont = re.compile("%" + fs + "c" + by + opc)
2083re_cdisc = re.compile("%" + fs + "c" + by + opd)
2084re_Cconti = re.compile("%" + fs + "C" + by + fromto)
2085re_cconti = re.compile("%" + fs + "c" + by + fromto)
2086re_D = re.compile("%" + fs + "D" + by)
2087re_d = re.compile("%" + fs + "d" + by)
2088re_AE = re.compile("%" + fs + "(?P<AorE>A|E)" + bysub)
2089re_I = re.compile("%" + fs + "I" + intrvl)
2090
2091def insert_str(s, mo, sub):
2092    """ Replace the part of s which is covered by mo
2093    with the string sub. """
2094    return s[:mo.start()] + sub + s[mo.end():]
2095
2096def insert_dot(s, mo):
2097    """ Replace the part of s which is covered by mo
2098    with a dot.  You should use this when the
2099    function cannot compute the desired quantity; it is called, for instance,
2100    when it needs to divide by something in the parent, but the parent
2101    doesn't exist.
2102    """
2103    return s[:mo.start()] + "." + s[mo.end():]
2104
2105def insert_num(s, mo, n):
2106    """ Replace the part of s matched by mo with the number n,
2107    formatted as specified by the user, that is, it multiplies
2108    it by 100, if needed, and prints with the right number of
2109    places and decimals. It does so by checking the mo
2110    for a group named m100 (representing the ``^`` in the format string)
2111    and a group named fs representing the part giving the number o
2112    f decimals (e.g. ``5.3``).
2113    """
2114    grps = mo.groupdict()
2115    m100 = grps.get("m100", None)
2116    if m100:
2117        n *= 100
2118    fs = grps.get("fs") or (m100 and ".0" or "5.3")
2119    return s[:mo.start()] + ("%%%sf" % fs % n) + s[mo.end():]
2120
2121def by_whom(by, parent, tree):
2122    """ If by equals bp, return parent, else return
2123    ``tree.tree``. This is used to find what to divide the quantity
2124    with, when division is required.
2125    """
2126    if by == "bP":
2127        return parent
2128    else:
2129        return tree.tree
2130
2131def replaceV(strg, mo, node, parent, tree):
2132    return insert_str(strg, mo, str(node.node_classifier.default_value))
2133
2134def replaceN(strg, mo, node, parent, tree):
2135    by = mo.group("by")
2136    N = node.distribution.abs
2137    if by:
2138        whom = by_whom(by, parent, tree)
2139        if whom and whom.distribution:
2140            if whom.distribution.abs > 1e-30:
2141                N /= whom.distribution.abs
2142        else:
2143            return insert_dot(strg, mo)
2144    return insert_num(strg, mo, N)
2145
2146
2147def replaceM(strg, mo, node, parent, tree):
2148    by = mo.group("by")
2149    maj = int(node.node_classifier.default_value)
2150    N = node.distribution[maj]
2151    if by:
2152        whom = by_whom(by, parent, tree)
2153        if whom and whom.distribution:
2154            if whom.distribution[maj] > 1e-30:
2155                N /= whom.distribution[maj]
2156        else:
2157            return insert_dot(strg, mo)
2158    return insert_num(strg, mo, N)
2159
2160
2161def replacem(strg, mo, node, parent, tree):
2162    by = mo.group("by")
2163    maj = int(node.node_classifier.default_value)
2164    if node.distribution.abs > 1e-30:
2165        N = node.distribution[maj] / node.distribution.abs
2166        if by:
2167            if whom and whom.distribution:
2168                byN = whom.distribution[maj] / whom.distribution.abs
2169                if byN > 1e-30:
2170                    N /= byN
2171            else:
2172                return insert_dot(strg, mo)
2173    else:
2174        N = 0.
2175    return insert_num(strg, mo, N)
2176
2177
2178def replaceCdisc(strg, mo, node, parent, tree):
2179    if tree.class_var.var_type != Orange.feature.Type.Discrete:
2180        return insert_dot(strg, mo)
2181
2182    by, op, cls = mo.group("by", "op", "cls")
2183    N = node.distribution[cls]
2184    if op == "!=":
2185        N = node.distribution.abs - N
2186    if by:
2187        whom = by_whom(by, parent, tree)
2188        if whom and whom.distribution:
2189            if whom.distribution[cls] > 1e-30:
2190                N /= whom.distribution[cls]
2191        else:
2192            return insert_dot(strg, mo)
2193    return insert_num(strg, mo, N)
2194
2195
2196def replacecdisc(strg, mo, node, parent, tree):
2197    if tree.class_var.var_type != Orange.feature.Type.Discrete:
2198        return insert_dot(strg, mo)
2199
2200    op, by, cls = mo.group("op", "by", "cls")
2201    N = node.distribution[cls]
2202    if node.distribution.abs > 1e-30:
2203        N /= node.distribution.abs
2204        if op == "!=":
2205            N = 1 - N
2206    if by:
2207        whom = by_whom(by, parent, tree)
2208        if whom and whom.distribution:
2209            if whom.distribution[cls] > 1e-30:
2210                N /= whom.distribution[cls] / whom.distribution.abs
2211        else:
2212            return insert_dot(strg, mo)
2213    return insert_num(strg, mo, N)
2214
2215
2216__opdict = {"<": operator.lt, "<=": operator.le, ">": operator.gt, ">=": operator.ge, "=": operator.eq, "!=": operator.ne}
2217
2218def replaceCcont(strg, mo, node, parent, tree):
2219    if tree.class_var.var_type != Orange.feature.Type.Continuous:
2220        return insert_dot(strg, mo)
2221
2222    by, op, num = mo.group("by", "op", "num")
2223    op = __opdict[op]
2224    num = float(num)
2225    N = sum([x[1] for x in node.distribution.items() if op(x[0], num)], 0.)
2226    if by:
2227        whom = by_whom(by, parent, tree)
2228        if whom and whom.distribution:
2229            byN = sum([x[1] for x in whom.distribution.items() if op(x[0], num)], 0.)
2230            if byN > 1e-30:
2231                N /= byN
2232        else:
2233            return insert_dot(strg, mo)
2234
2235    return insert_num(strg, mo, N)
2236
2237
2238def replaceccont(strg, mo, node, parent, tree):
2239    if tree.class_var.var_type != Orange.feature.Type.Continuous:
2240        return insert_dot(strg, mo)
2241
2242    by, op, num = mo.group("by", "op", "num")
2243    op = __opdict[op]
2244    num = float(num)
2245    N = sum([x[1] for x in node.distribution.items() if op(x[0], num)], 0.)
2246    if node.distribution.abs > 1e-30:
2247        N /= node.distribution.abs
2248    if by:
2249        whom = by_whom(by, parent, tree)
2250        if whom and whom.distribution:
2251            byN = sum([x[1] for x in whom.distribution.items() if op(x[0], num)], 0.)
2252            if byN > 1e-30:
2253                N /= byN / whom.distribution.abs # abs > byN, so byN>1e-30 => abs>1e-30
2254        else:
2255            return insert_dot(strg, mo)
2256    return insert_num(strg, mo, N)
2257
2258
2259def extractInterval(mo, dist):
2260    out, lowin, lower, upper, upin = mo.group("out", "lowin", "lower", "upper", "upin")
2261    lower, upper = float(lower), float(upper)
2262    if out:
2263        lop = lowin == "(" and operator.le or operator.lt
2264        hop = upin == ")" and operator.ge or operator.ge
2265        return filter(lambda x:lop(x[0], lower) or hop(x[0], upper), dist.items())
2266    else:
2267        lop = lowin == "(" and operator.gt or operator.ge
2268        hop = upin == ")" and operator.lt or operator.le
2269        return filter(lambda x:lop(x[0], lower) and hop(x[0], upper), dist.items())
2270
2271
2272def replaceCconti(strg, mo, node, parent, tree):
2273    if tree.class_var.var_type != Orange.feature.Type.Continuous:
2274        return insert_dot(strg, mo)
2275
2276    by = mo.group("by")
2277    N = sum([x[1] for x in extractInterval(mo, node.distribution)])
2278    if by:
2279        whom = by_whom(by, parent, tree)
2280        if whom and whom.distribution:
2281            byN = sum([x[1] for x in extractInterval(mo, whom.distribution)])
2282            if byN > 1e-30:
2283                N /= byN
2284        else:
2285            return insert_dot(strg, mo)
2286
2287    return insert_num(strg, mo, N)
2288
2289
2290def replacecconti(strg, mo, node, parent, tree):
2291    if tree.class_var.var_type != Orange.feature.Type.Continuous:
2292        return insert_dot(strg, mo)
2293
2294    N = sum([x[1] for x in extractInterval(mo, node.distribution)])
2295    ab = node.distribution.abs
2296    if ab > 1e-30:
2297        N /= ab
2298
2299    by = mo.group("by")
2300    if by:
2301        whom = by_whom(by, parent, tree)
2302        if whom and whom.distribution:
2303            byN = sum([x[1] for x in extractInterval(mo, whom.distribution)])
2304            if byN > 1e-30:
2305                N /= byN / whom.distribution.abs
2306        else:
2307            return insert_dot(strg, mo)
2308
2309    return insert_num(strg, mo, N)
2310
2311
2312def replaceD(strg, mo, node, parent, tree):
2313    if tree.class_var.var_type != Orange.feature.Type.Discrete:
2314        return insert_dot(strg, mo)
2315
2316    fs, by, m100 = mo.group("fs", "by", "m100")
2317    dist = list(node.distribution)
2318    if by:
2319        whom = by_whom(by, parent, tree)
2320        if whom:
2321            for i, d in enumerate(whom.distribution):
2322                if d > 1e-30:
2323                    dist[i] /= d
2324        else:
2325            return insert_dot(strg, mo)
2326    mul = m100 and 100 or 1
2327    fs = fs or (m100 and ".0" or "5.3")
2328    return insert_str(strg, mo, "[" + ", ".join(["%%%sf" % fs % (N * mul) for N in dist]) + "]")
2329
2330
2331def replaced(strg, mo, node, parent, tree):
2332    if tree.class_var.var_type != Orange.feature.Type.Discrete:
2333        return insert_dot(strg, mo)
2334
2335    fs, by, m100 = mo.group("fs", "by", "m100")
2336    dist = list(node.distribution)
2337    ab = node.distribution.abs
2338    if ab > 1e-30:
2339        dist = [d / ab for d in dist]
2340    if by:
2341        whom = by_whom(by, parent, tree)
2342        if whom:
2343            for i, d in enumerate(whom.distribution):
2344                if d > 1e-30:
2345                    dist[i] /= d / whom.distribution.abs # abs > d => d>1e-30 => abs>1e-30
2346        else:
2347            return insert_dot(strg, mo)
2348    mul = m100 and 100 or 1
2349    fs = fs or (m100 and ".0" or "5.3")
2350    return insert_str(strg, mo, "[" + ", ".join(["%%%sf" % fs % (N * mul) for N in dist]) + "]")
2351
2352
2353def replaceAE(strg, mo, node, parent, tree):
2354    if tree.class_var.var_type != Orange.feature.Type.Continuous:
2355        return insert_dot(strg, mo)
2356
2357    AorE, bysub, by = mo.group("AorE", "bysub", "by")
2358
2359    if AorE == "A":
2360        A = node.distribution.average()
2361    else:
2362        A = node.distribution.error()
2363    if by:
2364        whom = by_whom("b" + by, parent, tree)
2365        if whom:
2366            if AorE == "A":
2367                avg = whom.distribution.average()
2368            else:
2369                avg = whom.distribution.error()
2370            if bysub == "b":
2371                if avg > 1e-30:
2372                    A /= avg
2373            else:
2374                A -= avg
2375        else:
2376            return insert_dot(strg, mo)
2377    return insert_num(strg, mo, A)
2378
2379
2380Z = { 0.75:1.15, 0.80:1.28, 0.85:1.44, 0.90:1.64, 0.95:1.96, 0.99:2.58 }
2381
2382def replaceI(strg, mo, node, parent, tree):
2383    if tree.class_var.var_type != Orange.feature.Type.Continuous:
2384        return insert_dot(strg, mo)
2385
2386    fs = mo.group("fs") or "5.3"
2387    intrvl = float(mo.group("intp") or mo.group("intv") or "95") / 100.
2388    mul = mo.group("m100") and 100 or 1
2389
2390    if not Z.has_key(intrvl):
2391        raise SystemError, "Cannot compute %5.3f% confidence intervals" % intrvl
2392
2393    av = node.distribution.average()
2394    il = node.distribution.error() * Z[intrvl]
2395    return insert_str(strg, mo, "[%%%sf-%%%sf]" % (fs, fs) % ((av - il) * mul, (av + il) * mul))
2396
2397
2398# This class is more a collection of function, merged into a class so
2399# that they don't need to transfer too many arguments. It will be
2400# constructed, used and discarded, it is not meant to store any information.
2401class _TreeDumper:
2402    defaultStringFormats = [(re_V, replaceV), (re_N, replaceN),
2403         (re_M, replaceM), (re_m, replacem),
2404         (re_Cdisc, replaceCdisc), (re_cdisc, replacecdisc),
2405         (re_Ccont, replaceCcont), (re_ccont, replaceccont),
2406         (re_Cconti, replaceCconti), (re_cconti, replacecconti),
2407         (re_D, replaceD), (re_d, replaced), (re_AE, replaceAE),
2408         (re_I, replaceI) ]
2409
2410    def node(self):
2411        return self.tree.tree if "tree" in self.tree.__dict__ else self.tree
2412
2413    def __init__(self, leafStr, nodeStr, stringFormats, minExamples,
2414        maxDepth, simpleFirst, tree, **kw):
2415        self.stringFormats = stringFormats
2416        self.minExamples = minExamples
2417        self.maxDepth = maxDepth
2418        self.simpleFirst = simpleFirst
2419        self.tree = tree
2420        self.__dict__.update(kw)
2421
2422        if leafStr:
2423            self.leafStr = leafStr
2424        else:
2425            if self.node().node_classifier.class_var.var_type == \
2426                    Orange.feature.Type.Discrete:
2427                self.leafStr = "%V (%^.2m%)"
2428            else:
2429                self.leafStr = "%V"
2430
2431        if nodeStr == ".":
2432            self.nodeStr = self.leafStr
2433        else:
2434            self.nodeStr = nodeStr
2435
2436
2437    def formatString(self, strg, node, parent):
2438        if hasattr(strg, "__call__"):
2439            return strg(node, parent, self.tree)
2440
2441        if not node:
2442            return "<null node>"
2443
2444        for rgx, replacer in self.stringFormats:
2445            if not node.distribution:
2446                strg = rgx.sub(".", strg)
2447            else:
2448                strt = 0
2449                while True:
2450                    mo = rgx.search(strg, strt)
2451                    if not mo:
2452                        break
2453                    strg = replacer(strg, mo, node, parent, self.tree)
2454                    strt = mo.start() + 1
2455
2456        return strg
2457
2458
2459    def showBranch(self, node, parent, lev, i):
2460        bdes = node.branch_descriptions[i]
2461        bdes = node.branch_selector.class_var.name + \
2462            (bdes[0] not in "<=>" and "=" or "") + bdes
2463        if node.branches[i]:
2464            nodedes = self.nodeStr and ": " + \
2465                self.formatString(self.nodeStr, node.branches[i], node) or ""
2466        else:
2467            nodedes = "<null node>"
2468        return "|    "*lev + bdes + nodedes
2469
2470
2471    def dumpTree0(self, node, parent, lev):
2472        if node.branches:
2473            if node.distribution.abs < self.minExamples or \
2474                lev > self.maxDepth:
2475                return "|    "*lev + ". . .\n"
2476
2477            res = ""
2478            if self.leafStr and self.nodeStr and self.leafStr != self.nodeStr:
2479                leafsep = "\n" + ("|    "*lev) + "    "
2480            else:
2481                leafsep = ""
2482            if self.simpleFirst:
2483                for i, branch in enumerate(node.branches):
2484                    if not branch or not branch.branches:
2485                        if self.leafStr == self.nodeStr:
2486                            res += "%s\n" % \
2487                                self.showBranch(node, parent, lev, i)
2488                        else:
2489                            res += "%s: %s\n" % \
2490                                (self.showBranch(node, parent, lev, i),
2491                                 leafsep +
2492                                 self.formatString(self.leafStr, branch, node))
2493            for i, branch in enumerate(node.branches):
2494                if branch and branch.branches:
2495                    res += "%s\n%s" % (self.showBranch(node, parent, lev, i),
2496                                       self.dumpTree0(branch, node, lev + 1))
2497                elif not self.simpleFirst:
2498                    if self.leafStr == self.nodeStr:
2499                        res += "%s\n" % self.showBranch(node, parent, lev, i)
2500                    else:
2501                        res += "%s: %s\n" % \
2502                            (self.showBranch(node, parent, lev, i),
2503                             leafsep +
2504                             self.formatString(self.leafStr, branch, node))
2505            return res
2506        else:
2507            return self.formatString(self.leafStr, node, parent)
2508
2509
2510    def dumpTree(self):
2511        node = self.node()
2512        if self.nodeStr:
2513            lev, res = 1, "root: %s\n" % \
2514                self.formatString(self.nodeStr, node, None)
2515            self.maxDepth += 1
2516        else:
2517            lev, res = 0, ""
2518        return res + self.dumpTree0(node, None, lev)
2519
2520
2521    def dotTree0(self, node, parent, internalName):
2522        if node.branches:
2523            if node.distribution.abs < self.minExamples or \
2524                len(internalName) - 1 > self.maxDepth:
2525                self.fle.write('%s [ shape="plaintext" label="..." ]\n' % \
2526                    _quoteName(internalName))
2527                return
2528
2529            label = node.branch_selector.class_var.name
2530            if self.nodeStr:
2531                label += "\\n" + self.formatString(self.nodeStr, node, parent)
2532            self.fle.write('%s [ shape=%s label="%s"]\n' % \
2533                (_quoteName(internalName), self.nodeShape, label))
2534
2535            for i, branch in enumerate(node.branches):
2536                if branch:
2537                    internalBranchName = "%s-%d" % (internalName, i)
2538                    self.fle.write('%s -> %s [ label="%s" ]\n' % \
2539                        (_quoteName(internalName),
2540                         _quoteName(internalBranchName),
2541                         node.branch_descriptions[i]))
2542                    self.dotTree0(branch, node, internalBranchName)
2543
2544        else:
2545            self.fle.write('%s [ shape=%s label="%s"]\n' % \
2546                (_quoteName(internalName), self.leafShape,
2547                self.formatString(self.leafStr, node, parent)))
2548
2549
2550    def dotTree(self, internalName="n"):
2551        self.fle.write("digraph G {\n")
2552        self.dotTree0(self.node(), None, internalName)
2553        self.fle.write("}\n")
2554
2555def _quoteName(x):
2556    return '"%s"' % (base64.b64encode(x))
2557
2558class TreeClassifier(Orange.classification.Classifier):
2559    """
2560
2561    Classifies instances according to the tree stored in :obj:`tree`.
2562
2563    **The classification process**
2564
2565    :obj:`TreeClassifier` uses the :obj:`descender` to descend the
2566    instance from the root. If the :obj:`descender` returns only a
2567    :obj:`Node` and no distribution, the descend should stop as the node
2568    was unambiguously selected. The node's :obj:`~Node.node_classifier`
2569    decides the class.
2570
2571    If the descender returns a :obj:`Node` and a distribution, as it
2572    happens, for example, if the instance's value for the :obj:`Node`'s
2573    feature is unknown, the same process repeats for all subtrees and
2574    their predictions are combined.
2575
2576    **Attributes**
2577
2578    .. attribute:: tree
2579
2580        The root of the tree, as a :class:`Node`.
2581
2582    .. attribute:: descender
2583
2584        A :obj:`Descender` used to descend an instance from the root as
2585        deeply as possible according to the instance's feature values.
2586    """
2587
2588    def __init__(self, base_classifier=None):
2589        if not base_classifier: base_classifier = _TreeClassifier()
2590        self.nativeClassifier = base_classifier
2591        for k, v in self.nativeClassifier.__dict__.items():
2592            self.__dict__[k] = v
2593
2594    def __call__(self, instance, result_type=Orange.classification.Classifier.GetValue,
2595                 *args, **kwdargs):
2596        """Classify a new instance.
2597
2598     
2599        :param instance: instance to be classified.
2600        :type instance: :class:`Orange.data.Instance`
2601        :param result_type:
2602              :class:`Orange.classification.Classifier.GetValue` or \
2603              :class:`Orange.classification.Classifier.GetProbabilities` or
2604              :class:`Orange.classification.Classifier.GetBoth`
2605       
2606        :rtype: :class:`Orange.data.Value`,
2607              :class:`Orange.statistics.Distribution` or a tuple with both
2608        """
2609        return self.nativeClassifier(instance, result_type, *args, **kwdargs)
2610
2611    def __setattr__(self, name, value):
2612        if name == "nativeClassifier":
2613            self.__dict__[name] = value
2614            return
2615        if name in self.nativeClassifier.__dict__:
2616            self.nativeClassifier.__dict__[name] = value
2617        self.__dict__[name] = value
2618
2619    def __str__(self):
2620        return self.to_string()
2621
2622    @Orange.misc.deprecated_keywords({"fileName": "file_name", \
2623        "leafStr": "leaf_str", "nodeStr": "node_str", \
2624        "userFormats": "user_formats", "minExamples": "min_instances", \
2625        "min_examples": "min_instances", \
2626        "maxDepth": "max_depth", "simpleFirst": "simple_first"})
2627    def to_string(self, leaf_str="", node_str="", \
2628            user_formats=[], min_instances=0, max_depth=1e10, \
2629            simple_first=True):
2630        """
2631        Return a string representation of a tree.
2632
2633        :arg leaf_str: The format string for the tree leaves. If
2634          left empty, ``"%V (%^.2m%)"`` will be used for classification trees
2635          and ``"%V"`` for regression trees.
2636        :type leaf_str: string
2637        :arg node_str: The format string for the internal nodes.
2638          If left empty (as it is by default), no data is printed out for
2639          internal nodes. If set to ``"."``, the same string is
2640          used as for leaves.
2641        :type node_str: string
2642        :arg max_depth: If set, it limits the depth to which the tree is
2643          printed out.
2644        :type max_depth: integer
2645        :arg min_instances: If set, the subtrees with less than the given
2646          number of examples are not printed.
2647        :type min_instances: integer
2648        :arg simple_first: If True (default), the branches with a single
2649          node are printed before the branches with larger subtrees.
2650          If False, the branches are printed in order of
2651          appearance.
2652        :type simple_first: boolean
2653        :arg user_formats: A list of regular expressions and callback
2654          function through which the user can print out other specific
2655          information in the nodes.
2656        """
2657        return _TreeDumper(leaf_str, node_str, user_formats +
2658            _TreeDumper.defaultStringFormats, min_instances,
2659            max_depth, simple_first, self).dumpTree()
2660
2661    dump = to_string
2662
2663    @Orange.misc.deprecated_keywords({"fileName": "file_name", \
2664        "leafStr": "leaf_str", "nodeStr": "node_str", \
2665        "leafShape": "leaf_shape", "nodeShape": "node_shape", \
2666        "userFormats": "user_formats", \
2667        "minExamples": "min_instances", \
2668        "min_examples": "min_instances", \
2669        "maxDepth": "max_depth", "simpleFirst": "simple_first"})
2670    def dot(self, file_name, leaf_str="", node_str="", \
2671            leaf_shape="plaintext", node_shape="plaintext", \
2672            user_formats=[], min_instances=0, max_depth=1e10, \
2673            simple_first=True):
2674        """ Print the tree to a file in a format used by `GraphViz
2675        <http://www.research.att.com/sw/tools/graphviz>`_.  Uses the
2676        same parameters as :meth:`to_string` plus two which define the shape
2677        of internal nodes and leaves of the tree:
2678
2679        :param leaf_shape: Shape of the outline around leaves of the tree.
2680            If "plaintext", no outline is used (default: "plaintext").
2681        :type leaf_shape: string
2682        :param node_shape: Shape of the outline around internal nodes
2683            of the tree. If "plaintext", no outline is used (default: "plaintext")
2684        :type node_shape: string
2685
2686        Check `Polygon-based Nodes <http://www.graphviz.org/doc/info/shapes.html>`_
2687        for various outlines supported by GraphViz.
2688        """
2689        fle = isinstance(file_name, basestring) and open(file_name, "wt") or file_name
2690
2691        _TreeDumper(leaf_str, node_str, user_formats +
2692            _TreeDumper.defaultStringFormats, min_instances,
2693            max_depth, simple_first, self,
2694            leafShape=leaf_shape, nodeShape=node_shape, fle=fle).dotTree()
2695
2696    def count_nodes(self):
2697        """
2698        Return the number of nodes.
2699        """
2700        return _countNodes(self.tree)
2701
2702    def count_leaves(self):
2703        """
2704        Return the number of leaves.
2705        """
2706        return _countLeaves(self.tree)
2707
2708    def to_network(self):
2709        net = Orange.network.DiGraph()
2710        if self.class_var.var_type == Orange.feature.Type.Discrete:
2711            domain = Orange.data.Domain([self.class_var] +
2712                [Orange.feature.Continuous(name) for name in
2713                 ["instances", "majority proportion"] + list(self.class_var.values)], None)
2714        else:
2715            domain = Orange.data.Domain([self.class_var] +
2716                [Orange.feature.Continuous(name) for name in
2717                 ["error", "instances"]], None)
2718        domain = Orange.data.Domain(domain)
2719        data = Orange.data.Table(domain)
2720        self.to_network0(self.tree, net, data)
2721        net.set_items(data)
2722        return net
2723
2724    def to_network0(self, node, net, table):
2725        node_id = len(table)
2726        net.add_node(node_id)
2727        d = node.distribution
2728        maj = node.node_classifier.default_value
2729        if self.class_var.var_type == Orange.feature.Type.Discrete:
2730            if d.abs > 1e-6:
2731                table.append([maj, d.abs, d[maj]] + [x / d.abs for x in d])
2732            else:
2733                table.append([maj] + [0] * (2 + len(d)))
2734        else:
2735            table.append([maj, d.error(), d.abs])
2736        if node.branches:
2737            for branch in node.branches:
2738                if branch:
2739                    child_id = self.to_network0(branch, net, table)
2740                    net.add_edge(node_id, child_id)
2741        return node_id
2742
2743def _countNodes(node):
2744    count = 0
2745    if node:
2746        count += 1
2747        if node.branches:
2748            for node in node.branches:
2749                count += _countNodes(node)
2750    return count
2751
2752def _countLeaves(node):
2753    count = 0
2754    if node:
2755        if node.branches: # internal node
2756            for node in node.branches:
2757                count += _countLeaves(node)
2758        else:
2759            count += 1
2760    return count
2761
Note: See TracBrowser for help on using the repository browser.