source: orange/docs/reference/rst/code/discretization.py @ 9988:4e1229e347ca

Revision 9988:4e1229e347ca, 4.1 KB checked in by blaz <blaz.zupan@…>, 2 years ago (diff)

Polished discretization scripts.

Line 
1# Description: Shows how usage of different classes for discretization, including manual discretization
2# Category:    discretization, categorization, preprocessing
3# Classes:     EntropyDiscretization, EquiDistDiscretization, BiModalDiscretization, Discretization, IntervalDiscretizer, Discretizer, BiModalDiscretizer
4# Uses:        iris
5# Referenced:  discretization.htm
6
7import Orange
8data = Orange.data.Table("iris")
9
10print "\nEntropy discretization, first 10 examples"
11sep_w = Orange.feature.discretization.Entropy("sepal width", data)
12
13data2 = data.select([data.domain["sepal width"], sep_w, data.domain.class_var])
14for ex in data2[:10]:
15    print ex
16
17print "\nDiscretized attribute:", sep_w
18print "Continuous attribute:", sep_w.get_value_from.whichVar #FIXME not which_var
19print "Cut-off points:", sep_w.get_value_from.transformer.points
20
21print "\nManual construction of Interval discretizer - single attribute"
22idisc = Orange.feature.discretization.IntervalDiscretizer(points = [3.0, 5.0])
23sep_l = idisc.construct_variable(data.domain["sepal length"])
24data2 = data.select([data.domain["sepal length"], sep_l, data.domain.classVar])
25for ex in data2[:10]:
26    print ex
27
28
29print "\nManual construction of Interval discretizer - all attributes"
30idisc = Orange.feature.discretization.IntervalDiscretizer(points = [3.0, 5.0])
31newattrs = [idisc.construct_variable(attr) for attr in data.domain.attributes]
32data2 = data.select(newattrs + [data.domain.class_var])
33for ex in data2[:10]:
34    print ex
35
36
37print "\n\nDiscretization with equal width intervals"
38disc = Orange.feature.discretization.EqualWidth(numberOfIntervals = 6)
39newattrs = [disc(attr, data) for attr in data.domain.attributes]
40data2 = data.select(newattrs + [data.domain.classVar])
41
42for attr in newattrs:
43    print "%s: %s" % (attr.name, attr.values)
44print
45
46for attr in newattrs:
47    print "%15s: first interval at %5.3f, step %5.3f" % (attr.name, attr.get_value_from.transformer.first_cut, attr.get_value_from.transformer.step)
48    print " "*17 + "cutoffs at " + ", ".join(["%5.3f" % x for x in attr.get_value_from.transformer.points])
49print
50
51
52
53print "\n\nQuartile (equal frequency) discretization"
54disc = Orange.feature.discretization.EqualFreq(numberOfIntervals = 6)
55newattrs = [disc(attr, data) for attr in data.domain.attributes]
56data2 = data.select(newattrs + [data.domain.classVar])
57
58for attr in newattrs:
59    print "%s: %s" % (attr.name, attr.values)
60print
61
62for attr in newattrs:
63    print " "*17 + "cutoffs at " + ", ".join(["%5.3f" % x for x in attr.get_value_from.transformer.points])
64print
65
66
67
68print "\nManual construction of EqualWidth - all attributes"
69edisc = Orange.feature.discretization.EqualWidthDiscretizer(first_cut=2.0, step=1.0, n=5)
70newattrs = [edisc.constructVariable(attr) for attr in data.domain.attributes]
71data2 = data.select(newattrs + [data.domain.classVar])
72for ex in data2[:10]:
73    print ex
74
75
76print "\nFayyad-Irani entropy-based discretization"
77entro = Orange.feature.discretization.Entropy()
78for attr in data.domain.attributes:
79    disc = entro(attr, data)
80    print "%s: %s" % (attr.name, disc.get_value_from.transformer.points)
81print
82
83
84newclass = Orange.feature.Discrete("is versicolor", values = ["no", "yes"])
85newclass.get_value_from = lambda ex, w: ex["iris"]=="Iris-versicolor"
86newdomain = Orange.data.Domain(data.domain.attributes, newclass)
87data_v = Orange.data.Table(newdomain, data)
88
89print "\nBi-modal discretization on a binary problem"
90bimod = Orange.feature.discretization.BiModal(split_in_two = 0)
91for attr in data_v.domain.attributes:
92    disc = bimod(attr, data_v)
93    print "%s: %s" % (attr.name, disc.get_value_from.transformer.points)
94print
95
96print "\nBi-modal discretization on a binary problem"
97bimod = Orange.feature.discretization.BiModal()
98for attr in data_v.domain.attributes:
99    disc = bimod(attr, data_v)
100    print "%s: (%5.3f, %5.3f]" % (attr.name, disc.get_value_from.transformer.low, disc.get_value_from.transformer.high)
101print
102
103
104print "\nEntropy-based discretization on a binary problem"
105for attr in data_v.domain.attributes:
106    disc = entro(attr, data_v)
107    print "%s: %s" % (attr.name, disc.getValueFrom.transformer.points)
Note: See TracBrowser for help on using the repository browser.