source: orange/Orange/feature/scoring.py @ 10580:c4cbae8dcf8b

Revision 10580:c4cbae8dcf8b, 8.7 KB checked in by markotoplak, 2 years ago (diff)

Moved deprecation functions, progress bar support and environ into Orange.utils. Orange imports cleanly, although it is not tested yet.

Line 
1from Orange import core, feature
2from Orange.statistics import contingency, distribution
3
4from Orange.utils import deprecated_keywords, deprecated_members
5
6Score = core.MeasureAttribute
7ScoreFromProbabilities = core.MeasureAttributeFromProbabilities
8InfoGain = core.MeasureAttribute_info
9GainRatio = core.MeasureAttribute_gainRatio
10Gini = core.MeasureAttribute_gini
11Relevance = core.MeasureAttribute_relevance
12Cost = core.MeasureAttribute_cost
13Relief = core.MeasureAttribute_relief
14MSE = core.MeasureAttribute_MSE
15
16######
17# from orngEvalAttr.py
18
19class OrderAttributes:
20    """Orders features by their scores.
21   
22    .. attribute::  score
23   
24        A scoring method derived from :obj:`~Orange.feature.scoring.Score`.
25        If :obj:`None`, :obj:`Relief` with m=5 and k=10 is used.
26   
27    """
28    def __init__(self, score=None):
29        self.score = score
30
31    def __call__(self, data, weight):
32        """Score and order all features.
33
34        :param data: a data table used to score features
35        :type data: :obj:`~Orange.data.Table`
36
37        :param weight: meta attribute that stores weights of instances
38        :type weight: :obj:`~Orange.feature.Descriptor`
39
40        """
41        if self.score:
42            measure = self.score
43        else:
44            measure = Relief(m=5, k=10)
45
46        measured = [(attr, measure(attr, data, None, weight)) for attr in data.domain.attributes]
47        measured.sort(lambda x, y: cmp(x[1], y[1]))
48        return [x[0] for x in measured]
49
50OrderAttributes = deprecated_members({
51          "measure": "score",
52}, wrap_methods=[])(OrderAttributes)
53
54class Distance(Score):
55    """The :math:`1-D` distance is defined as information gain divided
56    by joint entropy :math:`H_{CA}` (:math:`C` is the class variable
57    and :math:`A` the feature):
58
59    .. math::
60        1-D(C,A) = \\frac{\\mathrm{Gain}(A)}{H_{CA}}
61    """
62
63    @deprecated_keywords({"aprioriDist": "apriori_dist"})
64    def __new__(cls, attr=None, data=None, apriori_dist=None, weightID=None):
65        self = Score.__new__(cls)
66        if attr is not None and data is not None:
67            #self.__init__(**argkw)
68            return self.__call__(attr, data, apriori_dist, weightID)
69        else:
70            return self
71
72    @deprecated_keywords({"aprioriDist": "apriori_dist"})
73    def __call__(self, attr, data, apriori_dist=None, weightID=None):
74        """Score the given feature.
75
76        :param attr: feature to score
77        :type attr: :obj:`~Orange.feature.Descriptor`
78
79        :param data: a data table used to score features
80        :type data: :obj:`~Orange.data.Table`
81
82        :param apriori_dist:
83        :type apriori_dist:
84       
85        :param weightID: meta feature used to weight individual data instances
86        :type weightID: :obj:`~Orange.feature.Descriptor`
87
88        """
89        import numpy
90        from orngContingency import Entropy #TODO: Move to new hierarchy
91        if attr in data.domain:  # if we receive attr as string we have to convert to variable
92            attr = data.domain[attr]
93        attrClassCont = contingency.VarClass(attr, data)
94        dist = []
95        for vals in attrClassCont.values():
96            dist += list(vals)
97        classAttrEntropy = Entropy(numpy.array(dist))
98        infoGain = InfoGain(attr, data)
99        if classAttrEntropy > 0:
100            return float(infoGain) / classAttrEntropy
101        else:
102            return 0
103
104class MDL(Score):
105    """Minimum description length principle [Kononenko1995]_. Let
106    :math:`n` be the number of instances, :math:`n_0` the number of
107    classes, and :math:`n_{cj}` the number of instances with feature
108    value :math:`j` and class value :math:`c`. Then MDL score for the
109    feature A is
110
111    .. math::
112         \mathrm{MDL}(A) = \\frac{1}{n} \\Bigg[
113         \\log\\binom{n}{n_{1.},\\cdots,n_{n_0 .}} - \\sum_j
114         \\log \\binom{n_{.j}}{n_{1j},\\cdots,n_{n_0 j}} \\\\
115         + \\log \\binom{n+n_0-1}{n_0-1} - \\sum_j \\log
116         \\binom{n_{.j}+n_0-1}{n_0-1}
117         \\Bigg]
118    """
119
120    @deprecated_keywords({"aprioriDist": "apriori_dist"})
121    def __new__(cls, attr=None, data=None, apriori_dist=None, weightID=None):
122        self = Score.__new__(cls)
123        if attr is not None and data is not None:
124            #self.__init__(**argkw)
125            return self.__call__(attr, data, apriori_dist, weightID)
126        else:
127            return self
128
129    @deprecated_keywords({"aprioriDist": "apriori_dist"})
130    def __call__(self, attr, data, apriori_dist=None, weightID=None):
131        """Score the given feature.
132
133        :param attr: feature to score
134        :type attr: :obj:`~Orange.feature.Descriptor`
135
136        :param data: a data table used to score the feature
137        :type data: :obj:`~Orange.data.Table`
138
139        :param apriori_dist:
140        :type apriori_dist:
141       
142        :param weightID: meta feature used to weight individual data instances
143        :type weightID: :obj:`~Orange.feature.Descriptor`
144
145        """
146        attrClassCont = contingency.VarClass(attr, data)
147        classDist = distribution.Distribution(data.domain.classVar, data).values()
148        nCls = len(classDist)
149        nEx = len(data)
150        priorMDL = _logMultipleCombs(nEx, classDist) + _logMultipleCombs(nEx+nCls-1, [nEx, nCls-1])
151        postPart1 = [_logMultipleCombs(sum(attrClassCont[key]), attrClassCont[key].values()) for key in attrClassCont.keys()]
152        postPart2 = [_logMultipleCombs(sum(attrClassCont[key])+nCls-1, [sum(attrClassCont[key]), nCls-1]) for key in attrClassCont.keys()]
153        ret = priorMDL
154        for val in postPart1 + postPart2:
155            ret -= val
156        return ret / max(1, nEx)
157
158# compute n! / k1! * k2! * k3! * ... kc!
159# ks = [k1, k2, ...]
160def _logMultipleCombs(n, ks):
161    import math
162    m = max(ks)
163    ks.remove(m)
164    resArray = []
165    for (start, end) in [(m+1, n+1)] + [(1, k+1) for k in ks]:
166        ret = 0
167        curr = 1
168        for val in range(int(start), int(end)):
169            curr *= val
170            if curr > 1e40:
171                ret += math.log(curr)
172                curr = 1
173        ret += math.log(curr)
174        resArray.append(ret)
175    ret = resArray[0]
176    for val in resArray[1:]:
177        ret -= val
178    return ret
179
180
181@deprecated_keywords({"attrList": "attr_list", "attrMeasure": "attr_score", "removeUnusedValues": "remove_unused_values"})
182def merge_values(data, attr_list, attr_score, remove_unused_values = 1):
183    import orngCI
184    #data = data.select([data.domain[attr] for attr in attr_list] + [data.domain.classVar])
185    newData = data.select(attr_list + [data.domain.class_var])
186    newAttr = orngCI.FeatureByCartesianProduct(newData, attr_list)[0]
187    dist = distribution.Distribution(newAttr, newData)
188    activeValues = []
189    for i in range(len(newAttr.values)):
190        if dist[newAttr.values[i]] > 0: activeValues.append(i)
191    currScore = attr_score(newAttr, newData)
192    while 1:
193        bestScore, bestMerge = currScore, None
194        for i1, ind1 in enumerate(activeValues):
195            oldInd1 = newAttr.get_value_from.lookupTable[ind1]
196            for ind2 in activeValues[:i1]:
197                newAttr.get_value_from.lookupTable[ind1] = ind2
198                score = attr_score(newAttr, newData)
199                if score >= bestScore:
200                    bestScore, bestMerge = score, (ind1, ind2)
201                newAttr.get_value_from.lookupTable[ind1] = oldInd1
202
203        if bestMerge:
204            ind1, ind2 = bestMerge
205            currScore = bestScore
206            for i, l in enumerate(newAttr.get_value_from.lookupTable):
207                if not l.isSpecial() and int(l) == ind1:
208                    newAttr.get_value_from.lookupTable[i] = ind2
209            newAttr.values[ind2] = newAttr.values[ind2] + "+" + newAttr.values[ind1]
210            del activeValues[activeValues.index(ind1)]
211        else:
212            break
213
214    if not remove_unused_values:
215        return newAttr
216
217    reducedAttr = feature.Discrete.EnumVariable(newAttr.name, values = [newAttr.values[i] for i in activeValues])
218    reducedAttr.get_value_from = newAttr.get_value_from
219    reducedAttr.get_value_from.class_var = reducedAttr
220    return reducedAttr
221
222######
223# from orngFSS
224@deprecated_keywords({"measure": "score"})
225def score_all(data, score=Relief(k=20, m=50)):
226    """Assess the quality of features using the given measure and return
227    a sorted list of tuples (feature name, measure).
228
229    :param data: data table should include a discrete class.
230    :type data: :obj:`~Orange.data.Table`
231    :param score:  feature scoring function. Derived from
232      :obj:`~Orange.feature.scoring.Score`. Defaults to
233      :obj:`~Orange.feature.scoring.Relief` with k=20 and m=50.
234    :type score: :obj:`~Orange.feature.scoring.Score`
235    :rtype: :obj:`list`; a sorted list of tuples (feature name, score)
236
237    """
238    measl=[]
239    for i in data.domain.attributes:
240        measl.append((i.name, score(i, data)))
241    measl.sort(lambda x,y:cmp(y[1], x[1]))
242    return measl
Note: See TracBrowser for help on using the repository browser.