source: orange/Orange/feature/scoring.py @ 10524:a83ea32a7108

Revision 10524:a83ea32a7108, 8.8 KB checked in by Ales Erjavec <ales.erjavec@…>, 2 years ago (diff)

Doc fix for score_all.

Line 
1import Orange.core as orange
2import Orange.misc
3
4from orange import MeasureAttribute as Score
5from orange import MeasureAttributeFromProbabilities as ScoreFromProbabilities
6from orange import MeasureAttribute_info as InfoGain
7from orange import MeasureAttribute_gainRatio as GainRatio
8from orange import MeasureAttribute_gini as Gini
9from orange import MeasureAttribute_relevance as Relevance
10from orange import MeasureAttribute_cost as Cost
11from orange import MeasureAttribute_relief as Relief
12from orange import MeasureAttribute_MSE as MSE
13
14######
15# from orngEvalAttr.py
16
17class OrderAttributes:
18    """Orders features by their scores.
19   
20    .. attribute::  score
21   
22        A scoring method derived from :obj:`~Orange.feature.scoring.Score`.
23        If :obj:`None`, :obj:`Relief` with m=5 and k=10 is used.
24   
25    """
26    def __init__(self, score=None):
27        self.score = score
28
29    def __call__(self, data, weight):
30        """Score and order all features.
31
32        :param data: a data table used to score features
33        :type data: Orange.data.Table
34
35        :param weight: meta attribute that stores weights of instances
36        :type weight: Orange.feature.Descriptor
37
38        """
39        if self.score:
40            measure = self.score
41        else:
42            measure = Relief(m=5, k=10)
43
44        measured = [(attr, measure(attr, data, None, weight)) for attr in data.domain.attributes]
45        measured.sort(lambda x, y: cmp(x[1], y[1]))
46        return [x[0] for x in measured]
47
48OrderAttributes = Orange.misc.deprecated_members({
49          "measure": "score",
50}, wrap_methods=[])(OrderAttributes)
51
52class Distance(Score):
53    """The :math:`1-D` distance is defined as information gain divided
54    by joint entropy :math:`H_{CA}` (:math:`C` is the class variable
55    and :math:`A` the feature):
56
57    .. math::
58        1-D(C,A) = \\frac{\\mathrm{Gain}(A)}{H_{CA}}
59    """
60
61    @Orange.misc.deprecated_keywords({"aprioriDist": "apriori_dist"})
62    def __new__(cls, attr=None, data=None, apriori_dist=None, weightID=None):
63        self = Score.__new__(cls)
64        if attr != None and data != None:
65            #self.__init__(**argkw)
66            return self.__call__(attr, data, apriori_dist, weightID)
67        else:
68            return self
69
70    @Orange.misc.deprecated_keywords({"aprioriDist": "apriori_dist"})
71    def __call__(self, attr, data, apriori_dist=None, weightID=None):
72        """Score the given feature.
73
74        :param attr: feature to score
75        :type attr: Orange.feature.Descriptor
76
77        :param data: a data table used to score features
78        :type data: Orange.data.table
79
80        :param apriori_dist:
81        :type apriori_dist:
82       
83        :param weightID: meta feature used to weight individual data instances
84        :type weightID: Orange.feature.Descriptor
85
86        """
87        import numpy
88        from orngContingency import Entropy
89        if attr in data.domain:  # if we receive attr as string we have to convert to variable
90            attr = data.domain[attr]
91        attrClassCont = orange.ContingencyAttrClass(attr, data)
92        dist = []
93        for vals in attrClassCont.values():
94            dist += list(vals)
95        classAttrEntropy = Entropy(numpy.array(dist))
96        infoGain = InfoGain(attr, data)
97        if classAttrEntropy > 0:
98            return float(infoGain) / classAttrEntropy
99        else:
100            return 0
101
102class MDL(Score):
103    """Minimum description length principle [Kononenko1995]_. Let
104    :math:`n` be the number of instances, :math:`n_0` the number of
105    classes, and :math:`n_{cj}` the number of instances with feature
106    value :math:`j` and class value :math:`c`. Then MDL score for the
107    feature A is
108
109    .. math::
110         \mathrm{MDL}(A) = \\frac{1}{n} \\Bigg[
111         \\log\\binom{n}{n_{1.},\\cdots,n_{n_0 .}} - \\sum_j
112         \\log \\binom{n_{.j}}{n_{1j},\\cdots,n_{n_0 j}} \\\\
113         + \\log \\binom{n+n_0-1}{n_0-1} - \\sum_j \\log
114         \\binom{n_{.j}+n_0-1}{n_0-1}
115         \\Bigg]
116    """
117
118    @Orange.misc.deprecated_keywords({"aprioriDist": "apriori_dist"})
119    def __new__(cls, attr=None, data=None, apriori_dist=None, weightID=None):
120        self = Score.__new__(cls)
121        if attr != None and data != None:
122            #self.__init__(**argkw)
123            return self.__call__(attr, data, apriori_dist, weightID)
124        else:
125            return self
126
127    @Orange.misc.deprecated_keywords({"aprioriDist": "apriori_dist"})
128    def __call__(self, attr, data, apriori_dist=None, weightID=None):
129        """Score the given feature.
130
131        :param attr: feature to score
132        :type attr: Orange.feature.Descriptor
133
134        :param data: a data table used to score the feature
135        :type data: Orange.data.table
136
137        :param apriori_dist:
138        :type apriori_dist:
139       
140        :param weightID: meta feature used to weight individual data instances
141        :type weightID: Orange.feature.Descriptor
142
143        """
144        attrClassCont = orange.ContingencyAttrClass(attr, data)
145        classDist = orange.Distribution(data.domain.classVar, data).values()
146        nCls = len(classDist)
147        nEx = len(data)
148        priorMDL = _logMultipleCombs(nEx, classDist) + _logMultipleCombs(nEx+nCls-1, [nEx, nCls-1])
149        postPart1 = [_logMultipleCombs(sum(attrClassCont[key]), attrClassCont[key].values()) for key in attrClassCont.keys()]
150        postPart2 = [_logMultipleCombs(sum(attrClassCont[key])+nCls-1, [sum(attrClassCont[key]), nCls-1]) for key in attrClassCont.keys()]
151        ret = priorMDL
152        for val in postPart1 + postPart2:
153            ret -= val
154        return ret / max(1, nEx)
155
156# compute n! / k1! * k2! * k3! * ... kc!
157# ks = [k1, k2, ...]
158def _logMultipleCombs(n, ks):
159    import math
160    m = max(ks)
161    ks.remove(m)
162    resArray = []
163    for (start, end) in [(m+1, n+1)] + [(1, k+1) for k in ks]:
164        ret = 0
165        curr = 1
166        for val in range(int(start), int(end)):
167            curr *= val
168            if curr > 1e40:
169                ret += math.log(curr)
170                curr = 1
171        ret += math.log(curr)
172        resArray.append(ret)
173    ret = resArray[0]
174    for val in resArray[1:]:
175        ret -= val
176    return ret
177
178
179@Orange.misc.deprecated_keywords({"attrList": "attr_list", "attrMeasure": "attr_score", "removeUnusedValues": "remove_unused_values"})
180def merge_values(data, attr_list, attr_score, remove_unused_values = 1):
181    import orngCI
182    #data = data.select([data.domain[attr] for attr in attr_list] + [data.domain.classVar])
183    newData = data.select(attr_list + [data.domain.class_var])
184    newAttr = orngCI.FeatureByCartesianProduct(newData, attr_list)[0]
185    dist = orange.Distribution(newAttr, newData)
186    activeValues = []
187    for i in range(len(newAttr.values)):
188        if dist[newAttr.values[i]] > 0: activeValues.append(i)
189    currScore = attr_score(newAttr, newData)
190    while 1:
191        bestScore, bestMerge = currScore, None
192        for i1, ind1 in enumerate(activeValues):
193            oldInd1 = newAttr.get_value_from.lookupTable[ind1]
194            for ind2 in activeValues[:i1]:
195                newAttr.get_value_from.lookupTable[ind1] = ind2
196                score = attr_score(newAttr, newData)
197                if score >= bestScore:
198                    bestScore, bestMerge = score, (ind1, ind2)
199                newAttr.get_value_from.lookupTable[ind1] = oldInd1
200
201        if bestMerge:
202            ind1, ind2 = bestMerge
203            currScore = bestScore
204            for i, l in enumerate(newAttr.get_value_from.lookupTable):
205                if not l.isSpecial() and int(l) == ind1:
206                    newAttr.get_value_from.lookupTable[i] = ind2
207            newAttr.values[ind2] = newAttr.values[ind2] + "+" + newAttr.values[ind1]
208            del activeValues[activeValues.index(ind1)]
209        else:
210            break
211
212    if not remove_unused_values:
213        return newAttr
214
215    reducedAttr = orange.EnumVariable(newAttr.name, values = [newAttr.values[i] for i in activeValues])
216    reducedAttr.get_value_from = newAttr.get_value_from
217    reducedAttr.get_value_from.class_var = reducedAttr
218    return reducedAttr
219
220######
221# from orngFSS
222@Orange.misc.deprecated_keywords({"measure": "score"})
223def score_all(data, score=Relief(k=20, m=50)):
224    """Assess the quality of features using the given measure and return
225    a sorted list of tuples (feature name, measure).
226
227    :param data: data table should include a discrete class.
228    :type data: :obj:`Orange.data.Table`
229    :param score:  feature scoring function. Derived from
230      :obj:`~Orange.feature.scoring.Score`. Defaults to
231      :obj:`~Orange.feature.scoring.Relief` with k=20 and m=50.
232    :type measure: :obj:`~Orange.feature.scoring.Score`
233    :rtype: :obj:`list`; a sorted (by descending score) list of
234      tuples (feature name, score)
235
236    """
237    measl=[]
238    for i in data.domain.attributes:
239        measl.append((i.name, score(i, data)))
240    measl.sort(lambda x,y:cmp(y[1], x[1]))
241    return measl
Note: See TracBrowser for help on using the repository browser.