source: orange/Orange/feature/scoring.py @ 10543:bac3278406d5

Revision 10543:bac3278406d5, 8.7 KB checked in by anzeh <anze.staric@…>, 2 years ago (diff)

Merge

Line 
1from Orange import core, feature
2from Orange.statistics import contingency, distribution
3
4from Orange.misc import deprecated_keywords, deprecated_members
5
6Score = core.MeasureAttribute
7ScoreFromProbabilities = core.MeasureAttributeFromProbabilities
8InfoGain = core.MeasureAttribute_info
9GainRatio = core.MeasureAttribute_gainRatio
10Gini = core.MeasureAttribute_gini
11Relevance = core.MeasureAttribute_relevance
12Cost = core.MeasureAttribute_cost
13Relief = core.MeasureAttribute_relief
14MSE = core.MeasureAttribute_MSE
15
16######
17# from orngEvalAttr.py
18
19class OrderAttributes:
20    """Orders features by their scores.
21   
22    .. attribute::  score
23   
24        A scoring method derived from :obj:`~Orange.feature.scoring.Score`.
25        If :obj:`None`, :obj:`Relief` with m=5 and k=10 is used.
26   
27    """
28    def __init__(self, score=None):
29        self.score = score
30
31    def __call__(self, data, weight):
32        """Score and order all features.
33
34        :param data: a data table used to score features
35        :type data: :obj:`~Orange.data.Table`
36
37        :param weight: meta attribute that stores weights of instances
38        :type weight: :obj:`~Orange.feature.Descriptor`
39
40        """
41        if self.score:
42            measure = self.score
43        else:
44            measure = Relief(m=5, k=10)
45
46        measured = [(attr, measure(attr, data, None, weight)) for attr in data.domain.attributes]
47        measured.sort(lambda x, y: cmp(x[1], y[1]))
48        return [x[0] for x in measured]
49
50OrderAttributes = deprecated_members({
51          "measure": "score",
52}, wrap_methods=[])(OrderAttributes)
53
54class Distance(Score):
55    """The :math:`1-D` distance is defined as information gain divided
56    by joint entropy :math:`H_{CA}` (:math:`C` is the class variable
57    and :math:`A` the feature):
58
59    .. math::
60        1-D(C,A) = \\frac{\\mathrm{Gain}(A)}{H_{CA}}
61    """
62
63    @deprecated_keywords({"aprioriDist": "apriori_dist"})
64    def __new__(cls, attr=None, data=None, apriori_dist=None, weightID=None):
65        self = Score.__new__(cls)
66        if attr is not None and data is not None:
67            #self.__init__(**argkw)
68            return self.__call__(attr, data, apriori_dist, weightID)
69        else:
70            return self
71
72    @deprecated_keywords({"aprioriDist": "apriori_dist"})
73    def __call__(self, attr, data, apriori_dist=None, weightID=None):
74        """Score the given feature.
75
76        :param attr: feature to score
77        :type attr: :obj:`~Orange.feature.Descriptor`
78
79        :param data: a data table used to score features
80        :type data: :obj:`~Orange.data.Table`
81
82        :param apriori_dist:
83        :type apriori_dist:
84       
85        :param weightID: meta feature used to weight individual data instances
86        :type weightID: :obj:`~Orange.feature.Descriptor`
87
88        """
89        import numpy
90        from orngContingency import Entropy #TODO: Move to new hierarchy
91        if attr in data.domain:  # if we receive attr as string we have to convert to variable
92            attr = data.domain[attr]
93        attrClassCont = contingency.VarClass(attr, data)
94        dist = []
95        for vals in attrClassCont.values():
96            dist += list(vals)
97        classAttrEntropy = Entropy(numpy.array(dist))
98        infoGain = InfoGain(attr, data)
99        if classAttrEntropy > 0:
100            return float(infoGain) / classAttrEntropy
101        else:
102            return 0
103
104class MDL(Score):
105    """Minimum description length principle [Kononenko1995]_. Let
106    :math:`n` be the number of instances, :math:`n_0` the number of
107    classes, and :math:`n_{cj}` the number of instances with feature
108    value :math:`j` and class value :math:`c`. Then MDL score for the
109    feature A is
110
111    .. math::
112         \mathrm{MDL}(A) = \\frac{1}{n} \\Bigg[
113         \\log\\binom{n}{n_{1.},\\cdots,n_{n_0 .}} - \\sum_j
114         \\log \\binom{n_{.j}}{n_{1j},\\cdots,n_{n_0 j}} \\\\
115         + \\log \\binom{n+n_0-1}{n_0-1} - \\sum_j \\log
116         \\binom{n_{.j}+n_0-1}{n_0-1}
117         \\Bigg]
118    """
119
120    @deprecated_keywords({"aprioriDist": "apriori_dist"})
121    def __new__(cls, attr=None, data=None, apriori_dist=None, weightID=None):
122        self = Score.__new__(cls)
123        if attr is not None and data is not None:
124            #self.__init__(**argkw)
125            return self.__call__(attr, data, apriori_dist, weightID)
126        else:
127            return self
128
129    @deprecated_keywords({"aprioriDist": "apriori_dist"})
130    def __call__(self, attr, data, apriori_dist=None, weightID=None):
131        """Score the given feature.
132
133        :param attr: feature to score
134        :type attr: :obj:`~Orange.feature.Descriptor`
135
136        :param data: a data table used to score the feature
137        :type data: :obj:`~Orange.data.Table`
138
139        :param apriori_dist:
140        :type apriori_dist:
141       
142        :param weightID: meta feature used to weight individual data instances
143        :type weightID: :obj:`~Orange.feature.Descriptor`
144
145        """
146        attrClassCont = contingency.VarClass(attr, data)
147        classDist = distribution.Distribution(data.domain.classVar, data).values()
148        nCls = len(classDist)
149        nEx = len(data)
150        priorMDL = _logMultipleCombs(nEx, classDist) + _logMultipleCombs(nEx+nCls-1, [nEx, nCls-1])
151        postPart1 = [_logMultipleCombs(sum(attrClassCont[key]), attrClassCont[key].values()) for key in attrClassCont.keys()]
152        postPart2 = [_logMultipleCombs(sum(attrClassCont[key])+nCls-1, [sum(attrClassCont[key]), nCls-1]) for key in attrClassCont.keys()]
153        ret = priorMDL
154        for val in postPart1 + postPart2:
155            ret -= val
156        return ret / max(1, nEx)
157
158# compute n! / k1! * k2! * k3! * ... kc!
159# ks = [k1, k2, ...]
160def _logMultipleCombs(n, ks):
161    import math
162    m = max(ks)
163    ks.remove(m)
164    resArray = []
165    for (start, end) in [(m+1, n+1)] + [(1, k+1) for k in ks]:
166        ret = 0
167        curr = 1
168        for val in range(int(start), int(end)):
169            curr *= val
170            if curr > 1e40:
171                ret += math.log(curr)
172                curr = 1
173        ret += math.log(curr)
174        resArray.append(ret)
175    ret = resArray[0]
176    for val in resArray[1:]:
177        ret -= val
178    return ret
179
180
181@deprecated_keywords({"attrList": "attr_list", "attrMeasure": "attr_score", "removeUnusedValues": "remove_unused_values"})
182def merge_values(data, attr_list, attr_score, remove_unused_values = 1):
183    import orngCI
184    #data = data.select([data.domain[attr] for attr in attr_list] + [data.domain.classVar])
185    newData = data.select(attr_list + [data.domain.class_var])
186    newAttr = orngCI.FeatureByCartesianProduct(newData, attr_list)[0]
187    dist = distribution.Distribution(newAttr, newData)
188    activeValues = []
189    for i in range(len(newAttr.values)):
190        if dist[newAttr.values[i]] > 0: activeValues.append(i)
191    currScore = attr_score(newAttr, newData)
192    while 1:
193        bestScore, bestMerge = currScore, None
194        for i1, ind1 in enumerate(activeValues):
195            oldInd1 = newAttr.get_value_from.lookupTable[ind1]
196            for ind2 in activeValues[:i1]:
197                newAttr.get_value_from.lookupTable[ind1] = ind2
198                score = attr_score(newAttr, newData)
199                if score >= bestScore:
200                    bestScore, bestMerge = score, (ind1, ind2)
201                newAttr.get_value_from.lookupTable[ind1] = oldInd1
202
203        if bestMerge:
204            ind1, ind2 = bestMerge
205            currScore = bestScore
206            for i, l in enumerate(newAttr.get_value_from.lookupTable):
207                if not l.isSpecial() and int(l) == ind1:
208                    newAttr.get_value_from.lookupTable[i] = ind2
209            newAttr.values[ind2] = newAttr.values[ind2] + "+" + newAttr.values[ind1]
210            del activeValues[activeValues.index(ind1)]
211        else:
212            break
213
214    if not remove_unused_values:
215        return newAttr
216
217    reducedAttr = feature.Discrete.EnumVariable(newAttr.name, values = [newAttr.values[i] for i in activeValues])
218    reducedAttr.get_value_from = newAttr.get_value_from
219    reducedAttr.get_value_from.class_var = reducedAttr
220    return reducedAttr
221
222######
223# from orngFSS
224@deprecated_keywords({"measure": "score"})
225def score_all(data, score=Relief(k=20, m=50)):
226    """Assess the quality of features using the given measure and return
227    a sorted list of tuples (feature name, measure).
228
229    :param data: data table should include a discrete class.
230    :type data: :obj:`~Orange.data.Table`
231    :param score:  feature scoring function. Derived from
232      :obj:`~Orange.feature.scoring.Score`. Defaults to
233      :obj:`~Orange.feature.scoring.Relief` with k=20 and m=50.
234    :type score: :obj:`~Orange.feature.scoring.Score`
235    :rtype: :obj:`list`; a sorted list of tuples (feature name, score)
236
237    """
238    measl=[]
239    for i in data.domain.attributes:
240        measl.append((i.name, score(i, data)))
241    measl.sort(lambda x,y:cmp(y[1], x[1]))
242    return measl
Note: See TracBrowser for help on using the repository browser.