| 1 | from Orange import core, feature |
|---|
| 2 | from Orange.statistics import contingency, distribution |
|---|
| 3 | |
|---|
| 4 | from Orange.misc import deprecated_keywords, deprecated_members |
|---|
| 5 | |
|---|
| 6 | Score = core.MeasureAttribute |
|---|
| 7 | ScoreFromProbabilities = core.MeasureAttributeFromProbabilities |
|---|
| 8 | InfoGain = core.MeasureAttribute_info |
|---|
| 9 | GainRatio = core.MeasureAttribute_gainRatio |
|---|
| 10 | Gini = core.MeasureAttribute_gini |
|---|
| 11 | Relevance = core.MeasureAttribute_relevance |
|---|
| 12 | Cost = core.MeasureAttribute_cost |
|---|
| 13 | Relief = core.MeasureAttribute_relief |
|---|
| 14 | MSE = core.MeasureAttribute_MSE |
|---|
| 15 | |
|---|
| 16 | ###### |
|---|
| 17 | # from orngEvalAttr.py |
|---|
| 18 | |
|---|
| 19 | class OrderAttributes: |
|---|
| 20 | """Orders features by their scores. |
|---|
| 21 | |
|---|
| 22 | .. attribute:: score |
|---|
| 23 | |
|---|
| 24 | A scoring method derived from :obj:`~Orange.feature.scoring.Score`. |
|---|
| 25 | If :obj:`None`, :obj:`Relief` with m=5 and k=10 is used. |
|---|
| 26 | |
|---|
| 27 | """ |
|---|
| 28 | def __init__(self, score=None): |
|---|
| 29 | self.score = score |
|---|
| 30 | |
|---|
| 31 | def __call__(self, data, weight): |
|---|
| 32 | """Score and order all features. |
|---|
| 33 | |
|---|
| 34 | :param data: a data table used to score features |
|---|
| 35 | :type data: :obj:`~Orange.data.Table` |
|---|
| 36 | |
|---|
| 37 | :param weight: meta attribute that stores weights of instances |
|---|
| 38 | :type weight: :obj:`~Orange.feature.Descriptor` |
|---|
| 39 | |
|---|
| 40 | """ |
|---|
| 41 | if self.score: |
|---|
| 42 | measure = self.score |
|---|
| 43 | else: |
|---|
| 44 | measure = Relief(m=5, k=10) |
|---|
| 45 | |
|---|
| 46 | measured = [(attr, measure(attr, data, None, weight)) for attr in data.domain.attributes] |
|---|
| 47 | measured.sort(lambda x, y: cmp(x[1], y[1])) |
|---|
| 48 | return [x[0] for x in measured] |
|---|
| 49 | |
|---|
| 50 | OrderAttributes = deprecated_members({ |
|---|
| 51 | "measure": "score", |
|---|
| 52 | }, wrap_methods=[])(OrderAttributes) |
|---|
| 53 | |
|---|
| 54 | class Distance(Score): |
|---|
| 55 | """The :math:`1-D` distance is defined as information gain divided |
|---|
| 56 | by joint entropy :math:`H_{CA}` (:math:`C` is the class variable |
|---|
| 57 | and :math:`A` the feature): |
|---|
| 58 | |
|---|
| 59 | .. math:: |
|---|
| 60 | 1-D(C,A) = \\frac{\\mathrm{Gain}(A)}{H_{CA}} |
|---|
| 61 | """ |
|---|
| 62 | |
|---|
| 63 | @deprecated_keywords({"aprioriDist": "apriori_dist"}) |
|---|
| 64 | def __new__(cls, attr=None, data=None, apriori_dist=None, weightID=None): |
|---|
| 65 | self = Score.__new__(cls) |
|---|
| 66 | if attr is not None and data is not None: |
|---|
| 67 | #self.__init__(**argkw) |
|---|
| 68 | return self.__call__(attr, data, apriori_dist, weightID) |
|---|
| 69 | else: |
|---|
| 70 | return self |
|---|
| 71 | |
|---|
| 72 | @deprecated_keywords({"aprioriDist": "apriori_dist"}) |
|---|
| 73 | def __call__(self, attr, data, apriori_dist=None, weightID=None): |
|---|
| 74 | """Score the given feature. |
|---|
| 75 | |
|---|
| 76 | :param attr: feature to score |
|---|
| 77 | :type attr: :obj:`~Orange.feature.Descriptor` |
|---|
| 78 | |
|---|
| 79 | :param data: a data table used to score features |
|---|
| 80 | :type data: :obj:`~Orange.data.Table` |
|---|
| 81 | |
|---|
| 82 | :param apriori_dist: |
|---|
| 83 | :type apriori_dist: |
|---|
| 84 | |
|---|
| 85 | :param weightID: meta feature used to weight individual data instances |
|---|
| 86 | :type weightID: :obj:`~Orange.feature.Descriptor` |
|---|
| 87 | |
|---|
| 88 | """ |
|---|
| 89 | import numpy |
|---|
| 90 | from orngContingency import Entropy #TODO: Move to new hierarchy |
|---|
| 91 | if attr in data.domain: # if we receive attr as string we have to convert to variable |
|---|
| 92 | attr = data.domain[attr] |
|---|
| 93 | attrClassCont = contingency.VarClass(attr, data) |
|---|
| 94 | dist = [] |
|---|
| 95 | for vals in attrClassCont.values(): |
|---|
| 96 | dist += list(vals) |
|---|
| 97 | classAttrEntropy = Entropy(numpy.array(dist)) |
|---|
| 98 | infoGain = InfoGain(attr, data) |
|---|
| 99 | if classAttrEntropy > 0: |
|---|
| 100 | return float(infoGain) / classAttrEntropy |
|---|
| 101 | else: |
|---|
| 102 | return 0 |
|---|
| 103 | |
|---|
| 104 | class MDL(Score): |
|---|
| 105 | """Minimum description length principle [Kononenko1995]_. Let |
|---|
| 106 | :math:`n` be the number of instances, :math:`n_0` the number of |
|---|
| 107 | classes, and :math:`n_{cj}` the number of instances with feature |
|---|
| 108 | value :math:`j` and class value :math:`c`. Then MDL score for the |
|---|
| 109 | feature A is |
|---|
| 110 | |
|---|
| 111 | .. math:: |
|---|
| 112 | \mathrm{MDL}(A) = \\frac{1}{n} \\Bigg[ |
|---|
| 113 | \\log\\binom{n}{n_{1.},\\cdots,n_{n_0 .}} - \\sum_j |
|---|
| 114 | \\log \\binom{n_{.j}}{n_{1j},\\cdots,n_{n_0 j}} \\\\ |
|---|
| 115 | + \\log \\binom{n+n_0-1}{n_0-1} - \\sum_j \\log |
|---|
| 116 | \\binom{n_{.j}+n_0-1}{n_0-1} |
|---|
| 117 | \\Bigg] |
|---|
| 118 | """ |
|---|
| 119 | |
|---|
| 120 | @deprecated_keywords({"aprioriDist": "apriori_dist"}) |
|---|
| 121 | def __new__(cls, attr=None, data=None, apriori_dist=None, weightID=None): |
|---|
| 122 | self = Score.__new__(cls) |
|---|
| 123 | if attr is not None and data is not None: |
|---|
| 124 | #self.__init__(**argkw) |
|---|
| 125 | return self.__call__(attr, data, apriori_dist, weightID) |
|---|
| 126 | else: |
|---|
| 127 | return self |
|---|
| 128 | |
|---|
| 129 | @deprecated_keywords({"aprioriDist": "apriori_dist"}) |
|---|
| 130 | def __call__(self, attr, data, apriori_dist=None, weightID=None): |
|---|
| 131 | """Score the given feature. |
|---|
| 132 | |
|---|
| 133 | :param attr: feature to score |
|---|
| 134 | :type attr: :obj:`~Orange.feature.Descriptor` |
|---|
| 135 | |
|---|
| 136 | :param data: a data table used to score the feature |
|---|
| 137 | :type data: :obj:`~Orange.data.Table` |
|---|
| 138 | |
|---|
| 139 | :param apriori_dist: |
|---|
| 140 | :type apriori_dist: |
|---|
| 141 | |
|---|
| 142 | :param weightID: meta feature used to weight individual data instances |
|---|
| 143 | :type weightID: :obj:`~Orange.feature.Descriptor` |
|---|
| 144 | |
|---|
| 145 | """ |
|---|
| 146 | attrClassCont = contingency.VarClass(attr, data) |
|---|
| 147 | classDist = distribution.Distribution(data.domain.classVar, data).values() |
|---|
| 148 | nCls = len(classDist) |
|---|
| 149 | nEx = len(data) |
|---|
| 150 | priorMDL = _logMultipleCombs(nEx, classDist) + _logMultipleCombs(nEx+nCls-1, [nEx, nCls-1]) |
|---|
| 151 | postPart1 = [_logMultipleCombs(sum(attrClassCont[key]), attrClassCont[key].values()) for key in attrClassCont.keys()] |
|---|
| 152 | postPart2 = [_logMultipleCombs(sum(attrClassCont[key])+nCls-1, [sum(attrClassCont[key]), nCls-1]) for key in attrClassCont.keys()] |
|---|
| 153 | ret = priorMDL |
|---|
| 154 | for val in postPart1 + postPart2: |
|---|
| 155 | ret -= val |
|---|
| 156 | return ret / max(1, nEx) |
|---|
| 157 | |
|---|
| 158 | # compute n! / k1! * k2! * k3! * ... kc! |
|---|
| 159 | # ks = [k1, k2, ...] |
|---|
| 160 | def _logMultipleCombs(n, ks): |
|---|
| 161 | import math |
|---|
| 162 | m = max(ks) |
|---|
| 163 | ks.remove(m) |
|---|
| 164 | resArray = [] |
|---|
| 165 | for (start, end) in [(m+1, n+1)] + [(1, k+1) for k in ks]: |
|---|
| 166 | ret = 0 |
|---|
| 167 | curr = 1 |
|---|
| 168 | for val in range(int(start), int(end)): |
|---|
| 169 | curr *= val |
|---|
| 170 | if curr > 1e40: |
|---|
| 171 | ret += math.log(curr) |
|---|
| 172 | curr = 1 |
|---|
| 173 | ret += math.log(curr) |
|---|
| 174 | resArray.append(ret) |
|---|
| 175 | ret = resArray[0] |
|---|
| 176 | for val in resArray[1:]: |
|---|
| 177 | ret -= val |
|---|
| 178 | return ret |
|---|
| 179 | |
|---|
| 180 | |
|---|
| 181 | @deprecated_keywords({"attrList": "attr_list", "attrMeasure": "attr_score", "removeUnusedValues": "remove_unused_values"}) |
|---|
| 182 | def merge_values(data, attr_list, attr_score, remove_unused_values = 1): |
|---|
| 183 | import orngCI |
|---|
| 184 | #data = data.select([data.domain[attr] for attr in attr_list] + [data.domain.classVar]) |
|---|
| 185 | newData = data.select(attr_list + [data.domain.class_var]) |
|---|
| 186 | newAttr = orngCI.FeatureByCartesianProduct(newData, attr_list)[0] |
|---|
| 187 | dist = distribution.Distribution(newAttr, newData) |
|---|
| 188 | activeValues = [] |
|---|
| 189 | for i in range(len(newAttr.values)): |
|---|
| 190 | if dist[newAttr.values[i]] > 0: activeValues.append(i) |
|---|
| 191 | currScore = attr_score(newAttr, newData) |
|---|
| 192 | while 1: |
|---|
| 193 | bestScore, bestMerge = currScore, None |
|---|
| 194 | for i1, ind1 in enumerate(activeValues): |
|---|
| 195 | oldInd1 = newAttr.get_value_from.lookupTable[ind1] |
|---|
| 196 | for ind2 in activeValues[:i1]: |
|---|
| 197 | newAttr.get_value_from.lookupTable[ind1] = ind2 |
|---|
| 198 | score = attr_score(newAttr, newData) |
|---|
| 199 | if score >= bestScore: |
|---|
| 200 | bestScore, bestMerge = score, (ind1, ind2) |
|---|
| 201 | newAttr.get_value_from.lookupTable[ind1] = oldInd1 |
|---|
| 202 | |
|---|
| 203 | if bestMerge: |
|---|
| 204 | ind1, ind2 = bestMerge |
|---|
| 205 | currScore = bestScore |
|---|
| 206 | for i, l in enumerate(newAttr.get_value_from.lookupTable): |
|---|
| 207 | if not l.isSpecial() and int(l) == ind1: |
|---|
| 208 | newAttr.get_value_from.lookupTable[i] = ind2 |
|---|
| 209 | newAttr.values[ind2] = newAttr.values[ind2] + "+" + newAttr.values[ind1] |
|---|
| 210 | del activeValues[activeValues.index(ind1)] |
|---|
| 211 | else: |
|---|
| 212 | break |
|---|
| 213 | |
|---|
| 214 | if not remove_unused_values: |
|---|
| 215 | return newAttr |
|---|
| 216 | |
|---|
| 217 | reducedAttr = feature.Discrete.EnumVariable(newAttr.name, values = [newAttr.values[i] for i in activeValues]) |
|---|
| 218 | reducedAttr.get_value_from = newAttr.get_value_from |
|---|
| 219 | reducedAttr.get_value_from.class_var = reducedAttr |
|---|
| 220 | return reducedAttr |
|---|
| 221 | |
|---|
| 222 | ###### |
|---|
| 223 | # from orngFSS |
|---|
| 224 | @deprecated_keywords({"measure": "score"}) |
|---|
| 225 | def score_all(data, score=Relief(k=20, m=50)): |
|---|
| 226 | """Assess the quality of features using the given measure and return |
|---|
| 227 | a sorted list of tuples (feature name, measure). |
|---|
| 228 | |
|---|
| 229 | :param data: data table should include a discrete class. |
|---|
| 230 | :type data: :obj:`~Orange.data.Table` |
|---|
| 231 | :param score: feature scoring function. Derived from |
|---|
| 232 | :obj:`~Orange.feature.scoring.Score`. Defaults to |
|---|
| 233 | :obj:`~Orange.feature.scoring.Relief` with k=20 and m=50. |
|---|
| 234 | :type score: :obj:`~Orange.feature.scoring.Score` |
|---|
| 235 | :rtype: :obj:`list`; a sorted list of tuples (feature name, score) |
|---|
| 236 | |
|---|
| 237 | """ |
|---|
| 238 | measl=[] |
|---|
| 239 | for i in data.domain.attributes: |
|---|
| 240 | measl.append((i.name, score(i, data))) |
|---|
| 241 | measl.sort(lambda x,y:cmp(y[1], x[1])) |
|---|
| 242 | return measl |
|---|