source: orange/orange/orngMosaic.py @ 8042:ffcb93bc9028

Revision 8042:ffcb93bc9028, 55.2 KB checked in by markotoplak, 3 years ago (diff)

Hierarchical clustering: also catch RuntimeError when importing matplotlib (or the documentation could not be built on server).

Line 
1from orngCI import FeatureByCartesianProduct, FeatureByIM
2from orngEvalAttr import MeasureAttribute_Distance, MeasureAttribute_MDL, mergeAttrValues
3from orngCN2 import CN2UnorderedLearner
4from orngContingency import Entropy
5from math import sqrt, log, e
6from orngScaleData import getVariableValuesSorted, discretizeDomain
7import orange, orngVisFuncts
8import time, operator, numpy, sys
9import orngTest, orngStat, statc
10from copy import copy
11from orngMisc import LimitedCounter
12
13# quality measures
14CHI_SQUARE = 0
15CHI_SQUARE_CLASS = 1
16CRAMERS_PHI_CLASS = 2
17INFORMATION_GAIN = 3
18GAIN_RATIO = 4
19DISTANCE_MEASURE = 5
20MDL = 6
21INTERACTION_GAIN = 7
22AVERAGE_PROBABILITY_OF_CORRECT_CLASSIFICATION = 8
23GINI_INDEX = 9
24CN2_RULES = 10
25
26# conditional probability estimation
27RELATIVE = 0
28LAPLACE = 1
29M_ESTIMATE = 2
30
31# optimization type
32EXACT_NUMBER_OF_ATTRS = 0
33MAXIMUM_NUMBER_OF_ATTRS = 1
34
35# attrCont
36MEAS_NONE = 0
37MEAS_RELIEFF = 1
38MEAS_GAIN_RATIO = 2
39MEAS_GINI = 3
40MEAS_DISTANCE = 4
41MEAS_MDL = 5
42
43# items in the results list
44SCORE = 0
45ATTR_LIST = 1
46TRY_INDEX = 2
47EXTRA_INFO = 3
48
49CROSSVALIDATION = 0
50PROPORTION_TEST = 1
51
52AO_CROSSVALIDATION = 0
53AO_TESTONTRAIN = 1
54
55# classification method
56MOS_TOPPROJ = 0
57MOS_SEMINAIVE = 1
58MOS_COMBINING = 2
59
60discMeasures = [("None", None),
61                ("ReliefF", orange.MeasureAttribute_relief(k=10, m=50)),
62                ("Gain ratio", orange.MeasureAttribute_gainRatio()),
63                ("Gini index", orange.MeasureAttribute_gini()),
64                ("Distance", MeasureAttribute_Distance()),
65                ("Minimum description length", MeasureAttribute_MDL())]
66
67def norm_factor(p):
68    max = 10.
69    min = -10.
70    z = 0.
71    eps = 0.001
72    while ((max-min)>eps):
73        pval = statc.zprob(z)
74        if pval>p:
75            max = z
76        if pval<p:
77            min = z
78        z = (max + min)/2.
79    return z
80
81
82class orngMosaic:
83    def __init__(self):
84        self.attributeCount = 2
85        self.optimizationType = MAXIMUM_NUMBER_OF_ATTRS
86        self.qualityMeasure = MDL
87        self.attrDisc = MEAS_MDL # MDL
88        self.percentDataUsed = 100
89
90        self.ignoreTooSmallCells = 1    # when computing chi-square and kramer's phi, ignore cells with less than 5 elements
91
92        self.classificationMethod = MOS_SEMINAIVE
93        self.testingMethod = CROSSVALIDATION
94        self.attributeOrderTestingMethod = AO_TESTONTRAIN
95        self.mValue = 2.0
96        self.probabilityEstimation = M_ESTIMATE
97        self.learnerName = "Mosaic Classifier"
98        self.saveSettingsList = ["attrDisc", "qualityMeasure", "percentDataUsed"]       # this is the default list of settings to save when saving interesting projections. change the list to get different behavior
99
100        # classification
101        self.clsTau = 0.8               # semi naive bayes parameter
102        self.clsTopProjCount = 10       # number of top projections considered in classification
103        self.classConfidence = 90       # parameter in the combining way of classification
104
105        self.timeLimit = 0              # if greater than 0 then this is the number of minutes that VizRank will use to evaluate projections
106        self.projectionLimit = 0        # if greater than 0 then this is the number of projections that will be evaluated with VizRank
107        self.evaluatedProjectionsCount = 0
108
109        self.evaluatedAttributes = None   # save last evaluated attributes
110        self.cancelOptimization = 0           # used to stop attribute and value order
111        self.cancelEvaluation = 0
112        self.cancelTreeBuilding = 0        # used in mosaic tree building
113        self.attributeDistributions = {}
114
115        self.data = None
116        self.results = []
117        self.shownResults = []
118        self.attrLenDict = {}
119
120    def clearResults(self):
121        self.results = []
122
123    def setData(self, data, removeUnusedValues = 0):
124        self.evaluatedAttributes = None
125        self.aprioriDistribution = None
126        self.aprioriProbabilities = None
127        self.attributeNameIndex = {}
128        self.attributeDistributions = {}
129        self.logits = {}
130        self.arguments = {}
131        self.classVals = []
132        self.cvIndices = None
133        self.contingencies = {}
134        self.clearResults()
135
136        if data is not None and (len(data) == 0 or len(data.domain) == 0):        # if we don't have any examples or attributes then this is not a valid data set
137            data = None
138
139        self.data = discretizeDomain(data, removeUnusedValues)
140        if not self.data:
141            return
142
143        self.attributeNameIndex = dict([(self.data.domain[i].name, i) for i in range(len(self.data.domain))])
144
145        if self.data.domain.classVar:
146            self.classVals = [val for val in self.data.domain.classVar.values]
147            self.aprioriDistribution  = orange.Distribution(self.data.domain.classVar.name, self.data)
148            self.aprioriProbabilities = [nrOfCases / float(max(1, len(self.data))) for nrOfCases in self.aprioriDistribution]
149
150            s = sum(self.aprioriDistribution)
151            for i in range(len(self.aprioriDistribution)):
152                if s == 0:
153                    p = 1
154                elif (self.aprioriDistribution[i]/s) > 0 and (self.aprioriDistribution[i]/s) < 1:
155                    p = (self.aprioriDistribution[i]/s) / (1-(self.aprioriDistribution[i]/s))
156                elif (self.aprioriDistribution[i]/s) == 0:
157                    p = 0.0001
158                elif (self.aprioriDistribution[i]/s) == 1:
159                    p = 99999.99
160                self.logits[self.classVals[i]] = log(p)
161
162
163    # given a dataset return a list of attributes where attribute are sorted by their decreasing importance for class discrimination
164    def getEvaluatedAttributes(self, data):
165        if not data.domain.classVar or data.domain.classVar.varType != orange.VarTypes.Discrete:
166            if self.__class__.__name__ != "orngMosaic":
167                from PyQt4.QtGui import QMessageBox
168                QMessageBox.information( None, "Mosaic Dialog", 'In order to be able to find interesing projections the data set has to have a discrete class.', QMessageBox.Ok + QMessageBox.Default)
169            return []
170        if self.evaluatedAttributes:
171            return self.evaluatedAttributes
172
173        if self.__class__.__name__ != "orngMosaic":
174            from PyQt4.QtGui import qApp, QWidget
175            from PyQt4.QtCore import Qt
176            self.setStatusBarText("Evaluating attributes...")
177            qApp.setOverrideCursor(Qt.WaitCursor)
178
179        try:
180            # evaluate attributes using the selected attribute measure
181            self.evaluatedAttributes = orngVisFuncts.evaluateAttributesDiscClass(data, None, discMeasures[self.attrDisc][1])
182            # remove attributes with only one value
183            for attr in self.evaluatedAttributes[::-1]:
184                if data.domain[attr].varType == orange.VarTypes.Discrete and len(data.domain[attr].values) < 2:
185                    self.evaluatedAttributes.remove(attr)
186        except:
187            type, val, traceback = sys.exc_info()
188            sys.excepthook(type, val, traceback)  # print the exception
189
190        if self.__class__.__name__ != "orngMosaic":
191            self.setStatusBarText("")
192            qApp.restoreOverrideCursor()
193
194        if not self.evaluatedAttributes: return []
195        else:                            return self.evaluatedAttributes
196
197
198    # create a dictionary with all possible pairs of "combination-of-attr-values" : count
199    def getConditionalDistributions(self, data, attrs):
200        dict = {}
201        for i in range(1, len(attrs)+1):
202            newFeature, quality = FeatureByCartesianProduct(data, attrs[:i])
203            dist = orange.Distribution(newFeature, data)
204            for val in newFeature.values:
205                dict[val] = dist[val]
206
207        if data.domain.classVar:
208            cont = orange.ContingencyAttrClass(newFeature, data)
209            clsValues = data.domain.classVar.values
210            for val in newFeature.values:
211                for classV in clsValues:
212                    dict[val+"-"+classV] = cont[val][classV]
213        return dict
214
215    def getContingencys(self, attrs):
216        conts = {}
217        for attr in attrs:
218            conts[attr] = self.contingencies.get(attr, orange.ContingencyAttrClass(attr, self.data))
219
220        self.contingencies.update(conts)
221        return conts
222
223
224    def isEvaluationCanceled(self):
225        if self.cancelEvaluation:  return 1
226
227        stop = 0
228        if self.timeLimit > 0: stop = (time.time() - self.startTime) / 60 >= self.timeLimit
229        if self.projectionLimit > 0: stop = stop or self.evaluatedProjectionsCount >= self.projectionLimit
230        return stop
231
232    def isOptimizationCanceled(self):
233        if self.cancelOptimization:  return 1
234
235        stop = 0
236        if self.timeLimit > 0: stop = (time.time() - self.startTime) / 60 >= self.timeLimit
237        if self.projectionLimit > 0: stop = stop or self.evaluatedProjectionsCount >= self.projectionLimit
238        return stop
239
240
241    #
242    # PROJECTIONS EVALUATION
243    #
244    def evaluateProjections(self):
245        self.cancelEvaluation = 0
246        self.evaluatedProjectionsCount = 0
247        if not self.data or len(self.data) == 0 or (not self.classVals and self.qualityMeasure != CHI_SQUARE): return
248        fullData = self.data
249
250        try:
251            if self.percentDataUsed != 100:
252                self.data = fullData.select(orange.MakeRandomIndices2(fullData, 1.0-float(self.percentDataUsed)/100.0))
253
254            self.clearResults()
255
256            if len(self.data) == 0:
257                self.data = fullData
258                self.setStatusBarText("The dataset is empty. Unable to evaluate projections...")
259                self.finishEvaluation(0)
260                return
261
262            if self.__class__.__name__ != "orngMosaic":
263                self.disableControls()
264                self.visualizationWidget.progressBarInit()
265                from PyQt4.QtGui import qApp
266                self.qApp = qApp
267
268            self.startTime = time.time()
269            triedPossibilities = 0
270
271            maxLength = self.attributeCount
272            if self.optimizationType == 0: minLength = self.attributeCount
273            else:                          minLength = 1
274
275            # generate cn2 rules and show projections that have
276            if self.qualityMeasure == CN2_RULES:
277                ruleFinder = orange.RuleBeamFinder()
278                ruleFinder.evaluator = orange.RuleEvaluator_Laplace()
279                ruleFinder.ruleStoppingValidator = orange.RuleValidator_LRS(alpha=0.2, min_coverage=0, max_rule_complexity = 4)
280                ruleFinder.validator = orange.RuleValidator_LRS(alpha=0.05, min_coverage=0, max_rule_complexity=4)
281                ruleFinder.ruleFilter = orange.RuleBeamFilter_Width(width=5)
282
283                learner = CN2UnorderedLearner()
284                learner.ruleFinder = ruleFinder
285                learner.coverAndRemove = orange.RuleCovererAndRemover_Default()
286
287                if self.__class__.__name__ != "orngMosaic":
288                    from OWCN2 import CN2ProgressBar
289                    learner.progressCallback = CN2ProgressBar(self.visualizationWidget)
290
291                classifier = learner(self.data)
292
293                self.dictResults = {}
294                for rule in classifier.rules:
295                    conds = rule.filter.conditions
296                    domain = rule.filter.domain
297                    attrs = [domain[c.position].name for c in conds]
298                    if len(attrs) > self.attributeCount or (self.optimizationType == EXACT_NUMBER_OF_ATTRS and len(attrs) != self.attributeCount):
299                        continue
300                    sortedAttrs = copy(attrs); sortedAttrs.sort()
301                    vals = [domain[c.position].values[int(c.values[0])] for c in conds]
302                    self.dictResults[tuple(sortedAttrs)] = self.dictResults.get(tuple(sortedAttrs), []) + [(rule.quality, attrs, vals)]
303
304                for key in self.dictResults.keys():
305                    el = self.dictResults[key]
306                    score = sum([e[0] for e in el]) / float(len(el))
307                    self.insertItem(score, el[0][1], self.findTargetIndex(score), 0, extraInfo = el)
308
309            else:
310                if self.qualityMeasure == CHI_SQUARE:
311                    evaluatedAttrs = [attr.name for attr in self.data.domain.variables]
312                else:
313                    evaluatedAttrs = self.getEvaluatedAttributes(self.data)
314                   
315                if evaluatedAttrs == []:
316                    self.data = fullData
317                    self.finishEvaluation(0)
318                    return
319
320                # total number of possible projections
321                totalPossibilities = 0
322                for i in range(minLength, maxLength+1):
323                    totalPossibilities += orngVisFuncts.combinationsCount(i, len(evaluatedAttrs))
324                totalStr = orngVisFuncts.createStringFromNumber(totalPossibilities)
325
326                self.cvIndices = None
327                for z in range(len(evaluatedAttrs)):
328                    for u in range(minLength-1, maxLength):
329                        combinations = orngVisFuncts.combinations(evaluatedAttrs[:z], u)
330
331                        for attrList in combinations:
332                            triedPossibilities += 1
333
334                            attrs = [evaluatedAttrs[z]] + attrList
335                            diffVals = reduce(operator.mul, [max(1, len(self.data.domain[attr].values)) for attr in attrs])
336                            if diffVals > 200: continue     # we cannot efficiently deal with projections with more than 200 different values
337
338                            val = self._Evaluate(attrs)
339                            ind = self.findTargetIndex(val)
340                            start = ind
341                            if ind > 0 and self.results[ind-1][0] == val:
342                                ind -= 1
343                            while ind > 0 and self.results[ind-1][0] == val and len(attrs) < len(self.results[ind-1][1]):
344                                ind -= 1
345                            while ind < len(self.results) and self.results[ind][0] == val and len(attrs) > len(self.results[ind][1]):
346                                ind += 1
347
348                            self.insertItem(val, attrs, ind, triedPossibilities)
349                            self.evaluatedProjectionsCount += 1
350
351                            if self.__class__.__name__ != "orngMosaic":
352                                self.visualizationWidget.progressBarSet(100.0*triedPossibilities/float(totalPossibilities))
353                                self.setStatusBarText("Evaluated %s/%s visualizations..." % (orngVisFuncts.createStringFromNumber(triedPossibilities), totalStr))
354                            if hasattr(self, "qApp"):
355                                self.qApp.processEvents()        # allow processing of other events
356
357                            if self.isEvaluationCanceled():
358                                self.data = fullData
359                                self.finishEvaluation(triedPossibilities)
360                                return
361        except:
362            type, val, traceback = sys.exc_info()
363            sys.excepthook(type, val, traceback)  # print the exception
364
365        self.data = fullData
366        self.finishEvaluation(triedPossibilities)
367
368
369    def finishEvaluation(self, evaluatedProjections):
370        self.attrLenDict = dict([(i,1) for i in range(self.attributeCount+1)])
371       
372        if self.__class__.__name__ != "orngMosaic":
373            secs = time.time() - self.startTime
374            self.setStatusBarText("Evaluation stopped (evaluated %s projections in %d min, %d sec)" % (orngVisFuncts.createStringFromNumber(evaluatedProjections), secs/60, secs%60))
375            self.visualizationWidget.progressBarFinished()
376            self.enableControls()
377            self.finishedAddingResults()
378
379
380    def _Evaluate(self, attrs):
381        newFeature, quality = FeatureByCartesianProduct(self.data, attrs)
382
383        retVal = -1
384        if self.qualityMeasure in [CHI_SQUARE]:
385            dists = []
386            for attr in attrs:
387                if not self.attributeDistributions.has_key(attr):
388                    self.attributeDistributions[attr] = orange.Distribution(attr, self.data)
389                dists.append(self.attributeDistributions[attr]) 
390                       
391            # create cartesian product of selected attributes and compute contingency
392            dXY = orange.Distribution(newFeature, self.data)   # distribution of the merged attribute
393           
394            # compute chi-square
395            retVal = 0.0
396            domain = self.data.domain
397            for vs in LimitedCounter([len(domain[attr].values) for attr in attrs]):
398                expected = len(self.data) * reduce(lambda x, y: x*y, [dists[i][v]/float(len(self.data)) for i,v in enumerate(vs)])
399                actual = dXY["-".join([domain[attrs[i]].values[v] for i, v in enumerate(vs)])]
400                if expected == 0: continue
401                pearson2 = (actual - expected)*(actual - expected) / expected
402                retVal += pearson2
403               
404           
405               
406        if self.qualityMeasure in [CHI_SQUARE_CLASS, CRAMERS_PHI_CLASS]:
407            aprioriSum = sum(self.aprioriDistribution)
408            retVal = 0.0
409
410            for dist in orange.ContingencyAttrClass(newFeature, self.data):
411                for i in range(len(self.aprioriDistribution)):
412                    expected = sum(dist) * (self.aprioriDistribution[i] / float(aprioriSum))
413                    if self.ignoreTooSmallCells and expected < 5: continue       # statisticians advise ignoring terms that have less than 5 expected examples, since they can significantly disturb chi-square
414                    if not expected: continue                                    # prevent division by zero
415                    retVal += (dist[i] - expected)**2 / expected
416
417            if self.qualityMeasure == CRAMERS_PHI_CLASS:
418                vals = min(len(newFeature.values), len(self.data.domain.classVar.values))-1
419                if vals:
420                    retVal = sqrt(retVal / (len(self.data) * vals))
421
422        elif self.qualityMeasure == DISTANCE_MEASURE:
423            retVal = MeasureAttribute_Distance(newFeature, self.data)
424
425        elif self.qualityMeasure == MDL:
426            retVal = MeasureAttribute_MDL(newFeature, self.data)
427
428        elif self.qualityMeasure == GINI_INDEX:
429            retVal = orange.MeasureAttribute_gini(newFeature, self.data)
430
431        elif self.qualityMeasure == INFORMATION_GAIN:
432            retVal = orange.MeasureAttribute_info(newFeature, self.data)
433            classEntropy = Entropy(numpy.array([val for val in self.aprioriDistribution]))
434            if classEntropy:
435                retVal = retVal * 100.0 / classEntropy
436       
437        elif self.qualityMeasure == GAIN_RATIO:
438            retVal = orange.MeasureAttribute_gainRatio(newFeature, self.data)
439
440        elif self.qualityMeasure == INTERACTION_GAIN:
441            new = orange.MeasureAttribute_info(newFeature, self.data)
442            gains = [orange.MeasureAttribute_info(attr, self.data) for attr in attrs]
443            retVal = new - sum(gains)
444
445            classEntropy = Entropy(numpy.array([val for val in self.aprioriDistribution]))
446            if classEntropy:
447                retVal = retVal * 100.0 / classEntropy
448
449        elif self.qualityMeasure == AVERAGE_PROBABILITY_OF_CORRECT_CLASSIFICATION:
450            d = self.data.select([newFeature, self.data.domain.classVar])     # create a dataset that has only this new feature and class info
451
452            if not self.cvIndices:
453                if self.testingMethod == PROPORTION_TEST:
454                    pick = orange.MakeRandomIndices2(stratified = orange.MakeRandomIndices.StratifiedIfPossible, p0 = 0.7, randomGenerator = 0)
455                    self.cvIndices = [pick(d) for i in range(10)]
456                elif self.testingMethod == CROSSVALIDATION:
457                    ind = orange.MakeRandomIndicesCV(d, 10, randomGenerator = 0, stratified = orange.MakeRandomIndices.StratifiedIfPossible)
458                    self.cvIndices = [[val == i for val in ind] for i in range(10)]
459
460            acc = 0.0; count = 0
461            for ind in self.cvIndices:
462                learnset = d.selectref(ind, 0)
463                testset = d.selectref(ind, 1)
464                learnDist = orange.Distribution(d.domain.classVar, learnset)
465                newFeatureDist = orange.Distribution(newFeature, testset)
466                learnConts = orange.ContingencyAttrClass(newFeature, learnset)
467                testConts  = orange.ContingencyAttrClass(newFeature, testset)
468                for val in testConts.keys():
469                    s = sum(learnConts[val])
470                    if not s: continue
471                    learnClassProb = [v/float(s) for v in learnConts[val]]      # class distribution for each class value (on learning set)
472                    testClassDist = [v for v in testConts[val]]                 # number of examples for each class value (on testing set)
473                    for i in range(len(testClassDist)):
474                        acc   += learnClassProb[i] * testClassDist[i]
475                        count += testClassDist[i]
476            retVal = 100*acc / max(1, float(count))
477
478        del newFeature, quality
479        return retVal
480
481    #
482    # ARGUMENTATION FUNCTIONS
483    #
484    def findArguments(self, example = None):
485        self.arguments = dict([(val, []) for val in self.classVals])
486
487        if not example or len(self.shownResults) == 0: return None, None
488        if not (self.data and self.data.domain.classVar and self.logits and self.classVals): return None, None
489
490        if self.__class__.__name__ != "orngMosaic":
491            from PyQt4.QtGui import qApp
492
493        usedArguments = 0
494        for index in range(len(self.shownResults)):
495            (score, attrList, tryIndex, extraInfo) = self.shownResults[index]
496            args = self.evaluateArgument(example, attrList, score)      # call evaluation of a specific projection
497            if not args: continue
498
499            if self.classificationMethod == MOS_TOPPROJ and usedArguments >= self.clsTopProjCount:
500                break
501
502            for val in self.classVals:
503                pos = self.getArgumentIndex(args[val][0], val)
504                self.arguments[val].insert(pos, (args[val][0], score, attrList, index, args[val][1]))
505                if self.__class__.__name__ != "orngMosaic" and val == str(self.classValueList.currentText()):
506                    self.insertArgument(args[val][0], args[val][1], attrList, pos)
507                    qApp.processEvents()
508            usedArguments += 1
509
510        predictions = []
511        if self.classificationMethod == MOS_TOPPROJ:
512            for val in self.classVals:
513                predictions.append(sum([v[0] for v in self.arguments[val]]))
514
515        elif self.classificationMethod == MOS_SEMINAIVE:
516            self.argumentation_SemiNaive()      # remove combinations of attributes that are not dependent
517            for val in self.classVals:
518                v = self.aprioriDistribution[val]/float(len(self.data))
519                for arg in self.arguments[val]:
520                    v *= arg[0]
521                predictions.append(v)
522
523        elif self.classificationMethod == MOS_COMBINING:
524            self.argumentation_Combining()
525            for val in self.classVals:
526                value = self.logits[val] + sum([v[0] for v in self.arguments[val]])
527                predictions.append(e**value / (1 + e**value))       # use predictions from all arguments to classify an example
528
529        classValue = self.data.domain.classVar[predictions.index(max(predictions))]
530        if sum(predictions) == 0:
531            predictions = [1]*len(predictions)
532        dist = orange.DiscDistribution([val/float(sum(predictions)) for val in predictions])
533        dist.variable = self.data.domain.classVar
534        return classValue, dist
535
536
537    # for a given example and a projection evaluate arguments for each class value
538    def evaluateArgument(self, example, attrList, score):
539        attrVals = [example[attr] for attr in attrList]
540        if "?" in attrVals: return None      # the testExample has a missing value at one of the visualized attributes
541
542        #subData = orange.Preprocessor_take(self.data, values = dict([(self.data.domain[attr], example[attr]) for attr in attrList]))
543        attrVals = [example[attr] for attr in attrList]
544        if "?" in attrVals: return None      # the testExample has a missing value at one of the visualized attributes
545        subData = self.getDataSubset(attrList, attrVals)
546        if not subData or len(subData) == 0: return None
547
548        lenData = len(self.data)
549        lenSubData = len(subData)
550
551        arguments = {}
552
553        actualProbabilities = self.estimateClassProbabilities(self.data, example, attrList, subData)    # estimate probabilities for the example and given attribute list
554
555        if self.classificationMethod in [MOS_TOPPROJ, MOS_SEMINAIVE]:
556            d = {}
557            eps = 0.0
558            for i in range(len(self.aprioriDistribution)):
559                conts = self.getContingencys(attrList)
560                P_Ci = self.aprioriDistribution[i]/float(lenData)
561                prob = 1.0
562                for attr in attrList:
563                    prob *= self.contingencies[attr][example[attr]][i] / float(max(1,sum(self.contingencies[attr][example[attr]].values())))
564                eps += P_Ci * abs( actualProbabilities[i]/float(lenSubData) - (prob / P_Ci) )
565
566            for i in range(len(self.aprioriDistribution)):
567                arguments[self.classVals[i]] = (actualProbabilities[i], (eps, lenSubData))
568
569        elif self.classificationMethod == MOS_COMBINING:
570            for i in range(len(self.aprioriDistribution)):
571                # compute log odds of P(c|ai)/P(c)
572                val = 0
573                if (1-actualProbabilities[i]) * self.aprioriProbabilities[i] > 0.0:
574                    val = (actualProbabilities[i] * (1-self.aprioriProbabilities[i])) / ((1-actualProbabilities[i]) * self.aprioriProbabilities[i])
575                    if val > 0: val = log(val)
576                    else: val = -999.99
577                else: val = 999.99
578
579                # compute confidence interval
580                approxZero = 0.0001
581                aprioriPc0 = max(approxZero, self.aprioriProbabilities[i]);
582                aprioriPc1 = max(approxZero, 1-self.aprioriProbabilities[i])
583                actualPc0  = max(approxZero, actualProbabilities[i]);
584                actualPc1  = max(approxZero, 1-actualProbabilities[i])
585                part1 = 1 / (len(self.data) * aprioriPc0 * aprioriPc1)
586                part2 = 1 / max(approxZero, lenSubData  * actualPc0  * actualPc1)
587                #error = sqrt(max(0, part2 - part1)) * 1.64
588                if self.classConfidence == 0: error = 0.
589                else:                         error = sqrt(max(0, part2 - part1)) * norm_factor(1-((1-float(self.classConfidence)/100.)/2.))
590
591                arguments[self.classVals[i]] = (val, error)
592
593        return arguments
594
595
596    # probability estimation function
597    def estimateClassProbabilities(self, data, example, attrList, subData = None, subDataDistribution = None, aprioriDistribution = None, probabilityEstimation = -1, mValue = -1):
598        if probabilityEstimation == -1: probabilityEstimation = self.probabilityEstimation
599        if aprioriDistribution == None: aprioriDistribution = self.aprioriDistribution
600        if mValue == -1:                mValue = self.mValue
601
602        if not subData:
603            attrVals = [example[attr] for attr in attrList]
604            if "?" in attrVals: return None      # the testExample has a missing value at one of the visualized attributes
605            subData = self.getDataSubset(attrList, attrVals)
606            #subData = orange.Preprocessor_take(data, values = dict([(data.domain[attr], example[attr]) for attr in attrList]))
607        if not subDataDistribution:
608            subDataDistribution = orange.Distribution(data.domain.classVar.name, subData)
609
610        lenData = len(data)
611        lenSubData = len(subData)
612
613        # estimate probabilities
614        if probabilityEstimation == RELATIVE:
615            if not len(subData):
616                self.printVerbose("OWMosaicOptimization: Empty data subset. Unable to compute relative frequency.")
617                return [0.0 for i in range(len(aprioriDistribution))]      # prevent division by zero
618            actualProbabilities = [ subDataDistribution[index] / float(lenSubData) for index in range(len(aprioriDistribution))]      # P(c_i | a_k) / P(c_i)
619
620        elif probabilityEstimation == LAPLACE:
621            actualProbabilities = []
622            for index in range(len(aprioriDistribution)):
623                actualProbabilities.append((subDataDistribution[index]+1) / float(lenSubData+len(subDataDistribution)))      # (r+1 / n+c) / P(c_i)
624
625        elif probabilityEstimation == M_ESTIMATE:
626            actualProbabilities = []
627            for index in range(len(aprioriDistribution)):
628                n = subDataDistribution[index]
629                pa = aprioriDistribution[index]/float(sum(aprioriDistribution))
630                actualProbabilities.append((pa * mValue + n) / float(lenSubData + mValue))       # p = (pa*m+n)/(N+m)
631
632        # just to check if the math is ok
633        s = sum(actualProbabilities)
634        if "%.3f" % s != "1.000":
635            print "Strange, huh. Probabilities don't sum to 1, but to", s, actualProbabilities
636
637        return actualProbabilities
638
639
640    def argumentation_Combining(self):
641        if not self.arguments or not self.arguments.values(): return
642
643        for classValue in self.arguments.keys():
644            # create a dict for arguments of different lengths. each dict has arguments of this length and corresponding scores
645            argList = {1: {}, 2: {}, 3:{}, 4:{}}
646            arguments = self.arguments[classValue]
647            for i in range(len(arguments)):
648                attrs = arguments[i][2]
649                attrs.sort()
650                argList[len(attrs)][tuple(attrs)] = (arguments[i][0], arguments[i][4], i)       # save arg val, error and arg index
651
652            if len(argList[1]) == 0: return     # in case that we only evaluated projections with EXACTLY X attributes we cannot remove weak arguments
653
654            for count in [4,3,2]:
655                args = argList[count]
656
657                candidates = []     # candidate projections for deleting
658                for key in args.keys():
659                    splits = orngVisFuncts.getPossibleSplits(list(key))
660                    for split in splits:
661                        vals = [argList[len(v)].get(tuple(v), [None, None, None])[0] for v in split]
662                        errors = [argList[len(v)].get(tuple(v), [None, None, None])[1] for v in split]
663                        # if None in vals or abs(sum(vals)) >= abs(args[key][0]):       # this condition is not enough, because some elements in vals have + and some - signs which makes a combination better than individual attributes
664                        if None in vals:        # some attrs have already been used
665                            args.pop(key)
666                            break
667                        sameSign = (sum([abs(val) == val for val in vals]) in [0, len(vals)])   # do all values in vals have the same sign
668                        partialVals = sum([abs(val) for val in vals] + errors)        # vals may have all - or all + values. errors are all +. to sum, we must make all vals +.
669                        complexVal = abs(args[key][0]) - args[key][1]       # value and error of the combination of attributes
670                        if not sameSign or partialVals >= complexVal:
671                            args.pop(key)   # remove the combination of attributes because a split exists, that produces a more important argument
672                            break
673                    if args.has_key(key):
674                        candidates.append((args[key][0], key, splits))
675
676                candidates.sort()
677                candidates.reverse()
678                for val, attrs, splits in candidates:
679                    vals = []
680                    for split in splits:
681                        vals += [argList[len(v)].get(tuple(v)) for v in split]
682                    if None in vals:
683                        args.pop(attrs)
684                        continue       # we have already used some of the attributes in split for a better projection
685
686                    # obviously we have found a very good projection of attributes and we still have all the attributes needed
687                    # we now have to remove other projections that use these attributes
688                    for split in splits:
689                        for part in split:
690                            if argList[len(part)].has_key(tuple(part)):
691                                argList[len(part)].pop(tuple(part))
692
693            #print [len(argList[1]), len(argList[2]), len(argList[3])]
694            #assert False not in [a[a.keys()[i]] == a.values()[i] for i in range(len(a.keys()))]
695            indicesToKeep = [val[2] for val in argList[1].values() + argList[2].values() + argList[3].values() + argList[4].values()]
696
697            # we remove all arguments that are not in indicesToKeep
698            for i in range(len(arguments))[::-1]:       # we remove all arguments that are not in indicesToKeep
699                if i not in indicesToKeep:
700                    arguments.pop(i)
701
702
703    # compute probability that the combination of attribute values in attrValues is significantly different from being independent
704    # see Kononenko: Semi-naive Bayes
705    def argumentation_SemiNaive(self):
706        if not self.arguments or not self.arguments.values(): return
707
708        for classValue in self.arguments.keys():
709            # create a dict for arguments of different lengths. each dict has arguments of this length and corresponding scores
710            argList = {1: {}, 2: {}, 3:{}, 4:{}}
711            arguments = self.arguments[classValue]
712            for i in range(len(arguments)):
713                attrs = arguments[i][2]
714                argList[len(attrs)][tuple(attrs)] = (arguments[i][4], i)
715
716            if len(argList[1]) == 0: return     # in case that we only evaluated projections with EXACTLY X attributes we cannot remove weak arguments
717
718            for count in [4,3,2]:
719                for key in argList[count].keys():
720                    eps, Nab = argList[count][key][0]
721                    v = float(4*eps*eps*Nab)
722                    if v == 0 or 1/v > 1 - self.clsTau:
723                        argList[count].pop(key)
724
725            indicesToKeep = [val[1] for val in argList[1].values() + argList[2].values() + argList[3].values() + argList[4].values()]
726
727            for i in range(len(arguments))[::-1]:       # we remove all arguments that are not in indicesToKeep
728                if i not in indicesToKeep:
729                    arguments.pop(i)
730
731
732
733    def getInteractionImportanceProbability(self, attrList, attrValues, subData = None, contingencies = None, aprioriDistribution = None, subDataDistribution = None):
734        assert len(attrList) == len(attrValues)
735
736        if not subData:
737            #subData = orange.Preprocessor_take(self.data, values = dict([(self.data.domain[attrList[i]], attrValues[i]) for i in range(len(attrList))]))
738            subData = self.getDataSubset(attrList, attrValues)
739        if not aprioriDistribution: aprioriDistribution = self.aprioriDistribution
740        if not subDataDistribution:
741            subDataDistribution = orange.Distribution(self.data.domain.classVar.name, subData)
742        """
743        if not contingencies:
744            for i in range(len(attrList)):
745                temp = self.data.filter({attrList[i]: attrValues[i]})
746                contingencies
747            contingencies = [orange.ContingencyAttrClass(attr, self.data) for attr in attrList]
748        """
749
750        lenSubData = len(subData)
751        lenData = len(self.data)
752        eps = 0.0
753        for i in range(len(aprioriDistribution)):
754            product = 1.0
755            for j in range(len(attrList)):
756                N_ci_aj = self.classDistributionsForExample[attrList[j]][i]
757                N_aj = sum(self.classDistributionsForExample[attrList[j]])
758                product *= N_ci_aj / N_aj       # P(ci|aj)
759
760            eps += (aprioriDistribution[i]/lenData) * abs((subDataDistribution[i]/lenSubData) - (product*lenData)/aprioriDistribution[i])
761            #eps += abs((subDataDistribution[i]/lenSubData) - (product*lenData)/aprioriDistribution[i])
762
763        if eps*lenSubData == 0: return 9999999
764        ret = 1/(4*eps*eps*lenSubData)
765        #if ret > 1 or ret < 0.0:
766        #    print "invalid probability", eps, ret
767        #assert ret >= 0.0
768        #assert ret <= 1.0
769        return ret
770
771    def getArgumentIndex(self, value, classValue):
772        if len(self.arguments[classValue]) == 0: return 0
773
774        top = 0; bottom = len(self.arguments[classValue])
775        while (bottom-top) > 1:
776            mid  = (bottom + top)/2
777            if max(value, self.arguments[classValue][mid][0]) == value: bottom = mid
778            else: top = mid
779
780        if max(value, self.arguments[classValue][top][0]) == value:  return top
781        else:                                                        return bottom
782
783
784    #######
785    # code for evaluating different placements of a set of attributes by the separation of different classes in the graph
786    #######
787    def evaluateAttributeOrder(self, attrs, valueOrder, conditions, revert, domain = None):
788        if not domain:
789            domain = orange.Domain([orange.FloatVariable("xVar"), orange.FloatVariable("yVar"), self.data.domain.classVar])
790            self.weightID = orange.newmetaid()
791            domain.addmeta(self.weightID, orange.FloatVariable("ExampleWeight"))
792        projData = orange.ExampleTable(domain)
793        projData.domain.classVar.distributed = True
794
795        triedIndices = [0]*(len(attrs))
796        maxVals = [len(val) for val in valueOrder]
797        xpos = 0; ypos = 0
798        while triedIndices[0] < maxVals[0]:
799            vals = [valueOrder[i][triedIndices[i]] for i in range(len(attrs))]
800            combVal = reduce(operator.concat, map(operator.concat, [vals[i] for i in revert], ["-"]*len(vals)))[:-1]
801            if conditions[combVal][0] > 0:
802                #projData.append([xpos, ypos, conditions[combVal][1]])
803                projData.append([xpos, ypos, projData.domain.classVar.values[0]])
804                val = orange.Value(projData.domain.classVar, conditions[combVal][1])
805                val.svalue = conditions[combVal][2]
806                projData[-1][-1] = val
807                projData[-1].setmeta("ExampleWeight", conditions[combVal][0]/max(1,len(self.data))) # set weight of the rectangle
808
809            triedIndices[-1] += 1
810            for i in range(1, len(attrs))[::-1]:
811                if triedIndices[i] >= maxVals[i]:
812                    triedIndices[i-1] += 1
813                    triedIndices[i] = 0
814
815            xpos = triedIndices[-1]
816            ypos = len(attrs) > 1 and 2 * triedIndices[-2]
817            if len(attrs) > 2: xpos += (2 + maxVals[-1]) * triedIndices[-3] # add the space of 3 between each different value of third attribute
818            if len(attrs) > 3: ypos += (4 + maxVals[-2]) * triedIndices[-4] # add the space of 4 between each different value of fourth attribute
819
820        distance = orange.ExamplesDistanceConstructor_Manhattan()
821        distance.normalize = 0
822        learner = orange.kNNLearner(rankWeight = 0, k = len(projData)/2, distanceConstructor = distance)
823        if self.attributeOrderTestingMethod == AO_CROSSVALIDATION:
824            results = orngTest.leaveOneOut([learner], (projData, self.weightID))
825        else:
826            results = orngTest.learnAndTestOnLearnData([learner], (projData, self.weightID))
827        return orngStat.AP(results)[0]
828
829    # for a given subset of attributes (max 4) find which permutation is most visual friendly. optimizeValueOrder
830    def findOptimalAttributeOrder(self, attrs, optimizeValueOrder = 0):
831        if not self.data or not self.data.domain.classVar or self.data.domain.classVar.varType != orange.VarTypes.Discrete:
832            return None
833        apriori = [max(1, self.aprioriDistribution[val]) for val in self.data.domain.classVar.values]
834        conditions = {}
835        newFeature, quality = FeatureByCartesianProduct(self.data, attrs)
836        dist = orange.Distribution(newFeature, self.data)
837        cont = orange.ContingencyAttrClass(newFeature, self.data)
838        for key in cont.keys():
839            if dist[key] == 0: conditions[key] = (0, 0)
840            else:
841                quotients = map(operator.div, cont[key], apriori)
842                conditions[key] = (dist[key], self.data.domain.classVar.values[quotients.index(max(quotients))], cont[key])
843
844        domain = orange.Domain([orange.FloatVariable("xVar"), orange.FloatVariable("yVar"), self.data.domain.classVar])
845        self.weightID = orange.newmetaid()
846        domain.addmeta(self.weightID, orange.FloatVariable("ExampleWeight"))
847
848        # create permutations of attributes and attribute values
849        attrPerms = orngVisFuncts.permutations(range(len(attrs)))
850        valuePerms = {}     # for each attribute we generate permutations of its values (if optimizeValueOrder = 1)
851        for attr in attrs:
852            if optimizeValueOrder:  valuePerms[attr] = orngVisFuncts.permutations(getVariableValuesSorted(self.data.domain[attr]))
853            else:                   valuePerms[attr] = [getVariableValuesSorted(self.data.domain[attr])]
854
855        if self.__class__.__name__ != "orngMosaic":
856            self.setStatusBarText("Generating possible attribute orders...")
857            self.visualizationWidget.progressBarInit()
858
859        possibleOrders = []
860        triedIndices = [0]*(len(attrs))                 # list of indices that point to the next permutation of values that will be tried
861        maxVals = [len(valuePerms[attr]) for attr in attrs]
862        while triedIndices[-1] < maxVals[-1]:           # we have no more possible placements if we have an overflow
863            valueOrder = [valuePerms[attrs[i]][triedIndices[i]] for i in range(len(attrs))]
864            possibleOrders.append(valueOrder)           # list of orders that we will evaluate
865            triedIndices[0] += 1
866            if self.cancelOptimization: break
867            for i in range(len(attrs)-1):
868                if triedIndices[i] >= maxVals[i]:
869                    triedIndices[i+1] += 1
870                    triedIndices[i] = 0
871
872        bestPlacements = []
873        total = len(attrPerms) * len(possibleOrders)
874        strCount = orngVisFuncts.createStringFromNumber(total)
875        self.evaluatedProjectionsCount = 0
876        for attrPerm in attrPerms:                      # for all attribute permutations
877            currAttrs = [attrs[i] for i in attrPerm]
878            if self.cancelOptimization: break
879            tempPerms = []
880            for order in possibleOrders:                # for all permutations of attribute values
881                currValueOrder = [order[i] for i in attrPerm]
882                val = self.evaluateAttributeOrder(currAttrs, currValueOrder, conditions, map(attrPerm.index, range(len(attrPerm))), domain)
883                tempPerms.append((val*100, currAttrs, currValueOrder))
884                self.evaluatedProjectionsCount += 1
885                if self.evaluatedProjectionsCount % 10 == 0 and self.__class__.__name__ != "orngMosaic":
886                    self.setStatusBarText("Evaluated %s/%s attribute orders..." % (orngVisFuncts.createStringFromNumber(self.evaluatedProjectionsCount), strCount))
887                    self.visualizationWidget.progressBarSet(100*self.evaluatedProjectionsCount/float(total))
888                    if self.isOptimizationCanceled(): break
889            bestPlacements.append(max(tempPerms))
890
891        if self.__class__.__name__ != "orngMosaic":
892            self.setStatusBarText("")
893            self.visualizationWidget.progressBarFinished()
894
895        bestPlacements.sort()
896        bestPlacements.reverse()
897        return bestPlacements
898
899    #
900    # MISC FUNCTIONS
901    #
902
903    # use bisection to find correct index
904    def findTargetIndex(self, score):
905        top = 0; bottom = len(self.results)
906
907        while (bottom-top) > 1:
908            mid  = (bottom + top)/2
909            if max(score, self.results[mid][SCORE]) == score: bottom = mid
910            else: top = mid
911
912        if len(self.results) == 0: return 0
913        if max(score, self.results[top][SCORE]) == score:
914            return top
915        else:
916            return bottom
917
918    # insert new result - give parameters: score of projection, number of examples in projection and list of attributes.
919    def insertItem(self, score, attrList, index, tryIndex, extraInfo = []):
920        self.results.insert(index, (score, attrList, tryIndex, extraInfo))
921        self.shownResults.insert(index, (score, attrList, tryIndex, extraInfo))
922
923   
924    # from a list of attributes build a nice string with attribute names
925    def buildAttrString(self, attrList):
926        if len(attrList) == 0: return ""
927
928        strList = attrList[0]
929        for attr in attrList[1:]:
930            strList += ", " + attr
931        return strList
932
933
934    # select only those examples that have the specified attribute values
935    # also remove examples that have missing values at those attributes
936    def getDataSubset(self, attrList, attrValues):
937        temp = self.data.selectref(dict([(attrList[i], attrValues[i]) for i in range(len(attrList))]))
938        filter = orange.Filter_isDefined(domain=self.data.domain)
939        for v in self.data.domain.variables:
940            filter.check[v] = v.name in attrList
941        return filter(temp)
942
943
944    #
945    # LOADING, SAVING, .....
946    #
947    # save evaluated projections into a file
948    def save(self, filename):
949        dict = {}
950        for attr in self.saveSettingsList:
951            dict[attr] = self.__dict__[attr]
952        dict["dataCheckSum"] = self.data.checksum()
953
954        file = open(filename, "wt")
955        file.write("%s\n" % (str(dict)))
956        for (score, attrList, tryIndex, extraInfo) in self.shownResults:
957            file.write("(%.4f, %s, %d)\n" % (score, attrList, tryIndex))
958        file.flush()
959        file.close()
960
961    def load(self, filename, ignoreCheckSum = 0):
962        self.clearResults()
963        file = open(filename, "rt")
964        settings = eval(file.readline()[:-1])
965
966        if not ignoreCheckSum and settings.has_key("dataCheckSum") and settings["dataCheckSum"] != self.data.checksum():
967            if self.__class__.__name__ != "orngMosaic":
968                from PyQt4.QtGui import QMessageBox
969                if QMessageBox.information(self, 'VizRank', 'The current data set has a different checksum than the data set that was used to evaluate visualizations in this file.\nDo you want to continue loading anyway, or cancel?','Continue','Cancel', '', 0,1):
970                    file.close()
971                    return
972            else:
973                print "dataset checksum does not agree with the checksum in the projections file. loading anyway"
974
975        #self.setSettings(settings)
976        for key in settings.keys():
977            setattr(self, key, settings[key])
978
979        ind = 0
980        for line in file.xreadlines():
981            (score, attrList, tryIndex) = eval(line)
982            self.insertItem(score, attrList, ind, tryIndex)
983            ind+=1
984        file.close()
985
986        return ind
987
988
989# #############################################################################
990# definition of tree of mosaics
991class MosaicTreeNode:
992    def __init__(self, parent, attrs):
993        self.children = {}
994        self.parent = parent
995        self.attrs = attrs
996        self.branches = {}        # links to other MosaicTreeNode instances that represent branches. Keys are attribute values, e.g. ([0,1,3], [1,2,2])
997        self.branchSelector = None
998        self.selectionIndices = None
999
1000
1001# #############################################################################
1002# learner that builds MosaicTreeLearner
1003class MosaicTreeLearner(orange.Learner):
1004    def __init__(self, mosaic = None, statusFunct = None):
1005        if not mosaic:
1006            mosaic = orngMosaic()
1007        self.mosaic = mosaic
1008        self.statusFunct = statusFunct
1009        #self.mosaic.qualityMeasure = MDL        # always use MDL for estimating quality of projections - this also stops building tree if no combination of attributes produces an improvement
1010        self.name = self.mosaic.learnerName
1011
1012    def __call__(self, examples, weightID = 0):
1013        return MosaicTreeClassifier(self.mosaic, examples, self.statusFunct)
1014
1015
1016# #############################################################################
1017# class that builds a tree of mosaics that can be used as a classifier
1018class MosaicTreeClassifier(orange.Classifier):
1019    def __init__(self, mosaic, data, statusFunct = None):
1020        self.mosaic = mosaic
1021
1022        # discretize domain if necessary
1023        mosaic.setData(data)
1024        data = mosaic.data
1025
1026        stop = orange.TreeStopCriteria_common()
1027        stop.minExamples = 5
1028        self.mosaicTree = None
1029
1030        treeLearner = orange.TreeLearner()
1031        treeLearner.split = SplitConstructor_MosaicMeasure(mosaic, statusFunct)
1032        treeLearner.stop = stop
1033        tree = treeLearner(data)
1034        if tree.tree and tree.tree.branchSelector:
1035            self.mosaicTree = self.createTreeNodes(tree.tree, data, None, [1]*len(data))
1036        if statusFunct:
1037            if self.mosaicTree:
1038                statusFunct("Mosaic tree was built successfully.")
1039            else:
1040                statusFunct("No tree was generated.")
1041
1042    def createTreeNodes(self, node, data, parentTreeNode, selectionIndices):
1043        treeNode = MosaicTreeNode(parentTreeNode, node.branchSelector.attrList)
1044        treeNode.branchSelector = node.branchSelector
1045        treeNode.selectionIndices = selectionIndices
1046
1047        if node.branches:    # if internal node
1048            for i in range(len(node.branches)):
1049                if not node.branches[i] or not node.branches[i].branchSelector:
1050                    continue        # if we are at a leaf
1051
1052                selectedAttrValues = node.branchDescriptions[i]
1053                pp = orange.Preprocessor_take()
1054                pp.values[node.branchSelector.classVar] = selectedAttrValues
1055                selectedIndices = list(pp.selectionVector(data.select([node.branchSelector.classVar])))
1056                selectedData = data.selectref(selectedIndices)
1057
1058                treeNode.branches[selectedAttrValues] = self.createTreeNodes(node.branches[i], selectedData, treeNode, selectedIndices)
1059
1060        return treeNode
1061
1062
1063    # for a given example run argumentation and find out to which class it most often fall
1064    def __call__(self, example, returnType = orange.GetBoth):
1065        currNode = self.mosaicTree
1066        while currNode:
1067            val = currNode.branchSelector.classVar.getValueFrom(example).value
1068            if currNode.branches.has_key(val):
1069                currNode = currNode.branches[val]
1070            else:
1071                return currNode.branchSelector.classifyExample(example, returnType)        # we are in the leaf of the mosaic tree. classify to the prevailing class
1072
1073
1074# a measure that evaluates different projections and then says that the best "attribute" is the best projection
1075class SplitConstructor_MosaicMeasure(orange.TreeSplitConstructor):
1076    def __init__(self, mosaic, statusFunct = None):
1077        self.mosaic = mosaic
1078        self.statusFunct = statusFunct
1079        self.nodeCount = 0
1080        self.measure = MeasureAttribute_MDL()
1081
1082    def updateStatus(self, evaluatingProjections):
1083        if self.statusFunct:
1084            s = "%sCurrent tree has %d nodes" % (evaluatingProjections and "Please wait, evaluating projections. " or "", self.nodeCount)
1085            self.statusFunct(s)
1086
1087    def __call__(self, gen, weightID, contingencies, apriori, candidates, nodeClassifier):
1088        self.mosaic.setData(gen)
1089        self.updateStatus(1)
1090        if self.mosaic.cancelTreeBuilding:
1091            return None
1092        self.mosaic.evaluateProjections()
1093        if self.mosaic.cancelTreeBuilding or len(self.mosaic.results) == 0:       # or self.mosaic.results[0][0] <= 0:     # if no results or score <=0 then stop building
1094            self.updateStatus(0)
1095            return None
1096
1097        score, attrList, tryIndex, extraInfo = self.mosaic.results[0]
1098
1099        newFeature = mergeAttrValues(gen, attrList, self.measure, removeUnusedValues = 0)
1100        dist = orange.Distribution(newFeature, gen).values()
1101        if max(dist) == sum(dist):    # if all examples belong to one attribute value then this is obviously a useless attribute and we should stop building
1102            self.updateStatus(0)
1103            return None
1104
1105        self.nodeCount += 1
1106        self.updateStatus(0)
1107        return (CartesianClassifier(newFeature, attrList, gen), newFeature.values, None, score)
1108
1109
1110class CartesianClassifier(orange.Classifier):
1111    def __init__(self, var, attrList, data):
1112        self.classVar = var
1113        self.attrList = attrList
1114        self.data = data
1115        self.valueMapping = dict(self.createValueDict(attrList, []))
1116
1117        self.values = {}    # dict of "3-1+8-9+2-2" -> [(3,1), (8,9), (2,2)]
1118        #for classVal in self.classVar.values:       # we cannot use this because when combining discretized attributes, classVals are c1, c2, ... and not 3-1+...
1119        #    self.values[classVal] = filter(None, [self.valueMapping.get(val, None) for val in classVal.split("+")])
1120        for val in self.classVar.values:
1121            self.values[val] = []
1122        for ind, combination in enumerate(LimitedCounter([len(data.domain[attr].values) for attr in self.attrList])):
1123            self.values[self.classVar.getValueFrom.lookupTable[ind].value].append(tuple([data.domain[self.attrList[attrInd]].values[attrValInd] for attrInd, attrValInd in enumerate(combination)]))
1124
1125
1126    # create a mapping from "0-1-1-4" back to [0, 1, 1, 4]
1127    def createValueDict(self, attrList, valueList):
1128        if attrList == []: return valueList
1129
1130        attrValues = self.data.domain[attrList[0]].values
1131        if valueList == []:
1132            return self.createValueDict(attrList[1:], [(val, (val,)) for val in attrValues])
1133        else:
1134            newValueList = []
1135            for val in attrValues:
1136                newValueList += [(pre+"-"+val, vals+(val,)) for (pre, vals) in valueList]
1137            return self.createValueDict(attrList[1:], newValueList)
1138
1139
1140    # determine to which class would the example be classifed based on this combination of attributes
1141    # i.e. get the majority class for this cartesian product of attributes
1142    def classifyExample(self, ex, what = orange.Classifier.GetValue):
1143        val = self.classVar.getValueFrom(ex).value
1144        classDist = orange.ContingencyAttrClass(self.classVar, self.data)[val]
1145        classValue = classDist.keys()[classDist.values().index(max(classDist.values()))]
1146        if what == orange.Classifier.GetValue:
1147            return orange.Value(self.data.domain.classVar, classValue)
1148        elif what == orange.Classifier.GetProbabilities:
1149            return classDist
1150        else:
1151            return (orange.Value(self.data.domain.classVar, classValue), classDist)
1152
1153
1154    # clasify the example ex based on the self.data
1155    def __call__(self, ex, what = orange.Classifier.GetValue):
1156        if 1 in [ex[attr].isSpecial() for attr in self.attrList]:
1157            return orange.Value("?")
1158        val = self.classVar.getValueFrom(ex)
1159        if what == orange.Classifier.GetValue:
1160            return val
1161        probs = orange.DiscDistribution(self.classVar)
1162        probs[val] = 1.0
1163        if what == orange.Classifier.GetProbabilities:
1164            return probs
1165        else:
1166            return (val, probs)
1167
1168
1169#test widget appearance
1170if __name__=="__main__":
1171    data = orange.ExampleTable(r"E:\Development\Orange Datasets\UCI\zoo.tab")
1172    a = mergeAttrValues(data, ["milk", "legs"], MeasureAttribute_MDL())
1173
1174    mosaic = orngMosaic()
1175    mosaic.setData(data)
1176    mosaic.qualityMeasure = DISTANCE_MEASURE
1177    mosaic.evaluateProjections()
1178
1179    learner = MosaicTreeLearner(mosaic)
1180    classifier = learner(data)
Note: See TracBrowser for help on using the repository browser.