source: orange/Orange/misc/visfuncts.py @ 9725:6c16952df555

Revision 9725:6c16952df555, 38.9 KB checked in by anze <anze.staric@…>, 2 years ago (diff)

Updated imports of c extensions.

Line 
1import orange, random
2from Orange.feature import scoring
3from Orange.misc import progress_bar_milestones
4from Orange import statc
5import copy
6from math import ceil
7##import numpy
8##from LinearAlgebra import *
9
10_differentClassPermutationsDict = {}
11_projectionListDict = {}
12
13
14# ##########################################################################################
15# MOSAIC OPTIMIZATION FUNCTION
16# ##########################################################################################
17
18_possibleSplits = {}
19_possibleSplits[1] = [[[0]]]
20_possibleSplits[2] = [[[0], [1]]]
21_possibleSplits[3] = [[[0], [1,2]], [[0,1],[2]], [[0,2], [1]], [[0], [1], [2]]]
22_possibleSplits[4] = [[[0], [1,2,3]], [[1], [0,2,3]], [[2], [0,1,3]], [[3], [0,1,2]],
23    [[0,1], [2,3]], [[0,2], [1,3]], [[0,3], [1,2]], [[0,1,2], [3]], [[0,1,3], [2]], [[0,2,3], [1]], [[1,2,3], [0]],
24    [[0], [1], [2,3]], [[0], [2], [1,3]], [[0], [3], [1,2]], [[1], [2], [0,3]], [[1], [3], [0,2]], [[2], [3], [0,1]], [[0], [1], [2], [3]]]
25
26# get possible splits of attributes in attrs into subgroups , e.g.: ["a","b","c"] -> [[['a'], ['b', 'c']], [['a', 'b'], ['c']], [['a', 'c'], ['b']], [['a'], ['b'], ['c']]]
27def getPossibleSplits(attrs):
28    if len(attrs) > 4: return []
29    retSplit = []
30    for split in _possibleSplits[len(attrs)]:
31        parts = []
32        for arr in split:
33            a = [attrs[i] for i in arr]
34            parts.append(a)
35        retSplit.append(parts)
36    return retSplit
37
38# ##########################################################################################
39# MISC FUNCTIONS
40# ##########################################################################################
41
42# take a number and return a formated string, eg: 2341232 -> "2,341,232"
43def createStringFromNumber(num):
44    s = str(num)
45    arr = range(len(s)-2)[:0:-3]
46    for i in arr:
47        s = s[:i] + "," + s[i:]
48    return s
49
50
51# factoriela
52def fact(i):
53    ret = 1
54    for j in range(2, i+1): ret*= j
55    return ret
56
57# get a sublist of permutations of list. Return only those permutations that cannot be attained by rotating one other permutation.
58# used in radviz where we have to try different placements of attributes
59def generateDifferentPermutations(list):
60    if len(list) == 0: return []
61    temp = permutations(list[:-1])
62    newPerms = {}
63    for p in temp:
64        if not newPerms.has_key(tuple(p[::-1])):
65            newPerms[tuple(p)] = p
66    lastVal = list[-1]
67    return [p + [lastVal] for p in newPerms.values()]
68
69
70# compute permutations of elements in list
71def permutations(list):
72    if list==[]:
73        return [[]]
74    else:
75        return [ [list[i]] + p for i in range(len(list)) for p in permutations(list[:i] + list[i+1:])]
76
77# return number of combinations where we select "select" from "total"
78def combinationsCount(select, total):
79    return fact(total)/ (fact(total-select)*fact(select))
80
81# get the actual combinations of items
82def combinations(items, count):
83    if count > len(items): return []
84    answer = []
85    indices = range(count)
86    if indices == []: return [[]]   # if count = 0 return empty list
87
88    indices[-1] = indices[-1] - 1
89    while 1:
90        limit = len(items) - 1
91        i = count - 1
92        while i >= 0 and indices[i] == limit:
93            i = i - 1
94            limit = limit - 1
95        if i < 0: break
96
97        val = indices[i]
98        for i in xrange( i, count ):
99            val = val + 1
100            indices[i] = val
101        temp = []
102        for i in indices:
103            temp.append( items[i] )
104        answer.append( temp )
105    return answer
106
107# ##########################################################################################
108# VIZRANK FUNCTIONS
109# ##########################################################################################
110
111# randomly switch two elements in a list. repeat nrOfTimes.
112def switchTwoElements(list, nrOfTimes = 1):
113    if len(list) < 2: return list
114    for i in range(nrOfTimes):
115        index1 = random.randint(0, len(list)-1)
116        index2 = random.randint(0, len(list)-1)
117        if index1 == index2:
118            index2 = (index1 + 1) % len(list)
119        exVal = list[index1]
120        list[index1] = list[index2]
121        list[index2] = exVal
122    return list
123
124# give a list of elements (e.g. [1,2,3,4,5,6,7]) and number of groups/classes (e.g. 2) and the number of switches you want to perform
125# we assume that the the first len(list)/groupCount elements belong to one group, the second to second group and so on. We only want
126# to switch elements inside a group, not elements between groups
127def switchTwoElementsInGroups(list, groupCount, nrOfTimes = 1):
128    # find group boundaries
129    count = len(list)
130    boundaries = []
131    index = 0
132    while groupCount > 0:
133        boundaries.append(int(ceil(count/float(groupCount))))
134        groupCount -= 1
135        count -= boundaries[-1]
136    currSum = 0
137    for i in range(len(boundaries)):
138        boundaries[i] = boundaries[i] + currSum
139        currSum = boundaries[i]
140
141    parts = []
142    boundaries = [0] + boundaries
143    for i in range(len(boundaries)-1):
144        parts.append(list[boundaries[i] : boundaries[i+1]])
145
146    # take parts, and for each part switch a few elements
147    retList = []
148    while parts != []:
149        count = int(ceil(nrOfTimes/float(len(parts))))
150        nrOfTimes -= count
151        part = parts.pop(random.randint(0, len(parts)-1))
152        retList += switchTwoElements(part, count)
153
154    return retList
155
156
157
158# ##########################################################################################
159# MEASURES FOR EVALUATING INTERESTINGNESS OF ATTRIBUTES
160# ##########################################################################################
161
162# class that measures correlation between continuous class value and continuous attribute
163class MeasureCorrelation:
164    def __init__(self):
165        pass
166
167    def __call__(self, attr, data):
168        return self.MeasureAttribute_info(attr, data)
169
170    def MeasureAttribute_info(self, attr, data):
171        table = data.select([attr, data.domain.classVar])
172        table = orange.Preprocessor_dropMissing(table)
173        a1 = [table[k][0].value for k in range(len(table))]
174        a2 = [table[k][1].value for k in range(len(table))]
175
176        val, prob = statc.pearsonr(a1, a2)
177        return val
178
179
180# fisher discriminant implemented to be used as orange.MeasureAttribute
181class MeasureFisherDiscriminant:
182    def __init__(self):
183        self.dataset = None
184        self.attrInfo = {}
185        self.stats = []
186
187    def __call__(self, attr, data):
188        return self.MeasureAttribute_info(attr, data)
189
190    def MeasureAttribute_info(self, attr, data):
191        # if basic statistics is not computed for this dataset -> compute it
192        if not (self.stats and self.dataset == data):
193            self.stats = {}
194            self.dataset = data
195
196            arr = [0] * len(data.domain.attributes)
197            for val in data.domain.classVar.values:
198                data2 = data.select({data.domain.classVar: val})
199                bas = orange.DomainBasicAttrStat(data2)
200                self.stats[val] = bas
201
202            for i in range(len(self.stats.keys())):
203                statI = self.stats[self.stats.keys()[i]]
204                if len(statI) == 0: continue
205                for j in range(i+1, len(self.stats.keys())):
206                    statJ = self.stats[self.stats.keys()[j]]
207                    if len(statJ) == 0: continue
208                    for attribute in range(len(data.domain.attributes)):
209                        if data.domain.attributes[attribute].varType != orange.VarTypes.Continuous: continue
210                        bottom = (statI[attribute].n * statI[attribute].dev + statJ[attribute].n * statJ[attribute].dev)
211                        if bottom == 0.0: bottom = 0.001
212                        val = abs(statI[attribute].avg - statJ[attribute].avg) * (statI[attribute].n + statJ[attribute].n)/bottom
213                        arr[attribute] += val
214
215            # normalize values in arr so that the largest value will be 1 and others will be proportionally smaller
216            largest = max(arr)
217            if largest != 0:
218                arr = [val/largest for val in arr]
219
220            for i in range(len(data.domain.attributes)):
221                self.attrInfo[data.domain.attributes[i].name] = arr[i]
222
223        return self.attrInfo[data.domain[attr].name]
224
225
226# signal to noise measure implemented as orange.MeasureAttribute
227class S2NMeasure:
228    def __init__(self):
229        self.attrInfo = {}
230        self.data = None
231
232    def __call__(self, attr, data):
233        # if the data changed clear the attribute values
234        if data != self.data:
235            self.attrInfo = {}
236            self.data = data
237
238        if self.attrInfo == {}:
239            classVar = data.domain.classVar
240            datas = [data.select({data.domain.classVar.name: [val]}) for val in data.domain.classVar.values]
241            stats = [orange.DomainBasicAttrStat(d) for d in datas]
242            cls = range(len(stats))
243            clsCount = len(stats)
244            for i in range(len(stats[0])):
245                if stats[0][i] == None: continue
246                temp = 0.0
247                for j in cls:
248                    for k in range(j+1, clsCount):
249                        if (stats[j][i].dev + stats[k][i].dev) > 0:
250                            temp += abs((stats[j][i].avg - stats[k][i].avg) / (stats[j][i].dev + stats[k][i].dev))
251                self.attrInfo[data.domain.attributes[i].name] = temp
252
253        if self.attrInfo.has_key(data.domain[attr].name):
254            return self.attrInfo[data.domain[attr].name]
255        else:
256            return -1
257
258
259# same class as above, just that we can use it to evaluate attribute for each class value
260class S2NMeasureMix(S2NMeasure):
261    def __init__(self):
262        S2NMeasure.__init__(self)
263        self.attrInfoMix = {}
264        self.dataMix = None
265        self.sortedAttrList = []
266
267    def __call__(self, attribute, data):
268
269        # if the data changed clear the attribute values
270        if data != self.dataMix:
271            self.attrInfoMix = {}
272            self.attrInfo = {}
273            self.dataMix = data
274
275        if self.attrInfoMix == {}:
276            attrs = range(len(data.domain.attributes))
277            classVar = data.domain.classVar
278            #shortData = data.select(attrs + [classVar])
279            datas = [data.select({classVar.name: [val]}) for val in classVar.values]
280            statistics = [orange.DomainBasicAttrStat(d) for d in datas]
281
282            cls = []
283            for classVarIndex, c in enumerate(classVar.values):   # for each class value compute how good is each attribute for discriminating this class value against all other
284                attrValsList = []
285                newData = mergeClassValues(data, c)
286                for attrIndex in range(len(attrs)):
287                    if data.domain[attrIndex].varType == orange.VarTypes.Discrete:      # ignore discrete attributes
288                        continue
289                    val = S2NMeasure.__call__(self, attrs[attrIndex], newData)
290                    if statistics[0][attrIndex] == None:
291                        attrValsList.append((0,attrs[attrIndex]))
292                    else:
293                        aves = [stat[attrIndex].avg for stat in statistics]
294                        if max(aves) != aves[classVarIndex] :
295                            val = -val
296                        attrValsList.append((val, attrs[attrIndex]))
297                attrValsList.sort()
298                attrValsList = [element[1] for element in attrValsList]     # remove the value
299                attrValsList.reverse()
300                cls.append(attrValsList)
301
302            attrPositionsDict = dict([(attr, []) for attr in cls[0]])
303            for arr in cls:
304                for i in range(len(arr)):
305                    attrPositionsDict[arr[i]].append(i)
306
307            numClasses = len(classVar.values)
308            currPos = [0 for i in range(numClasses)]
309            self.sortedAttrList = []
310            ableToAdd = 1
311            while ableToAdd:    # sometimes some attributes are duplicated. in such cases we will add only one instance of such attribute to the list
312                ableToAdd = 0
313                for i in range(numClasses):
314                    pos = currPos[i]
315                    while pos < len(cls[i]) and cls[i][pos] == None:
316                        pos += 1
317                    currPos[i] = pos + 1
318                    if pos >= len(cls[i]):
319                        continue
320                    ableToAdd = 1
321
322                    attr = cls[i][pos]
323                    self.sortedAttrList.append(attr)
324                    attrPositions = attrPositionsDict[attr]     # get indices in cls where attribute attr is placed
325                    for j in range(numClasses):
326                        cls[j][attrPositions[j]] = None
327
328            count = len(self.sortedAttrList)
329            for (i, attr) in enumerate(self.sortedAttrList):
330                self.attrInfoMix[data.domain[attr].name] = count-i
331
332        if self.attrInfoMix.has_key(data.domain[attribute].name):
333            return self.attrInfoMix[data.domain[attribute].name]
334        else:
335            return -1
336
337# ##########################################################################################
338# ##########################################################################################
339
340def mergeClassValues(data, value):
341    selection = orange.EnumVariable("Selection", values = ["0", "1"])
342
343    selectedClassesStr = [value]
344    nonSelectedClassesStr = []
345    for val in data.domain.classVar.values:
346        if val not in selectedClassesStr: nonSelectedClassesStr.append(val)
347
348    shortData1 = data.select({data.domain.classVar.name: selectedClassesStr})
349    shortData2 = data.select({data.domain.classVar.name: nonSelectedClassesStr})
350    d1 = orange.Domain(shortData1.domain.attributes + [selection])
351    selection.getValueFrom = lambda ex, what: orange.Value(selection, "0")
352    data1 = orange.ExampleTable(d1, shortData1)
353
354    selection.getValueFrom = lambda ex, what: orange.Value(selection, "1")
355    data2 = orange.ExampleTable(d1, shortData2)
356    data1.extend(data2)
357    return data1
358
359
360# almost equal to evaluateAttributesByEachClassValue function with one exception. if the class value that we want to discriminate
361# has a lower average value than the other average class values then we multiply quality of this attribute with -1
362# this way we get the attributes that have the highes expression and are also good at discrimination
363def findAttributeGroupsForRadviz(data, measure):
364    if isinstance(measure, S2NMeasureMix):
365        measure(data.domain.attributes[0].name, data)                       # just call measure to compute quality of all attributes
366        attrNames = [data.domain[ind].name for ind in measure.sortedAttrList]
367        numClasses = len(data.domain.classVar.values)
368        cls = [attrNames[i::numClasses] for i in range(numClasses)]
369    else:
370        attrVals = [(measure(attr.name, data), attr.name) for attr in data.domain.attributes]
371        attrVals.sort()
372        attrVals.reverse()
373        attrNames = [attrVals[i][1] for i in range(len(attrVals))]  # remove quality values of the attributes
374        cls = None
375
376    return attrNames, cls
377
378
379# for each class value test how good does the measure discriminate between this class value and all the other values merged
380# return a list of lists. Each list contains tuples (val, attr), where val is the attribute quality for attr at separating one class value from the others
381def evaluateAttributesByEachClassValue(data, measure, attrs):
382    cls = []
383    for c in data.domain.classVar.values:   # for each class value compute how good is each attribute for discriminating this class value against all other
384        v = []
385        data2 = mergeClassValues(data, c)
386        for attr in attrs:
387            v.append((measure(attr, data2), attr))
388        v.sort()
389        cls.append(v)
390    return cls
391
392# used by VizRank to evaluate attributes
393def evaluateAttributesDiscClass(data, contMeasure, discMeasure):
394    attrs = []
395    #corr = MeasureCorrelation()
396    for attr in data.domain.attributes:
397        #if data.domain.classVar.varType == orange.VarTypes.Continuous and attr.varType == orange.VarTypes.Continuous: attrs.append((corr(attr.name, data), attr.name))
398        if data.domain.classVar.varType == orange.VarTypes.Continuous and attr.varType == orange.VarTypes.Continuous: attrs.append((1, attr.name))
399        elif data.domain.classVar.varType == orange.VarTypes.Continuous:            attrs.append((0.0, attr.name))
400        elif discMeasure == None and attr.varType == orange.VarTypes.Discrete:      attrs.append((0.0, attr.name))
401        elif contMeasure == None and attr.varType == orange.VarTypes.Continuous:    attrs.append((0.0, attr.name))
402        elif attr.varType == orange.VarTypes.Continuous:                            attrs.append((contMeasure(attr.name, data), attr.name))
403        else:                                                                       attrs.append((discMeasure(attr.name, data), attr.name))
404
405    if discMeasure or contMeasure:
406        attrs.sort()
407        attrs.reverse()
408
409    return [attr for (val, attr) in attrs]  # return only the ordered list of attributes and not also their values
410
411def evaluateAttributesContClass(data, contMeasure, discMeasure):
412    attrs = []
413    for attr in data.domain.attributes:
414        attrs.append((1, attr.name))
415    #        elif discMeasure == None and attr.varType == orange.VarTypes.Discrete:      attrs.append((0.0, attr.name))
416    #        elif contMeasure == None and attr.varType == orange.VarTypes.Continuous:    attrs.append((0.0, attr.name))
417    #        elif attr.varType == orange.VarTypes.Continuous:                            attrs.append((contMeasure(attr.name, data), attr.name))
418    #        else:                                                                       attrs.append((discMeasure(attr.name, data), attr.name))
419
420    if discMeasure or contMeasure:
421        attrs.sort()
422        attrs.reverse()
423
424    return [attr for (val, attr) in attrs]  # return only the ordered list of attributes and not also their values
425
426
427def evaluateAttributesNoClass(data, contMeasure, discMeasure):
428    attrs = []
429    for attr in data.domain.attributes:
430        attrs.append((1, attr.name))
431    #        elif discMeasure == None and attr.varType == orange.VarTypes.Discrete:      attrs.append((0.0, attr.name))
432    #        elif contMeasure == None and attr.varType == orange.VarTypes.Continuous:    attrs.append((0.0, attr.name))
433    #        elif attr.varType == orange.VarTypes.Continuous:                            attrs.append((contMeasure(attr.name, data), attr.name))
434    #        else:                                                                       attrs.append((discMeasure(attr.name, data), attr.name))
435
436    if discMeasure or contMeasure:
437        attrs.sort()
438        attrs.reverse()
439
440    return [attr for (val, attr) in attrs]  # return only the ordered list of attributes and not also their values
441
442
443
444
445# ##############################################################################################
446# ##############################################################################################
447
448
449# SELECT ATTRIBUTES ##########################
450def selectAttributes(data, attrContOrder, attrDiscOrder, projections = None):
451    if data.domain.classVar == None or data.domain.classVar.varType != orange.VarTypes.Discrete:
452        return ([attr.name for attr in data.domain.attributes], [], 0)
453
454    shown = [data.domain.classVar.name]; hidden = []; maxIndex = 0    # initialize outputs
455
456    # # both are RELIEF
457    if attrContOrder == "ReliefF" and attrDiscOrder == "ReliefF":
458        attrVals = scoring.score_all(data, orange.MeasureAttribute_relief())
459        s,h = getTopAttrs(attrVals, 0.95)
460        return (shown + s, hidden + h, 0)
461
462    # # both are NONE
463    elif attrContOrder == "None" and attrDiscOrder == "None":
464        for item in data.domain.attributes:    shown.append(item.name)
465        return (shown, hidden, 0)
466
467
468    # disc and cont attribute list
469    discAttrs = []; contAttrs = []
470    for attr in data.domain.attributes:
471        if attr.varType == orange.VarTypes.Continuous: contAttrs.append(attr.name)
472        elif attr.varType == orange.VarTypes.Discrete: discAttrs.append(attr.name)
473
474
475    ###############################
476    # sort continuous attributes
477    if attrContOrder == "None":
478        shown += contAttrs
479    elif attrContOrder in ["ReliefF", "Fisher discriminant", "Signal to Noise", "Signal to Noise For Each Class"]:
480        if attrContOrder == "ReliefF":               measure = orange.MeasureAttribute_relief(k=10, m=50)
481        elif attrContOrder == "Fisher discriminant": measure = MeasureFisherDiscriminant()
482        elif attrContOrder == "Signal to Noise":     measure = S2NMeasure()
483        else:                                        measure = S2NMeasureMix()
484
485        dataNew = data.select(contAttrs + [data.domain.classVar])
486        attrVals = scoring.score_all(dataNew, measure)
487        s,h = getTopAttrs(attrVals, 0.95)
488        shown += s
489        hidden += h
490    else:
491        print "Unknown value for attribute order: ", attrContOrder
492
493    # ###############################
494    # sort discrete attributes
495    if attrDiscOrder == "None":
496        shown += discAttrs
497    elif attrDiscOrder == "GainRatio" or attrDiscOrder == "Gini" or attrDiscOrder == "ReliefF":
498        if attrDiscOrder == "GainRatio":   measure = orange.MeasureAttribute_gainRatio()
499        elif attrDiscOrder == "Gini":       measure = orange.MeasureAttribute_gini()
500        else:                               measure = orange.MeasureAttribute_relief()
501
502        dataNew = data.select(discAttrs + [data.domain.classVar])
503        attrVals = scoring.score_all(dataNew, measure)
504        s,h = getTopAttrs(attrVals, 0.95)
505        shown += s; hidden += h
506
507    elif attrDiscOrder == "Oblivious decision graphs":
508        #shown.append(data.domain.classVar.name)
509        attrs = getFunctionalList(data)
510        for item in attrs:
511            shown.append(item)
512        for attr in data.domain.attributes:
513            if attr.name not in shown and attr.varType == orange.VarTypes.Discrete:
514                hidden.append(attr.name)
515    else:
516        print "Unknown value for attribute order: ", attrDiscOrder
517
518    return (shown, hidden, maxIndex)
519
520
521def getTopAttrs(results, maxSum = 0.95, onlyPositive = 1):
522    s = []; h = []
523    sum = 0
524    for (attr, val) in results:
525        if not onlyPositive or val > 0: sum += val
526    tempSum = 0
527    for (attr, val) in results:
528        if tempSum < maxSum*sum: s.append(attr)
529        else: h.append(attr)
530        tempSum += val
531    return (s, h)
532
533
534# ##########################################################################################
535# PARALLEL COORDINATES FUNCTIONS
536# ##########################################################################################
537
538# find interesting attribute order for parallel coordinates
539# attrInfo = [(val1, attr1, attr2), .... ]
540def optimizeAttributeOrder(attrInfo, numberOfAttributes, optimizationDlg, app):
541    while (attrInfo != []):
542        proj = []
543        projVal = []
544        canAddAttribute = 1
545        while canAddAttribute:
546            if not optimizationDlg.canContinueOptimization(): return
547            app.processEvents()        # allow processing of other events
548
549            if len(proj) == 0:
550                proj = [attrInfo[0][1], attrInfo[0][2]]
551                projVal = [attrInfo[0][0]]
552            elif len(proj) == numberOfAttributes:
553                canAddAttribute = 0     # time to add the projection to the list
554            else:
555                proj, projVal, success = addBestToCurrentProj(proj, projVal, attrInfo)
556                if not success: canAddAttribute = 0 # there are no more attributes that can be added to this projection
557
558        if len(proj) == numberOfAttributes:
559            # I (Ales) commented this out because fixItersectingPairs can enter an endless cycle
560            #proj, projVal = fixIntersectingPairs(proj, projVal, attrInfo)
561            for i in range(len(proj)-1):
562                removeAttributePair(proj[i], proj[i+1], attrInfo)
563            optimizationDlg.addProjection(sum(projVal)/len(projVal), proj)
564        else:
565            for i in range(len(proj)-1):
566                removeAttributePair(proj[i], proj[i+1], attrInfo)
567
568
569# try rotating subsequences of proj to increase value of attribute order
570def fixIntersectingPairs(proj, projVal, attrInfo):
571    changed = 1
572    allprojections = set()
573    while changed:
574        changed = 0
575        for i in range(len(projVal)-1):
576            if changed: continue
577            for j in range(i+2, len(projVal)-1):
578                if changed: continue
579                val1, exists1 = getAttributePairValue(proj[i], proj[j], attrInfo)
580                val2, exists2 = getAttributePairValue(proj[i+1], proj[j+1], attrInfo)
581                if exists1 and exists2 and (val1 + val2 > projVal[i] + projVal[j]):
582                    projVal[i] = val1
583                    projVal[j] = val2
584                    rev = proj[i+1:j+1]
585                    rev.reverse()
586                    tempProj = proj[:i+1] + rev + proj[j+1:]
587                    proj = tempProj
588                    changed = 1     # we rotated the projection. start checking from the begining
589
590        tproj = tuple(proj)
591        assert(tproj not in allprojections)
592        allprojections.add(tproj)
593
594    return proj, projVal
595
596# return value for attribute pair (val, attr1, attr2) if exists. if not, return 0
597def getAttributePairValue(attr1, attr2, attrInfo):
598    for (val, a1, a2) in attrInfo:
599        if (attr1 == a1 and attr2 == a2) or (attr1 == a2 and attr2 == a1): return (val, 1)
600    return (0, 0)
601
602# remove attribute pair (val, attr1, attr2) from attrInfo
603def removeAttributePair(attr1, attr2, attrInfo):
604    for (val, a1, a2) in attrInfo:
605        if (attr1 == a1 and attr2 == a2) or (attr1 == a2 and attr2 == a1):
606            attrInfo.remove((val, a1, a2))
607            return
608    print "failed to remove attribute pair", attr1, attr2
609
610
611def addBestToCurrentProj(proj, projVal, attrInfo):
612    for (val, a1, a2) in attrInfo:
613        if (a1 == proj[0] and a2 not in proj) or (a2 == proj[0] and a1 not in proj) or (a1 == proj[-1] and a2 not in proj) or (a2 == proj[-1] and a1 not in proj):
614            if a1 == proj[0]: return ([a2] + proj, [val] + projVal, 1)
615            elif a2 == proj[0]: return ([a1] + proj, [val] + projVal, 1)
616            elif a1 == proj[-1]: return (proj + [a2], projVal + [val], 1)
617            else                       : return (proj + [a1], projVal + [val], 1)
618
619    """
620    for (val, a1, a2) in attrInfo:
621        if a1 in proj and a2 in proj: continue
622        if a1 not in proj and a2 not in proj: continue
623       
624        if (a1 not in proj) and (a2 in proj): placed = a2; place = a1
625        else:                                  placed = a1; place = a2
626           
627        ind = proj.index(placed)
628        if ind > 0:
629            val2, exists = getAttributePairValue(place, proj[ind-1], attrInfo)
630            if exists:
631                proj.insert(ind, place)
632                projVal[ind-1] = val2
633                projVal.insert(ind-1, val)
634                return (proj, projVal, 1)
635        if ind < len(proj)-1:
636            val2, exists = getAttributePairValue(place, proj[ind+1], attrInfo)
637            if exists:
638                proj.insert(ind+1, place)
639                projVal[ind] = val
640                projVal.insert(ind, val2)
641                return (proj, projVal, 1)
642    """
643    return proj, projVal, 0
644
645def computeCorrelationBetweenAttributes(data, attrList, minCorrelation = 0.0, progressCallback=None):
646    correlations = []
647    attrListLen = len(attrList)
648    iterCount = attrListLen * (attrListLen - 1) / 2
649    iter = 0
650    milestones = progress_bar_milestones(iterCount)
651    for i in range(len(attrList)):
652        if data.domain.attributes[i].varType != orange.VarTypes.Continuous:
653            continue
654        for j in range(i+1, len(attrList)):
655            if data.domain.attributes[j].varType != orange.VarTypes.Continuous:
656                continue
657            val = abs(computeCorrelation(data, attrList[i], attrList[j]))
658            if val >= minCorrelation:
659                correlations.append((val, attrList[i], attrList[j]))
660            iter += 1
661            if progressCallback and iter in milestones:
662                progressCallback(100.0 * iter / iterCount)
663
664    return sorted(correlations, reverse=True)
665
666
667def computeCorrelationInsideClassesBetweenAttributes(data, attrList, minCorrelation = 0.0, progressCallback=None):
668    if not data.domain.classVar or data.domain.classVar.varType == orange.VarTypes.Continuous:
669        return []
670    correlations = []
671    attrListLen = len(attrList)
672    iterCount = attrListLen * (attrListLen - 1) / 2
673    iter = 0
674    milestones = progress_bar_milestones(iterCount)
675    for i in range(len(attrList)):
676        if data.domain.attributes[i].varType != orange.VarTypes.Continuous:
677            continue
678        for j in range(i+1, len(attrList)):
679            if data.domain.attributes[j].varType != orange.VarTypes.Continuous:
680                continue
681            corr, corrs, lengths = computeCorrelationInsideClasses(data, attrList[i], attrList[j])
682            if corr >= minCorrelation:
683                correlations.append((corr, attrList[i], attrList[j]))
684            iter += 1
685            if progressCallback and iter in milestones:
686                progressCallback(100.0 * iter / iterCount)
687
688    return sorted(correlations, reverse=True)
689
690
691def computeCorrelationInsideClasses(data, attr1, attr2):
692    if data.domain[attr1].varType != orange.VarTypes.Continuous: return None
693    if data.domain[attr2].varType != orange.VarTypes.Continuous: return None
694
695    table = data.select([attr1, attr2, data.domain.classVar])
696    table = orange.Preprocessor_dropMissing(table)
697    lengths = []; corrs = []
698    for val in table.domain.classVar.values:
699        tab = table.filter({table.domain.classVar: val})
700        a1 = [tab[k][attr1].value for k in range(len(tab))]
701        a2 = [tab[k][attr2].value for k in range(len(tab))]
702        if len(a1) == 0: continue
703        val, prob = statc.pearsonr(a1, a2)
704        lengths.append(len(a1))
705        corrs.append(val)
706    corr = 0
707    for ind in range(len(corrs)): corr += abs(corrs[ind])*lengths[ind]
708    corr /= sum(lengths)
709    return corr, corrs, lengths
710
711# compute correlation between two continuous attributes
712def computeCorrelation(data, attr1, attr2):
713    if data.domain[attr1].varType != orange.VarTypes.Continuous: return None
714    if data.domain[attr2].varType != orange.VarTypes.Continuous: return None
715
716    table = data.select([attr1, attr2])
717    table = orange.Preprocessor_dropMissing(table)
718    a1 = [table[k][attr1].value for k in range(len(table))]
719    a2 = [table[k][attr2].value for k in range(len(table))]
720
721    try:
722        val, prob = statc.pearsonr(a1, a2)
723    except:
724        val = 0.0    # possibly invalid a1 or a2
725
726    return val
727
728# ##########################################################################################
729# ##########################################################################################
730
731
732# ##########################################################################################
733# #### FUNCTIONS FOR CALCULATING ATTRIBUTE ORDER USING Oblivious decision graphs
734# ##########################################################################################
735def replaceAttributes(index1, index2, merged, data):
736    attrs = list(data.domain)
737    attrs.remove(data.domain[index1])
738    attrs.remove(data.domain[index2])
739    domain = orange.Domain(attrs+ [merged])
740    return data.select(domain)
741
742
743def getFunctionalList(data):
744    import orngCI
745
746    bestQual = -10000000
747    bestAttr = -1
748    testAttrs = []
749
750    dataShort = orange.Preprocessor_dropMissing(data)
751    # remove continuous attributes from data
752    disc = []
753    for i in range(len(dataShort.domain.attributes)):
754        # keep only discrete attributes that have more than one value
755        if dataShort.domain.attributes[i].varType == orange.VarTypes.Discrete and len(dataShort.domain.attributes[i].values) > 1: disc.append(dataShort.domain.attributes[i].name)
756    if disc == []: return []
757    discData = dataShort.select(disc + [dataShort.domain.classVar.name])
758
759    remover = orngCI.AttributeRedundanciesRemover(noMinimization = 1)
760    newData = remover(discData, weight = 0)
761
762    for attr in newData.domain.attributes: testAttrs.append(attr.name)
763
764    # compute the best attribute combination
765    for i in range(len(newData.domain.attributes)):
766        vals, qual = orngCI.FeatureByMinComplexity(newData, [newData.domain.attributes[i], newData.domain.classVar])
767        if qual > bestQual:
768            bestQual = qual
769            bestAttr = newData.domain.attributes[i].name
770            mergedVals = vals
771            mergedVals.name = newData.domain.classVar.name
772
773    if bestAttr == -1: return []
774    outList = [bestAttr]
775    newData = replaceAttributes(bestAttr, newData.domain.classVar, mergedVals, newData)
776    testAttrs.remove(bestAttr)
777
778    while (testAttrs != []):
779        bestQual = -10000000
780        for attrName in testAttrs:
781            vals, qual = orngCI.FeatureByMinComplexity(newData, [mergedVals, attrName])
782            if qual > bestQual:
783                bestqual = qual
784                bestAttr = attrName
785
786        vals, qual = orngCI.FeatureByMinComplexity(newData, [mergedVals, bestAttr])
787        mergedVals = vals
788        mergedVals.name = newData.domain.classVar.name
789        newData = replaceAttributes(bestAttr, newData.domain.classVar, mergedVals, newData)
790        outList.append(bestAttr)
791        testAttrs.remove(bestAttr)
792
793    # new attributes have "'" at the end of their names. we have to remove that in ored to identify them in the old domain
794    for index in range(len(outList)):
795        if outList[index][-1] == "'": outList[index] = outList[index][:-1]
796    return outList
797
798
799# ##########################################################################################
800# POLYVIZ FUNCTIONS USED IN VIZRANK.
801# ##########################################################################################
802
803# input: array where items are arrays, e.g.: [[1,2,3], [1,3,2], [2,1,3], [2,3,1], [3,1,2], [3,2,1]]
804# returns: array, where some items are removed. it rotates items in each array and checks if the item is already in the array. it also tries to flip the items ([::-1])
805# for the above case, it would return [[1,2,3]]
806# used for radviz
807def removeRotationDuplicates(arr, removeFlipDuplicates = 1):
808    final = []
809    projDict = {}
810    for a in arr:
811        if type(a[0]) == int:
812            key = tuple([a[i]-a[i+1] for i in range(-1, len(a)-1)])
813            rkey = tuple([a[i+1]-a[i] for i in range(-1, len(a)-1)])
814            o = min(a)      # offset. needed in cases when we get in arr: [0,1,2] and [1,2,3]. otherwise it would remove the second element because it would look the same as the first
815        else:
816            key = tuple([(a[i][0]-a[i+1][0], a[i][1]-a[i+1][1]) for i in range(-1, len(a)-1)])
817            rkey = tuple([(a[i+1][0]-a[i][0], a[i+1][1]-a[i][1]) for i in range(-1, len(a)-1)])
818            o = 0
819
820        rotations = [key[i:] + key[:i] for i in range(len(key))]
821        if 1 in [projDict.has_key((o,key)) for key in rotations]:
822            continue
823
824        rotations = [rkey[i:] + rkey[:i] for i in range(len(rkey))]
825        if removeFlipDuplicates and 1 in [projDict.has_key((o,key)) for key in rotations]:
826            continue
827
828        projDict[(o,key)] = a
829
830        """
831        found = 0
832        kkey = copy.copy(key)
833       
834        for i in range(len(kkey)):
835            kkey = tuple(list(kkey[1:]) + [kkey[0]])
836            if projDict.has_key((o,kkey)):
837                found = 1
838                break
839
840        # try also if there is a flipped duplicate
841        if not found and removeFlipDuplicates:
842            for i in range(len(rkey)):
843                rkey = tuple(list(rkey[1:]) + [rkey[0]])
844                if projDict.has_key((o,rkey)):
845                    found = 1
846                    break
847        if not found: projDict[(o,key)] = a
848        """
849    return projDict.values()
850
851# create possible combinations with the given set of numbers in arr
852def createMixCombinations(arrs, removeFlipDuplicates):
853
854    def addProjs(projs, count, i):
855        ret = []
856        perms = permutations(range(count))
857        for perm in perms:
858            c = copy.copy(projs)
859            add = [(i, p) for p in perm]
860            c = [p + add for p in c]
861            ret += c
862        return ret
863
864    projs = [[]]
865    for i in range(len(arrs)):
866        projs = addProjs(projs, arrs[i], i)
867    return removeRotationDuplicates(projs, removeFlipDuplicates)
868
869
870# get a list of possible projections if we have numClasses and want to use maxProjLen attributes
871# removeFlipDuplicates tries to flip the attributes in the projection and removes it if the projection already exists
872# removeFlipDuplicates = 1 for radviz and =0 for polyviz
873def createProjections(numClasses, maxProjLen, removeFlipDuplicates = 1):
874    if _projectionListDict.has_key((numClasses, maxProjLen, removeFlipDuplicates)):
875        return _projectionListDict[(numClasses, maxProjLen, removeFlipDuplicates)]
876
877    # create array of arrays of lengths, e.g. [3,3,2] that will tell that we want 3 attrs from the 1st class, 3 from 2nd and 2 from 3rd
878    if maxProjLen % numClasses != 0:
879        cs = combinations(range(numClasses), maxProjLen % numClasses)
880        equal = [int(maxProjLen / numClasses) for i in range(numClasses)]
881        lens = [copy.copy(equal) for comb in cs]     # make array of arrays
882        for i, comb in enumerate(cs):
883            for val in comb: lens[i][val] += 1
884    else:
885        lens = [[int(maxProjLen / numClasses) for i in range(numClasses)]]
886
887    combs = []
888    seen = []
889    for l in lens:
890        withoutZeros = filter(None, l)          # if numClasses > maxProjLen then some values in l are 0. we have to remove this zeros and check if we have already added such combinations.
891        if withoutZeros not in seen:
892            tempCombs = createMixCombinations(withoutZeros, removeFlipDuplicates)
893            combs += tempCombs
894            seen.append(withoutZeros)
895
896    if _differentClassPermutationsDict.has_key((numClasses, maxProjLen, removeFlipDuplicates)):
897        perms = _differentClassPermutationsDict[(numClasses, maxProjLen, removeFlipDuplicates)]
898    else:
899        if numClasses <= maxProjLen: perms = permutations(range(numClasses))
900        else:                        perms = combinations(range(numClasses), maxProjLen)
901        perms = removeRotationDuplicates(perms, removeFlipDuplicates)
902        _differentClassPermutationsDict[(numClasses, maxProjLen, removeFlipDuplicates)] = perms
903
904    final = []
905    for perm in perms:
906        final += [[(perm[i], j) for (i,j) in comb] for comb in combs]
907
908    _projectionListDict[(numClasses, maxProjLen, removeFlipDuplicates)] = final
909    return final
910
911
912if __name__=="__main__":
913    """
914    print "possible splits of ['a','b','c'] are ", getPossibleSplits(["a","b","c"])
915    print "possible combinations of 2 elements of the array [1,2,3,4] are ", combinations([1,2,3,4],2)
916    print "permutations of [1,2,3] are ", [[1, 2, 3], [1, 3, 2], [2, 1, 3], [2, 3, 1], [3, 1, 2], [3, 2, 1]]
917    print "array with two randomly switched elements of [1,2,3,4,5] is", switchTwoElements([1,2,3,4,5])
918
919    data = orange.ExampleTable(r"E:\Development\Python23\Lib\site-packages\Orange\Datasets\microarray\cancer diagnostics\leukemia_tran.tab")
920    a = data.toNumpy("ac")[0]
921    c = S2NMeasure()
922    c(data.domain.attributes[0].name, data)
923    """
924    final = createProjections(8,4)
925    print final
Note: See TracBrowser for help on using the repository browser.