source: orange/orange/orngVisFuncts.py @ 8042:ffcb93bc9028

Revision 8042:ffcb93bc9028, 39.2 KB checked in by markotoplak, 3 years ago (diff)

Hierarchical clustering: also catch RuntimeError when importing matplotlib (or the documentation could not be built on server).

Line 
1import orange, orngFSS, statc, random
2from orngMisc import progressBarMilestones
3import copy
4from math import ceil
5##import numpy
6##from LinearAlgebra import *
7
8_differentClassPermutationsDict = {}
9_projectionListDict = {}
10
11
12# ##########################################################################################
13# MOSAIC OPTIMIZATION FUNCTION
14# ##########################################################################################
15
16_possibleSplits = {}
17_possibleSplits[1] = [[[0]]]
18_possibleSplits[2] = [[[0], [1]]]
19_possibleSplits[3] = [[[0], [1,2]], [[0,1],[2]], [[0,2], [1]], [[0], [1], [2]]]
20_possibleSplits[4] = [[[0], [1,2,3]], [[1], [0,2,3]], [[2], [0,1,3]], [[3], [0,1,2]],
21    [[0,1], [2,3]], [[0,2], [1,3]], [[0,3], [1,2]], [[0,1,2], [3]], [[0,1,3], [2]], [[0,2,3], [1]], [[1,2,3], [0]],
22    [[0], [1], [2,3]], [[0], [2], [1,3]], [[0], [3], [1,2]], [[1], [2], [0,3]], [[1], [3], [0,2]], [[2], [3], [0,1]], [[0], [1], [2], [3]]]
23
24# get possible splits of attributes in attrs into subgroups , e.g.: ["a","b","c"] -> [[['a'], ['b', 'c']], [['a', 'b'], ['c']], [['a', 'c'], ['b']], [['a'], ['b'], ['c']]]
25def getPossibleSplits(attrs):
26    if len(attrs) > 4: return []
27    retSplit = []
28    for split in _possibleSplits[len(attrs)]:
29        parts = []
30        for arr in split:
31            a = [attrs[i] for i in arr]
32            parts.append(a)
33        retSplit.append(parts)
34    return retSplit
35
36# ##########################################################################################
37# MISC FUNCTIONS
38# ##########################################################################################
39   
40# take a number and return a formated string, eg: 2341232 -> "2,341,232"
41def createStringFromNumber(num):
42    s = str(num)
43    arr = range(len(s)-2)[:0:-3]
44    for i in arr:
45        s = s[:i] + "," + s[i:]
46    return s
47       
48
49# factoriela
50def fact(i):
51        ret = 1
52        for j in range(2, i+1): ret*= j
53        return ret
54
55# get a sublist of permutations of list. Return only those permutations that cannot be attained by rotating one other permutation.
56# used in radviz where we have to try different placements of attributes
57def generateDifferentPermutations(list):
58    if len(list) == 0: return []
59    temp = permutations(list[:-1])
60    newPerms = {}
61    for p in temp:
62        if not newPerms.has_key(tuple(p[::-1])):
63            newPerms[tuple(p)] = p
64    lastVal = list[-1]
65    return [p + [lastVal] for p in newPerms.values()]
66   
67
68# compute permutations of elements in list
69def permutations(list):
70    if list==[]:
71        return [[]]
72    else:
73        return [ [list[i]] + p for i in range(len(list)) for p in permutations(list[:i] + list[i+1:])]
74
75# return number of combinations where we select "select" from "total"
76def combinationsCount(select, total):
77    return fact(total)/ (fact(total-select)*fact(select))
78
79# get the actual combinations of items
80def combinations(items, count):
81    if count > len(items): return []
82    answer = []
83    indices = range(count)
84    if indices == []: return [[]]   # if count = 0 return empty list
85   
86    indices[-1] = indices[-1] - 1
87    while 1:
88        limit = len(items) - 1
89        i = count - 1
90        while i >= 0 and indices[i] == limit:
91            i = i - 1
92            limit = limit - 1
93        if i < 0: break
94
95        val = indices[i]
96        for i in xrange( i, count ):
97            val = val + 1
98            indices[i] = val
99        temp = []
100        for i in indices:
101            temp.append( items[i] )
102        answer.append( temp )
103    return answer
104
105# ##########################################################################################
106# VIZRANK FUNCTIONS
107# ##########################################################################################
108
109# randomly switch two elements in a list. repeat nrOfTimes.
110def switchTwoElements(list, nrOfTimes = 1):
111    if len(list) < 2: return list
112    for i in range(nrOfTimes):
113        index1 = random.randint(0, len(list)-1)
114        index2 = random.randint(0, len(list)-1)
115        if index1 == index2:
116            index2 = (index1 + 1) % len(list)
117        exVal = list[index1]
118        list[index1] = list[index2]
119        list[index2] = exVal
120    return list
121
122# give a list of elements (e.g. [1,2,3,4,5,6,7]) and number of groups/classes (e.g. 2) and the number of switches you want to perform
123# we assume that the the first len(list)/groupCount elements belong to one group, the second to second group and so on. We only want
124# to switch elements inside a group, not elements between groups
125def switchTwoElementsInGroups(list, groupCount, nrOfTimes = 1):
126    # find group boundaries
127    count = len(list)
128    boundaries = []
129    index = 0
130    while groupCount > 0:
131        boundaries.append(int(ceil(count/float(groupCount))))
132        groupCount -= 1
133        count -= boundaries[-1]
134    currSum = 0
135    for i in range(len(boundaries)):
136        boundaries[i] = boundaries[i] + currSum
137        currSum = boundaries[i]
138
139    parts = []
140    boundaries = [0] + boundaries
141    for i in range(len(boundaries)-1):
142        parts.append(list[boundaries[i] : boundaries[i+1]])
143
144    # take parts, and for each part switch a few elements
145    retList = []
146    while parts != []:
147        count = int(ceil(nrOfTimes/float(len(parts))))
148        nrOfTimes -= count
149        part = parts.pop(random.randint(0, len(parts)-1))
150        retList += switchTwoElements(part, count)
151
152    return retList
153
154
155
156# ##########################################################################################
157# MEASURES FOR EVALUATING INTERESTINGNESS OF ATTRIBUTES
158# ##########################################################################################
159
160# class that measures correlation between continuous class value and continuous attribute
161class MeasureCorrelation:
162    def __init__(self):
163        pass
164
165    def __call__(self, attr, data):
166        return self.MeasureAttribute_info(attr, data)
167
168    def MeasureAttribute_info(self, attr, data):
169        table = data.select([attr, data.domain.classVar])
170        table = orange.Preprocessor_dropMissing(table)
171        a1 = [table[k][0].value for k in range(len(table))]
172        a2 = [table[k][1].value for k in range(len(table))]
173       
174        val, prob = statc.pearsonr(a1, a2)
175        return val
176       
177
178# fisher discriminant implemented to be used as orange.MeasureAttribute
179class MeasureFisherDiscriminant:
180    def __init__(self):
181        self.dataset = None
182        self.attrInfo = {}
183        self.stats = []
184
185    def __call__(self, attr, data):
186        return self.MeasureAttribute_info(attr, data)
187
188    def MeasureAttribute_info(self, attr, data):
189        # if basic statistics is not computed for this dataset -> compute it
190        if not (self.stats and self.dataset == data):
191            self.stats = {}
192            self.dataset = data
193
194            arr = [0] * len(data.domain.attributes)
195            for val in data.domain.classVar.values:
196                data2 = data.select({data.domain.classVar: val})
197                bas = orange.DomainBasicAttrStat(data2)
198                self.stats[val] = bas
199
200            for i in range(len(self.stats.keys())):
201                statI = self.stats[self.stats.keys()[i]]
202                if len(statI) == 0: continue
203                for j in range(i+1, len(self.stats.keys())):
204                    statJ = self.stats[self.stats.keys()[j]]
205                    if len(statJ) == 0: continue
206                    for attribute in range(len(data.domain.attributes)):
207                        if data.domain.attributes[attribute].varType != orange.VarTypes.Continuous: continue
208                        bottom = (statI[attribute].n * statI[attribute].dev + statJ[attribute].n * statJ[attribute].dev)
209                        if bottom == 0.0: bottom = 0.001
210                        val = abs(statI[attribute].avg - statJ[attribute].avg) * (statI[attribute].n + statJ[attribute].n)/bottom
211                        arr[attribute] += val
212
213            # normalize values in arr so that the largest value will be 1 and others will be proportionally smaller
214            largest = max(arr)
215            if largest != 0:
216                arr = [val/largest for val in arr]
217
218            for i in range(len(data.domain.attributes)):
219                self.attrInfo[data.domain.attributes[i].name] = arr[i]
220
221        return self.attrInfo[data.domain[attr].name]
222
223
224# signal to noise measure implemented as orange.MeasureAttribute
225class S2NMeasure:
226    def __init__(self):
227        self.attrInfo = {}
228        self.data = None
229
230    def __call__(self, attr, data):
231        # if the data changed clear the attribute values
232        if data != self.data:
233            self.attrInfo = {}
234            self.data = data
235
236        if self.attrInfo == {}:
237            classVar = data.domain.classVar
238            datas = [data.select({data.domain.classVar.name: [val]}) for val in data.domain.classVar.values]
239            stats = [orange.DomainBasicAttrStat(d) for d in datas]
240            cls = range(len(stats))
241            clsCount = len(stats)
242            for i in range(len(stats[0])):
243                if stats[0][i] == None: continue
244                temp = 0.0
245                for j in cls:
246                    for k in range(j+1, clsCount):
247                        if (stats[j][i].dev + stats[k][i].dev) > 0:
248                            temp += abs((stats[j][i].avg - stats[k][i].avg) / (stats[j][i].dev + stats[k][i].dev))
249                self.attrInfo[data.domain.attributes[i].name] = temp
250               
251        if self.attrInfo.has_key(data.domain[attr].name):
252            return self.attrInfo[data.domain[attr].name]
253        else:
254            return -1
255
256
257# same class as above, just that we can use it to evaluate attribute for each class value
258class S2NMeasureMix(S2NMeasure):
259    def __init__(self):
260        S2NMeasure.__init__(self)
261        self.attrInfoMix = {}
262        self.dataMix = None
263        self.sortedAttrList = []
264       
265    def __call__(self, attribute, data):
266               
267        # if the data changed clear the attribute values
268        if data != self.dataMix:
269            self.attrInfoMix = {}
270            self.attrInfo = {}
271            self.dataMix = data
272
273        if self.attrInfoMix == {}:
274            attrs = range(len(data.domain.attributes))
275            classVar = data.domain.classVar
276            #shortData = data.select(attrs + [classVar])
277            datas = [data.select({classVar.name: [val]}) for val in classVar.values]
278            statistics = [orange.DomainBasicAttrStat(d) for d in datas]
279
280            cls = []
281            for classVarIndex, c in enumerate(classVar.values):   # for each class value compute how good is each attribute for discriminating this class value against all other
282                attrValsList = []
283                newData = mergeClassValues(data, c)
284                for attrIndex in range(len(attrs)):
285                    if data.domain[attrIndex].varType == orange.VarTypes.Discrete:      # ignore discrete attributes
286                        continue   
287                    val = S2NMeasure.__call__(self, attrs[attrIndex], newData)
288                    if statistics[0][attrIndex] == None:
289                        attrValsList.append((0,attrs[attrIndex]))
290                    else:
291                        aves = [stat[attrIndex].avg for stat in statistics]
292                        if max(aves) != aves[classVarIndex] :
293                            val = -val
294                        attrValsList.append((val, attrs[attrIndex]))
295                attrValsList.sort()
296                attrValsList = [element[1] for element in attrValsList]     # remove the value
297                attrValsList.reverse()
298                cls.append(attrValsList)
299
300            attrPositionsDict = dict([(attr, []) for attr in cls[0]])
301            for arr in cls:
302                for i in range(len(arr)):
303                    attrPositionsDict[arr[i]].append(i)
304           
305            numClasses = len(classVar.values)
306            currPos = [0 for i in range(numClasses)]
307            self.sortedAttrList = []
308            ableToAdd = 1
309            while ableToAdd:    # sometimes some attributes are duplicated. in such cases we will add only one instance of such attribute to the list
310                ableToAdd = 0
311                for i in range(numClasses):
312                    pos = currPos[i]
313                    while pos < len(cls[i]) and cls[i][pos] == None:
314                        pos += 1
315                    currPos[i] = pos + 1
316                    if pos >= len(cls[i]):
317                        continue
318                    ableToAdd = 1
319                   
320                    attr = cls[i][pos]
321                    self.sortedAttrList.append(attr)
322                    attrPositions = attrPositionsDict[attr]     # get indices in cls where attribute attr is placed
323                    for j in range(numClasses):
324                        cls[j][attrPositions[j]] = None
325                   
326            count = len(self.sortedAttrList)
327            for (i, attr) in enumerate(self.sortedAttrList):
328                self.attrInfoMix[data.domain[attr].name] = count-i
329
330        if self.attrInfoMix.has_key(data.domain[attribute].name):
331            return self.attrInfoMix[data.domain[attribute].name]
332        else:
333            return -1
334   
335# ##########################################################################################
336# ##########################################################################################
337
338def mergeClassValues(data, value):
339    selection = orange.EnumVariable("Selection", values = ["0", "1"])
340
341    selectedClassesStr = [value]
342    nonSelectedClassesStr = []
343    for val in data.domain.classVar.values:
344        if val not in selectedClassesStr: nonSelectedClassesStr.append(val)
345               
346    shortData1 = data.select({data.domain.classVar.name: selectedClassesStr})
347    shortData2 = data.select({data.domain.classVar.name: nonSelectedClassesStr})
348    d1 = orange.Domain(shortData1.domain.attributes + [selection])
349    selection.getValueFrom = lambda ex, what: orange.Value(selection, "0")
350    data1 = orange.ExampleTable(d1, shortData1)
351
352    selection.getValueFrom = lambda ex, what: orange.Value(selection, "1")
353    data2 = orange.ExampleTable(d1, shortData2)
354    data1.extend(data2)
355    return data1
356
357
358# almost equal to evaluateAttributesByEachClassValue function with one exception. if the class value that we want to discriminate
359# has a lower average value than the other average class values then we multiply quality of this attribute with -1
360# this way we get the attributes that have the highes expression and are also good at discrimination
361def findAttributeGroupsForRadviz(data, measure):
362    if isinstance(measure, S2NMeasureMix):
363        measure(data.domain.attributes[0].name, data)                       # just call measure to compute quality of all attributes
364        attrNames = [data.domain[ind].name for ind in measure.sortedAttrList]
365        numClasses = len(data.domain.classVar.values)
366        cls = [attrNames[i::numClasses] for i in range(numClasses)]
367    else:
368        attrVals = [(measure(attr.name, data), attr.name) for attr in data.domain.attributes]
369        attrVals.sort()
370        attrVals.reverse()
371        attrNames = [attrVals[i][1] for i in range(len(attrVals))]  # remove quality values of the attributes
372        cls = None
373
374    return attrNames, cls
375   
376   
377# for each class value test how good does the measure discriminate between this class value and all the other values merged
378# return a list of lists. Each list contains tuples (val, attr), where val is the attribute quality for attr at separating one class value from the others
379def evaluateAttributesByEachClassValue(data, measure, attrs):
380    cls = []
381    for c in data.domain.classVar.values:   # for each class value compute how good is each attribute for discriminating this class value against all other
382        v = []
383        data2 = mergeClassValues(data, c)
384        for attr in attrs:
385            v.append((measure(attr, data2), attr))
386        v.sort()
387        cls.append(v)
388    return cls
389
390# used by VizRank to evaluate attributes
391def evaluateAttributesDiscClass(data, contMeasure, discMeasure):
392    attrs = []
393    #corr = MeasureCorrelation()
394    for attr in data.domain.attributes:
395        #if data.domain.classVar.varType == orange.VarTypes.Continuous and attr.varType == orange.VarTypes.Continuous: attrs.append((corr(attr.name, data), attr.name))
396        if data.domain.classVar.varType == orange.VarTypes.Continuous and attr.varType == orange.VarTypes.Continuous: attrs.append((1, attr.name))
397        elif data.domain.classVar.varType == orange.VarTypes.Continuous:            attrs.append((0.0, attr.name))
398        elif discMeasure == None and attr.varType == orange.VarTypes.Discrete:      attrs.append((0.0, attr.name))
399        elif contMeasure == None and attr.varType == orange.VarTypes.Continuous:    attrs.append((0.0, attr.name))
400        elif attr.varType == orange.VarTypes.Continuous:                            attrs.append((contMeasure(attr.name, data), attr.name))
401        else:                                                                       attrs.append((discMeasure(attr.name, data), attr.name))
402
403    if discMeasure or contMeasure:
404        attrs.sort()
405        attrs.reverse()
406
407    return [attr for (val, attr) in attrs]  # return only the ordered list of attributes and not also their values
408
409def evaluateAttributesContClass(data, contMeasure, discMeasure):
410    attrs = []
411    for attr in data.domain.attributes:
412        attrs.append((1, attr.name))
413#        elif discMeasure == None and attr.varType == orange.VarTypes.Discrete:      attrs.append((0.0, attr.name))
414#        elif contMeasure == None and attr.varType == orange.VarTypes.Continuous:    attrs.append((0.0, attr.name))
415#        elif attr.varType == orange.VarTypes.Continuous:                            attrs.append((contMeasure(attr.name, data), attr.name))
416#        else:                                                                       attrs.append((discMeasure(attr.name, data), attr.name))
417
418    if discMeasure or contMeasure:
419        attrs.sort()
420        attrs.reverse()
421
422    return [attr for (val, attr) in attrs]  # return only the ordered list of attributes and not also their values
423       
424
425def evaluateAttributesNoClass(data, contMeasure, discMeasure):
426    attrs = []
427    for attr in data.domain.attributes:
428        attrs.append((1, attr.name))
429#        elif discMeasure == None and attr.varType == orange.VarTypes.Discrete:      attrs.append((0.0, attr.name))
430#        elif contMeasure == None and attr.varType == orange.VarTypes.Continuous:    attrs.append((0.0, attr.name))
431#        elif attr.varType == orange.VarTypes.Continuous:                            attrs.append((contMeasure(attr.name, data), attr.name))
432#        else:                                                                       attrs.append((discMeasure(attr.name, data), attr.name))
433
434    if discMeasure or contMeasure:
435        attrs.sort()
436        attrs.reverse()
437
438    return [attr for (val, attr) in attrs]  # return only the ordered list of attributes and not also their values
439
440
441 
442   
443# ##############################################################################################
444# ##############################################################################################
445
446
447# SELECT ATTRIBUTES ##########################
448def selectAttributes(data, attrContOrder, attrDiscOrder, projections = None):
449    if data.domain.classVar == None or data.domain.classVar.varType != orange.VarTypes.Discrete:
450        return ([attr.name for attr in data.domain.attributes], [], 0)
451
452    shown = [data.domain.classVar.name]; hidden = []; maxIndex = 0    # initialize outputs
453
454    # # both are RELIEF
455    if attrContOrder == "ReliefF" and attrDiscOrder == "ReliefF":
456        attrVals = orngFSS.attMeasure(data, orange.MeasureAttribute_relief())
457        s,h = getTopAttrs(attrVals, 0.95)
458        return (shown + s, hidden + h, 0)
459
460    # # both are NONE
461    elif attrContOrder == "None" and attrDiscOrder == "None":
462        for item in data.domain.attributes:    shown.append(item.name)
463        return (shown, hidden, 0)
464
465
466    # disc and cont attribute list
467    discAttrs = []; contAttrs = []
468    for attr in data.domain.attributes:
469        if attr.varType == orange.VarTypes.Continuous: contAttrs.append(attr.name)
470        elif attr.varType == orange.VarTypes.Discrete: discAttrs.append(attr.name)
471       
472
473    ###############################
474    # sort continuous attributes
475    if attrContOrder == "None":
476        shown += contAttrs
477    elif attrContOrder in ["ReliefF", "Fisher discriminant", "Signal to Noise", "Signal to Noise For Each Class"]:
478        if attrContOrder == "ReliefF":               measure = orange.MeasureAttribute_relief(k=10, m=50)
479        elif attrContOrder == "Fisher discriminant": measure = MeasureFisherDiscriminant()
480        elif attrContOrder == "Signal to Noise":     measure = S2NMeasure()
481        else:                                        measure = S2NMeasureMix()
482
483        dataNew = data.select(contAttrs + [data.domain.classVar])
484        attrVals = orngFSS.attMeasure(dataNew, measure)
485        s,h = getTopAttrs(attrVals, 0.95)
486        shown += s
487        hidden += h
488    else:
489        print "Unknown value for attribute order: ", attrContOrder
490
491    # ###############################
492    # sort discrete attributes
493    if attrDiscOrder == "None":
494        shown += discAttrs
495    elif attrDiscOrder == "GainRatio" or attrDiscOrder == "Gini" or attrDiscOrder == "ReliefF":
496        if attrDiscOrder == "GainRatio":   measure = orange.MeasureAttribute_gainRatio()
497        elif attrDiscOrder == "Gini":       measure = orange.MeasureAttribute_gini()
498        else:                               measure = orange.MeasureAttribute_relief()
499
500        dataNew = data.select(discAttrs + [data.domain.classVar])
501        attrVals = orngFSS.attMeasure(dataNew, measure)
502        s,h = getTopAttrs(attrVals, 0.95)
503        shown += s; hidden += h
504
505    elif attrDiscOrder == "Oblivious decision graphs":
506            #shown.append(data.domain.classVar.name)
507            attrs = getFunctionalList(data)
508            for item in attrs:
509                shown.append(item)
510            for attr in data.domain.attributes:
511                if attr.name not in shown and attr.varType == orange.VarTypes.Discrete:
512                    hidden.append(attr.name)
513    else:
514        print "Unknown value for attribute order: ", attrDiscOrder
515
516    return (shown, hidden, maxIndex)
517
518
519def getTopAttrs(results, maxSum = 0.95, onlyPositive = 1):
520    s = []; h = []
521    sum = 0
522    for (attr, val) in results:
523        if not onlyPositive or val > 0: sum += val
524    tempSum = 0   
525    for (attr, val) in results:
526        if tempSum < maxSum*sum: s.append(attr)
527        else: h.append(attr)
528        tempSum += val
529    return (s, h)
530
531
532# ##########################################################################################
533# PARALLEL COORDINATES FUNCTIONS
534# ##########################################################################################
535
536# find interesting attribute order for parallel coordinates
537# attrInfo = [(val1, attr1, attr2), .... ]
538def optimizeAttributeOrder(attrInfo, numberOfAttributes, optimizationDlg, app):
539    while (attrInfo != []):
540        proj = []
541        projVal = []
542        canAddAttribute = 1
543        while canAddAttribute:
544            if not optimizationDlg.canContinueOptimization(): return
545            app.processEvents()        # allow processing of other events
546   
547            if len(proj) == 0:
548                proj = [attrInfo[0][1], attrInfo[0][2]]
549                projVal = [attrInfo[0][0]]
550            elif len(proj) == numberOfAttributes:
551                canAddAttribute = 0     # time to add the projection to the list
552            else:
553                proj, projVal, success = addBestToCurrentProj(proj, projVal, attrInfo)
554                if not success: canAddAttribute = 0 # there are no more attributes that can be added to this projection
555
556        if len(proj) == numberOfAttributes:
557            # I (Ales) commented this out because fixItersectingPairs can enter an endless cycle
558            #proj, projVal = fixIntersectingPairs(proj, projVal, attrInfo)
559            for i in range(len(proj)-1):
560                removeAttributePair(proj[i], proj[i+1], attrInfo)
561            optimizationDlg.addProjection(sum(projVal)/len(projVal), proj)
562        else:
563            for i in range(len(proj)-1):
564                removeAttributePair(proj[i], proj[i+1], attrInfo)
565
566
567# try rotating subsequences of proj to increase value of attribute order
568def fixIntersectingPairs(proj, projVal, attrInfo):
569    changed = 1
570    allprojections = set()
571    while changed:
572        changed = 0
573        for i in range(len(projVal)-1):
574            if changed: continue
575            for j in range(i+2, len(projVal)-1):
576                if changed: continue
577                val1, exists1 = getAttributePairValue(proj[i], proj[j], attrInfo)
578                val2, exists2 = getAttributePairValue(proj[i+1], proj[j+1], attrInfo)
579                if exists1 and exists2 and (val1 + val2 > projVal[i] + projVal[j]):
580                    projVal[i] = val1
581                    projVal[j] = val2
582                    rev = proj[i+1:j+1]
583                    rev.reverse()
584                    tempProj = proj[:i+1] + rev + proj[j+1:]
585                    proj = tempProj
586                    changed = 1     # we rotated the projection. start checking from the begining
587       
588        tproj = tuple(proj) 
589        assert(tproj not in allprojections)
590        allprojections.add(tproj)
591               
592    return proj, projVal
593
594# return value for attribute pair (val, attr1, attr2) if exists. if not, return 0
595def getAttributePairValue(attr1, attr2, attrInfo):
596    for (val, a1, a2) in attrInfo:
597        if (attr1 == a1 and attr2 == a2) or (attr1 == a2 and attr2 == a1): return (val, 1)
598    return (0, 0)
599
600# remove attribute pair (val, attr1, attr2) from attrInfo
601def removeAttributePair(attr1, attr2, attrInfo):
602    for (val, a1, a2) in attrInfo:
603        if (attr1 == a1 and attr2 == a2) or (attr1 == a2 and attr2 == a1):
604            attrInfo.remove((val, a1, a2))
605            return
606    print "failed to remove attribute pair", attr1, attr2
607
608
609def addBestToCurrentProj(proj, projVal, attrInfo):
610    for (val, a1, a2) in attrInfo:
611        if (a1 == proj[0] and a2 not in proj) or (a2 == proj[0] and a1 not in proj) or (a1 == proj[-1] and a2 not in proj) or (a2 == proj[-1] and a1 not in proj):
612            if a1 == proj[0]: return ([a2] + proj, [val] + projVal, 1)
613            elif a2 == proj[0]: return ([a1] + proj, [val] + projVal, 1)
614            elif a1 == proj[-1]: return (proj + [a2], projVal + [val], 1)
615            else                       : return (proj + [a1], projVal + [val], 1)
616
617    """
618    for (val, a1, a2) in attrInfo:
619        if a1 in proj and a2 in proj: continue
620        if a1 not in proj and a2 not in proj: continue
621       
622        if (a1 not in proj) and (a2 in proj): placed = a2; place = a1
623        else:                                  placed = a1; place = a2
624           
625        ind = proj.index(placed)
626        if ind > 0:
627            val2, exists = getAttributePairValue(place, proj[ind-1], attrInfo)
628            if exists:
629                proj.insert(ind, place)
630                projVal[ind-1] = val2
631                projVal.insert(ind-1, val)
632                return (proj, projVal, 1)
633        if ind < len(proj)-1:
634            val2, exists = getAttributePairValue(place, proj[ind+1], attrInfo)
635            if exists:
636                proj.insert(ind+1, place)
637                projVal[ind] = val
638                projVal.insert(ind, val2)
639                return (proj, projVal, 1)
640    """
641    return proj, projVal, 0
642
643def computeCorrelationBetweenAttributes(data, attrList, minCorrelation = 0.0, progressCallback=None):
644    correlations = []
645    attrListLen = len(attrList)
646    iterCount = attrListLen * (attrListLen - 1) / 2
647    iter = 0
648    milestones = progressBarMilestones(iterCount)
649    for i in range(len(attrList)):
650        if data.domain.attributes[i].varType != orange.VarTypes.Continuous:
651            continue
652        for j in range(i+1, len(attrList)):
653            if data.domain.attributes[j].varType != orange.VarTypes.Continuous:
654                continue
655            val = abs(computeCorrelation(data, attrList[i], attrList[j]))
656            if val >= minCorrelation:
657                correlations.append((val, attrList[i], attrList[j]))
658            iter += 1
659            if progressCallback and iter in milestones:
660                progressCallback(100.0 * iter / iterCount)
661               
662    return sorted(correlations, reverse=True)
663
664
665def computeCorrelationInsideClassesBetweenAttributes(data, attrList, minCorrelation = 0.0, progressCallback=None):
666    if not data.domain.classVar or data.domain.classVar.varType == orange.VarTypes.Continuous:
667        return []
668    correlations = []
669    attrListLen = len(attrList)
670    iterCount = attrListLen * (attrListLen - 1) / 2
671    iter = 0
672    milestones = progressBarMilestones(iterCount)
673    for i in range(len(attrList)):
674        if data.domain.attributes[i].varType != orange.VarTypes.Continuous:
675            continue
676        for j in range(i+1, len(attrList)):
677            if data.domain.attributes[j].varType != orange.VarTypes.Continuous:
678                continue
679            corr, corrs, lengths = computeCorrelationInsideClasses(data, attrList[i], attrList[j])
680            if corr >= minCorrelation:
681                correlations.append((corr, attrList[i], attrList[j]))
682            iter += 1
683            if progressCallback and iter in milestones:
684                progressCallback(100.0 * iter / iterCount)
685               
686    return sorted(correlations, reverse=True)
687
688
689def computeCorrelationInsideClasses(data, attr1, attr2):
690    if data.domain[attr1].varType != orange.VarTypes.Continuous: return None
691    if data.domain[attr2].varType != orange.VarTypes.Continuous: return None
692
693    table = data.select([attr1, attr2, data.domain.classVar])
694    table = orange.Preprocessor_dropMissing(table)
695    lengths = []; corrs = []
696    for val in table.domain.classVar.values:
697        tab = table.filter({table.domain.classVar: val})
698        a1 = [tab[k][attr1].value for k in range(len(tab))]
699        a2 = [tab[k][attr2].value for k in range(len(tab))]
700        if len(a1) == 0: continue
701        val, prob = statc.pearsonr(a1, a2)
702        lengths.append(len(a1))
703        corrs.append(val)
704    corr = 0
705    for ind in range(len(corrs)): corr += abs(corrs[ind])*lengths[ind]
706    corr /= sum(lengths)
707    return corr, corrs, lengths
708
709# compute correlation between two continuous attributes
710def computeCorrelation(data, attr1, attr2):
711    if data.domain[attr1].varType != orange.VarTypes.Continuous: return None
712    if data.domain[attr2].varType != orange.VarTypes.Continuous: return None
713
714    table = data.select([attr1, attr2])
715    table = orange.Preprocessor_dropMissing(table)
716    a1 = [table[k][attr1].value for k in range(len(table))]
717    a2 = [table[k][attr2].value for k in range(len(table))]
718   
719    try:
720        val, prob = statc.pearsonr(a1, a2)
721    except:
722        val = 0.0    # possibly invalid a1 or a2
723       
724    return val
725
726# ##########################################################################################
727# ##########################################################################################
728
729
730# ##########################################################################################
731# #### FUNCTIONS FOR CALCULATING ATTRIBUTE ORDER USING Oblivious decision graphs
732# ##########################################################################################
733def replaceAttributes(index1, index2, merged, data):
734    attrs = list(data.domain)
735    attrs.remove(data.domain[index1])
736    attrs.remove(data.domain[index2])
737    domain = orange.Domain(attrs+ [merged])
738    return data.select(domain)
739
740
741def getFunctionalList(data):
742    import orngCI
743   
744    bestQual = -10000000
745    bestAttr = -1
746    testAttrs = []
747
748    dataShort = orange.Preprocessor_dropMissing(data)
749    # remove continuous attributes from data
750    disc = []
751    for i in range(len(dataShort.domain.attributes)):
752        # keep only discrete attributes that have more than one value
753        if dataShort.domain.attributes[i].varType == orange.VarTypes.Discrete and len(dataShort.domain.attributes[i].values) > 1: disc.append(dataShort.domain.attributes[i].name)
754    if disc == []: return []
755    discData = dataShort.select(disc + [dataShort.domain.classVar.name])
756
757    remover = orngCI.AttributeRedundanciesRemover(noMinimization = 1)
758    newData = remover(discData, weight = 0)
759
760    for attr in newData.domain.attributes: testAttrs.append(attr.name)
761
762    # compute the best attribute combination
763    for i in range(len(newData.domain.attributes)):
764        vals, qual = orngCI.FeatureByMinComplexity(newData, [newData.domain.attributes[i], newData.domain.classVar])
765        if qual > bestQual:
766            bestQual = qual
767            bestAttr = newData.domain.attributes[i].name
768            mergedVals = vals
769            mergedVals.name = newData.domain.classVar.name
770
771    if bestAttr == -1: return []
772    outList = [bestAttr]
773    newData = replaceAttributes(bestAttr, newData.domain.classVar, mergedVals, newData)
774    testAttrs.remove(bestAttr)
775   
776    while (testAttrs != []):
777        bestQual = -10000000
778        for attrName in testAttrs:
779            vals, qual = orngCI.FeatureByMinComplexity(newData, [mergedVals, attrName])
780            if qual > bestQual:
781                bestqual = qual
782                bestAttr = attrName
783
784        vals, qual = orngCI.FeatureByMinComplexity(newData, [mergedVals, bestAttr])
785        mergedVals = vals
786        mergedVals.name = newData.domain.classVar.name
787        newData = replaceAttributes(bestAttr, newData.domain.classVar, mergedVals, newData)
788        outList.append(bestAttr)
789        testAttrs.remove(bestAttr)
790
791    # new attributes have "'" at the end of their names. we have to remove that in ored to identify them in the old domain
792    for index in range(len(outList)):
793        if outList[index][-1] == "'": outList[index] = outList[index][:-1]
794    return outList
795
796
797# ##########################################################################################
798# POLYVIZ FUNCTIONS USED IN VIZRANK.
799# ##########################################################################################
800
801# input: array where items are arrays, e.g.: [[1,2,3], [1,3,2], [2,1,3], [2,3,1], [3,1,2], [3,2,1]]
802# returns: array, where some items are removed. it rotates items in each array and checks if the item is already in the array. it also tries to flip the items ([::-1])
803# for the above case, it would return [[1,2,3]]
804# used for radviz
805def removeRotationDuplicates(arr, removeFlipDuplicates = 1):
806    final = []
807    projDict = {}
808    for a in arr:
809        if type(a[0]) == int:
810            key = tuple([a[i]-a[i+1] for i in range(-1, len(a)-1)])
811            rkey = tuple([a[i+1]-a[i] for i in range(-1, len(a)-1)])
812            o = min(a)      # offset. needed in cases when we get in arr: [0,1,2] and [1,2,3]. otherwise it would remove the second element because it would look the same as the first
813        else:
814            key = tuple([(a[i][0]-a[i+1][0], a[i][1]-a[i+1][1]) for i in range(-1, len(a)-1)])
815            rkey = tuple([(a[i+1][0]-a[i][0], a[i+1][1]-a[i][1]) for i in range(-1, len(a)-1)])
816            o = 0
817
818        rotations = [key[i:] + key[:i] for i in range(len(key))]
819        if 1 in [projDict.has_key((o,key)) for key in rotations]:
820            continue
821
822        rotations = [rkey[i:] + rkey[:i] for i in range(len(rkey))]
823        if removeFlipDuplicates and 1 in [projDict.has_key((o,key)) for key in rotations]:
824            continue
825
826        projDict[(o,key)] = a   
827
828        """
829        found = 0
830        kkey = copy.copy(key)
831       
832        for i in range(len(kkey)):
833            kkey = tuple(list(kkey[1:]) + [kkey[0]])
834            if projDict.has_key((o,kkey)):
835                found = 1
836                break
837
838        # try also if there is a flipped duplicate
839        if not found and removeFlipDuplicates:
840            for i in range(len(rkey)):
841                rkey = tuple(list(rkey[1:]) + [rkey[0]])
842                if projDict.has_key((o,rkey)):
843                    found = 1
844                    break
845        if not found: projDict[(o,key)] = a
846        """
847    return projDict.values()
848       
849# create possible combinations with the given set of numbers in arr
850def createMixCombinations(arrs, removeFlipDuplicates):
851
852    def addProjs(projs, count, i):
853        ret = []
854        perms = permutations(range(count))
855        for perm in perms:
856            c = copy.copy(projs)
857            add = [(i, p) for p in perm]
858            c = [p + add for p in c]
859            ret += c
860        return ret
861
862    projs = [[]]
863    for i in range(len(arrs)):
864        projs = addProjs(projs, arrs[i], i)
865    return removeRotationDuplicates(projs, removeFlipDuplicates)
866
867
868# get a list of possible projections if we have numClasses and want to use maxProjLen attributes
869# removeFlipDuplicates tries to flip the attributes in the projection and removes it if the projection already exists
870# removeFlipDuplicates = 1 for radviz and =0 for polyviz
871def createProjections(numClasses, maxProjLen, removeFlipDuplicates = 1):
872    if _projectionListDict.has_key((numClasses, maxProjLen, removeFlipDuplicates)):
873        return _projectionListDict[(numClasses, maxProjLen, removeFlipDuplicates)]
874
875    # create array of arrays of lengths, e.g. [3,3,2] that will tell that we want 3 attrs from the 1st class, 3 from 2nd and 2 from 3rd
876    if maxProjLen % numClasses != 0:
877        cs = combinations(range(numClasses), maxProjLen % numClasses)
878        equal = [int(maxProjLen / numClasses) for i in range(numClasses)]
879        lens = [copy.copy(equal) for comb in cs]     # make array of arrays
880        for i, comb in enumerate(cs):
881            for val in comb: lens[i][val] += 1
882    else:
883        lens = [[int(maxProjLen / numClasses) for i in range(numClasses)]]
884       
885    combs = []
886    seen = []
887    for l in lens:
888        withoutZeros = filter(None, l)          # if numClasses > maxProjLen then some values in l are 0. we have to remove this zeros and check if we have already added such combinations.
889        if withoutZeros not in seen:
890            tempCombs = createMixCombinations(withoutZeros, removeFlipDuplicates)
891            combs += tempCombs
892            seen.append(withoutZeros)
893
894    if _differentClassPermutationsDict.has_key((numClasses, maxProjLen, removeFlipDuplicates)):
895        perms = _differentClassPermutationsDict[(numClasses, maxProjLen, removeFlipDuplicates)]
896    else:
897        if numClasses <= maxProjLen: perms = permutations(range(numClasses))
898        else:                        perms = combinations(range(numClasses), maxProjLen)
899        perms = removeRotationDuplicates(perms, removeFlipDuplicates)
900        _differentClassPermutationsDict[(numClasses, maxProjLen, removeFlipDuplicates)] = perms
901           
902    final = []
903    for perm in perms:
904        final += [[(perm[i], j) for (i,j) in comb] for comb in combs]
905
906    _projectionListDict[(numClasses, maxProjLen, removeFlipDuplicates)] = final
907    return final
908
909
910if __name__=="__main__":
911    """
912    print "possible splits of ['a','b','c'] are ", getPossibleSplits(["a","b","c"])
913    print "possible combinations of 2 elements of the array [1,2,3,4] are ", combinations([1,2,3,4],2)
914    print "permutations of [1,2,3] are ", [[1, 2, 3], [1, 3, 2], [2, 1, 3], [2, 3, 1], [3, 1, 2], [3, 2, 1]]
915    print "array with two randomly switched elements of [1,2,3,4,5] is", switchTwoElements([1,2,3,4,5])
916
917    data = orange.ExampleTable(r"E:\Development\Python23\Lib\site-packages\Orange\Datasets\microarray\cancer diagnostics\leukemia_tran.tab")
918    a = data.toNumpy("ac")[0]
919    c = S2NMeasure()
920    c(data.domain.attributes[0].name, data)
921    """
922    final = createProjections(8,4)
923    print final
Note: See TracBrowser for help on using the repository browser.