source: orange/Orange/misc/visfuncts.py @ 9706:19e411118955

Revision 9706:19e411118955, 38.9 KB checked in by anze <anze.staric@…>, 2 years ago (diff)

Copied orngVisFuncts and fixed dependencies.

Line 
1import orange, statc, random
2from Orange.feature import scoring
3from Orange.misc import progress_bar_milestones
4import copy
5from math import ceil
6##import numpy
7##from LinearAlgebra import *
8
9_differentClassPermutationsDict = {}
10_projectionListDict = {}
11
12
13# ##########################################################################################
14# MOSAIC OPTIMIZATION FUNCTION
15# ##########################################################################################
16
17_possibleSplits = {}
18_possibleSplits[1] = [[[0]]]
19_possibleSplits[2] = [[[0], [1]]]
20_possibleSplits[3] = [[[0], [1,2]], [[0,1],[2]], [[0,2], [1]], [[0], [1], [2]]]
21_possibleSplits[4] = [[[0], [1,2,3]], [[1], [0,2,3]], [[2], [0,1,3]], [[3], [0,1,2]],
22    [[0,1], [2,3]], [[0,2], [1,3]], [[0,3], [1,2]], [[0,1,2], [3]], [[0,1,3], [2]], [[0,2,3], [1]], [[1,2,3], [0]],
23    [[0], [1], [2,3]], [[0], [2], [1,3]], [[0], [3], [1,2]], [[1], [2], [0,3]], [[1], [3], [0,2]], [[2], [3], [0,1]], [[0], [1], [2], [3]]]
24
25# get possible splits of attributes in attrs into subgroups , e.g.: ["a","b","c"] -> [[['a'], ['b', 'c']], [['a', 'b'], ['c']], [['a', 'c'], ['b']], [['a'], ['b'], ['c']]]
26def getPossibleSplits(attrs):
27    if len(attrs) > 4: return []
28    retSplit = []
29    for split in _possibleSplits[len(attrs)]:
30        parts = []
31        for arr in split:
32            a = [attrs[i] for i in arr]
33            parts.append(a)
34        retSplit.append(parts)
35    return retSplit
36
37# ##########################################################################################
38# MISC FUNCTIONS
39# ##########################################################################################
40
41# take a number and return a formated string, eg: 2341232 -> "2,341,232"
42def createStringFromNumber(num):
43    s = str(num)
44    arr = range(len(s)-2)[:0:-3]
45    for i in arr:
46        s = s[:i] + "," + s[i:]
47    return s
48
49
50# factoriela
51def fact(i):
52    ret = 1
53    for j in range(2, i+1): ret*= j
54    return ret
55
56# get a sublist of permutations of list. Return only those permutations that cannot be attained by rotating one other permutation.
57# used in radviz where we have to try different placements of attributes
58def generateDifferentPermutations(list):
59    if len(list) == 0: return []
60    temp = permutations(list[:-1])
61    newPerms = {}
62    for p in temp:
63        if not newPerms.has_key(tuple(p[::-1])):
64            newPerms[tuple(p)] = p
65    lastVal = list[-1]
66    return [p + [lastVal] for p in newPerms.values()]
67
68
69# compute permutations of elements in list
70def permutations(list):
71    if list==[]:
72        return [[]]
73    else:
74        return [ [list[i]] + p for i in range(len(list)) for p in permutations(list[:i] + list[i+1:])]
75
76# return number of combinations where we select "select" from "total"
77def combinationsCount(select, total):
78    return fact(total)/ (fact(total-select)*fact(select))
79
80# get the actual combinations of items
81def combinations(items, count):
82    if count > len(items): return []
83    answer = []
84    indices = range(count)
85    if indices == []: return [[]]   # if count = 0 return empty list
86
87    indices[-1] = indices[-1] - 1
88    while 1:
89        limit = len(items) - 1
90        i = count - 1
91        while i >= 0 and indices[i] == limit:
92            i = i - 1
93            limit = limit - 1
94        if i < 0: break
95
96        val = indices[i]
97        for i in xrange( i, count ):
98            val = val + 1
99            indices[i] = val
100        temp = []
101        for i in indices:
102            temp.append( items[i] )
103        answer.append( temp )
104    return answer
105
106# ##########################################################################################
107# VIZRANK FUNCTIONS
108# ##########################################################################################
109
110# randomly switch two elements in a list. repeat nrOfTimes.
111def switchTwoElements(list, nrOfTimes = 1):
112    if len(list) < 2: return list
113    for i in range(nrOfTimes):
114        index1 = random.randint(0, len(list)-1)
115        index2 = random.randint(0, len(list)-1)
116        if index1 == index2:
117            index2 = (index1 + 1) % len(list)
118        exVal = list[index1]
119        list[index1] = list[index2]
120        list[index2] = exVal
121    return list
122
123# give a list of elements (e.g. [1,2,3,4,5,6,7]) and number of groups/classes (e.g. 2) and the number of switches you want to perform
124# we assume that the the first len(list)/groupCount elements belong to one group, the second to second group and so on. We only want
125# to switch elements inside a group, not elements between groups
126def switchTwoElementsInGroups(list, groupCount, nrOfTimes = 1):
127    # find group boundaries
128    count = len(list)
129    boundaries = []
130    index = 0
131    while groupCount > 0:
132        boundaries.append(int(ceil(count/float(groupCount))))
133        groupCount -= 1
134        count -= boundaries[-1]
135    currSum = 0
136    for i in range(len(boundaries)):
137        boundaries[i] = boundaries[i] + currSum
138        currSum = boundaries[i]
139
140    parts = []
141    boundaries = [0] + boundaries
142    for i in range(len(boundaries)-1):
143        parts.append(list[boundaries[i] : boundaries[i+1]])
144
145    # take parts, and for each part switch a few elements
146    retList = []
147    while parts != []:
148        count = int(ceil(nrOfTimes/float(len(parts))))
149        nrOfTimes -= count
150        part = parts.pop(random.randint(0, len(parts)-1))
151        retList += switchTwoElements(part, count)
152
153    return retList
154
155
156
157# ##########################################################################################
158# MEASURES FOR EVALUATING INTERESTINGNESS OF ATTRIBUTES
159# ##########################################################################################
160
161# class that measures correlation between continuous class value and continuous attribute
162class MeasureCorrelation:
163    def __init__(self):
164        pass
165
166    def __call__(self, attr, data):
167        return self.MeasureAttribute_info(attr, data)
168
169    def MeasureAttribute_info(self, attr, data):
170        table = data.select([attr, data.domain.classVar])
171        table = orange.Preprocessor_dropMissing(table)
172        a1 = [table[k][0].value for k in range(len(table))]
173        a2 = [table[k][1].value for k in range(len(table))]
174
175        val, prob = statc.pearsonr(a1, a2)
176        return val
177
178
179# fisher discriminant implemented to be used as orange.MeasureAttribute
180class MeasureFisherDiscriminant:
181    def __init__(self):
182        self.dataset = None
183        self.attrInfo = {}
184        self.stats = []
185
186    def __call__(self, attr, data):
187        return self.MeasureAttribute_info(attr, data)
188
189    def MeasureAttribute_info(self, attr, data):
190        # if basic statistics is not computed for this dataset -> compute it
191        if not (self.stats and self.dataset == data):
192            self.stats = {}
193            self.dataset = data
194
195            arr = [0] * len(data.domain.attributes)
196            for val in data.domain.classVar.values:
197                data2 = data.select({data.domain.classVar: val})
198                bas = orange.DomainBasicAttrStat(data2)
199                self.stats[val] = bas
200
201            for i in range(len(self.stats.keys())):
202                statI = self.stats[self.stats.keys()[i]]
203                if len(statI) == 0: continue
204                for j in range(i+1, len(self.stats.keys())):
205                    statJ = self.stats[self.stats.keys()[j]]
206                    if len(statJ) == 0: continue
207                    for attribute in range(len(data.domain.attributes)):
208                        if data.domain.attributes[attribute].varType != orange.VarTypes.Continuous: continue
209                        bottom = (statI[attribute].n * statI[attribute].dev + statJ[attribute].n * statJ[attribute].dev)
210                        if bottom == 0.0: bottom = 0.001
211                        val = abs(statI[attribute].avg - statJ[attribute].avg) * (statI[attribute].n + statJ[attribute].n)/bottom
212                        arr[attribute] += val
213
214            # normalize values in arr so that the largest value will be 1 and others will be proportionally smaller
215            largest = max(arr)
216            if largest != 0:
217                arr = [val/largest for val in arr]
218
219            for i in range(len(data.domain.attributes)):
220                self.attrInfo[data.domain.attributes[i].name] = arr[i]
221
222        return self.attrInfo[data.domain[attr].name]
223
224
225# signal to noise measure implemented as orange.MeasureAttribute
226class S2NMeasure:
227    def __init__(self):
228        self.attrInfo = {}
229        self.data = None
230
231    def __call__(self, attr, data):
232        # if the data changed clear the attribute values
233        if data != self.data:
234            self.attrInfo = {}
235            self.data = data
236
237        if self.attrInfo == {}:
238            classVar = data.domain.classVar
239            datas = [data.select({data.domain.classVar.name: [val]}) for val in data.domain.classVar.values]
240            stats = [orange.DomainBasicAttrStat(d) for d in datas]
241            cls = range(len(stats))
242            clsCount = len(stats)
243            for i in range(len(stats[0])):
244                if stats[0][i] == None: continue
245                temp = 0.0
246                for j in cls:
247                    for k in range(j+1, clsCount):
248                        if (stats[j][i].dev + stats[k][i].dev) > 0:
249                            temp += abs((stats[j][i].avg - stats[k][i].avg) / (stats[j][i].dev + stats[k][i].dev))
250                self.attrInfo[data.domain.attributes[i].name] = temp
251
252        if self.attrInfo.has_key(data.domain[attr].name):
253            return self.attrInfo[data.domain[attr].name]
254        else:
255            return -1
256
257
258# same class as above, just that we can use it to evaluate attribute for each class value
259class S2NMeasureMix(S2NMeasure):
260    def __init__(self):
261        S2NMeasure.__init__(self)
262        self.attrInfoMix = {}
263        self.dataMix = None
264        self.sortedAttrList = []
265
266    def __call__(self, attribute, data):
267
268        # if the data changed clear the attribute values
269        if data != self.dataMix:
270            self.attrInfoMix = {}
271            self.attrInfo = {}
272            self.dataMix = data
273
274        if self.attrInfoMix == {}:
275            attrs = range(len(data.domain.attributes))
276            classVar = data.domain.classVar
277            #shortData = data.select(attrs + [classVar])
278            datas = [data.select({classVar.name: [val]}) for val in classVar.values]
279            statistics = [orange.DomainBasicAttrStat(d) for d in datas]
280
281            cls = []
282            for classVarIndex, c in enumerate(classVar.values):   # for each class value compute how good is each attribute for discriminating this class value against all other
283                attrValsList = []
284                newData = mergeClassValues(data, c)
285                for attrIndex in range(len(attrs)):
286                    if data.domain[attrIndex].varType == orange.VarTypes.Discrete:      # ignore discrete attributes
287                        continue
288                    val = S2NMeasure.__call__(self, attrs[attrIndex], newData)
289                    if statistics[0][attrIndex] == None:
290                        attrValsList.append((0,attrs[attrIndex]))
291                    else:
292                        aves = [stat[attrIndex].avg for stat in statistics]
293                        if max(aves) != aves[classVarIndex] :
294                            val = -val
295                        attrValsList.append((val, attrs[attrIndex]))
296                attrValsList.sort()
297                attrValsList = [element[1] for element in attrValsList]     # remove the value
298                attrValsList.reverse()
299                cls.append(attrValsList)
300
301            attrPositionsDict = dict([(attr, []) for attr in cls[0]])
302            for arr in cls:
303                for i in range(len(arr)):
304                    attrPositionsDict[arr[i]].append(i)
305
306            numClasses = len(classVar.values)
307            currPos = [0 for i in range(numClasses)]
308            self.sortedAttrList = []
309            ableToAdd = 1
310            while ableToAdd:    # sometimes some attributes are duplicated. in such cases we will add only one instance of such attribute to the list
311                ableToAdd = 0
312                for i in range(numClasses):
313                    pos = currPos[i]
314                    while pos < len(cls[i]) and cls[i][pos] == None:
315                        pos += 1
316                    currPos[i] = pos + 1
317                    if pos >= len(cls[i]):
318                        continue
319                    ableToAdd = 1
320
321                    attr = cls[i][pos]
322                    self.sortedAttrList.append(attr)
323                    attrPositions = attrPositionsDict[attr]     # get indices in cls where attribute attr is placed
324                    for j in range(numClasses):
325                        cls[j][attrPositions[j]] = None
326
327            count = len(self.sortedAttrList)
328            for (i, attr) in enumerate(self.sortedAttrList):
329                self.attrInfoMix[data.domain[attr].name] = count-i
330
331        if self.attrInfoMix.has_key(data.domain[attribute].name):
332            return self.attrInfoMix[data.domain[attribute].name]
333        else:
334            return -1
335
336# ##########################################################################################
337# ##########################################################################################
338
339def mergeClassValues(data, value):
340    selection = orange.EnumVariable("Selection", values = ["0", "1"])
341
342    selectedClassesStr = [value]
343    nonSelectedClassesStr = []
344    for val in data.domain.classVar.values:
345        if val not in selectedClassesStr: nonSelectedClassesStr.append(val)
346
347    shortData1 = data.select({data.domain.classVar.name: selectedClassesStr})
348    shortData2 = data.select({data.domain.classVar.name: nonSelectedClassesStr})
349    d1 = orange.Domain(shortData1.domain.attributes + [selection])
350    selection.getValueFrom = lambda ex, what: orange.Value(selection, "0")
351    data1 = orange.ExampleTable(d1, shortData1)
352
353    selection.getValueFrom = lambda ex, what: orange.Value(selection, "1")
354    data2 = orange.ExampleTable(d1, shortData2)
355    data1.extend(data2)
356    return data1
357
358
359# almost equal to evaluateAttributesByEachClassValue function with one exception. if the class value that we want to discriminate
360# has a lower average value than the other average class values then we multiply quality of this attribute with -1
361# this way we get the attributes that have the highes expression and are also good at discrimination
362def findAttributeGroupsForRadviz(data, measure):
363    if isinstance(measure, S2NMeasureMix):
364        measure(data.domain.attributes[0].name, data)                       # just call measure to compute quality of all attributes
365        attrNames = [data.domain[ind].name for ind in measure.sortedAttrList]
366        numClasses = len(data.domain.classVar.values)
367        cls = [attrNames[i::numClasses] for i in range(numClasses)]
368    else:
369        attrVals = [(measure(attr.name, data), attr.name) for attr in data.domain.attributes]
370        attrVals.sort()
371        attrVals.reverse()
372        attrNames = [attrVals[i][1] for i in range(len(attrVals))]  # remove quality values of the attributes
373        cls = None
374
375    return attrNames, cls
376
377
378# for each class value test how good does the measure discriminate between this class value and all the other values merged
379# return a list of lists. Each list contains tuples (val, attr), where val is the attribute quality for attr at separating one class value from the others
380def evaluateAttributesByEachClassValue(data, measure, attrs):
381    cls = []
382    for c in data.domain.classVar.values:   # for each class value compute how good is each attribute for discriminating this class value against all other
383        v = []
384        data2 = mergeClassValues(data, c)
385        for attr in attrs:
386            v.append((measure(attr, data2), attr))
387        v.sort()
388        cls.append(v)
389    return cls
390
391# used by VizRank to evaluate attributes
392def evaluateAttributesDiscClass(data, contMeasure, discMeasure):
393    attrs = []
394    #corr = MeasureCorrelation()
395    for attr in data.domain.attributes:
396        #if data.domain.classVar.varType == orange.VarTypes.Continuous and attr.varType == orange.VarTypes.Continuous: attrs.append((corr(attr.name, data), attr.name))
397        if data.domain.classVar.varType == orange.VarTypes.Continuous and attr.varType == orange.VarTypes.Continuous: attrs.append((1, attr.name))
398        elif data.domain.classVar.varType == orange.VarTypes.Continuous:            attrs.append((0.0, attr.name))
399        elif discMeasure == None and attr.varType == orange.VarTypes.Discrete:      attrs.append((0.0, attr.name))
400        elif contMeasure == None and attr.varType == orange.VarTypes.Continuous:    attrs.append((0.0, attr.name))
401        elif attr.varType == orange.VarTypes.Continuous:                            attrs.append((contMeasure(attr.name, data), attr.name))
402        else:                                                                       attrs.append((discMeasure(attr.name, data), attr.name))
403
404    if discMeasure or contMeasure:
405        attrs.sort()
406        attrs.reverse()
407
408    return [attr for (val, attr) in attrs]  # return only the ordered list of attributes and not also their values
409
410def evaluateAttributesContClass(data, contMeasure, discMeasure):
411    attrs = []
412    for attr in data.domain.attributes:
413        attrs.append((1, attr.name))
414    #        elif discMeasure == None and attr.varType == orange.VarTypes.Discrete:      attrs.append((0.0, attr.name))
415    #        elif contMeasure == None and attr.varType == orange.VarTypes.Continuous:    attrs.append((0.0, attr.name))
416    #        elif attr.varType == orange.VarTypes.Continuous:                            attrs.append((contMeasure(attr.name, data), attr.name))
417    #        else:                                                                       attrs.append((discMeasure(attr.name, data), attr.name))
418
419    if discMeasure or contMeasure:
420        attrs.sort()
421        attrs.reverse()
422
423    return [attr for (val, attr) in attrs]  # return only the ordered list of attributes and not also their values
424
425
426def evaluateAttributesNoClass(data, contMeasure, discMeasure):
427    attrs = []
428    for attr in data.domain.attributes:
429        attrs.append((1, attr.name))
430    #        elif discMeasure == None and attr.varType == orange.VarTypes.Discrete:      attrs.append((0.0, attr.name))
431    #        elif contMeasure == None and attr.varType == orange.VarTypes.Continuous:    attrs.append((0.0, attr.name))
432    #        elif attr.varType == orange.VarTypes.Continuous:                            attrs.append((contMeasure(attr.name, data), attr.name))
433    #        else:                                                                       attrs.append((discMeasure(attr.name, data), attr.name))
434
435    if discMeasure or contMeasure:
436        attrs.sort()
437        attrs.reverse()
438
439    return [attr for (val, attr) in attrs]  # return only the ordered list of attributes and not also their values
440
441
442
443
444# ##############################################################################################
445# ##############################################################################################
446
447
448# SELECT ATTRIBUTES ##########################
449def selectAttributes(data, attrContOrder, attrDiscOrder, projections = None):
450    if data.domain.classVar == None or data.domain.classVar.varType != orange.VarTypes.Discrete:
451        return ([attr.name for attr in data.domain.attributes], [], 0)
452
453    shown = [data.domain.classVar.name]; hidden = []; maxIndex = 0    # initialize outputs
454
455    # # both are RELIEF
456    if attrContOrder == "ReliefF" and attrDiscOrder == "ReliefF":
457        attrVals = scoring.score_all(data, orange.MeasureAttribute_relief())
458        s,h = getTopAttrs(attrVals, 0.95)
459        return (shown + s, hidden + h, 0)
460
461    # # both are NONE
462    elif attrContOrder == "None" and attrDiscOrder == "None":
463        for item in data.domain.attributes:    shown.append(item.name)
464        return (shown, hidden, 0)
465
466
467    # disc and cont attribute list
468    discAttrs = []; contAttrs = []
469    for attr in data.domain.attributes:
470        if attr.varType == orange.VarTypes.Continuous: contAttrs.append(attr.name)
471        elif attr.varType == orange.VarTypes.Discrete: discAttrs.append(attr.name)
472
473
474    ###############################
475    # sort continuous attributes
476    if attrContOrder == "None":
477        shown += contAttrs
478    elif attrContOrder in ["ReliefF", "Fisher discriminant", "Signal to Noise", "Signal to Noise For Each Class"]:
479        if attrContOrder == "ReliefF":               measure = orange.MeasureAttribute_relief(k=10, m=50)
480        elif attrContOrder == "Fisher discriminant": measure = MeasureFisherDiscriminant()
481        elif attrContOrder == "Signal to Noise":     measure = S2NMeasure()
482        else:                                        measure = S2NMeasureMix()
483
484        dataNew = data.select(contAttrs + [data.domain.classVar])
485        attrVals = scoring.score_all(dataNew, measure)
486        s,h = getTopAttrs(attrVals, 0.95)
487        shown += s
488        hidden += h
489    else:
490        print "Unknown value for attribute order: ", attrContOrder
491
492    # ###############################
493    # sort discrete attributes
494    if attrDiscOrder == "None":
495        shown += discAttrs
496    elif attrDiscOrder == "GainRatio" or attrDiscOrder == "Gini" or attrDiscOrder == "ReliefF":
497        if attrDiscOrder == "GainRatio":   measure = orange.MeasureAttribute_gainRatio()
498        elif attrDiscOrder == "Gini":       measure = orange.MeasureAttribute_gini()
499        else:                               measure = orange.MeasureAttribute_relief()
500
501        dataNew = data.select(discAttrs + [data.domain.classVar])
502        attrVals = scoring.score_all(dataNew, measure)
503        s,h = getTopAttrs(attrVals, 0.95)
504        shown += s; hidden += h
505
506    elif attrDiscOrder == "Oblivious decision graphs":
507        #shown.append(data.domain.classVar.name)
508        attrs = getFunctionalList(data)
509        for item in attrs:
510            shown.append(item)
511        for attr in data.domain.attributes:
512            if attr.name not in shown and attr.varType == orange.VarTypes.Discrete:
513                hidden.append(attr.name)
514    else:
515        print "Unknown value for attribute order: ", attrDiscOrder
516
517    return (shown, hidden, maxIndex)
518
519
520def getTopAttrs(results, maxSum = 0.95, onlyPositive = 1):
521    s = []; h = []
522    sum = 0
523    for (attr, val) in results:
524        if not onlyPositive or val > 0: sum += val
525    tempSum = 0
526    for (attr, val) in results:
527        if tempSum < maxSum*sum: s.append(attr)
528        else: h.append(attr)
529        tempSum += val
530    return (s, h)
531
532
533# ##########################################################################################
534# PARALLEL COORDINATES FUNCTIONS
535# ##########################################################################################
536
537# find interesting attribute order for parallel coordinates
538# attrInfo = [(val1, attr1, attr2), .... ]
539def optimizeAttributeOrder(attrInfo, numberOfAttributes, optimizationDlg, app):
540    while (attrInfo != []):
541        proj = []
542        projVal = []
543        canAddAttribute = 1
544        while canAddAttribute:
545            if not optimizationDlg.canContinueOptimization(): return
546            app.processEvents()        # allow processing of other events
547
548            if len(proj) == 0:
549                proj = [attrInfo[0][1], attrInfo[0][2]]
550                projVal = [attrInfo[0][0]]
551            elif len(proj) == numberOfAttributes:
552                canAddAttribute = 0     # time to add the projection to the list
553            else:
554                proj, projVal, success = addBestToCurrentProj(proj, projVal, attrInfo)
555                if not success: canAddAttribute = 0 # there are no more attributes that can be added to this projection
556
557        if len(proj) == numberOfAttributes:
558            # I (Ales) commented this out because fixItersectingPairs can enter an endless cycle
559            #proj, projVal = fixIntersectingPairs(proj, projVal, attrInfo)
560            for i in range(len(proj)-1):
561                removeAttributePair(proj[i], proj[i+1], attrInfo)
562            optimizationDlg.addProjection(sum(projVal)/len(projVal), proj)
563        else:
564            for i in range(len(proj)-1):
565                removeAttributePair(proj[i], proj[i+1], attrInfo)
566
567
568# try rotating subsequences of proj to increase value of attribute order
569def fixIntersectingPairs(proj, projVal, attrInfo):
570    changed = 1
571    allprojections = set()
572    while changed:
573        changed = 0
574        for i in range(len(projVal)-1):
575            if changed: continue
576            for j in range(i+2, len(projVal)-1):
577                if changed: continue
578                val1, exists1 = getAttributePairValue(proj[i], proj[j], attrInfo)
579                val2, exists2 = getAttributePairValue(proj[i+1], proj[j+1], attrInfo)
580                if exists1 and exists2 and (val1 + val2 > projVal[i] + projVal[j]):
581                    projVal[i] = val1
582                    projVal[j] = val2
583                    rev = proj[i+1:j+1]
584                    rev.reverse()
585                    tempProj = proj[:i+1] + rev + proj[j+1:]
586                    proj = tempProj
587                    changed = 1     # we rotated the projection. start checking from the begining
588
589        tproj = tuple(proj)
590        assert(tproj not in allprojections)
591        allprojections.add(tproj)
592
593    return proj, projVal
594
595# return value for attribute pair (val, attr1, attr2) if exists. if not, return 0
596def getAttributePairValue(attr1, attr2, attrInfo):
597    for (val, a1, a2) in attrInfo:
598        if (attr1 == a1 and attr2 == a2) or (attr1 == a2 and attr2 == a1): return (val, 1)
599    return (0, 0)
600
601# remove attribute pair (val, attr1, attr2) from attrInfo
602def removeAttributePair(attr1, attr2, attrInfo):
603    for (val, a1, a2) in attrInfo:
604        if (attr1 == a1 and attr2 == a2) or (attr1 == a2 and attr2 == a1):
605            attrInfo.remove((val, a1, a2))
606            return
607    print "failed to remove attribute pair", attr1, attr2
608
609
610def addBestToCurrentProj(proj, projVal, attrInfo):
611    for (val, a1, a2) in attrInfo:
612        if (a1 == proj[0] and a2 not in proj) or (a2 == proj[0] and a1 not in proj) or (a1 == proj[-1] and a2 not in proj) or (a2 == proj[-1] and a1 not in proj):
613            if a1 == proj[0]: return ([a2] + proj, [val] + projVal, 1)
614            elif a2 == proj[0]: return ([a1] + proj, [val] + projVal, 1)
615            elif a1 == proj[-1]: return (proj + [a2], projVal + [val], 1)
616            else                       : return (proj + [a1], projVal + [val], 1)
617
618    """
619    for (val, a1, a2) in attrInfo:
620        if a1 in proj and a2 in proj: continue
621        if a1 not in proj and a2 not in proj: continue
622       
623        if (a1 not in proj) and (a2 in proj): placed = a2; place = a1
624        else:                                  placed = a1; place = a2
625           
626        ind = proj.index(placed)
627        if ind > 0:
628            val2, exists = getAttributePairValue(place, proj[ind-1], attrInfo)
629            if exists:
630                proj.insert(ind, place)
631                projVal[ind-1] = val2
632                projVal.insert(ind-1, val)
633                return (proj, projVal, 1)
634        if ind < len(proj)-1:
635            val2, exists = getAttributePairValue(place, proj[ind+1], attrInfo)
636            if exists:
637                proj.insert(ind+1, place)
638                projVal[ind] = val
639                projVal.insert(ind, val2)
640                return (proj, projVal, 1)
641    """
642    return proj, projVal, 0
643
644def computeCorrelationBetweenAttributes(data, attrList, minCorrelation = 0.0, progressCallback=None):
645    correlations = []
646    attrListLen = len(attrList)
647    iterCount = attrListLen * (attrListLen - 1) / 2
648    iter = 0
649    milestones = progress_bar_milestones(iterCount)
650    for i in range(len(attrList)):
651        if data.domain.attributes[i].varType != orange.VarTypes.Continuous:
652            continue
653        for j in range(i+1, len(attrList)):
654            if data.domain.attributes[j].varType != orange.VarTypes.Continuous:
655                continue
656            val = abs(computeCorrelation(data, attrList[i], attrList[j]))
657            if val >= minCorrelation:
658                correlations.append((val, attrList[i], attrList[j]))
659            iter += 1
660            if progressCallback and iter in milestones:
661                progressCallback(100.0 * iter / iterCount)
662
663    return sorted(correlations, reverse=True)
664
665
666def computeCorrelationInsideClassesBetweenAttributes(data, attrList, minCorrelation = 0.0, progressCallback=None):
667    if not data.domain.classVar or data.domain.classVar.varType == orange.VarTypes.Continuous:
668        return []
669    correlations = []
670    attrListLen = len(attrList)
671    iterCount = attrListLen * (attrListLen - 1) / 2
672    iter = 0
673    milestones = progress_bar_milestones(iterCount)
674    for i in range(len(attrList)):
675        if data.domain.attributes[i].varType != orange.VarTypes.Continuous:
676            continue
677        for j in range(i+1, len(attrList)):
678            if data.domain.attributes[j].varType != orange.VarTypes.Continuous:
679                continue
680            corr, corrs, lengths = computeCorrelationInsideClasses(data, attrList[i], attrList[j])
681            if corr >= minCorrelation:
682                correlations.append((corr, attrList[i], attrList[j]))
683            iter += 1
684            if progressCallback and iter in milestones:
685                progressCallback(100.0 * iter / iterCount)
686
687    return sorted(correlations, reverse=True)
688
689
690def computeCorrelationInsideClasses(data, attr1, attr2):
691    if data.domain[attr1].varType != orange.VarTypes.Continuous: return None
692    if data.domain[attr2].varType != orange.VarTypes.Continuous: return None
693
694    table = data.select([attr1, attr2, data.domain.classVar])
695    table = orange.Preprocessor_dropMissing(table)
696    lengths = []; corrs = []
697    for val in table.domain.classVar.values:
698        tab = table.filter({table.domain.classVar: val})
699        a1 = [tab[k][attr1].value for k in range(len(tab))]
700        a2 = [tab[k][attr2].value for k in range(len(tab))]
701        if len(a1) == 0: continue
702        val, prob = statc.pearsonr(a1, a2)
703        lengths.append(len(a1))
704        corrs.append(val)
705    corr = 0
706    for ind in range(len(corrs)): corr += abs(corrs[ind])*lengths[ind]
707    corr /= sum(lengths)
708    return corr, corrs, lengths
709
710# compute correlation between two continuous attributes
711def computeCorrelation(data, attr1, attr2):
712    if data.domain[attr1].varType != orange.VarTypes.Continuous: return None
713    if data.domain[attr2].varType != orange.VarTypes.Continuous: return None
714
715    table = data.select([attr1, attr2])
716    table = orange.Preprocessor_dropMissing(table)
717    a1 = [table[k][attr1].value for k in range(len(table))]
718    a2 = [table[k][attr2].value for k in range(len(table))]
719
720    try:
721        val, prob = statc.pearsonr(a1, a2)
722    except:
723        val = 0.0    # possibly invalid a1 or a2
724
725    return val
726
727# ##########################################################################################
728# ##########################################################################################
729
730
731# ##########################################################################################
732# #### FUNCTIONS FOR CALCULATING ATTRIBUTE ORDER USING Oblivious decision graphs
733# ##########################################################################################
734def replaceAttributes(index1, index2, merged, data):
735    attrs = list(data.domain)
736    attrs.remove(data.domain[index1])
737    attrs.remove(data.domain[index2])
738    domain = orange.Domain(attrs+ [merged])
739    return data.select(domain)
740
741
742def getFunctionalList(data):
743    import orngCI
744
745    bestQual = -10000000
746    bestAttr = -1
747    testAttrs = []
748
749    dataShort = orange.Preprocessor_dropMissing(data)
750    # remove continuous attributes from data
751    disc = []
752    for i in range(len(dataShort.domain.attributes)):
753        # keep only discrete attributes that have more than one value
754        if dataShort.domain.attributes[i].varType == orange.VarTypes.Discrete and len(dataShort.domain.attributes[i].values) > 1: disc.append(dataShort.domain.attributes[i].name)
755    if disc == []: return []
756    discData = dataShort.select(disc + [dataShort.domain.classVar.name])
757
758    remover = orngCI.AttributeRedundanciesRemover(noMinimization = 1)
759    newData = remover(discData, weight = 0)
760
761    for attr in newData.domain.attributes: testAttrs.append(attr.name)
762
763    # compute the best attribute combination
764    for i in range(len(newData.domain.attributes)):
765        vals, qual = orngCI.FeatureByMinComplexity(newData, [newData.domain.attributes[i], newData.domain.classVar])
766        if qual > bestQual:
767            bestQual = qual
768            bestAttr = newData.domain.attributes[i].name
769            mergedVals = vals
770            mergedVals.name = newData.domain.classVar.name
771
772    if bestAttr == -1: return []
773    outList = [bestAttr]
774    newData = replaceAttributes(bestAttr, newData.domain.classVar, mergedVals, newData)
775    testAttrs.remove(bestAttr)
776
777    while (testAttrs != []):
778        bestQual = -10000000
779        for attrName in testAttrs:
780            vals, qual = orngCI.FeatureByMinComplexity(newData, [mergedVals, attrName])
781            if qual > bestQual:
782                bestqual = qual
783                bestAttr = attrName
784
785        vals, qual = orngCI.FeatureByMinComplexity(newData, [mergedVals, bestAttr])
786        mergedVals = vals
787        mergedVals.name = newData.domain.classVar.name
788        newData = replaceAttributes(bestAttr, newData.domain.classVar, mergedVals, newData)
789        outList.append(bestAttr)
790        testAttrs.remove(bestAttr)
791
792    # new attributes have "'" at the end of their names. we have to remove that in ored to identify them in the old domain
793    for index in range(len(outList)):
794        if outList[index][-1] == "'": outList[index] = outList[index][:-1]
795    return outList
796
797
798# ##########################################################################################
799# POLYVIZ FUNCTIONS USED IN VIZRANK.
800# ##########################################################################################
801
802# input: array where items are arrays, e.g.: [[1,2,3], [1,3,2], [2,1,3], [2,3,1], [3,1,2], [3,2,1]]
803# returns: array, where some items are removed. it rotates items in each array and checks if the item is already in the array. it also tries to flip the items ([::-1])
804# for the above case, it would return [[1,2,3]]
805# used for radviz
806def removeRotationDuplicates(arr, removeFlipDuplicates = 1):
807    final = []
808    projDict = {}
809    for a in arr:
810        if type(a[0]) == int:
811            key = tuple([a[i]-a[i+1] for i in range(-1, len(a)-1)])
812            rkey = tuple([a[i+1]-a[i] for i in range(-1, len(a)-1)])
813            o = min(a)      # offset. needed in cases when we get in arr: [0,1,2] and [1,2,3]. otherwise it would remove the second element because it would look the same as the first
814        else:
815            key = tuple([(a[i][0]-a[i+1][0], a[i][1]-a[i+1][1]) for i in range(-1, len(a)-1)])
816            rkey = tuple([(a[i+1][0]-a[i][0], a[i+1][1]-a[i][1]) for i in range(-1, len(a)-1)])
817            o = 0
818
819        rotations = [key[i:] + key[:i] for i in range(len(key))]
820        if 1 in [projDict.has_key((o,key)) for key in rotations]:
821            continue
822
823        rotations = [rkey[i:] + rkey[:i] for i in range(len(rkey))]
824        if removeFlipDuplicates and 1 in [projDict.has_key((o,key)) for key in rotations]:
825            continue
826
827        projDict[(o,key)] = a
828
829        """
830        found = 0
831        kkey = copy.copy(key)
832       
833        for i in range(len(kkey)):
834            kkey = tuple(list(kkey[1:]) + [kkey[0]])
835            if projDict.has_key((o,kkey)):
836                found = 1
837                break
838
839        # try also if there is a flipped duplicate
840        if not found and removeFlipDuplicates:
841            for i in range(len(rkey)):
842                rkey = tuple(list(rkey[1:]) + [rkey[0]])
843                if projDict.has_key((o,rkey)):
844                    found = 1
845                    break
846        if not found: projDict[(o,key)] = a
847        """
848    return projDict.values()
849
850# create possible combinations with the given set of numbers in arr
851def createMixCombinations(arrs, removeFlipDuplicates):
852
853    def addProjs(projs, count, i):
854        ret = []
855        perms = permutations(range(count))
856        for perm in perms:
857            c = copy.copy(projs)
858            add = [(i, p) for p in perm]
859            c = [p + add for p in c]
860            ret += c
861        return ret
862
863    projs = [[]]
864    for i in range(len(arrs)):
865        projs = addProjs(projs, arrs[i], i)
866    return removeRotationDuplicates(projs, removeFlipDuplicates)
867
868
869# get a list of possible projections if we have numClasses and want to use maxProjLen attributes
870# removeFlipDuplicates tries to flip the attributes in the projection and removes it if the projection already exists
871# removeFlipDuplicates = 1 for radviz and =0 for polyviz
872def createProjections(numClasses, maxProjLen, removeFlipDuplicates = 1):
873    if _projectionListDict.has_key((numClasses, maxProjLen, removeFlipDuplicates)):
874        return _projectionListDict[(numClasses, maxProjLen, removeFlipDuplicates)]
875
876    # create array of arrays of lengths, e.g. [3,3,2] that will tell that we want 3 attrs from the 1st class, 3 from 2nd and 2 from 3rd
877    if maxProjLen % numClasses != 0:
878        cs = combinations(range(numClasses), maxProjLen % numClasses)
879        equal = [int(maxProjLen / numClasses) for i in range(numClasses)]
880        lens = [copy.copy(equal) for comb in cs]     # make array of arrays
881        for i, comb in enumerate(cs):
882            for val in comb: lens[i][val] += 1
883    else:
884        lens = [[int(maxProjLen / numClasses) for i in range(numClasses)]]
885
886    combs = []
887    seen = []
888    for l in lens:
889        withoutZeros = filter(None, l)          # if numClasses > maxProjLen then some values in l are 0. we have to remove this zeros and check if we have already added such combinations.
890        if withoutZeros not in seen:
891            tempCombs = createMixCombinations(withoutZeros, removeFlipDuplicates)
892            combs += tempCombs
893            seen.append(withoutZeros)
894
895    if _differentClassPermutationsDict.has_key((numClasses, maxProjLen, removeFlipDuplicates)):
896        perms = _differentClassPermutationsDict[(numClasses, maxProjLen, removeFlipDuplicates)]
897    else:
898        if numClasses <= maxProjLen: perms = permutations(range(numClasses))
899        else:                        perms = combinations(range(numClasses), maxProjLen)
900        perms = removeRotationDuplicates(perms, removeFlipDuplicates)
901        _differentClassPermutationsDict[(numClasses, maxProjLen, removeFlipDuplicates)] = perms
902
903    final = []
904    for perm in perms:
905        final += [[(perm[i], j) for (i,j) in comb] for comb in combs]
906
907    _projectionListDict[(numClasses, maxProjLen, removeFlipDuplicates)] = final
908    return final
909
910
911if __name__=="__main__":
912    """
913    print "possible splits of ['a','b','c'] are ", getPossibleSplits(["a","b","c"])
914    print "possible combinations of 2 elements of the array [1,2,3,4] are ", combinations([1,2,3,4],2)
915    print "permutations of [1,2,3] are ", [[1, 2, 3], [1, 3, 2], [2, 1, 3], [2, 3, 1], [3, 1, 2], [3, 2, 1]]
916    print "array with two randomly switched elements of [1,2,3,4,5] is", switchTwoElements([1,2,3,4,5])
917
918    data = orange.ExampleTable(r"E:\Development\Python23\Lib\site-packages\Orange\Datasets\microarray\cancer diagnostics\leukemia_tran.tab")
919    a = data.toNumpy("ac")[0]
920    c = S2NMeasure()
921    c(data.domain.attributes[0].name, data)
922    """
923    final = createProjections(8,4)
924    print final
Note: See TracBrowser for help on using the repository browser.