source: orange-bioinformatics/_bioinformatics/obiGsea.py @ 1706:fee5567c52af

Revision 1706:fee5567c52af, 32.7 KB checked in by markotoplak, 20 months ago (diff)

Make the datasets with joined class values picklable.

Line 
1from __future__ import absolute_import
2
3from collections import defaultdict
4import random
5import time
6
7import numpy
8
9import orange
10import Orange
11
12from . import obiGeneSets
13from .obiExpression import *
14
15"""
16Gene set enrichment analysis.
17
18Author: Marko Toplak
19"""
20
21def iset(data):
22    """
23    Is data orange.ExampleTable?
24    """
25    return isinstance(data, orange.ExampleTable)
26
27def issequencens(x):
28    "Is x a sequence and not string ? We say it is if it has a __getitem__ method and is not string."
29    return hasattr(x, '__getitem__') and not isinstance(x, basestring)
30
31def mean(l):
32    return float(sum(l))/len(l)
33
34def rankingFromOrangeMeas(meas):
35    """
36    Creates a function that sequentally ranks all attributes and returns
37    results in a list. Ranking function is build out of
38    orange.MeasureAttribute.
39    """
40    return lambda d: [ meas(i,d) for i in range(len(d.domain.attributes)) ]
41
42def orderedPointersCorr(lcor):
43    """
44    Return a list of integers: indexes in original
45    lcor. Elements in the list are ordered by
46    their lcor[i] value. Higher correlations first.
47    """
48    ordered = [ (i,a) for i,a in enumerate(lcor) ] #original pos + correlation
49    ordered.sort(lambda x,y: cmp(y[1],x[1])) #sort by correlation, descending
50    ordered = nth(ordered, 0) #contains positions in the original list
51    return ordered
52
53def enrichmentScoreRanked(subset, lcor, ordered, p=1.0, rev2=None):
54    """
55    Input data and subset.
56   
57    subset: list of attribute indices of the input data belonging
58        to the same set.
59    lcor: correlations with class for each attribute in a list.
60
61    Returns enrichment score on given data.
62
63    This implementation efficiently handles "sparse" genesets (that
64    cover only a small subset of all genes in the dataset).
65    """
66
67    #print lcor
68
69    subset = set(subset)
70
71    if rev2 == None:
72        def rev(l):
73            return numpy.argsort(l)
74        rev2 = rev(ordered)
75
76    #add if gene is not in the subset
77    notInA = -(1. / (len(lcor)-len(subset)))
78    #base for addition if gene is in the subset
79    cors = [ abs(lcor[i])**p for i in subset ]
80    sumcors = sum(cors)
81
82    #this should not happen
83    if sumcors == 0.0:
84        return (0.0, None)
85   
86    inAb = 1./sumcors
87
88    ess = [0.0]
89   
90    map = {}
91    for i in subset:
92        orderedpos = rev2[i]
93        map[orderedpos] = inAb*abs(lcor[i]**p)
94       
95    last = 0
96
97    maxSum = minSum = csum = 0.0
98
99    for a,b in sorted(map.items()):
100        diff = a-last
101        csum += notInA*diff
102        last = a+1
103       
104        if csum < minSum:
105            minSum = csum
106       
107        csum += b
108
109        if csum > maxSum:
110            maxSum = csum
111
112    #finish it
113    diff = (len(ordered))-last
114    csum += notInA*diff
115
116    if csum < minSum:
117        minSum = csum
118
119    #print "MY", (maxSum if abs(maxSum) > abs(minSum) else minSum)
120
121    """
122    #BY DEFINITION
123    print "subset", subset
124
125    for i in ordered:
126        ess.append(ess[-1] + \
127            (inAb*abs(lcor[i]**p) if i in subset else notInA)
128        )
129        if i in subset:
130            print ess[-2], ess[-1]
131            print i, (inAb*abs(lcor[i]**p))
132
133    maxEs = max(ess)
134    minEs = min(ess)
135   
136    print "REAL", (maxEs if abs(maxEs) > abs(minEs) else minEs, ess[1:])
137
138    """
139    return (maxSum if abs(maxSum) > abs(minSum) else minSum, [])
140
141#from mOrngData
142def shuffleAttribute(data, attribute, locations):
143    """
144    Destructive!
145    """
146    attribute = data.domain[attribute]
147    l = [None]*len(data)
148    for i in range(len(data)):
149        l[locations[i]] = data[i][attribute]
150    for i in range(len(data)):
151        data[i][attribute] = l[i]
152
153def shuffleClass(datai, rands=0):
154    """
155    Returns a dataset with values of class attribute randomly shuffled.
156    If multiple dataset are on input shuffle them all with the same random seed.
157    """
158    def shuffleOne(data):
159        rand = random.Random(rands)
160        d2 = orange.ExampleTable(data.domain, data)
161        locations = range(len(data))
162        rand.shuffle(locations)
163        shuffleAttribute(d2, d2.domain.classVar, locations)
164        return d2
165
166    if iset(datai):
167        return shuffleOne(datai)
168    else:
169        return [ shuffleOne(data) for data in datai ]
170
171def shuffleList(l, rand=random.Random(0)):
172    """
173    Returns a copy of a shuffled input list.
174    """
175    import copy
176    l2 = copy.copy(l)
177    rand.shuffle(l2)
178    return l2
179
180def shuffleAttributes(data, rand=random.Random(0)):
181    """
182    Returns a dataset with a new attribute order.
183    """
184    natts = shuffleList(list(data.domain.attributes), rand)
185    dom2 = orange.Domain(natts, data.domain.classVar)
186    d2 = orange.ExampleTable(dom2, data)
187    return d2
188
189def gseapval(es, esnull):
190    """
191    From article (PNAS):
192    estimate nominal p-value for S from esnull by using the positive
193    or negative portion of the distribution corresponding to the sign
194    of the observed ES(S).
195    """
196   
197    try:
198        if es < 0:
199            return float(len([ a for a in esnull if a <= es ]))/ \
200                len([ a for a in esnull if a < 0])   
201        else: 
202            return float(len([ a for a in esnull if a >= es ]))/ \
203                len([ a for a in esnull if a >= 0])
204    except:
205        return 1.0
206
207
208def enrichmentScore(data, subset, rankingf):
209    """
210    Returns enrichment score and running enrichment score.
211    """
212    lcor = rankingf(data)
213    ordered = orderedPointersCorr(lcor)
214    es,l = enrichmentScoreRanked(subset, lcor, ordered)
215    return es,l
216
217def gseaE(data, subsets, rankingf=None, \
218        n=100, permutation="class", **kwargs):
219    """
220    Run GSEA algorithm on an example table.
221
222    data: orange example table.
223    subsets: list of distinct subsets of data.
224    rankingf: function that returns correlation to class of each
225        variable.
226    n: number of random permutations to sample null distribution.
227    permutation: "class" for permutating class, else permutate attribute
228        order.
229
230    """
231
232    if not rankingf:
233        rankingf=rankingFromOrangeMeas(MA_signalToNoise())
234
235    enrichmentScores = []
236 
237    lcor = rankingf(data)
238    #print lcor
239
240    ordered = orderedPointersCorr(lcor)
241
242    def rev(l):
243        return numpy.argsort(l)
244
245    rev2 = rev(ordered)
246
247    for subset in subsets:
248        es = enrichmentScoreRanked(subset, lcor, ordered, rev2=rev2)[0]
249        enrichmentScores.append(es)
250
251    runOptCallbacks(kwargs)
252
253    #print "PERMUTATION", permutation
254
255    enrichmentNulls = [ [] for a in range(len(subsets)) ]
256
257    for i in range(n):
258
259        if permutation == "class":
260            d2 = shuffleClass(data, 2000+i) #fixed permutation
261            r2 = rankingf(d2)
262
263        else:
264            r2 = shuffleList(lcor, random.Random(2000+i))
265
266        ordered2 = orderedPointersCorr(r2)
267        rev22 = rev(ordered2)
268        for si,subset in enumerate(subsets):
269            esn = enrichmentScoreRanked(subset, r2, ordered2, rev2=rev22)[0]
270            enrichmentNulls[si].append(esn)
271
272        runOptCallbacks(kwargs)
273
274    return gseaSignificance(enrichmentScores, enrichmentNulls)
275
276
277def runOptCallbacks(rargs):
278    if "callback" in rargs:
279        try:
280            [ a() for a in rargs["callback"] ]
281        except:
282            rargs["callback"]()
283           
284
285def gseaR(rankings, subsets, n=100, **kwargs):
286    """
287    """
288
289    if "permutation" in kwargs:
290        if kwargs["permutation"] == "class":
291            raise Exception("Only gene permutation possible")
292
293    enrichmentScores = []
294 
295    ordered = orderedPointersCorr(rankings)
296   
297    def rev(l):
298        return numpy.argsort(l)
299
300    rev2 = rev(ordered)
301
302    for subset in subsets:
303
304        es = enrichmentScoreRanked(subset, rankings, ordered, rev2=rev2)[0]
305        enrichmentScores.append(es)
306   
307    runOptCallbacks(kwargs)
308
309    enrichmentNulls = [ [] for a in range(len(subsets)) ]
310
311    for i in range(n):
312       
313        r2 = shuffleList(rankings, random.Random(2000+i))
314        ordered2 = orderedPointersCorr(r2)
315        rev22 = rev(ordered2)
316
317        for si,subset in enumerate(subsets):
318
319            esn = enrichmentScoreRanked(subset, r2, ordered2, rev2=rev22)[0]
320            enrichmentNulls[si].append(esn)
321
322        runOptCallbacks(kwargs)
323
324    return gseaSignificance(enrichmentScores, enrichmentNulls)
325
326
327def gseaSignificance(enrichmentScores, enrichmentNulls):
328
329    #print enrichmentScores
330
331    import time
332
333    tb1 = time.time()
334
335    enrichmentPVals = []
336    nEnrichmentScores = []
337    nEnrichmentNulls = []
338
339    for i in range(len(enrichmentScores)):
340        es = enrichmentScores[i]
341        enrNull = enrichmentNulls[i]
342        #print es, enrNull
343
344        enrichmentPVals.append(gseapval(es, enrNull))
345
346        #normalize the ES(S,pi) and the observed ES(S), separetely rescaling
347        #the positive and negative scores by divident by the mean of the
348        #ES(S,pi)
349
350        #print es, enrNull
351
352        def normalize(s):
353            try:
354                if s == 0:
355                    return 0.0
356                if s >= 0:
357                    meanPos = mean([a for a in enrNull if a >= 0])
358                    #print s, meanPos
359                    return s/meanPos
360                else:
361                    meanNeg = mean([a for a in enrNull if a < 0])
362                    #print s, meanNeg
363                    return -s/meanNeg
364            except:
365                return 0.0 #return if according mean value is uncalculable
366
367
368        nes = normalize(es)
369        nEnrichmentScores.append(nes)
370       
371        nenrNull = [ normalize(s) for s in enrNull ]
372        nEnrichmentNulls.append(nenrNull)
373 
374
375    #print "First part", time.time() - tb1
376
377    #FDR computation
378    #create a histogram of all NES(S,pi) over all S and pi
379    vals = reduce(lambda x,y: x+y, nEnrichmentNulls, [])
380
381
382    def shorten(l, p=10000):
383        """
384        Take each len(l)/p element, if len(l)/p >= 2.
385        """
386        e = len(l)/p
387        if e <= 1:
388            return l
389        else:
390            return [ l[i] for i in xrange(0, len(l), e) ]
391
392    #vals = shorten(vals) -> this can speed up second part. is it relevant TODO?
393
394    """
395    Use this null distribution to compute an FDR q value, for a given NES(S) =
396    NES* >= 0. The FDR is the ratio of the percantage of all (S,pi) with
397    NES(S,pi) >= 0, whose NES(S,pi) >= NES*, divided by the percentage of
398    observed S wih NES(S) >= 0, whose NES(S) >= NES*, and similarly if NES(S)
399    = NES* <= 0.
400    """
401
402    nvals = numpy.array(sorted(vals))
403    nnes = numpy.array(sorted(nEnrichmentScores))
404
405    #print "LEN VALS", len(vals), len(nEnrichmentScores)
406
407    fdrs = []
408
409    import operator
410
411    for i in range(len(enrichmentScores)):
412
413        nes = nEnrichmentScores[i]
414
415        """
416        #Strighfoward but slow implementation follows in comments.
417        #Useful as code description.
418       
419        if nes >= 0:
420            op0 = operator.ge
421            opn = operator.ge
422        else:
423            op0 = operator.lt
424            opn = operator.le
425
426        allPos = [a for a in vals if op0(a,0)]
427        allHigherAndPos = [a for a in allPos if opn(a,nes) ]
428
429        nesPos = [a for a in nEnrichmentScores if op0(a,0) ]
430        nesHigherAndPos = [a for a in nesPos if opn(a,nes) ]
431
432        top = len(allHigherAndPos)/float(len(allPos)) #p value
433        down = len(nesHigherAndPos)/float(len(nesPos))
434       
435        l1 = [ len(allPos), len(allHigherAndPos), len(nesPos), len(nesHigherAndPos)]
436
437        allPos = allHigherAndPos = nesPos =  nesHigherAndPos = 1
438
439        """
440
441        #this could be speed up twice with the same accuracy!
442        if nes >= 0:
443            allPos = int(len(vals) - numpy.searchsorted(nvals, 0, side="left"))
444            allHigherAndPos = int(len(vals) - numpy.searchsorted(nvals, nes, side="left"))
445            nesPos = len(nnes) - int(numpy.searchsorted(nnes, 0, side="left"))
446            nesHigherAndPos = len(nnes) - int(numpy.searchsorted(nnes, nes, side="left"))
447        else:
448            allPos = int(numpy.searchsorted(nvals, 0, side="left"))
449            allHigherAndPos = int(numpy.searchsorted(nvals, nes, side="right"))
450            nesPos = int(numpy.searchsorted(nnes, 0, side="left"))
451            nesHigherAndPos = int(numpy.searchsorted(nnes, nes, side="right"))
452           
453        """
454        #Comparing results
455        l2 = [ allPos, allHigherAndPos, nesPos, nesHigherAndPos ]
456        diffs = [ l1[i]-l2[i] for i in range(len(l1)) ]
457        sumd = sum( [ abs(a) for a in diffs ] )
458        if sumd > 0:
459            print nes > 0
460            print "orig", l1
461            print "modi", l2
462        """
463
464        try:
465            top = allHigherAndPos/float(allPos) #p value
466            down = nesHigherAndPos/float(nesPos)
467
468            fdrs.append(top/down)
469        except:
470            fdrs.append(1000000000.0)
471   
472    #print "Whole part", time.time() - tb1
473
474    return zip(enrichmentScores, nEnrichmentScores, enrichmentPVals, fdrs)
475
476from . import obiGene
477
478def nth(l,n): return [ a[n] for a in l ]
479
480def itOrFirst(data):
481    """ Returns input if input is of type ExampleTable, else returns first
482    element of the input list """
483    if iset(data):
484        return data
485    else:
486        return data[0]
487
488def wrap_in_list(data):
489    """ Wraps orange.ExampleTable in a list """
490    if iset(data):
491        return [ data ]
492    else:
493        return data
494
495class transform_class(object):
496   
497    def __init__(self, cv, mapval, class_values, nclass):
498        self.cv = cv
499        self.mapval = mapval
500        self.class_values = class_values
501        self.nclass = nclass
502   
503    def __call__(self, ex, *args, **kwargs):
504        """
505        Removes unnecessary class values and joins them according
506        to function input.
507        """
508        if ex[self.cv] in self.class_values:
509            return self.nclass(self.mapval[str(ex[self.cv].value)])
510        else:
511            return "?"
512
513def takeClasses(datai, classValues=None):
514    """
515    Function joins class groups specified in an input pair
516    classValues. Each element of the pair is a list of class
517    values to be joined to first or second class. Group
518    classes in two new class values.
519    If class values are not specified, take all the classes.
520
521    Input data can be a single data set or a list of data sets
522    with the same domain.
523
524    Returns transformed data sets / data sets.
525    """
526
527    cv = itOrFirst(datai).domain.classVar
528    nclassvalues = None
529
530    if cv and len(itOrFirst(datai)) > 1:
531        oldcvals = [ a for a in cv.values ]
532       
533        if not classValues:
534            classValues = oldcvals
535
536        toJoin = []
537
538        for vals in classValues:
539            if issequencens(vals):
540                toJoin.append(list(vals))
541            else:
542                toJoin.append([vals])
543
544        classValues = reduce(lambda x,y: x+y, toJoin)
545        classValues = [ str(a) for a in classValues ] # ok class values
546
547        #dictionary of old class -> new class
548        mapval = {}
549        nclassvalues = [] # need to preserver order
550
551        for joinvals in toJoin:
552            joinvalsn = "+".join([ str(val) for val in sorted(joinvals) ])
553            nclassvalues.append(joinvalsn)
554
555            for val in joinvals:
556                mapval[str(val)] = joinvalsn
557
558        #take only examples with classValues classes
559        nclass = orange.EnumVariable(cv.name, values=nclassvalues)
560
561        tco = transform_class(cv=cv,mapval=mapval,class_values=classValues, nclass=nclass)
562
563        nclass.get_value_from = tco
564
565        ndom = orange.Domain(itOrFirst(datai).domain.attributes, nclass)
566
567        def removeAndTransformClasses(data):
568            ndata = Orange.data.Table(ndom, data)
569            ndata = Orange.data.Table(ndom, [ex for ex in ndata if ex[-1].value != "?"])
570            return ndata
571
572        if iset(datai):
573            datai = removeAndTransformClasses(datai)
574        else:
575            datai = [ removeAndTransformClasses(data) for data in datai ]
576
577    return datai
578
579def removeBadAttributes(datai, atLeast=3):
580    """
581    Removes attributes which would obscure GSEA analysis.
582
583    Attributes need to be continuous, they need to have
584    at least one value. Remove other attributes.
585
586    For the attribute to be valid, it needs to have at least
587    [atLeast] values for every class value.
588
589    Return transformed data set / data sets and ignored attributes.
590    """
591
592    def attrOk(a, data):
593        """
594        Attribute is ok if it is continouous and if containg
595        at least atLest not unknown values.
596        """
597
598        a = data.domain.attributes.index(a)
599
600        #can't
601        if data.domain.attributes[a].varType != orange.VarTypes.Continuous:
602            return False
603
604        if len(data) == 1:
605
606            vals = [ex[a].value for ex in data if not ex[a].isSpecial()]
607            if len(vals) < 1:
608                return False 
609       
610        if len(data) > 1 and data.domain.classVar and atLeast > 0:
611           
612            dc = dict( (v, 0) for v in data.domain.classVar.values )
613
614            for ex in data:
615                if not ex[a].isSpecial():
616                    dc[ex[-1].value] += 1
617
618            minl = min(dc.values())
619
620            if minl < atLeast:
621                #print "Less than atLeast"
622                return False
623
624        return True
625   
626
627    def notOkAttributes(data):
628        ignored = []
629        for a in data.domain.attributes:
630            if not attrOk(a, data):
631                #print "Removing", a
632                ignored.append(a)
633        return ignored
634   
635    ignored = []
636    if iset(datai):
637        ignored = set(notOkAttributes(datai))
638    else:
639        #ignore any attribute which is has less than atLeast values for each class
640        #ignored = set(reduce(lambda x,y: x+y, [ notOkAttributes(data) for data in datai ]))
641
642        #remove any attribute, which is ok in less than half of the dataset
643        ignored = []
644        for a in itOrFirst(datai).domain.attributes:
645            attrOks = sum([ attrOk(a, data) for data in datai ])
646            if attrOks < len(datai)/2:
647                ignored.append(a)
648
649
650    natts = [ a for a in itOrFirst(datai).domain.attributes if a not in ignored ]
651    #print ignored, natts, set(ignored) & set(natts)
652
653    ndom = orange.Domain(natts, itOrFirst(datai).domain.classVar)
654
655    datao = None
656    if iset(datai):
657        datao = orange.ExampleTable(ndom, datai)
658    else:
659        datao = [ orange.ExampleTable(ndom, data) for data in datai ]
660
661    return datao, ignored
662
663def keepOnlyMeanAttrs(datai, atLeast=3, classValues=None):
664    """
665    Attributes need to be continuous, they need to have
666    at least one value.
667
668    In order of attribute to be valid, it needs to have at least
669    [atLeast] values for every class value.
670
671    Keep only specified classes - group them in two values.
672    """   
673    datai = takeClasses(datai, classValues=classValues)
674    return removeBadAttributes(datai, atLeast=atLeast)
675
676def data_single_meas_column(data):
677    """
678    Returns true if data seems to be in one column
679    (float variables) only. This column should contain
680    the rankings
681    """
682    columns = [a for a in data.domain] +  [ data.domain.getmeta(a) for a in list(data.domain.getmetas()) ]
683    floatvars = [ a for a in columns if a.varType == orange.VarTypes.Continuous ]
684    if len(floatvars) == 1:
685        return True
686    else:
687        return False
688
689def transform_data(data, phenVar, geneVar):
690    """
691    if we have log2ratio in a single value column, transpose the matrix
692    i.e. we have a single column with a continous variable. first
693    string variable then becomes the gene name
694
695    The goal is to have different phenotypes annotated with a class,
696    and names of genes as attribute names.
697
698    If phenVar is False, then we can work, then the input already
699    consists of scores of differential expressions
700
701    If we have a single column, transpose it.
702    If phenVar is one of the groups, transpose the matrix.
703    """
704
705    def prepare_data(data, phenVar=None, geneVar=None):
706
707        def rorq(a, name):
708            """ Group annatation or question mark. """
709            try: 
710                return a.attributes[name]
711            except: 
712                return '?'
713   
714        #use class as phenotype by default, if it is present,
715        #if not, do not use any phenotype!
716        if phenVar == None: 
717            if not data.domain.classVar:
718                phenVar = False
719            else:
720                phenVar = data.domain.classVar
721
722
723        #TODO validate phenVar and geneVar?
724        #TODO autodetection of groups?
725
726        #transpose is not needed if phenVar is classVar or phenVar is False
727        #and there is only one sample
728        if phenVar == data.domain.classVar or \
729            (phenVar == False and len(data) == 1):
730
731            if geneVar == None: #if not specified, set as true in this stage
732                geneVar = True
733
734            floatvars = [ a for a in data.domain.attributes \
735                if a.varType == orange.VarTypes.Continuous ]
736
737            #rename attributes without touching the original variable
738            if geneVar != True:
739                fl2 = []
740
741                for a in floatvars:
742                    na = orange.FloatVariable(name=rorq(a, geneVar))
743                    na.getValueFrom = lambda e, rw: e[a]
744                    fl2.append(na)
745
746                floatvars = fl2
747
748            dom = orange.Domain(floatvars, phenVar)
749            return orange.ExampleTable(dom, data)
750
751        elif phenVar == False or phenVar != data.domain.classVar:
752
753            cands = allgroups(data)
754            pv = False
755            if phenVar != False:
756                pv = orange.EnumVariable(name="phenotype", 
757                    values=list(cands[phenVar]))
758
759            #take the only string attribute as a gene name
760            gc = gene_cands(data, False)
761            if geneVar == None:
762                if len(gc) == 1:
763                    geneVar = gc[0]
764                else:
765                    geneNamesUnspecifiedError()
766           
767            latts = [ orange.FloatVariable(name=ex[geneVar].value) \
768                for ex in data ]
769
770            domain = orange.Domain(latts, pv)
771
772            examples = []
773            for at in data.domain.attributes:
774                if at.varType == orange.VarTypes.Continuous:
775                    vals = [ ex[at].value for ex in data ]
776                    if pv != False: #add class value
777                        vals.append(rorq(at, phenVar))
778                    examples.append(orange.Example(domain, vals))
779
780            return orange.ExampleTable(domain, examples)
781        else:
782            wrongInputsError()
783
784    #transform all example tables
785    single = iset(data)
786    transposed = [ prepare_data(d, phenVar, geneVar) for d in wrap_in_list(data) ]
787
788    if single:
789        return transposed[0]
790    else:
791        return transposed
792
793
794def allgroups(data):
795    """
796    Return all phenotype descriptors of attributes with their values.
797    """
798    sd = defaultdict(set)
799    for attr in data.domain.attributes:
800        for key, value in attr.attributes.items():
801            sd[key].add(value)
802    return sd
803
804def already_have_correlations(data):
805    return len(data) == 1 or data_single_meas_column(data)
806
807def need_to_transpose_single(data):
808    if len(data) == 1:
809        return False
810    else:
811        return True
812
813def phenotype_cands(data):
814    """
815    Return all phenotype candidate descriptors in a list of tuples
816    (variable, values). Candidates are class variable, if it exists and
817    attributes dictionaries of attributes.
818    Phenotype candidates must contain at least two differend values.
819    """
820    if already_have_correlations(data):
821        return [ (False, set()) ]
822    else:
823        cv = []
824        if data.domain.classVar and data.domain.classVar.varType == orange.VarTypes.Discrete:
825            cv.append((data.domain.classVar, set(data.domain.classVar.values)))
826        cands = cv + sorted(allgroups(data).items())
827        return filter(lambda x: len(x[1]) >= 2, cands)
828
829def gene_cands(data, correct):
830    """
831    Returns all valid gene descriptors with regards to the choosen
832    phenotype variable.
833    Return variable descriptor for variables, name of the group for
834    descriptions in attr.attributes and True for the usage
835    of attribute names.
836    Correct is True, if the example table has genes as attributes.
837    """
838    if correct:
839        #gene names could be in attributes or as gene names (marker True)
840        return [True] + nth(sorted(allgroups(data)),0)
841    else:
842        #gene names are values of some string attribute
843        columns = [a for a in data.domain] +  \
844            [ data.domain.getmeta(a) for a in list(data.domain.getmetas()) ]
845        stringvars = [ a for a in columns if a.varType == 6 ]
846        return stringvars
847
848def is_variable(phenVar):
849    return isinstance(phenVar, orange.Variable)
850
851class GSEA(object):
852
853    def __init__(self, data, organism=None, matcher=None, classValues=None, 
854        atLeast=3, caseSensitive=False, phenVar=None, geneVar=None):
855        """
856        If the data set constains multiple measurements for a single gene,
857        all are considered. Individual constributions of such measurements
858        are not weighted down - each measurement is as important as they
859        would measure different genes.
860
861        phenVar and geneVar can ether be an orange attribute or a string.
862        If they are strings, then they describe a group.
863        """
864
865        self.genesets = {}
866        self.organism = organism
867
868        if organism != None:
869            print "WARNING: obiGsea - organism and caseSensitive parameters are deprecated. Use matcher instead."
870
871        self.gsweights = {}
872        self.namesToIndices = None
873        self.gm = matcher
874
875        data = transform_data(data, phenVar, geneVar)
876
877        data, info = keepOnlyMeanAttrs(data, classValues=classValues, atLeast=atLeast)
878
879        self.data = data
880
881        #init attrnames
882        attrnames = [ a.name for a in itOrFirst(self.data).domain.attributes ]
883
884        if self.gm == None: #build a gene matcher, if if does not exists
885            self.gm = obiGene.matcher([obiGene.GMKEGG(self.organism, ignore_case=not caseSensitive)], 
886                ignore_case=not caseSensitive, direct=True)
887            print "WARNING: gene matcher build automatically for organism: " + self.organism
888
889        self.gm.set_targets(attrnames)
890
891 
892    def addGeneset(self, genesetname, genes):
893        """
894        Add a single gene set. See addGenesets function.
895        Solely for backwards compatibility.
896        """
897        self.addGenesets({ genesetname: genes })
898
899    def addGenesets(self, genesets):
900        """
901        Adds genesets from input dictionary. Performs gene matching. Adds
902        to a self.genesets: key is genesetname, it's values are individual
903        genes and match results.
904        """
905        for g in obiGeneSets.GeneSets(genesets):
906            genes = g.genes
907            datamatch = filter(lambda x: x[1] != None, 
908                [ (gene, self.gm.umatch(gene)) for gene in genes])
909            self.genesets[g] = datamatch
910
911    def selectGenesets(self, minSize=3, maxSize=1000, minPart=0.1):
912        """ Returns a list of gene sets that have sizes in limits """
913
914        def okSizes(orig, transl):
915            """compares sizes of genesets to limitations"""
916            if len(transl) >= minSize and len(transl) <= maxSize \
917                and float(len(transl))/len(orig) >= minPart:
918                return True
919            return False
920
921        return  dict( (a,c) for a,c in self.genesets.iteritems() if okSizes(a.genes,c) )
922
923    def genesIndices(self, genes):
924        """
925        Returns in attribute indices of given genes.
926        Buffers locations dictionary.
927        """
928        if not self.namesToIndices:
929            self.namesToIndices = defaultdict(list)
930            for i,at in enumerate(itOrFirst(self.data).domain.attributes):
931                self.namesToIndices[at.name].append(i)
932        return reduce(lambda x,y:x+y, [ self.namesToIndices[gname] for gname in genes ], [])
933
934    def compute_gene_weights(self, gsweights, gsetsnum, nattributes):
935        """
936        Computes gene set weights for all specified weights.
937        Expects gene sets in form { name: [ num_attributes ] }
938        GSWeights are
939        """
940        pass
941
942    def to_gsetsnum(self, gsets):
943        """
944        Returns a dictionary of given  gene sets in gsetnums format.
945        """
946        return dict( (gs,self.genesIndices(nth(self.genesets[gs],1))) for gs in gsets)
947
948    def compute(self, minSize=3, maxSize=1000, minPart=0.1, n=100, **kwargs):
949
950        subsetsok = self.selectGenesets(minSize=minSize, maxSize=maxSize, minPart=minPart)
951
952        geneweights = None
953
954        gsetsnum = self.to_gsetsnum(subsetsok.keys())
955        gsetsnumit = gsetsnum.items() #to fix order
956
957        #gsetsnumit = gsetsnumit[:1]
958        #print gsetsnumit
959
960        if len(gsetsnum) == 0:
961            return {} # quick return if no genesets
962
963        if len(self.gsweights) > 0:
964            #set geneset
965            geneweights = [1]*len(data.domain.attributes)
966
967        if len(itOrFirst(self.data)) > 1:
968            gseal = gseaE(self.data, nth(gsetsnumit,1), n=n, geneweights=geneweights, **kwargs)
969        else:
970            rankings = [ self.data[0][at].native() for at in self.data.domain.attributes ]
971            gseal = gseaR(rankings, nth(gsetsnumit,1), n=n, **kwargs)
972
973        res = {}
974
975        for gs, gseale in zip(nth(gsetsnumit,0),gseal):
976            rdict = {}
977            rdict['es'] = gseale[0]
978            rdict['nes'] = gseale[1]
979            rdict['p'] = gseale[2]
980            rdict['fdr'] = gseale[3]
981            rdict['size'] = len(gs.genes)
982            rdict['matched_size'] = len(self.genesets[gs])
983            rdict['genes'] = nth(self.genesets[gs],1)
984            res[gs] = rdict
985
986        return res
987
988def runGSEA(data, organism=None, classValues=None, geneSets=None, n=100, 
989        permutation="class", minSize=3, maxSize=1000, minPart=0.1, atLeast=3, 
990        matcher=None, geneVar=None, phenVar=None, caseSensitive=False, 
991        **kwargs):
992    """
993    phenVar and geneVar specify the phenotype and gene variable.
994    """
995    gso = GSEA(data, organism=organism, matcher=matcher, 
996        classValues=classValues, atLeast=atLeast, caseSensitive=caseSensitive,
997        geneVar=geneVar, phenVar=phenVar)
998    gso.addGenesets(geneSets)
999    res1 = gso.compute(n=n, permutation=permutation, minSize=minSize,
1000        maxSize=maxSize, minPart=minPart, **kwargs)
1001    return res1
1002
1003def etForAttribute(datal,a):
1004    """
1005    Builds an example table for a single attribute across multiple
1006    example tables.
1007    """
1008
1009    tables = len(datal)
1010
1011    def getAttrVals(data, attr):
1012        dom2 = orange.Domain([data.domain[attr]], False)
1013        dataa = orange.ExampleTable(dom2, data)
1014        return [ a[0].native() for a in dataa ]
1015
1016    domainl = []
1017    valuesl = []
1018
1019    for id, data in enumerate(datal):
1020        v = getAttrVals(data,a)
1021        valuesl.append(v)
1022        domainl.append(orange.FloatVariable(name=("v"+str(id))))
1023
1024    classvals = getAttrVals(data, datal[0].domain.classVar)
1025    valuesl += [ classvals ]
1026
1027    dom = orange.Domain(domainl, datal[0].domain.classVar)
1028    examples = [ list(a) for a in zip(*valuesl) ]
1029
1030    datat = orange.ExampleTable(dom, examples)
1031
1032    return datat
1033
1034
1035def evaluateEtWith(fn, *args, **kwargs):
1036    """
1037    fn - evaluates example table given
1038    following arguments.
1039    """
1040
1041    def newf(datal):
1042        res = []
1043        for a in datal[0].domain.attributes:
1044            et = etForAttribute(datal, a)
1045            res.append(fn(et, *args, **kwargs))
1046        return res
1047
1048    return newf
1049
1050
1051def hierarchyOutput(results, limitGenes=50):
1052    """
1053    Transforms results for use by hierarchy output from GO.
1054
1055    limitGenes - maximum number of genes on output.
1056    """
1057    trans = []
1058   
1059    for name, res in results.items():
1060        try:
1061            second = name.split(' ')[2]
1062            name = second if second[:2] == 'GO' else name
1063        except:
1064            pass
1065       
1066        trans.append((name, abs(res["nes"]), res["matched_size"], res["size"], res["p"], min(res["fdr"], 1.0), res["genes"][:limitGenes]))
1067
1068    return trans
1069
1070if  __name__=="__main__":
1071
1072    """
1073    Old example with another measure function
1074    data = orange.ExampleTable("gene_three_lines_log.tab")
1075    data = orange.ExampleTable("sterolTalkHepa.tab")
1076    gen1 = collections(['steroltalk.gmt', ':kegg:hsa'], default=False)
1077    rankingf = rankingFromOrangeMeas(MA_anova())
1078    matcher = obiGene.matcher([obiGene.GMKEGG('hsa')])
1079    geneVar = gene_cands(data, False)[1]
1080    phenVar = "group"
1081    out = runGSEA(data, n=10, geneSets=gen1, permutation="gene", atLeast=3, matcher=matcher, rankingf=rankingf, phenVar=phenVar, geneVar=geneVar)
1082    """
1083    """
1084    data = orange.ExampleTable("sterolTalkHepa.tab")
1085    gen1 = obiGeneSets.collections('steroltalk.gmt', (("KEGG",), "9606"))
1086    matcher = obiGene.matcher([obiGene.GMKEGG('hsa')])
1087    out = runGSEA(data, n=10, geneSets=gen1, permutation="gene", atLeast=3, matcher=matcher)
1088    """
1089    matcher = obiGene.matcher([[obiGene.GMKEGG('ddi'), obiGene.GMDicty()]])
1090    data = orange.ExampleTable("/home/marko/t_gsea1.tab")
1091    gen1 = obiGeneSets.collections((("KEGG",), "352472"))
1092    out = runGSEA(data, n=10, geneSets=gen1, permutation="gene", atLeast=3, matcher=matcher, phenVar="growth")
1093    print out
1094    print "\n".join(map(str,sorted(out.items())))
1095   
Note: See TracBrowser for help on using the repository browser.