source: orange-bioinformatics/obiGene.py @ 1183:6b51cc000d70

Revision 1183:6b51cc000d70, 26.0 KB checked in by ales_erjavec <ales.erjavec@…>, 4 years ago (diff)
  • added tax id mapping to NCBIGeneInfo
Line 
1import os
2import obiTaxonomy
3import orngServerFiles
4
5import time
6
7default_database_path = orngServerFiles.localpath("NCBI_geneinfo")
8
9class GeneInfo(object):
10    """ An object representing the NCBI information for a gene.
11    """
12
13    NCBI_GENEINFO_TAGS = ("tax_id", "gene_id", "symbol", "locus_tag", "synonyms",
14                          "dbXrefs", "chromosome", "map_location", "description", "type",
15                          "symbol_from_nomenclature_authority", "full_name_from_nomenclature_authority",
16                          "nomenclature_status", "other_designations", "modification_date")
17    NCBI_MULTIPLE_CARDINALITY_TAGS = ("synonyms", "dbXrefs", "other_designations")
18   
19    __slots__ = NCBI_GENEINFO_TAGS
20    def __init__(self, line):
21        """ Construct the GeneInfo object from a line in the NCBI gene_info file
22        """
23        line = line.split("\t")
24        for attr, value in zip(self.__slots__, line):
25            if value == "-":
26                value = None
27            if attr in GeneInfo.NCBI_MULTIPLE_CARDINALITY_TAGS:
28                value = value.split("|") if value != None else []
29            setattr(self, attr, value)
30
31    def __repr__(self):
32        def format(value):
33            if not value:
34                return "-"
35            elif type(value) == list:
36                return "|".join(value)
37            else:
38                return value
39        return "\t".join(format(getattr(self, slot)) for slot in self.__slots__)
40
41    def __str__(self):
42        return repr(self)
43
44class GeneHistory(object):
45    NCBI_GENE_HISTORY_TAGS = ("tax_id", "gene_id", "discontinued_gene_id", "discontinued_symbol", "discontinue_date")
46    __slots__ = NCBI_GENE_HISTORY_TAGS
47    def __init__(self, line):
48        for attr, value in zip(self.__slots__, line.split("\t")):
49            setattr(self, attr, value)
50           
51           
52class NCBIGeneInfo(dict):
53    TAX_MAP = {
54            "2104": "272634",  # Mycoplasma pneumoniae
55            "4530": "39947",  # Oryza sativa
56            "5833": "36329",  # Plasmodium falciparum
57            "4932": "559292",  # Saccharomyces cerevisiae
58            }
59       
60    def __init__(self, organism, genematcher=None):
61        """ An dictionary like object for accessing NCBI gene info
62        Arguments::
63                - *organism*    Organism id
64
65        Example::
66            >>> info = NCBIGeneInfo("Homo sapiens")
67        """
68       
69        self.taxid = self.organism_name_search(organism)
70
71
72        fname = orngServerFiles.localpath_download("NCBI_geneinfo", "gene_info.%s.db" % self.taxid)
73        file = open(fname, "rb")
74        self.update(dict([(line.split("\t", 3)[1], line) for line in file.read().split("\n") if line.strip() and not line.startswith("#")]))
75
76        # NOTE orig init time for gene matcher: 2.5s, new 4s: investigate the slowdown
77        # NOTE matches are not the same because aliases are build a bit
78        # differently (main name versus old aliases conflict!)
79
80        self.matcher = genematcher
81        if self.matcher == None:
82            if self.taxid == '352472':
83                self.matcher = matcher([[GMNCBI(self.taxid), GMDicty()]])
84            else:
85                self.matcher = matcher([GMNCBI(self.taxid)])
86
87        #if this is done with a gene matcher, pool target names
88        self.matcher.set_targets(self.keys())
89       
90    def history(self):
91        if getattr(self, "_history", None) is None:
92            fname = orngServerFiles.localpath_download("NCBI_geneinfo", "gene_history.%s.db" % self.taxid)
93            try:
94                self._history = dict([(line.split("\t")[2], GeneHistory(line)) for line in open(fname, "rb").read().split("\n")])
95               
96            except Exception, ex:
97                print >> sys.srderr, "Loading NCBI gene history failed.", ex
98                self._history = {}
99        return self._history
100       
101    @classmethod
102    def organism_version(cls, name):
103        oname = cls.organism_name_search(name)
104        #FIXME, dirty hack to ensure file id downloaded
105        orngServerFiles.localpath_download("NCBI_geneinfo", "gene_info.%s.db" % oname) 
106        return orngServerFiles.info("NCBI_geneinfo", "gene_info.%s.db" % oname)["datetime"]
107
108    @classmethod
109    def organism_name_search(cls, org):
110        taxids = obiTaxonomy.to_taxid(org, mapTo=cls.common_taxids())
111        if not taxids:
112            taxids = obiTaxonomy.search(org, onlySpecies=False)
113            taxids = set(cls.common_taxids()).intersection(taxids) #onlySpecies=False needed to find correct dicty
114        if len(taxids) == 0:
115            raise obiTaxonomy.UnknownSpeciesIdentifier, org
116        elif len(taxids) > 1:
117            raise obiTaxonomy.MultipleSpeciesException, ", ".join(["%s: %s" % (id, obiTaxonomy.name(id)) for id in taxids])
118        taxid = taxids.pop()
119        return cls.TAX_MAP.get(taxid, taxid)
120
121    @classmethod   
122    def load(cls, file):
123        """ A class method that loads gene info from file
124        """
125        if type(file) in [str, unicode]:
126            file = open(file, "rb")
127        return cls((line.split("\t", 3)[1], line) for line in file.read().split("\n") if line.strip() and not line.startswith("#"))
128       
129    def get_info(self, gene_id, def_=None):
130        """ Search and return the GeneInfo object for gene_id
131        """
132        try:
133            return self(gene_id)
134        except KeyError:
135            return def_
136       
137    def __call__(self, name):
138        """ Search and return the GeneInfo object for gene_id
139        """
140        #id = self.translate.get(name, name)
141        #print self.matcher.umatch(name), self.matcher.match(name)
142        id = self.matcher.umatch(name)
143        return self[id]
144
145    def __getitem__(self, key):
146#        return self.get(gene_id, self.matcher[gene_id])
147        return GeneInfo(dict.__getitem__(self, key))
148
149    def __setitem__(self, key, value):
150        if type(value) == str:
151            dict.__setitem__(self, key, value)
152        else:
153            dict.__setitem__(self, key, repr(value))
154
155    def get(self, key, def_=None):
156        try:
157            return self[key]
158        except KeyError:
159            return def_
160
161    def itervalues(self):
162        for val in dict.itervalues(self):
163            yield GeneInfo(val)
164
165    def iteritems(self):
166        for key, val in zip(self.iterkeys(), self.itervalues()):
167            yield key, val
168
169    def values(self):
170        return list(self.itervalues())
171   
172    def items(self):
173        return list(self.iteritems())
174
175    @staticmethod
176    def get_geneinfo_from_ncbi(file, progressCallback=None):
177        import urllib2, gzip, shutil, tempfile
178        from cStringIO import StringIO
179        if isinstance(file, basestring):
180            file = open(file, "wb")
181       
182        stream = urllib2.urlopen("ftp://ftp.ncbi.nih.gov/gene/DATA/gene_info.gz")
183        tmpfile = tempfile.TemporaryFile()
184        shutil.copyfileobj(stream, tmpfile)
185        tmpfile.seek(0)
186        stream = gzip.GzipFile(None, "rb", fileobj=tmpfile)
187        shutil.copyfileobj(stream, file)
188       
189    @staticmethod
190    def get_gene_history_from_ncbi(file, progressCallback=None):
191        import urllib2, gzip, shutil, tempfile
192        from cStringIO import StringIO
193        if isinstance(file, basestring):
194            file = open(file, "wb")
195       
196        stream = urllib2.urlopen("ftp://ftp.ncbi.nih.gov/gene/DATA/gene_history.gz")
197        tmpfile = tempfile.TemporaryFile()
198        shutil.copyfileobj(stream, tmpfile)
199        tmpfile.seek(0)
200        stream = gzip.GzipFile(None, "rb", fileobj=tmpfile)
201        shutil.copyfileobj(stream, file)
202       
203    @classmethod
204    def common_taxids(cls):
205        taxids = obiTaxonomy.common_taxids()
206        return [cls.TAX_MAP.get(id, id) for id in taxids if cls.TAX_MAP.get(id, id)]
207   
208    @classmethod
209    def essential_taxids(cls):
210        taxids = obiTaxonomy.essential_taxids()
211        return [cls.TAX_MAP.get(id, id) for id in taxids if cls.TAX_MAP.get(id, id)]
212
213"""
214Gene matcher.
215
216"Database" for each oranism is a list of sets of gene aliases.
217"""
218
219from collections import defaultdict
220import os
221
222gene_matcher_path = None
223
224def ignore_case(gs):
225    """ Transform names in sets in list to lower case """
226    return [ set([a.lower() for a in g]) for g in gs ]
227
228def create_mapping(groups, lower=False):
229    """
230    Returns mapping of aliases to the group index. If lower
231    is True, lower case forms of gene aliases are mapped to indices.
232
233    Unpickling the results of this function (binary format)
234    is slower than running it.
235
236    TIMING NOTES:
237    - lower() costs are neglible (< 10%)
238    - building sets instead of lists also costs about 10 percent
239    """
240    togroup = defaultdict(set)
241
242    # code duplicated because running a function in relatively expensive here.
243    if lower: 
244        for i,group in enumerate(groups):
245            for alias in group:
246                togroup[alias.lower()].add(i)
247    else:
248        for i,group in enumerate(groups):
249            for alias in group:
250                togroup[alias].add(i)
251
252    return togroup
253
254def join_sets(set1, set2, lower=False):
255    """
256    Joins two sets of gene set mappings. If lower is True, lower case
257    forms of gene aliases are compared.
258
259    A group g1 from set1 is joined to a group of aliases g2 from set2,
260    if the groups share at least one gene.
261    Returns all joined groups and groups that were not matched, which
262    remain unchanged.
263
264    The operation both commutative and associative.
265    """
266
267    set1 = [ set(a) for a in set1 ]
268    set2 = [ set(a) for a in set2 ]
269
270    currentmap = create_mapping(set1, lower=lower)
271
272    new = [] #new groups
273
274    #remember used to find unused
275    set1used = set() 
276    set2used = set()
277
278    fn = lambda x: x
279    if lower:
280        fn = lambda x: x.lower()
281
282    for i, group in enumerate(set2):
283
284        #find groups of aliases (from set1)  intersecting with a current
285        #group from set2
286        cross = reduce(set.union, 
287            [ currentmap[fn(alias)] for alias in group if fn(alias) in currentmap], set())
288
289        for c in cross:
290            #print c, group & set1[c], group, set1[c]
291            set1used.add(c)
292            set2used.add(i)
293            new.append(group | set1[c]) #add a union
294
295    #add groups without matches (from both sets)
296    set1add = set(range(len(set1))) - set1used
297    set2add = set(range(len(set2))) - set2used
298
299    for a in set1add:
300        new.append(set1[a])
301    for a in set2add:
302        new.append(set2[a])
303
304    return new
305 
306def join_sets_l(lsets, lower=False):
307    """
308    Joins multiple gene set mappings using join_sets function.
309    """
310    current = lsets[0]
311    for b in lsets[1:]:
312        current = join_sets(current, b, lower=lower)
313    return current
314
315class Matcher(object):
316    """
317    Gene matcher tries to match an input gene to some target.
318    """
319
320    def set_targets(self, targets):
321        """
322        Set input list of gene names as targets.
323        Abstract function.
324        """
325        notImplemented()
326
327    def match(self, gene):
328        """Returns a list of matching target gene names."""
329        notImplemented()
330
331    def umatch(self, gene):
332        """Returns an unique (only one matching target) target or None"""
333        mat = self.match(gene)
334        return mat[0] if len(mat) == 1 else None
335
336    def explain(self, gene):
337        """
338        Returns an gene matches with explanations as lists of tuples.
339        Each tuple consists of a list of target genes in a set
340        of aliases matched to input gene, returned as a second part
341        of the tuple.
342        """
343        notImplemented()
344
345def buffer_path():
346    """ Returns buffer path from Orange's setting folder if not
347    defined differently (in gene_matcher_path). """
348    if  gene_matcher_path == None:
349        import orngEnviron
350        pth = os.path.join(orngEnviron.directoryNames["bufferDir"], 
351            "gene_matcher")
352        try:
353            os.makedirs(pth)
354        except:
355            pass
356        return pth
357    else:
358        return gene_matcher_path
359
360def auto_pickle(filename, version, func, *args, **kwargs):
361    """
362    Run function func with given arguments and save the results to
363    a file named filename. If results for a given filename AND
364    version were already saved, just read and return them.
365    """
366
367    import cPickle as pickle
368
369    output = None
370    outputOk = False
371
372    try:
373        f = open(filename,'rb')
374
375        try:
376            versionF = pickle.load(f)
377            if version == None or versionF == version:
378                outputOk = True
379                output = pickle.load(f)
380        except:
381            pass
382        finally:
383            f.close()
384
385    except:
386        pass
387
388    if not outputOk:
389        output = func(*args, **kwargs)
390
391        #save output before returning
392        f = open(filename,'wb')
393        pickle.dump(version, f, -1)
394        pickle.dump(output, f, -1)
395        f.close()
396
397    return output
398
399class MatcherAliases(Matcher):
400    """
401    Genes matcher based on a list of sets of given aliases.
402
403    Target genes belonging to same sets of aliases as the input gene are
404    returned as matching genes.
405
406    """
407    def __init__(self, aliases, ignore_case=True):
408        self.aliases = aliases
409        self.ignore_case = ignore_case
410        self.mdict = create_mapping(self.aliases, self.ignore_case)
411
412    def to_ids(self, gene):
413        """ Return ids of sets of aliases the gene belongs to. """
414        if self.ignore_case:
415            gene = gene.lower()
416        return self.mdict[gene]
417
418    def set_targets(self, targets):
419        """
420        A reverse dictionary is made accordint to each target's membership
421        in the sets of aliases.
422        """
423        d = defaultdict(list)
424        for target in targets:
425            ids = self.to_ids(target)
426            if ids != None:
427                for id in ids:
428                    d[id].append(target)
429        self.to_targets = d
430
431    def match(self, gene):
432        """
433        Input gene is first mapped to ids of sets of aliases which contain
434        it. Target genes belonding to the same sets of aliases are returned
435        as input's match.
436        """
437        inputgeneids = self.to_ids(gene)
438        #return target genes with same ids
439        return list(set( \
440            reduce(lambda x,y:x+y, 
441                [ self.to_targets[igid] for igid in inputgeneids ], [])))
442
443    def explain(self, gene):
444        inputgeneids = self.to_ids(gene)
445        return [ (self.to_targets[igid], self.aliases[igid]) for igid in inputgeneids ]
446
447class MatcherAliasesPickled(MatcherAliases):
448    """
449    Gene matchers based on sets of aliases supporting pickling should
450    extend this class. Subclasses must define functions "filename",
451    "create_aliases_version" and "create_aliases". Those are crucial for
452    pickling of gene aliases to work.
453
454    Loading of gene aliases is done lazily: they are loaded when they are
455    needed. Loading of aliases for components of joined matchers is often
456    unnecessary and is therefore avoided.
457    """
458   
459    def set_aliases(self, aliases):
460        self.saved_aliases = aliases
461
462    def get_aliases(self):
463        if not self.saved_aliases: #loads aliases if not loaded
464            self.aliases = self.load_aliases()
465        #print "size of aliases ", len(self.saved_aliases)
466        return self.saved_aliases
467
468    aliases = property(get_aliases, set_aliases)
469
470    def get_mdict(self):
471        """ Creates mdict. Aliases are loaded if needed. """
472        if not self.saved_mdict:
473            self.saved_mdict = create_mapping(self.aliases, self.ignore_case)
474        return self.saved_mdict
475
476    def set_mdict(self, mdict):
477        self.saved_mdict = mdict
478
479    mdict = property(get_mdict, set_mdict)
480
481    def set_targets(self, targets):
482        MatcherAliases.set_targets(self, targets)
483
484    def filename(self):
485        """ Returns file name for saving aliases. """
486        notImplemented()
487       
488    def create_aliases_version(self):
489        """ Returns version of the source database state. """
490        notImplemented()
491
492    def create_aliases(self):
493        """ Returns gene aliases. """
494        notImplemented()
495
496    def load_aliases(self):
497        fn = self.filename()
498        ver = self.create_aliases_version() #if version == None ignore it
499        if fn != None:
500            if isinstance(fn, tuple): #if you pass tuple, look directly
501               filename = fn[0]
502            else:
503               filename = os.path.join(buffer_path(), fn)
504            return auto_pickle(filename, ver, self.create_aliases)
505        else:
506            #if either file version of version is None, do not pickle
507            return self.create_aliases()
508
509    def __init__(self, ignore_case=True):
510        self.aliases = []
511        self.mdict = {}
512        self.ignore_case = ignore_case
513        self.filename() # test if valid filename can be built
514
515class MatcherAliasesKEGG(MatcherAliasesPickled):
516
517    def _organism_name(self, organism):
518        import obiKEGG 
519        return obiKEGG.organism_name_search(organism)
520
521    def create_aliases(self):
522        organism = self._organism_name(self.organism)
523        import obiKEGG
524        org = obiKEGG.KEGGOrganism(self.organism, genematcher=GMDirect())
525        genes = org.genes
526        osets = [ set([name]) | set(b.alt_names) for 
527                name,b in genes.items() ]
528        return osets
529
530    def create_aliases_version(self):
531        import obiKEGG
532        return obiKEGG.KEGGOrganism.organism_version(self.organism) + ".1"
533
534    def filename(self):
535        return "kegg_" + self._organism_name(self.organism) 
536
537    def __init__(self, organism, ignore_case=True):
538        self.organism = organism
539        MatcherAliasesPickled.__init__(self, ignore_case=ignore_case)
540
541class MatcherAliasesFile(MatcherAliasesPickled):
542
543    def create_aliases(self):
544        canNotCreateButCanOnlyOpen()
545
546    def create_aliases_version(self):
547        return None
548
549    def filename(self):
550        return (self.filename_,)
551
552    def __init__(self, filename, ignore_case=True):
553        self.filename_ = filename
554        MatcherAliasesPickled.__init__(self, ignore_case=ignore_case)
555
556
557class MatcherAliasesGO(MatcherAliasesPickled):
558
559    def _organism_name(self, organism):
560        """ Returns internal GO organism name. Used to define file name. """
561        import obiGO
562        return obiGO.organism_name_search(self.organism)
563
564    def create_aliases(self):
565        import obiGO
566        annotations = obiGO.Annotations(self.organism, genematcher=GMDirect())
567        names = annotations.geneNamesDict
568        return map(set, list(set([ \
569            tuple(sorted(set([name]) | set(genes))) \
570            for name,genes in names.items() ])))
571
572    def filename(self):
573        return "go_" + self._organism_name(self.organism)
574
575    def create_aliases_version(self):
576        import obiGO
577        return "v2." + obiGO.Annotations.organism_version(self.organism)
578
579    def __init__(self, organism, ignore_case=True):
580        self.organism = organism
581        MatcherAliasesPickled.__init__(self, ignore_case=ignore_case)
582
583class MatcherAliasesDictyBase(MatcherAliasesPickled):
584
585    def create_aliases(self):
586        import obiDicty
587        db = obiDicty.DictyBase()
588        #db.info, db.mappings
589        infoa = [ set([id,name]) | set(aliases) for id,(name,aliases,_) in db.info.items() ]
590        mappingsa = [ set(filter(None, a)) for a in db.mappings ]
591        joineda = join_sets(infoa, mappingsa, lower=True)
592        return joineda
593
594    def create_aliases_version(self):
595        import obiDicty
596        return "v1." + obiDicty.DictyBase.version()
597
598    def filename(self):
599        return "dictybase" 
600
601    def __init__(self, ignore_case=True):
602        MatcherAliasesPickled.__init__(self, ignore_case=ignore_case)
603
604class MatcherAliasesNCBI(MatcherAliasesPickled):
605
606    def _organism_name(self, organism):
607        return NCBIGeneInfo.organism_name_search(organism)
608
609    def create_aliases(self):
610        ncbi = NCBIGeneInfo(self.organism, genematcher=GMDirect())
611        out = []
612        for k in ncbi.keys():
613            out.append(set(filter(None, [k, ncbi[k].symbol, ncbi[k].locus_tag] + [ s for s in ncbi[k].synonyms ] )))
614        return out
615
616    def filename(self):
617        return "ncbi_" + self._organism_name(self.organism)
618
619    def create_aliases_version(self):
620        return "v2." + NCBIGeneInfo.organism_version(self.organism)
621
622    def __init__(self, organism, ignore_case=True):
623        self.organism = organism
624        MatcherAliasesPickled.__init__(self, ignore_case=ignore_case)
625       
626class MatcherAliasesAffy(MatcherAliasesPickled):
627    def create_aliases(self):
628        filename = orngServerFiles.localpath_download("Affy", self.organism + ".pickle")
629        import cPickle
630        return cPickle.load(open(filename, "rb"))
631   
632    def filename(self):
633        return "affy_" + self.organism
634   
635    def create_aliases_version(self):
636        orngServerFiles.localpath_download("Affy", self.organism + ".pickle")
637        return orngServerFiles.info("Affy", self.organism + ".pickle")["datetime"]
638       
639    def __init__(self, organism, **kwargs):
640        self.organism = organism
641        MatcherAliasesPickled.__init__(self, **kwargs)
642
643class MatcherAliasesPickledJoined(MatcherAliasesPickled):
644    """
645    Creates a new matcher by joining gene aliases from different data sets.
646    Sets of aliases are joined if they contain common genes.
647
648    The joined gene matcher can only be pickled if the source gene
649    matchers are picklable.
650    """
651
652    def filename(self):
653        # do not pickle if any is unpicklable
654        try:
655            filenames = [ mat.filename() for mat in self.matchers ]
656            if self.ignore_case:
657                filenames += [ "icj" ]
658            return "__".join(filenames)
659        except:
660            return None
661
662    def create_aliases(self):
663        return join_sets_l([ mat.aliases for mat in self.matchers ], lower=self.ignore_case)
664
665    def create_aliases_version(self):
666        try:
667            return "v4_" + "__".join([ mat.create_aliases_version() for mat in self.matchers ])
668        except:
669            return None
670
671    def __init__(self, matchers):
672        """
673        Join matchers together. Groups of aliases are joined if
674        they share a common name.
675
676        If ignore_case is True, ignores case when joining gene aliases.
677        """
678        #FIXME: sorting of matchers to avoid multipying pickled files for
679        #different orderings.
680        self.matchers = matchers
681        allic = set([ m.ignore_case for m in self.matchers ])
682        if len(allic) > 1:
683            notAllMatchersHaveEqualIgnoreCase()
684        ignore_case = list(allic)[0]
685
686        MatcherAliasesPickled.__init__(self, ignore_case=ignore_case)
687       
688class MatcherSequence(Matcher):
689    """
690    Chaining of gene matchers.
691   
692    User defines the order of gene matchers. Each gene is goes through
693    sequence of gene matchers until a match is found.
694    """
695   
696    def __init__(self, matchers):
697        self.matchers = matchers
698
699    def match(self, gene):
700        for matcher in self.matchers:
701            m = matcher.match(gene)
702            if m: 
703                return m
704        return []
705
706    def set_targets(self, targets):
707        for matcher in self.matchers:
708            matcher.set_targets(targets)
709
710    def explain(self, gene):
711        for matcher in self.matchers:
712            m = matcher.match(gene)
713            if m: 
714                return matcher.explain(gene)
715        return []
716
717class MatcherDirect(Matcher):
718    """
719    Direct matching to targets.
720    """
721
722    def __init__(self, ignore_case=True):
723        self.ignore_case = ignore_case
724
725    def set_targets(self, targets):
726        aliases = [ set([a]) for a in targets]
727        self.am = MatcherAliases(aliases, ignore_case=self.ignore_case)
728        self.am.set_targets(targets)
729
730    def match(self, gene):
731        return self.am.match(gene)
732               
733GMDirect = MatcherDirect
734GMKEGG = MatcherAliasesKEGG
735GMGO = MatcherAliasesGO
736GMNCBI = MatcherAliasesNCBI
737GMDicty = MatcherAliasesDictyBase
738GMAffy = MatcherAliasesAffy
739
740def issequencens(x):
741    return hasattr(x, '__getitem__') and not isinstance(x, basestring)
742
743def matcher(matchers, direct=True, ignore_case=True):
744    """
745    Build a matcher from a sequence of matchers. If a sequence element is a
746    sequence, join matchers in the subsequence.
747
748    direct - if True, add a direct matcher to targets
749    ignore_case - if True, ignores case with optionally added direct matcher
750    """
751    seqmat = []
752    if direct:
753        seqmat.append(MatcherDirect(ignore_case=ignore_case))
754    for mat in matchers:
755        if issequencens(mat):
756            mat = MatcherAliasesPickledJoined(list(mat))
757            seqmat.append(mat)
758        else:
759            seqmat.append(mat)
760    return MatcherSequence(seqmat)
761
762if __name__ == '__main__':
763
764    #m1 = matcher([[GMNCBI('44689'), GMDicty()]])
765    #print m1.matchers[1].aliases[:100]
766
767    #m2 = GMNCBI('Dictyostelium discoideum')
768    #print m2.aliases
769
770
771
772    """
773    gi = info(list(info)[0])
774    print gi.tax_id, gi.synonyms, gi.dbXrefs, gi.symbol_from_nomenclature_authority, gi.full_name_from_nomenclature_authority
775    """
776
777    #dobim z joinom prave stvari?
778
779    import time
780    import obiGeneSets
781
782    def testsets():
783        return obiGeneSets.collections([":kegg:hsa", ":go:hsa"])
784
785    def names1():
786        import orange
787        data = orange.ExampleTable("DLBCL.tab")
788        return [ a.name for a in  data.domain.attributes ]
789
790    def namesd():
791        import orange
792        data = orange.ExampleTable("dd_ge_biorep1.tab")
793        names = [ ex["gene"].value for ex in data ]
794        return names
795
796    genesets = auto_pickle("testcol", "3", testsets)
797    names = auto_pickle("testnames", "4", names1)
798    names2 = auto_pickle("testnamesdicty", "4", namesd)
799
800    info = NCBIGeneInfo('Dictyostelium discoideum')
801    for a in names2:
802        print a
803        info.get_info(a)
804
805    t = time.time()
806    mat5 = matcher([[GMKEGG('human'),GMGO('human')]], direct=False, ignore_case=True)
807    mat7 = GMDicty()
808    mat8 = GMNCBI('Homo sapiens')
809    print "initialized all", time.time()-t
810
811    print "using targets"
812
813    mat5.set_targets(names)
814    mat7.set_targets(names)
815    mat8.set_targets(names)
816
817    print "before genes"
818    genes = reduce(set.union, genesets.values()[:1000], set())
819    genes = list(genes)
820    print genes[:20]
821    print len(genes)
822    print "after genes"
823
824    for g in sorted(genes):
825        print "KGO ", g, mat5.match(g), mat5.explain(g)
826        print "DICT", g, mat7.match(g)
827        print "NCBI", g, mat8.match(g)
828
829
Note: See TracBrowser for help on using the repository browser.