source: orange-bioinformatics/_bioinformatics/obiGene.py @ 1735:50499d1dc55a

Revision 1735:50499d1dc55a, 32.5 KB checked in by Ales Erjavec <ales.erjavec@…>, 13 months ago (diff)

Changed the Organism.gene_aliases method.

RevLine 
[1632]1from __future__ import absolute_import
[716]2
[1632]3import os, time
4
[1643]5from Orange.orng import orngServerFiles
[1632]6
7from . import obiTaxonomy
[1716]8from . import obiKEGG
[782]9
[716]10default_database_path = orngServerFiles.localpath("NCBI_geneinfo")
11
12class GeneInfo(object):
[718]13    """ An object representing the NCBI information for a gene.
14    """
[753]15
16    NCBI_GENEINFO_TAGS = ("tax_id", "gene_id", "symbol", "locus_tag", "synonyms",
17                          "dbXrefs", "chromosome", "map_location", "description", "type",
18                          "symbol_from_nomenclature_authority", "full_name_from_nomenclature_authority",
19                          "nomenclature_status", "other_designations", "modification_date")
20    NCBI_MULTIPLE_CARDINALITY_TAGS = ("synonyms", "dbXrefs", "other_designations")
21   
22    __slots__ = NCBI_GENEINFO_TAGS
[716]23    def __init__(self, line):
[718]24        """ Construct the GeneInfo object from a line in the NCBI gene_info file
25        """
[716]26        line = line.split("\t")
27        for attr, value in zip(self.__slots__, line):
28            if value == "-":
29                value = None
[753]30            if attr in GeneInfo.NCBI_MULTIPLE_CARDINALITY_TAGS:
[716]31                value = value.split("|") if value != None else []
32            setattr(self, attr, value)
33
34    def __repr__(self):
35        def format(value):
36            if not value:
37                return "-"
38            elif type(value) == list:
39                return "|".join(value)
40            else:
41                return value
42        return "\t".join(format(getattr(self, slot)) for slot in self.__slots__)
43
44    def __str__(self):
45        return repr(self)
46
[1071]47class GeneHistory(object):
48    NCBI_GENE_HISTORY_TAGS = ("tax_id", "gene_id", "discontinued_gene_id", "discontinued_symbol", "discontinue_date")
49    __slots__ = NCBI_GENE_HISTORY_TAGS
50    def __init__(self, line):
51        for attr, value in zip(self.__slots__, line.split("\t")):
52            setattr(self, attr, value)
53           
54           
[716]55class NCBIGeneInfo(dict):
[1183]56    TAX_MAP = {
57            "2104": "272634",  # Mycoplasma pneumoniae
58            "4530": "39947",  # Oryza sativa
59            "5833": "36329",  # Plasmodium falciparum
60            "4932": "559292",  # Saccharomyces cerevisiae
61            }
[1071]62       
[782]63    def __init__(self, organism, genematcher=None):
[718]64        """ An dictionary like object for accessing NCBI gene info
65        Arguments::
[781]66                - *organism*    Organism id
[718]67
68        Example::
69            >>> info = NCBIGeneInfo("Homo sapiens")
[716]70        """
[781]71       
72        self.taxid = self.organism_name_search(organism)
[784]73
74
[781]75        fname = orngServerFiles.localpath_download("NCBI_geneinfo", "gene_info.%s.db" % self.taxid)
76        file = open(fname, "rb")
[1682]77        self.update(dict([(line.split("\t", 3)[1], line) for line in file.read().splitlines() if line.strip() and not line.startswith("#")]))
[780]78
[782]79        # NOTE orig init time for gene matcher: 2.5s, new 4s: investigate the slowdown
80        # NOTE matches are not the same because aliases are build a bit
81        # differently (main name versus old aliases conflict!)
82
[781]83        self.matcher = genematcher
84        if self.matcher == None:
[784]85            if self.taxid == '352472':
[1333]86                self.matcher = matcher([GMNCBI(self.taxid), GMDicty(), [GMNCBI(self.taxid), GMDicty()]])
[783]87            else:
88                self.matcher = matcher([GMNCBI(self.taxid)])
[781]89
[780]90        #if this is done with a gene matcher, pool target names
[781]91        self.matcher.set_targets(self.keys())
[1071]92       
93    def history(self):
94        if getattr(self, "_history", None) is None:
95            fname = orngServerFiles.localpath_download("NCBI_geneinfo", "gene_history.%s.db" % self.taxid)
96            try:
[1682]97                self._history = dict([(line.split("\t")[2], GeneHistory(line)) for line in open(fname, "rb").read().splitlines()])
[1071]98               
99            except Exception, ex:
100                print >> sys.srderr, "Loading NCBI gene history failed.", ex
101                self._history = {}
102        return self._history
103       
[781]104    @classmethod
105    def organism_version(cls, name):
106        oname = cls.organism_name_search(name)
[782]107        #FIXME, dirty hack to ensure file id downloaded
108        orngServerFiles.localpath_download("NCBI_geneinfo", "gene_info.%s.db" % oname) 
[781]109        return orngServerFiles.info("NCBI_geneinfo", "gene_info.%s.db" % oname)["datetime"]
110
111    @classmethod
112    def organism_name_search(cls, org):
[1183]113        taxids = obiTaxonomy.to_taxid(org, mapTo=cls.common_taxids())
[781]114        if not taxids:
[1183]115            taxids = obiTaxonomy.search(org, onlySpecies=False)
116            taxids = set(cls.common_taxids()).intersection(taxids) #onlySpecies=False needed to find correct dicty
[781]117        if len(taxids) == 0:
118            raise obiTaxonomy.UnknownSpeciesIdentifier, org
119        elif len(taxids) > 1:
120            raise obiTaxonomy.MultipleSpeciesException, ", ".join(["%s: %s" % (id, obiTaxonomy.name(id)) for id in taxids])
[1183]121        taxid = taxids.pop()
122        return cls.TAX_MAP.get(taxid, taxid)
[780]123
[716]124    @classmethod   
125    def load(cls, file):
[718]126        """ A class method that loads gene info from file
127        """
[716]128        if type(file) in [str, unicode]:
129            file = open(file, "rb")
[1682]130        return cls((line.split("\t", 3)[1], line) for line in file.read().splitlines() if line.strip() and not line.startswith("#"))
[716]131       
[732]132    def get_info(self, gene_id, def_=None):
[718]133        """ Search and return the GeneInfo object for gene_id
134        """
[732]135        try:
136            return self(gene_id)
137        except KeyError:
138            return def_
[716]139       
140    def __call__(self, name):
[718]141        """ Search and return the GeneInfo object for gene_id
142        """
[782]143        #id = self.translate.get(name, name)
[785]144        #print self.matcher.umatch(name), self.matcher.match(name)
[782]145        id = self.matcher.umatch(name)
[732]146        return self[id]
[716]147
148    def __getitem__(self, key):
[719]149#        return self.get(gene_id, self.matcher[gene_id])
[716]150        return GeneInfo(dict.__getitem__(self, key))
151
152    def __setitem__(self, key, value):
153        if type(value) == str:
154            dict.__setitem__(self, key, value)
155        else:
156            dict.__setitem__(self, key, repr(value))
157
158    def get(self, key, def_=None):
159        try:
160            return self[key]
161        except KeyError:
162            return def_
163
164    def itervalues(self):
165        for val in dict.itervalues(self):
166            yield GeneInfo(val)
167
168    def iteritems(self):
169        for key, val in zip(self.iterkeys(), self.itervalues()):
170            yield key, val
171
172    def values(self):
173        return list(self.itervalues())
174   
175    def items(self):
176        return list(self.iteritems())
177
178    @staticmethod
[787]179    def get_geneinfo_from_ncbi(file, progressCallback=None):
180        import urllib2, gzip, shutil, tempfile
[716]181        from cStringIO import StringIO
[1071]182        if isinstance(file, basestring):
[787]183            file = open(file, "wb")
[716]184       
[787]185        stream = urllib2.urlopen("ftp://ftp.ncbi.nih.gov/gene/DATA/gene_info.gz")
186        tmpfile = tempfile.TemporaryFile()
187        shutil.copyfileobj(stream, tmpfile)
188        tmpfile.seek(0)
189        stream = gzip.GzipFile(None, "rb", fileobj=tmpfile)
190        shutil.copyfileobj(stream, file)
[1071]191       
192    @staticmethod
193    def get_gene_history_from_ncbi(file, progressCallback=None):
194        import urllib2, gzip, shutil, tempfile
195        from cStringIO import StringIO
196        if isinstance(file, basestring):
197            file = open(file, "wb")
198       
199        stream = urllib2.urlopen("ftp://ftp.ncbi.nih.gov/gene/DATA/gene_history.gz")
200        tmpfile = tempfile.TemporaryFile()
201        shutil.copyfileobj(stream, tmpfile)
202        tmpfile.seek(0)
203        stream = gzip.GzipFile(None, "rb", fileobj=tmpfile)
204        shutil.copyfileobj(stream, file)
[1183]205       
206    @classmethod
207    def common_taxids(cls):
208        taxids = obiTaxonomy.common_taxids()
209        return [cls.TAX_MAP.get(id, id) for id in taxids if cls.TAX_MAP.get(id, id)]
210   
211    @classmethod
212    def essential_taxids(cls):
213        taxids = obiTaxonomy.essential_taxids()
214        return [cls.TAX_MAP.get(id, id) for id in taxids if cls.TAX_MAP.get(id, id)]
[734]215
[1468]216
217class EnsembleGeneInfo(object):
218#    BIO_MART_DATASETS = {"9606": "hsapiens"}
219    DEF_ATTRS = ["ensembl_gene_id", "external_gene_id", "entrezgene"]
220    BIOMART_CONF = {"9606": ("hsapiens_gene_ensembl", DEF_ATTRS) # + ["hgnc_symbol"])
221                    }
222    def __init__(self, organism, gene_matcher=None):
223        self.organism = self.organism_name_search(organism) 
224        self.load()
225        if gene_matcher is None:
226            self.gene_matcher = matcher([GMEnsembl(self.organism), GMNCBI(self.organism)])
227        else:
228            self.gene_matcher = gene_matcher
229       
230    @classmethod
231    def organism_name_search(cls, name):
232        taxids = obiTaxonomy.to_taxid(name, mapTo=cls.common_taxids())
233        if len(taxids) == 1:
234            return taxids[0]
235        else:
236            raise ValueError(name)
237       
238    @classmethod
239    def common_taxids(cls):
240        return ["3702", "9913", "6239", "7955", "9606", "7227",
241                "10090", "10116", "4932", "4896", "8355"]
242   
243    @classmethod
244    def version(cls):
245        return "v1.0"
246       
247    def filename(self):
248        return "ensembl_" + self.organism
249   
250    def create_info(self):
[1632]251        from . import obiBioMart
[1468]252        if self.organism in self.BIOMART_CONF:
253            dset_name, attrs = self.BIOMART_CONF[self.organism]
254        else:
255            dset_name, attrs = self.default_biomart_conf(self.organism)
256       
257        dataset = obiBioMart.BioMartDataset("ensembl", dset_name)
258        table = dataset.get_example_table(attributes=attrs, unique=True)
259        print len(table)
260        table.save(dset_name + ".tab")
261        from collections import defaultdict
262        names = defaultdict(set)
263        for ex in table:
264            row = [str(v) for v in ex if not v.isSpecial() and str(v)]
265            names[row[0]].update(row)
266        return names
267       
268    def default_biomart_conf(self, taxid):
269        name = obiTaxonomy.name(self.organism).lower()
270        name1, name2 = name.split(" ")[: 2]
271        dset_name = name1[0] + name2 + "_gene_ensembl"
272        return dset_name, self.DEF_ATTRS
273   
274    def load(self):
275        import cPickle
276        dir = orngServerFiles.localpath("EnsembleGeneInfo")
277        if not os.path.exists(dir):
278            os.makedirs(dir)
279       
280        try:
281            filename = orngServerFiles.localpath_download("EnsembleGeneInfo", self.filename())
282            info = cPickle.load(open(filename, "rb"))
283        except Exception, ex:   
284            filename = orngServerFiles.localpath("EnsembleGeneInfo", self.filename())
285            info = self.create_info()
286            cPickle.dump(info, open(filename, "wb"))
287           
288        self.info = info
289       
290    def __getitem__(self, key):
291        return self.info[key]
292    def __contains__(self, key):
293        return key in self.info
294    def __len__(self):
295        return len(self.info)
296    def __iter__(self):
297        return iter(self.info)
298    def keys(self):
299        return self.info.keys()
300    def values(self):
301        return self.info.values()
302    def items(self):
303        return self.info.items()
304    def get(self, key, default=None):
305        return self.info.get(key, default)
306   
307    def genes(self):
308        return self.info.keys()
309   
310    def aliases(self):
311        return [(key,) + tuple(value) for key, value in self.items()]
312   
313    def ensembl_id(self, name):
314        return self.gene_matcher.umatch(name)
315       
[734]316"""
317Gene matcher.
318
[1438]319"Database" for each organism is a list of sets of gene aliases.
[734]320"""
321
322from collections import defaultdict
323import os
324
[751]325gene_matcher_path = None
[734]326
327def ignore_case(gs):
328    """ Transform names in sets in list to lower case """
329    return [ set([a.lower() for a in g]) for g in gs ]
330
[757]331def create_mapping(groups, lower=False):
[734]332    """
[775]333    Returns mapping of aliases to the group index. If lower
334    is True, lower case forms of gene aliases are mapped to indices.
[734]335
336    Unpickling the results of this function (binary format)
337    is slower than running it.
[757]338
339    TIMING NOTES:
340    - lower() costs are neglible (< 10%)
341    - building sets instead of lists also costs about 10 percent
[734]342    """
[757]343    togroup = defaultdict(set)
344
345    # code duplicated because running a function in relatively expensive here.
346    if lower: 
347        for i,group in enumerate(groups):
348            for alias in group:
349                togroup[alias.lower()].add(i)
350    else:
351        for i,group in enumerate(groups):
352            for alias in group:
353                togroup[alias].add(i)
354
[734]355    return togroup
356
[775]357def join_sets(set1, set2, lower=False):
[734]358    """
[775]359    Joins two sets of gene set mappings. If lower is True, lower case
360    forms of gene aliases are compared.
[734]361
362    A group g1 from set1 is joined to a group of aliases g2 from set2,
[751]363    if the groups share at least one gene.
[755]364    Returns all joined groups and groups that were not matched, which
365    remain unchanged.
[734]366
[751]367    The operation both commutative and associative.
[734]368    """
369
[997]370    set1 = [ set(a) for a in set1 ]
371    set2 = [ set(a) for a in set2 ]
372
373    currentmap = create_mapping(set1, lower=lower)
[734]374
375    new = [] #new groups
376
377    #remember used to find unused
378    set1used = set() 
379    set2used = set()
380
[775]381    fn = lambda x: x
382    if lower:
383        fn = lambda x: x.lower()
384
[734]385    for i, group in enumerate(set2):
386
387        #find groups of aliases (from set1)  intersecting with a current
388        #group from set2
[757]389        cross = reduce(set.union, 
[775]390            [ currentmap[fn(alias)] for alias in group if fn(alias) in currentmap], set())
[734]391
392        for c in cross:
393            #print c, group & set1[c], group, set1[c]
394            set1used.add(c)
395            set2used.add(i)
396            new.append(group | set1[c]) #add a union
397
398    #add groups without matches (from both sets)
399    set1add = set(range(len(set1))) - set1used
400    set2add = set(range(len(set2))) - set2used
401
402    for a in set1add:
403        new.append(set1[a])
404    for a in set2add:
405        new.append(set2[a])
406
407    return new
408 
[775]409def join_sets_l(lsets, lower=False):
[734]410    """
[751]411    Joins multiple gene set mappings using join_sets function.
[734]412    """
413    current = lsets[0]
414    for b in lsets[1:]:
[775]415        current = join_sets(current, b, lower=lower)
[734]416    return current
417
418class Matcher(object):
[751]419    """
420    Gene matcher tries to match an input gene to some target.
421    """
[734]422
[1578]423    def copy(self):
424        return notImplemented()
425
426    def __call__(self, targets):
427        return self.set_targets(targets)
428
[751]429    def set_targets(self, targets):
[734]430        """
[751]431        Set input list of gene names as targets.
[734]432        Abstract function.
433        """
434        notImplemented()
435
436    def match(self, gene):
[751]437        """Returns a list of matching target gene names."""
[734]438        notImplemented()
439
[756]440    def umatch(self, gene):
441        """Returns an unique (only one matching target) target or None"""
442        mat = self.match(gene)
443        return mat[0] if len(mat) == 1 else None
444
[815]445    def explain(self, gene):
446        """
447        Returns an gene matches with explanations as lists of tuples.
448        Each tuple consists of a list of target genes in a set
449        of aliases matched to input gene, returned as a second part
450        of the tuple.
451        """
452        notImplemented()
453
[734]454def buffer_path():
[751]455    """ Returns buffer path from Orange's setting folder if not
456    defined differently (in gene_matcher_path). """
457    if  gene_matcher_path == None:
[1632]458        from Orange.orng import orngEnviron
[734]459        pth = os.path.join(orngEnviron.directoryNames["bufferDir"], 
460            "gene_matcher")
461        try:
462            os.makedirs(pth)
463        except:
464            pass
465        return pth
466    else:
[751]467        return gene_matcher_path
[734]468
469def auto_pickle(filename, version, func, *args, **kwargs):
470    """
[751]471    Run function func with given arguments and save the results to
[734]472    a file named filename. If results for a given filename AND
473    version were already saved, just read and return them.
474    """
475
476    import cPickle as pickle
477
478    output = None
479    outputOk = False
480
481    try:
482        f = open(filename,'rb')
483
484        try:
485            versionF = pickle.load(f)
[1075]486            if version == None or versionF == version:
[734]487                outputOk = True
488                output = pickle.load(f)
489        except:
490            pass
491        finally:
492            f.close()
493
494    except:
495        pass
496
497    if not outputOk:
498        output = func(*args, **kwargs)
499
500        #save output before returning
501        f = open(filename,'wb')
502        pickle.dump(version, f, -1)
503        pickle.dump(output, f, -1)
504        f.close()
505
506    return output
507
508class MatcherAliases(Matcher):
509    """
[751]510    Genes matcher based on a list of sets of given aliases.
[746]511
[751]512    Target genes belonging to same sets of aliases as the input gene are
513    returned as matching genes.
514
[734]515    """
[756]516    def __init__(self, aliases, ignore_case=True):
[734]517        self.aliases = aliases
[756]518        self.ignore_case = ignore_case
[757]519        self.mdict = create_mapping(self.aliases, self.ignore_case)
[734]520
521    def to_ids(self, gene):
[751]522        """ Return ids of sets of aliases the gene belongs to. """
[734]523        if self.ignore_case:
524            gene = gene.lower()
525        return self.mdict[gene]
526
527    def set_targets(self, targets):
[751]528        """
[1578]529        A reverse dictionary is made according to each target's membership
[751]530        in the sets of aliases.
531        """
[734]532        d = defaultdict(list)
[1578]533        #d = id: [ targets ], where id is index of the set of aliases
[734]534        for target in targets:
535            ids = self.to_ids(target)
536            if ids != None:
537                for id in ids:
538                    d[id].append(target)
[1578]539        mo = MatchAliases(d, self)
540        self.matcho = mo #backward compatibility - default match object
541        return mo
542
543    #this two functions are solely for backward compatibility
544    def match(self, gene):
545        return self.matcho.match(gene)
546    def explain(self, gene):
547        return self.matcho.explain(gene)
548
549class Match(object):
[1583]550
551    def umatch(self, gene):
552        """Returns an unique (only one matching target) target or None"""
553        mat = self.match(gene)
554        return mat[0] if len(mat) == 1 else None
[1578]555 
556class MatchAliases(Match):
557
558    def __init__(self, to_targets, parent):
559        self.to_targets = to_targets
560        self.parent = parent
[734]561
562    def match(self, gene):
[751]563        """
564        Input gene is first mapped to ids of sets of aliases which contain
565        it. Target genes belonding to the same sets of aliases are returned
566        as input's match.
567        """
[1578]568        inputgeneids = self.parent.to_ids(gene)
[734]569        #return target genes with same ids
[751]570        return list(set( \
[1578]571            reduce(lambda x,y: x+y, 
[751]572                [ self.to_targets[igid] for igid in inputgeneids ], [])))
[734]573
[815]574    def explain(self, gene):
[1578]575        inputgeneids = self.parent.to_ids(gene)
576        return [ (self.to_targets[igid], self.parent.aliases[igid]) for igid in inputgeneids ]
[734]577
578class MatcherAliasesPickled(MatcherAliases):
579    """
[751]580    Gene matchers based on sets of aliases supporting pickling should
581    extend this class. Subclasses must define functions "filename",
582    "create_aliases_version" and "create_aliases". Those are crucial for
583    pickling of gene aliases to work.
584
585    Loading of gene aliases is done lazily: they are loaded when they are
586    needed. Loading of aliases for components of joined matchers is often
587    unnecessary and is therefore avoided.
[734]588    """
589   
590    def set_aliases(self, aliases):
591        self.saved_aliases = aliases
592
593    def get_aliases(self):
594        if not self.saved_aliases: #loads aliases if not loaded
595            self.aliases = self.load_aliases()
[779]596        #print "size of aliases ", len(self.saved_aliases)
[734]597        return self.saved_aliases
598
599    aliases = property(get_aliases, set_aliases)
600
601    def get_mdict(self):
602        """ Creates mdict. Aliases are loaded if needed. """
603        if not self.saved_mdict:
[757]604            self.saved_mdict = create_mapping(self.aliases, self.ignore_case)
[734]605        return self.saved_mdict
606
607    def set_mdict(self, mdict):
608        self.saved_mdict = mdict
609
610    mdict = property(get_mdict, set_mdict)
611
612    def set_targets(self, targets):
[1578]613        return MatcherAliases.set_targets(self, targets)
[734]614
615    def filename(self):
616        """ Returns file name for saving aliases. """
617        notImplemented()
618       
619    def create_aliases_version(self):
620        """ Returns version of the source database state. """
621        notImplemented()
622
623    def create_aliases(self):
624        """ Returns gene aliases. """
625        notImplemented()
626
627    def load_aliases(self):
[746]628        fn = self.filename()
[1075]629        ver = self.create_aliases_version() #if version == None ignore it
630        if fn != None:
631            if isinstance(fn, tuple): #if you pass tuple, look directly
632               filename = fn[0]
633            else:
634               filename = os.path.join(buffer_path(), fn)
[746]635            return auto_pickle(filename, ver, self.create_aliases)
636        else:
637            #if either file version of version is None, do not pickle
638            return self.create_aliases()
[734]639
[756]640    def __init__(self, ignore_case=True):
[734]641        self.aliases = []
642        self.mdict = {}
[775]643        self.ignore_case = ignore_case
644        self.filename() # test if valid filename can be built
[734]645
[1715]646
[734]647class MatcherAliasesKEGG(MatcherAliasesPickled):
648
649    def _organism_name(self, organism):
[1716]650        return obiKEGG.organism_name_search(organism)
[734]651
652    def create_aliases(self):
[1716]653        org = obiKEGG.KEGGOrganism(self.organism, genematcher=GMDirect())
[1735]654        osets = org.gene_aliases()
[756]655        return osets
[734]656
657    def create_aliases_version(self):
[1735]658        # use KEGG short release string (e.g. '66.0+')
659        release = obiKEGG.KEGGOrganism.organism_version(self.organism) + ".2"
660        release, _ = release.split("/")
661        return release
[734]662
663    def filename(self):
[1735]664        return "kegg_2_" + self._organism_name(self.organism)
[734]665
[756]666    def __init__(self, organism, ignore_case=True):
[734]667        self.organism = organism
[757]668        MatcherAliasesPickled.__init__(self, ignore_case=ignore_case)
[734]669
[1735]670
[1075]671class MatcherAliasesFile(MatcherAliasesPickled):
672
673    def create_aliases(self):
674        canNotCreateButCanOnlyOpen()
675
676    def create_aliases_version(self):
677        return None
678
679    def filename(self):
680        return (self.filename_,)
681
682    def __init__(self, filename, ignore_case=True):
683        self.filename_ = filename
684        MatcherAliasesPickled.__init__(self, ignore_case=ignore_case)
685
686
[734]687class MatcherAliasesGO(MatcherAliasesPickled):
688
[753]689    def _organism_name(self, organism):
690        """ Returns internal GO organism name. Used to define file name. """
[1632]691        from . import obiGO
[756]692        return obiGO.organism_name_search(self.organism)
[753]693
[734]694    def create_aliases(self):
[1632]695        from . import obiGO
[798]696        annotations = obiGO.Annotations(self.organism, genematcher=GMDirect())
[734]697        names = annotations.geneNamesDict
[757]698        return map(set, list(set([ \
[734]699            tuple(sorted(set([name]) | set(genes))) \
[757]700            for name,genes in names.items() ])))
[734]701
702    def filename(self):
[753]703        return "go_" + self._organism_name(self.organism)
[734]704
705    def create_aliases_version(self):
[1632]706        from . import obiGO
[804]707        return "v2." + obiGO.Annotations.organism_version(self.organism)
[734]708
[756]709    def __init__(self, organism, ignore_case=True):
[734]710        self.organism = organism
[757]711        MatcherAliasesPickled.__init__(self, ignore_case=ignore_case)
[734]712
[780]713class MatcherAliasesDictyBase(MatcherAliasesPickled):
714
715    def create_aliases(self):
[1632]716        from . import obiDicty
[780]717        db = obiDicty.DictyBase()
718        #db.info, db.mappings
719        infoa = [ set([id,name]) | set(aliases) for id,(name,aliases,_) in db.info.items() ]
720        mappingsa = [ set(filter(None, a)) for a in db.mappings ]
721        joineda = join_sets(infoa, mappingsa, lower=True)
722        return joineda
723
724    def create_aliases_version(self):
[1632]725        from . import obiDicty
[780]726        return "v1." + obiDicty.DictyBase.version()
727
728    def filename(self):
729        return "dictybase" 
730
731    def __init__(self, ignore_case=True):
732        MatcherAliasesPickled.__init__(self, ignore_case=ignore_case)
733
734class MatcherAliasesNCBI(MatcherAliasesPickled):
735
736    def _organism_name(self, organism):
[781]737        return NCBIGeneInfo.organism_name_search(organism)
[780]738
739    def create_aliases(self):
[782]740        ncbi = NCBIGeneInfo(self.organism, genematcher=GMDirect())
[781]741        out = []
742        for k in ncbi.keys():
743            out.append(set(filter(None, [k, ncbi[k].symbol, ncbi[k].locus_tag] + [ s for s in ncbi[k].synonyms ] )))
744        return out
[780]745
746    def filename(self):
[781]747        return "ncbi_" + self._organism_name(self.organism)
[780]748
749    def create_aliases_version(self):
[782]750        return "v2." + NCBIGeneInfo.organism_version(self.organism)
[780]751
752    def __init__(self, organism, ignore_case=True):
753        self.organism = organism
754        MatcherAliasesPickled.__init__(self, ignore_case=ignore_case)
[986]755       
756class MatcherAliasesAffy(MatcherAliasesPickled):
757    def create_aliases(self):
[989]758        filename = orngServerFiles.localpath_download("Affy", self.organism + ".pickle")
[986]759        import cPickle
760        return cPickle.load(open(filename, "rb"))
761   
762    def filename(self):
[989]763        return "affy_" + self.organism
[986]764   
765    def create_aliases_version(self):
[997]766        orngServerFiles.localpath_download("Affy", self.organism + ".pickle")
[989]767        return orngServerFiles.info("Affy", self.organism + ".pickle")["datetime"]
[986]768       
769    def __init__(self, organism, **kwargs):
[989]770        self.organism = organism
[986]771        MatcherAliasesPickled.__init__(self, **kwargs)
[1438]772   
773       
774class MatcherAliasesEnsembl(MatcherAliasesPickled):
775    """ A matcher for Ensemble ids
776    """
777    DEF_ATTRS = ["ensembl_gene_id", "external_gene_id", "entrezgene"]
778    # taxid: (dataset_name, [name_attr1, name_attr2 ...])
779    BIOMART_CONF = {}
780    def __init__(self, organism, **kwargs):
781        self.organism = organism
782        MatcherAliasesPickled.__init__(self, **kwargs)
783       
784    def filename(self):
785        return "ensembl_" + self.organism
786   
787    def create_aliases_version(self):
788        return "v1"
789   
790    def create_aliases(self):
[1632]791        from . import obiBioMart
[1438]792        if self.organism in self.BIOMART_CONF:
793            dset_name, attrs = self.BIOMART_CONF[self.organism]
794        else:
795            dset_name, attrs = self.default_biomart_conf(self.organism)
796       
797        dataset = obiBioMart.BioMartDataset("ensembl", dset_name)
798        table = dataset.get_example_table(attributes=attrs)
799        from collections import defaultdict
800        names = defaultdict(set)
801        for ex in table:
802            row = [str(v) for v in ex if not v.isSpecial() and str(v)]
[1468]803            names[row[0]].update(row)
[1438]804        return names.values()
805       
806    def default_biomart_conf(self, taxid):
807        name = obiTaxonomy.name(self.organism).lower()
808        name1, name2 = name.split(" ")[: 2]
809        dset_name = name1[0] + name2 + "_gene_ensembl"
810        return dset_name, self.DEF_ATTRS
811       
[780]812
[734]813class MatcherAliasesPickledJoined(MatcherAliasesPickled):
814    """
[751]815    Creates a new matcher by joining gene aliases from different data sets.
816    Sets of aliases are joined if they contain common genes.
817
818    The joined gene matcher can only be pickled if the source gene
819    matchers are picklable.
[734]820    """
821
822    def filename(self):
[746]823        # do not pickle if any is unpicklable
824        try:
825            filenames = [ mat.filename() for mat in self.matchers ]
[775]826            if self.ignore_case:
827                filenames += [ "icj" ]
[746]828            return "__".join(filenames)
829        except:
830            return None
[734]831
832    def create_aliases(self):
[775]833        return join_sets_l([ mat.aliases for mat in self.matchers ], lower=self.ignore_case)
[734]834
835    def create_aliases_version(self):
[746]836        try:
[775]837            return "v4_" + "__".join([ mat.create_aliases_version() for mat in self.matchers ])
[746]838        except:
839            return None
[734]840
[815]841    def __init__(self, matchers):
[734]842        """
843        Join matchers together. Groups of aliases are joined if
844        they share a common name.
[775]845
846        If ignore_case is True, ignores case when joining gene aliases.
[734]847        """
848        #FIXME: sorting of matchers to avoid multipying pickled files for
849        #different orderings.
850        self.matchers = matchers
[815]851        allic = set([ m.ignore_case for m in self.matchers ])
852        if len(allic) > 1:
853            notAllMatchersHaveEqualIgnoreCase()
854        ignore_case = list(allic)[0]
855
[775]856        MatcherAliasesPickled.__init__(self, ignore_case=ignore_case)
[734]857       
[746]858class MatcherSequence(Matcher):
859    """
[1578]860    Each gene goes through sequence of gene matchers (in the same order
861    as in the matchers arguments) until a match is found.
[746]862    """
863   
864    def __init__(self, matchers):
865        self.matchers = matchers
866
[1578]867    def set_targets(self, targets):
868        ms = []
[1703]869        targets = list(targets) #copy targets as multiple use would
870                                #be problematic if a generator was passed
[1578]871        for matcher in self.matchers:
872            ms.append(matcher.set_targets(targets))
873        om = MatchSequence(ms)
874        self.matcho = om
875        return om
876
877    #this two functions are solely for backward compatibility
[746]878    def match(self, gene):
[1578]879        return self.matcho.match(gene)
[1703]880
[1578]881    def explain(self, gene):
882        return self.matcho.explain(gene)
883
884class MatchSequence(Match):
885
886    def __init__(self, ms):
887        self.ms = ms
888
889    def match(self, gene):
890        for match in self.ms:
891            m = match.match(gene)
[756]892            if m: 
[746]893                return m
[756]894        return []
[746]895
[815]896    def explain(self, gene):
[1578]897        for match in self.ms:
898            m = match.match(gene)
[815]899            if m: 
[1578]900                return match.explain(gene)
[815]901        return []
[777]902
[746]903class MatcherDirect(Matcher):
904    """
905    Direct matching to targets.
906    """
907
[757]908    def __init__(self, ignore_case=True):
909        self.ignore_case = ignore_case
910
[746]911    def set_targets(self, targets):
[1703]912        targets = list(targets) #would be problematic if generator was passed
913                                #as it is used twice
[757]914        aliases = [ set([a]) for a in targets]
915        self.am = MatcherAliases(aliases, ignore_case=self.ignore_case)
[1578]916        self.matcho = self.am.set_targets(targets)
917        return self.matcho
[746]918
[1578]919    #this two functions are solely for backward compatibility
[746]920    def match(self, gene):
[1578]921        return self.matcho.match(gene)
922    def explain(self, gene):
923        return self.matcho.explain(gene)
924
925               
[747]926GMDirect = MatcherDirect
927GMKEGG = MatcherAliasesKEGG
928GMGO = MatcherAliasesGO
[780]929GMNCBI = MatcherAliasesNCBI
930GMDicty = MatcherAliasesDictyBase
[986]931GMAffy = MatcherAliasesAffy
[1438]932GMEnsembl = MatcherAliasesEnsembl
[747]933
[756]934def issequencens(x):
935    return hasattr(x, '__getitem__') and not isinstance(x, basestring)
936
[775]937def matcher(matchers, direct=True, ignore_case=True):
[747]938    """
939    Build a matcher from a sequence of matchers. If a sequence element is a
[756]940    sequence, join matchers in the subsequence.
[775]941
942    direct - if True, add a direct matcher to targets
[815]943    ignore_case - if True, ignores case with optionally added direct matcher
[747]944    """
945    seqmat = []
[775]946    if direct:
947        seqmat.append(MatcherDirect(ignore_case=ignore_case))
[747]948    for mat in matchers:
[756]949        if issequencens(mat):
[815]950            mat = MatcherAliasesPickledJoined(list(mat))
[747]951            seqmat.append(mat)
952        else:
953            seqmat.append(mat)
954    return MatcherSequence(seqmat)
[746]955
[734]956if __name__ == '__main__':
[783]957
[784]958    #m1 = matcher([[GMNCBI('44689'), GMDicty()]])
959    #print m1.matchers[1].aliases[:100]
[783]960
[784]961    #m2 = GMNCBI('Dictyostelium discoideum')
962    #print m2.aliases
963
964
[782]965
[734]966    """
[718]967    gi = info(list(info)[0])
968    print gi.tax_id, gi.synonyms, gi.dbXrefs, gi.symbol_from_nomenclature_authority, gi.full_name_from_nomenclature_authority
[734]969    """
970
[775]971    #dobim z joinom prave stvari?
972
[746]973    import time
[1632]974    from . import obiGeneSets
[734]975
976    def testsets():
977        return obiGeneSets.collections([":kegg:hsa", ":go:hsa"])
978
979    def names1():
980        import orange
981        data = orange.ExampleTable("DLBCL.tab")
982        return [ a.name for a in  data.domain.attributes ]
[746]983
[780]984    def namesd():
985        import orange
986        data = orange.ExampleTable("dd_ge_biorep1.tab")
[781]987        names = [ ex["gene"].value for ex in data ]
988        return names
[780]989
[734]990    genesets = auto_pickle("testcol", "3", testsets)
991    names = auto_pickle("testnames", "4", names1)
[783]992    names2 = auto_pickle("testnamesdicty", "4", namesd)
[746]993
[784]994    info = NCBIGeneInfo('Dictyostelium discoideum')
[783]995    for a in names2:
996        print a
997        info.get_info(a)
998
999    t = time.time()
[775]1000    mat5 = matcher([[GMKEGG('human'),GMGO('human')]], direct=False, ignore_case=True)
[782]1001    mat7 = GMDicty()
1002    mat8 = GMNCBI('Homo sapiens')
1003    print "initialized all", time.time()-t
[779]1004
[734]1005    print "using targets"
1006
[747]1007    mat5.set_targets(names)
[777]1008    mat7.set_targets(names)
[779]1009    mat8.set_targets(names)
1010
[734]1011    print "before genes"
[756]1012    genes = reduce(set.union, genesets.values()[:1000], set())
[775]1013    genes = list(genes)
1014    print genes[:20]
[734]1015    print len(genes)
1016    print "after genes"
1017
[781]1018    for g in sorted(genes):
[815]1019        print "KGO ", g, mat5.match(g), mat5.explain(g)
[781]1020        print "DICT", g, mat7.match(g)
[782]1021        print "NCBI", g, mat8.match(g)
[756]1022
[782]1023
Note: See TracBrowser for help on using the repository browser.