source: orange-bioinformatics/obiArrayExpress.py @ 1345:738b2b2377a2

Revision 1345:738b2b2377a2, 15.1 KB checked in by ales_erjavec <ales.erjavec@…>, 3 years ago (diff)

Added a obiArrayExpress-test.py testing script, added format parameter to query_experiments/files.

Line 
1"""
2Array Express
3-------------
4
5A python module for accessing the ArrayExpress and GeneExpressionAtlas web services.
6
7Example::
8
9    >>> import obiArrayExpress
10    >>> obiArrayExpress.query_experiments(accession='E-MEXP-31')
11    <addinfourl at ...
12   
13   
14   
15"""
16
17import os, sys
18import urllib2
19
20import orngEnviron
21import warnings
22import posixpath
23import shelve
24from collections import defaultdict
25
26ARRAYEXPRESS_FIELDS = \
27    ["accession",
28     "array",
29     "ef",
30     "efv",
31     "expdesign",
32     "exptype",
33     "gxa",
34     "pmid",
35     "sa",
36     "species",
37    ]
38
39class ArrayExpressConnection(object):
40    """ A connection to the ArrayExpress database with query caching
41    """
42   
43    try:
44        DEFAULT_CACHE = shelve.open(os.path.join(orngEnviron.bufferDir, "ArrayExpressCache.shelve"))
45    except ImportError:
46        warnings.warn("Could not load persistent cache!")
47        DEFAULT_CACHE = {}
48   
49    DEFAULT_ADDRESS = "http://www.ebi.ac.uk/arrayexpress/{format}/v2/"
50    DEFAULT_FORMAT = "json"
51   
52    # Order of arguments in the query
53    _ARGS_ORDER = ["keywords", "species", "array"]
54   
55    def __init__(self, address=None, cache=None, timeout=30):
56        self.address = address if address is not None else self.DEFAULT_ADDRESS
57        self.cache = cache if cache is not None else self.DEFAULT_CACHE
58        self.timeout = timeout
59       
60    def format_query(self, **kwargs):
61        def format_species(val):
62            return '"%s"' % val.lower()
63        # TODO: range values (e.g. [1 TO 2]), handle AND, OR, +, check for valid keys
64        formaters = {"species": format_species,
65                     }
66        parts = []
67        arg_items = kwargs.items()
68        ordered = sorted(arg_items, key=lambda arg: self._ARGS_ORDER.index(arg[0]) \
69                         if arg[0] in self._ARGS_ORDER else 100)
70        for key, value in kwargs.iteritems():
71            fmt = formaters.get("key", lambda val: val)
72            value = fmt(value)
73            parts.append("{0}={1}".format(key, value)) 
74        return "&".join(parts)
75       
76    def query_url(self, what="experiments", **kwargs):
77        """ Return a formated query url for the calls arguments
78       
79        Example::
80            >>> conn.query_url(accession="E-MEXP-31")
81            'http://www.ebi.ac.uk/arrayexpress/xml/v2/experiments?accession=E-MEXP-31'
82           
83        """
84        query = self.format_query(**kwargs)
85        url = posixpath.join(self.address, what)
86        url = url.format(format=kwargs.get("format", self.DEFAULT_FORMAT))
87        url = url + "?" + query
88#        print url
89        url = url.replace(" ", "%20")
90        return url
91   
92    def query_url_experiments(self, **kwargs):
93        """ Return a formated experiments query url for the calls arguments
94        """
95        return self.query_url("experiments", **kwargs)
96   
97    def query_url_files(self, **kwargs):
98        """ Return a formated experiments query url for the calls arguments
99        """
100        return self.query_url("files", **kwargs)
101   
102    def query_experiment(self, **kwargs):
103        url = self.query_url_experiments(**kwargs)
104        stream = urllib2.urlopen(url, timeout=self.timeout)
105        #  TODO: check stream for errors 
106        return stream
107   
108    def query_files(self, **kwargs):
109        url = self.query_url_files(**kwargs)
110        stream = urllib2.urlopen(url, timeout=self.timeout)
111        #  TODO: check stream for errors 
112        return stream
113   
114    def open_file(self, accession, kind="raw", ext=None):
115        """ Return a file handle to experiment data.
116        Possible values for kind:
117            - raw: return the raw data if available
118            - fgem: return the processed data if available
119            - biosamples: a png or svg design image
120            - idf: investigation description
121            - adf: array design description
122            - mageml: MAGE-ML file
123           
124        Example::
125       
126            >>> raw_file = conn.open_file("E-TABM-1087", kind="raw")
127            >>> processed_file = conn.open_file("E-TABM-1087", kind="fgem")
128             
129        """
130        from Orange.misc.xml import parse
131        files = parse(self.query_files(accession=accession))
132        files = list(files.elements("file"))
133        for file in files:
134            filekind = file.elements("kind").next()
135            fileext = file.elements("extension").next()
136            if filekind.data.strip() == kind and (fileext.data.strip() == ext or ext is None): 
137                url = file.elements("url").next()
138                return urllib2.urlopen(url.data.strip(), timeout=self.timeout)
139   
140   
141def query_experiments(**kwargs):
142    """ Query Array Express experiments.
143   
144    Example ::
145   
146        >>> query_experiments(species="Homo sapiens", ef="organism_part", efv="liver")
147        <addinfourl at ...
148       
149    """
150    return ArrayExpressConnection().query_experiment(**kwargs)
151
152def query_files(**kwargs):
153    """ Query Array Express files.
154   
155    Example ::
156   
157        >>> query_files(species="Mus musculus", ef="developmental_stage", efv="embryo")
158        <addinfourl at ...
159                       
160    """
161    return ArrayExpressConnection().query_files(**kwargs)
162   
163# TODO: List all accepted keyword values for query_* functions.
164
165"""\
166Gene Expression Atlas
167---------------------
168"""
169
170class GeneExpressionAtlasConenction(object):
171    """ A connection to Gene Expression Atlas database
172    """
173    try:
174        DEFAULT_CACHE = shelve.open(os.path.join(orngEnviron.bufferDir, "GeneExpressionAtlasCache.shelve"))
175    except ImportError:
176        warnings.warn("Could not load persistent cache!")
177        DEFAULT_CACHE = {}
178   
179    DEFAULT_ADDRESS = "http://www.ebi.ac.uk:80/gxa/"
180   
181    def __init__(self, address=None, cache=None, timeout=30):
182        self.address = address if address is not None else self.DEFAULT_ADDRESS
183        self.cache = cache if cache is not None else self.DEFAULT_CACHE
184        self.timeout = timeout
185       
186    def format_query(self,):
187        pass
188   
189    def query(self, condition, format="json", start=None, rows=None, indent=False):
190        url = self.address + "api?" + condition.rest()
191        if start and rows:
192            url += "&start={0}&rows={1}".format(start, rows)
193        url += "&format={0}".format(format)
194        if indent:
195            url += "&indent"
196#        print url
197        response = urllib2.urlopen(url)
198        return response
199   
200GENE_FILTERS = \
201    ["Name", # Gene name
202     "Goterm", #Gene Ontology Term
203     "Interproterm", #InterPro Term
204     "Disease", #Gene-Disease Assocation
205     "Keyword", #Gene Keyword
206     "Protein", #Protein
207
208     "Dbxref", #Other Database Cross-Refs
209     "Embl", #EMBL-Bank ID
210     "Ensfamily", #Ensembl Family
211     "Ensgene", #Ensembl Gene ID
212
213     "Ensprotein", #Ensembl Protein ID
214     "Enstranscript", #Ensembl Transcript ID
215     "Goid", #Gene Ontology ID
216     "Image", #IMAGE ID
217     "Interproid", #InterPro ID
218     "Locuslink", #Entrez Gene ID
219
220     "Omimid", #OMIM ID
221     "Orf", #ORF
222     "Refseq", #RefSeq ID
223     "Unigene", #UniGene ID
224     "Uniprot", #UniProt Accession
225
226     "Hmdb", #HMDB ID
227     "Chebi", #ChEBI ID
228     "Cas", #CAS
229     "Uniprotmetenz", #Uniprotmetenz
230     "Gene", #Gene Name or Identifier
231     "Synonym", #Gene Synonym
232     ]
233   
234GENE_FILTER_QUALIFIERS =\
235    ["Is",
236     "IsNot"
237     ]
238
239ATLAS_ORGANISMS = \
240    ["Anopheles gambiae",
241     "Arabidopsis thaliana",
242     "Bos taurus",
243     "Caenorhabditis elegans",
244     "Danio rerio",
245     "Drosophila melanogaster",
246     "Epstein barr virus",
247     "Gallus gallus",
248     "Homo sapiens",
249     "Human cytomegalovirus",
250     "Kaposi sarcoma-associated herpesvirus",
251     "Mus musculus",
252     "Rattus norvegicus",
253     "Saccharomyces cerevisiae",
254     "Schizosaccharomyces pombe",
255     "Unknown",
256     "Xenopus laevis"
257     ]
258   
259def ef_ontology():
260    """ Return the EF (Experimental Factor) ontology
261    """
262    import obiOntology
263#    return obiOntology.OBOOntology(urllib2.urlopen("http://efo.svn.sourceforge.net/svnroot/efo/trunk/src/efoinobo/efo.obo"))
264    import orngServerFiles
265    # Should this be in the OBOFoundry (Ontology) domain
266    file_name = orngServerFiles.localpath_download("ArrayExpress", "efo.obo")
267    return obiOntology.OBOOntology(open(filename, "rb"))
268
269
270class AtlasCondition(object):
271    """ Base class for Gene Expression Atlas query condition
272    """
273    def validate(self):
274        """ Validate condition in a subclass.
275        """
276        raise NotImplementedError
277   
278    def rest(self):
279        """ Return a REST query part in a subclass.
280        """
281        raise NotImplementedError
282   
283   
284class AtlasConditionList(list, AtlasCondition):
285    """ A list of AtlasCondition instances.
286    """ 
287    def validate(self):
288        for item in self:
289            item.validate()
290       
291    def rest(self):
292        return "&".join(cond.rest() for cond in self)
293
294class AtlasConditionGeneProperty(AtlasCondition):
295    """ An atlas gene filter condition.
296   
297    :param property: Property of the gene. If None or "" all properties
298        will be searched.
299    :param qualifier: Qualifier can be 'Is' or 'IsNot'
300    :param value: The value to search for.
301   
302    Example ::
303   
304        >>> # Condition on a gene name
305        >>> condition = AtlasConditionGeneProperty("Name", "Is", "AS3MT")
306        >>> # Condition on genes from a GO Term
307        >>> condition = AtlasConditionGeneProperty("Goterm", "Is", "p53 binding")
308        >>> # Condition on disease association
309        >>> condition = AtlasConditionGeneProperty("Disease", "Is", "cancer")
310       
311    """
312    def __init__(self, property, qualifier, value):
313        self.property = property or ""
314        self.qualifier = qualifier
315        if isinstance(value, basestring):
316            self.value = value.replace(" ", "+")
317        elif isinstance(value, list):
318            self.value = "+".join(value)
319        else:
320            raise ValueError(value)
321       
322        self.validate()
323       
324    def validate(self):
325        assert(self.property in GENE_FILTERS + [""])
326        assert(self.qualifier in GENE_FILTER_QUALIFIERS + [""])
327       
328    def rest(self):
329        return "gene{property}{qualifier}={value}".format(**self.__dict__)
330       
331       
332class AtlasConditionExperimentalFactor(AtlasCondition):
333    """ An atlas experimental factor filter condition.
334   
335    :param factor: EFO experiamntal factor
336    :param regulation: "up", "down", "updown", "any" or "none"
337    :param n: Minimum number of of experimants with this condition
338    :param value: Experimantal factor value
339   
340    Example ::
341   
342        >>> # Any genes up regulated in at least 3 experiments involving cancer.
343        >>> condition = AtlasConditionExperimentalFactor("", "up", 3, "cancer")
344        >>> # Only genes which are up/down regulated in the heart in at least one experiment.
345        >>> condition = AtlasConditionExperimentalFactor("Organism_part", "updown", 1, "heart")
346       
347    """
348    def __init__(self, factor, regulation, n, value):
349        self.factor = factor
350        self.regulation = regulation
351        self.n = n
352        self.value = value
353        self.validate()
354       
355    def validate(self):
356        # TODO: validate the factor and value
357#        assert(self.factor in efv_ontology())
358        assert(self.regulation in ["up", "down", "updown"])
359       
360    def rest(self):
361        return "{regulation}{n}In{factor}={value}".format(**self.__dict__)
362       
363class AtlasConditionOrganism(AtlasCondition):
364    """ Condition on organism.
365    """
366    def __init__(self, organism):
367        self.organism = organism
368        self.validate()
369       
370    def validate(self):
371        assert(self.organism in ATLAS_ORGANISMS)
372       
373    def rest(self):
374        return "species={0}".format(self.organism.replace(" ", "+").lower())
375       
376   
377def query_atlas_simple(genes=None, regulated=None, organism=None, format="json"):
378    """ A simple Atlas query.
379   
380    Example::
381       
382        >>> query_atlas_simple(genes=['Pou5f1', 'Dppa3'], organism="Mus musculus")
383        <addinfourl at ...
384       
385        >>> query_atlas_simple(genes=['Pou5f1', 'Dppa3'], regulated="up", organism="Mus musculus")
386        <addinfourl at ...
387       
388    """
389    conditions = AtlasConditionList()
390    conditions.append(AtlasConditionGeneProperty("Gene", "Is", genes))
391    if regulated:
392        conditions.append(AtlasConditionExperimentalFactor("", regulated, 1, ""))
393    if organism:
394        conditions.append(AtlasConditionOrganism(organism))
395    connection = GeneExpressionAtlasConenction()
396    results = connection.query(conditions, format=format)
397    return results
398
399"""\
400TODO: can this be implemented query_atlas(organism="...", Locuslink="...", Chebi="...", up3InCompound="..." downInEFO="...")
401      Need a full list of accepted factors
402"""
403
404def query_atlas(condition, format="json", start=None, rows=None, indent=False):
405    """ Query Atlas based on a `condition` (instance of AtlasCondition)
406   
407    Example::
408       
409        >>> #condition = AtlasConditionGeneProperty()
410       
411    """
412    connection = GeneExpressionAtlasConenction()
413    results = connection.query(condition, format=format, start=start,
414                               rows=rows, indent=indent)
415    return results
416
417
418def get_atlas_summary(genes, organism):
419    """ Return 3 dictionaries containing a summary of atlas information
420    about three experimental factors:
421   
422        - Organism Part (OP)
423        - Disease State (DS)
424        - Cell type (CT)
425   
426    Each dictionary contains query genes as keys. Values are dictionaries
427    mapping factor values to a 2-tuple containig the count of up regulated
428    and down regulated experiments.
429   
430    Example::
431   
432        >>> get_atlas_summary(["RUNX1"], "Homo sapiens")
433        ({u'RUNX1': ...
434       
435    """
436    genes_condition = AtlasConditionGeneProperty("Gene", "Is", genes)
437    org_condition = AtlasConditionOrganism(organism)
438    condition = AtlasConditionList([genes_condition, org_condition])
439    result = query_atlas(condition, format="json")
440    import json
441    result = json.load(result)
442   
443    org_part = collect_ef_summary(result, "organism_part")
444    disease_state = collect_ef_summary(result, "disease_state")
445    cell_type = collect_ef_summary(result, "cell_type")
446   
447    return org_part, disease_state, cell_type
448   
449def collect_ef_summary(info, ef):
450    """ Collect the results summary from query_atlas, result for experimental
451    factor `ef`.
452    """
453    summary = defaultdict(dict)
454    results = info["results"]
455    for res in results:
456        gene = res["gene"]
457        expressions = res["expressions"] 
458        for expression in expressions:
459            if expression["ef"] == ef:
460                efv = expression["efv"]
461                updown = (expression["upExperiments"],
462                          expression["downExperiments"]
463                          )
464               
465                if any(updown):
466                    summary[gene["name"]][efv] = updown
467   
468    return dict(summary)
469   
470   
471if __name__ == "__main__":
472    from pprint import pprint   
473    pprint(get_atlas_summary(['Pou5f1', 'Dppa3'], 'Mus musculus'))
474       
475    pprint(get_atlas_summary(['PDLIM5', 'FGFR2' ], 'Homo sapiens'))
476   
477   
478    conn = ArrayExpressConnection()
479    import doctest
480    doctest.testmod(optionflags=doctest.ELLIPSIS, extraglobs={"conn": conn})
481   
Note: See TracBrowser for help on using the repository browser.