source: orange-bioinformatics/obiArrayExpress.py @ 1342:3ac2a7774fbb

Revision 1342:3ac2a7774fbb, 14.2 KB checked in by ales_erjavec <ales.erjavec@…>, 3 years ago (diff)

Added obiArrayExpress module. This module is used to query both ArrayExpress as well as GeneExpressionAtlas from python.

Line 
1"""
2Array Express
3-------------
4
5A python module for accessing the ArrayExpress and Gene Atlas web services.
6
7Example::
8
9    >>> import obiArrayExpress
10    >>> obiArrayExpress.query_experiments(accession='E-MEXP-31')
11    <addinfourl at ...
12   
13   
14   
15"""
16
17import os, sys
18import urllib2
19
20import orngEnviron
21import warnings
22import posixpath
23import shelve
24from collections import defaultdict
25
26ARRAYEXPRESS_FIELDS = \
27    ["accession",
28     "array",
29     "ef",
30     "efv",
31     "expdesign",
32     "exptype",
33     "gxa",
34     "pmid",
35     "sa",
36     "species",
37    ]
38
39class ArrayExpressConnection(object):
40    """ A connection to the ArrayExpress database with query caching
41    """
42   
43    try:
44        DEFAULT_CACHE = shelve.open(os.path.join(orngEnviron.bufferDir, "ArrayExpressCache.shelve"))
45    except ImportError:
46        warnings.warn("Could not load persistent cache!")
47        DEFAULT_CACHE = {}
48   
49    DEFAULT_ADDRESS = "http://www.ebi.ac.uk/arrayexpress/xml/v2/"
50   
51    # Order of arguments in the query
52    _ARGS_ORDER = ["keywords", "species", "array"]
53   
54    def __init__(self, address=None, cache=None, timeout=30):
55        self.address = address if address is not None else self.DEFAULT_ADDRESS
56        self.cache = cache if cache is not None else self.DEFAULT_CACHE
57        self.timeout = timeout
58       
59    def format_query(self, **kwargs):
60        def format_species(val):
61            return '"%s"' % val.lower()
62        # TODO: range values (e.g. [1 TO 2]), handle AND, OR, +, check for valid keys
63        formaters = {"species": format_species,
64                     }
65        parts = []
66        arg_items = kwargs.items()
67        ordered = sorted(arg_items, key=lambda arg: self._ARGS_ORDER.index(arg[0]) \
68                         if arg[0] in self._ARGS_ORDER else 100)
69        for key, value in kwargs.iteritems():
70            fmt = formaters.get("key", lambda val: val)
71            value = fmt(value)
72            parts.append("{0}={1}".format(key, value)) 
73        return "&".join(parts)
74       
75    def query_url(self, what="experiments", **kwargs):
76        """ Return a formated query url for the calls arguments
77       
78        Example::
79            >>> conn.query_url(accession="E-MEXP-31")
80            'http://www.ebi.ac.uk/arrayexpress/xml/v2/experiments?accession=E-MEXP-31'
81           
82        """
83        query = self.format_query(**kwargs)
84        url = posixpath.join(self.address, what)
85        url = url + "?" + query
86#        print url
87        url = url.replace(" ", "%20")
88        return url
89   
90    def query_url_experiments(self, **kwargs):
91        """ Return a formated experiments query url for the calls arguments
92        """
93        return self.query_url("experiments", **kwargs)
94   
95    def query_url_files(self, **kwargs):
96        """ Return a formated experiments query url for the calls arguments
97        """
98        return self.query_url("files", **kwargs)
99   
100    def query_experiment(self, **kwargs):
101        url = self.query_url_experiments(**kwargs)
102        stream = urllib2.urlopen(url, timeout=self.timeout)
103        #  TODO: check stream for errors 
104        return stream
105   
106    def query_files(self, **kwargs):
107        url = self.query_url_files(**kwargs)
108        stream = urllib2.urlopen(url, timeout=self.timeout)
109        #  TODO: check stream for errors 
110        return stream
111   
112    def open_file(self, accession, kind="raw", ext=None):
113        """ Return a file handle to experiment data.
114        Possible values for kind:
115            - raw: return the raw data if available
116            - fgem: return the processed data if available
117            - biosamples: a png or svg design image
118            - idf: investigation description
119            - adf: array design description
120            - mageml: MAGE-ML file
121           
122        Example::
123       
124            >>> raw_file = conn.open_file("E-TABM-1087", kind="raw")
125            >>> processed_file = conn.open_file("E-TABM-1087", kind="fgem")
126             
127        """
128        from Orange.misc.xml import parse
129        files = parse(self.query_files(accession=accession))
130        files = list(files.elements("file"))
131        for file in files:
132            filekind = file.elements("kind").next()
133            fileext = file.elements("extension").next()
134            if filekind.data.strip() == kind and (fileext.data.strip() == ext or ext is None): 
135                url = file.elements("url").next()
136                return urllib2.urlopen(url.data.strip(), timeout=self.timeout)
137   
138   
139def query_experiments(**kwargs):
140    """ Query Array Express experiments.
141   
142    Example ::
143   
144        >>> query_experiments(species="Homo sapiens", ef="organism_part", efv="liver")
145        <addinfourl at ...
146       
147    """
148    return ArrayExpressConnection().query_experiment(**kwargs)
149
150def query_files(**kwargs):
151    """ Query Array Express files.
152   
153    Example ::
154   
155        >>> query_files(species="Mus musculus", ef="developmental_stage", efv="embryo")
156        <addinfourl at ...
157                       
158    """
159    return ArrayExpressConnection().query_files(**kwargs)
160   
161# TODO: List all accepted keyword values for query_* functions.
162
163"""\
164Gene Expression Atlas
165---------------------
166"""
167
168class GeneExpressionAtlasConenction(object):
169    """ A connection to Gene Expression Atlas database
170    """
171    try:
172        DEFAULT_CACHE = shelve.open(os.path.join(orngEnviron.bufferDir, "GeneExpressionAtlasCache.shelve"))
173    except ImportError:
174        warnings.warn("Could not load persistent cache!")
175        DEFAULT_CACHE = {}
176   
177    DEFAULT_ADDRESS = "http://www.ebi.ac.uk:80/gxa/"
178   
179    def __init__(self, address=None, cache=None, timeout=30):
180        self.address = address if address is not None else self.DEFAULT_ADDRESS
181        self.cache = cache if cache is not None else self.DEFAULT_CACHE
182        self.timeout = timeout
183       
184    def format_query(self,):
185        pass
186   
187    def query(self, condition, format="json", start=None, rows=None, indent=False):
188        url = self.address + "api?" + condition.rest()
189        if start and rows:
190            url += "&start={0}&rows={1}".format(start, rows)
191        url += "&format={0}".format(format)
192        if indent:
193            url += "&indent"
194#        print url
195        response = urllib2.urlopen(url)
196        return response
197   
198GENE_FILTERS = \
199    ["Name", # Gene name
200     "Goterm", #Gene Ontology Term
201     "Interproterm", #InterPro Term
202     "Disease", #Gene-Disease Assocation
203     "Keyword", #Gene Keyword
204     "Protein", #Protein
205
206     "Dbxref", #Other Database Cross-Refs
207     "Embl", #EMBL-Bank ID
208     "Ensfamily", #Ensembl Family
209     "Ensgene", #Ensembl Gene ID
210
211     "Ensprotein", #Ensembl Protein ID
212     "Enstranscript", #Ensembl Transcript ID
213     "Goid", #Gene Ontology ID
214     "Image", #IMAGE ID
215     "Interproid", #InterPro ID
216     "Locuslink", #Entrez Gene ID
217
218     "Omimid", #OMIM ID
219     "Orf", #ORF
220     "Refseq", #RefSeq ID
221     "Unigene", #UniGene ID
222     "Uniprot", #UniProt Accession
223
224     "Hmdb", #HMDB ID
225     "Chebi", #ChEBI ID
226     "Cas", #CAS
227     "Uniprotmetenz", #Uniprotmetenz
228     "Gene", #Gene Name or Identifier
229     "Synonym", #Gene Synonym
230     ]
231   
232GENE_FILTER_QUALIFIERS =\
233    ["Is",
234     "IsNot"
235     ]
236
237ATLAS_ORGANISMS = \
238    ["Anopheles gambiae",
239     "Arabidopsis thaliana",
240     "Bos taurus",
241     "Caenorhabditis elegans",
242     "Danio rerio",
243     "Drosophila melanogaster",
244     "Epstein barr virus",
245     "Gallus gallus",
246     "Homo sapiens",
247     "Human cytomegalovirus",
248     "Kaposi sarcoma-associated herpesvirus",
249     "Mus musculus",
250     "Rattus norvegicus",
251     "Saccharomyces cerevisiae",
252     "Schizosaccharomyces pombe",
253     "Unknown",
254     "Xenopus laevis"
255     ]
256   
257def ef_ontology():
258    """ Return the EF (Experimental Factor) ontology
259    """
260    import obiOntology
261    import orngServerFiles
262    # Should this be in the OBOFoundry domain
263    file_name = orngServerFiles.localpath_download("ArrayExpress", "efo.obo")
264    return obiOntology.OBOOntology(open(filename, "rb"))
265
266
267class AtlasCondition(object):
268    """ Base class for Gene Expression Atlas query condition
269    """
270    def validate(self):
271        """ Validate condition in a subclass.
272        """
273        raise NotImplementedError
274   
275    def rest(self):
276        """ Return a REST query part in a subclass.
277        """
278        raise NotImplementedError
279   
280   
281class AtlasConditionList(list, AtlasCondition):
282    """ A list of AtlasCondition instances.
283    """ 
284    def validate(self):
285        for item in self:
286            item.validate()
287       
288    def rest(self):
289        return "&".join(cond.rest() for cond in self)
290
291class AtlasConditionGeneProperty(AtlasCondition):
292    """ An atlas gene filter condition.
293   
294    :param property: Property of the gene. If None or "" all properties
295        will be searched.
296    :param qualifier: Qualifier can be 'Is' or 'IsNot'
297    :param value: The value to search for.
298   
299    Example ::
300   
301        >>> # Condition on a gene name
302        >>> condition = AtlasConditionGeneProperty("Name", "Is", "AS3MT")
303        >>> # Condition on genes from a GO Term
304        >>> condition = AtlasConditionGeneProperty("Goterm", "Is", "p53 binding")
305        >>> # Condition on disease association
306        >>> condition = AtlasConditionGeneProperty("Disease", "Is", "cancer")
307       
308    """
309    def __init__(self, property, qualifier, value):
310        self.property = property or ""
311        self.qualifier = qualifier
312        if isinstance(value, basestring):
313            self.value = value.replace(" ", "+")
314        elif isinstance(value, list):
315            self.value = "+".join(value)
316        else:
317            raise ValueError(value)
318       
319        self.validate()
320       
321    def validate(self):
322        assert(self.property in GENE_FILTERS + [""])
323        assert(self.qualifier in GENE_FILTER_QUALIFIERS + [""])
324       
325    def rest(self):
326        return "gene{property}{qualifier}={value}".format(**self.__dict__)
327       
328       
329class AtlasConditionExperimentalFactor(AtlasCondition):
330    """ An atlas experimental factor filter condition.
331   
332    :param factor: EFO experiamntal factor
333    :param regulation: "up", "down", "updown", "any" or "none"
334    :param n: Minimum number of of experimants with this condition
335    :param value: Experimantal factor value
336   
337    Example ::
338   
339        >>> # Any genes up regulated in at least 3 experiments involving cancer.
340        >>> condition = AtlasConditionExperimentalFactor("", "up", 3, "cancer")
341        >>> # Only genes which are up/down regulated in the heart in at least one experiment.
342        >>> condition = AtlasConditionExperimentalFactor("Organism_part", "updown", 1, "heart")
343       
344    """
345    def __init__(self, factor, regulation, n, value):
346        self.factor = factor
347        self.regulation = regulation
348        self.n = n
349        self.value = value
350        self.validate()
351       
352    def validate(self):
353        # TODO: validate the factor and value
354#        assert(self.factor in efv_ontology())
355        assert(self.regulation in ["up", "down", "updown"])
356       
357    def rest(self):
358        return "{regulation}{n}In{factor}={value}".format(**self.__dict__)
359       
360class AtlasConditionOrganism(AtlasCondition):
361    """ Condition on organism.
362    """
363    def __init__(self, organism):
364        self.organism = organism
365        self.validate()
366       
367    def validate(self):
368        assert(self.organism in ATLAS_ORGANISMS)
369       
370    def rest(self):
371        return "species={0}".format(self.organism.replace(" ", "+").lower())
372       
373   
374def query_atlas_simple(genes=None, regulated=None, organism=None, format="json"):
375    """ A simple Atlas query.
376   
377    Example::
378       
379        >>> query_atlas_simple(genes=['Pou5f1', 'Dppa3'], organism="Mus musculus")
380        <addinfourl at ...
381       
382        >>> query_atlas_simple(genes=['Pou5f1', 'Dppa3'], regulated="up", organism="Mus musculus")
383        <addinfourl at ...
384       
385    """
386    conditions = AtlasConditionList()
387    conditions.append(AtlasConditionGeneProperty("Gene", "Is", genes))
388    if regulated:
389        conditions.append(AtlasConditionExperimentalFactor("", regulated, 1, ""))
390    if organism:
391        conditions.append(AtlasConditionOrganism(organism))
392    connection = GeneExpressionAtlasConenction()
393    results = connection.query(conditions, format=format)
394    return results
395
396"""\
397TODO: can this be implemented query_atlas(organism="...", Locuslink="...", Chebi="...", up3InCompound="..." downInEFO="...")
398      Need a full list of accepted factors
399"""
400
401def query_atlas(condition, format="json"):
402    """ Query Atlas based on a `condition` (instance of AtlasCondition)
403   
404    Example::
405       
406        >>> #condition = AtlasConditionGeneProperty()
407       
408    """
409    connection = GeneExpressionAtlasConenction()
410    results = connection.query(condition, format=format)
411    return results
412
413
414def get_atlas_info(genes, organism):
415    genes_condition = AtlasConditionGeneProperty("", "Is", genes)
416    org_condition = AtlasConditionOrganism(organism)
417    condition = AtlasConditionList([genes_condition, org_condition])
418    result = query_atlas(condition, format="json")
419    import json
420    result = json.load(result)
421   
422    org_part = collect_ef_summary(result, "organism_part")
423    disease_state = collect_ef_summary(result, "disease_state")
424    cell_type = collect_ef_summary(result, "cell_type")
425   
426    return org_part, disease_state, cell_type
427   
428def collect_ef_summary(info, ef):
429    """ Collect the results summary from query_atlas, result for experimental
430    factor `ef`.
431    """
432    summary = defaultdict(dict)
433    results = info["results"]
434    for res in results:
435        gene = res["gene"]
436        expressions = res["expressions"] 
437        for expression in expressions:
438            if expression["ef"] == ef:
439                efv = expression["efv"]
440                updown = (expression["upExperiments"],
441                          expression["downExperiments"]
442                          )
443               
444                if any(updown):
445                    summary[gene["name"]][efv] = updown
446   
447    return dict(summary)
448   
449   
450if __name__ == "__main__":   
451#    print get_atlas_info(['Pou5f1', 'Dppa3'], 'Mus musculus')
452#   
453#    print get_atlas_info(['PDLIM5', 'FGFR2' ], 'Homo sapiens')
454   
455   
456   
457   
458    conn = ArrayExpressConnection()
459    import doctest
460    doctest.testmod(optionflags=doctest.ELLIPSIS, extraglobs={"conn": conn})
461   
462   
463       
Note: See TracBrowser for help on using the repository browser.