source: orange-bioinformatics/orangecontrib/bio/obiArrayExpress.py @ 1885:8d4623044732

Revision 1885:8d4623044732, 59.9 KB checked in by Ales Erjavec <ales.erjavec@…>, 6 months ago (diff)

Fixes for Array Express caching.

Line 
1"""
2===============
3obiArrayExpress
4===============
5
6A python module for accessing the ArrayExpress web services and database.
7
8`Array Express Archive <http://www.ebi.ac.uk/arrayexpress/>`_ is a
9database of gene expression experiments that you can query and download.
10
11Example ::
12
13    >>> # Retrieve the object representing experiment with accession E-TABM-25
14    >>> experiment = ArrayExpressExperiment("E-TABM-25")
15    >>> print experiment.accession
16    E-TABM-25
17
18    >>> print experiment.name
19    Transcription profiling of aging in the primate brain
20
21    >>> print experiment.species
22    ['Pan troglodytes']
23
24    >>> print experiment.files
25    [{'kind': ...
26
27    >>> # Retrieve the data matrix for experiment 'E-MEXP-2917'
28    >>> experiment = ArrayExpressExperiment("E-MEXP-2917")
29    >>> table = experiment.fgem_to_table()
30
31
32Low level Array Express query using REST services::
33
34    >>> from Orange.bio import obiArrayExpress
35    >>> obiArrayExpress.query_experiments(accession='E-MEXP-31')
36    {u'experiments': ...
37
38    >>> obiArrayExpress.query_experiments(keywords='gliobastoma')
39    {u'experiments': ...
40
41    >>> obiArrayExpress.query_files(accession='E-MEXP-32', format="xml")
42    <xml.etree.ElementTree.ElementTree object ...
43
44.. note:: Currently querying ArrayExpress files only works with the xml format.
45
46.. note:: See the documentation of `query_experiments` for a full set of
47          parameters that these functions accept.
48
49"""
50
51from __future__ import absolute_import
52
53import os
54import urllib2
55import re
56import warnings
57import shelve
58import shutil
59import posixpath
60import json
61from xml.etree.ElementTree import ElementTree
62from StringIO import StringIO
63
64from collections import defaultdict
65from functools import partial
66from contextlib import closing
67
68from Orange.orng import orngServerFiles
69
70parse_json = json.load
71
72
73def parse_xml(stream):
74    """ Parse an xml stream into an instance of
75    `xml.etree.ElementTree.ElementTree`.
76    """
77    return ElementTree(file=stream)
78
79# All searchable fields of ArrayExpress (see query_experiments docstring
80# for a description of the fields)
81ARRAYEXPRESS_FIELDS = \
82    ["keywords",
83     "accession",
84     "array",
85     "ef",
86     "efv",
87     "expdesign",
88     "exptype",
89     "gxa",
90     "pmid",
91     "sa",
92     "species",
93     "expandefo",
94     "directsub",
95     "assaycount",
96     "efcount",
97     "samplecount",
98     "sacount",
99     "rawcount",
100     "fgemcount",
101     "miamescore",
102     "date",
103     "wholewords",
104    ]
105
106
107def _open_shelve(filename, flag="r"):
108    dirname = os.path.dirname(filename)
109    if not os.path.isdir(dirname):
110        os.makedirs(dirname)
111    exists = os.path.exists(filename)
112    if flag in ["r", "w"] and not exists:
113        # needs to be created first
114        # XXX: Race condition
115        s = shelve.open(filename, "c")
116        s.close()
117
118    return shelve.open(filename, flag)
119
120
121class ArrayExpressConnection(object):
122    """
123    A connection to the Array Express.
124
125    Used to construct a REST query and run it.
126
127    :param address: Address of the ArrayExpress API.
128    :param timeout: Timeout for the socket connection.
129
130    .. todo::
131        Implement user login (see Accessing Private Data in API docs)
132
133    """
134
135    DEFAULT_ADDRESS = "http://www.ebi.ac.uk/arrayexpress/{format}/v2/"
136    DEFAULT_FORMAT = "json"
137    DEFAULT_CACHE = orngServerFiles.localpath("ArrayExpress",
138                                              "ArrayExpressCache.shelve")
139    # Order of arguments in the query
140    _ARGS_ORDER = ["keywords", "species", "array"]
141
142    def __init__(self, address=None, timeout=30, cache=None,
143                 username=None, password=None):
144        self.address = address if address is not None else self.DEFAULT_ADDRESS
145        self.timeout = timeout
146        self.cache = cache if cache is not None else self.DEFAULT_CACHE
147        self.username = username
148        self.password = password
149       
150    def format_query(self, **kwargs):
151        """ Format the query arguments.
152       
153        Example ::
154       
155            >>> conn.format_query(gxa=True, efcount=(1, 5))
156            'efcount=[1 TO 5]&gxa=true'
157           
158        """
159        # Formaters:
160        def format_default(val):
161            if isinstance(val, basestring):
162                return val
163            else:
164                return "+".join(val)
165        def format_species(val):
166            return '"%s"' % val.lower()
167        def format_gxa(val):
168            if val:
169                return "true"
170            else:
171                raise ValueError("gxa={0}".format(val))
172        def format_expandefo(val):
173            if val:
174                return "on"
175            else:
176                raise ValueError("expandefo={0}".format(val))
177        def format_true_false(val):
178            return "true" if val else "false"
179        def format_interval(val):
180            if isinstance(val, tuple):
181                return "[{0} TO {1}]".format(*val)
182            else:
183                raise ValueError("Must be an interval argument (min, max)!")
184        def format_date(val):
185            # TODO check if val contains a datetime.date object, assert proper format
186            return format_interval(val)
187        def format_wholewords(val):
188            if val:
189                return "on"
190            else:
191                raise ValueError("wholewords={0}".format(val))
192       
193        formaters = {"species": format_species,
194                     "gxa": format_gxa,
195                     "expandefo": format_expandefo,
196                     "directsub": format_true_false,
197                     "assaycount": format_interval,
198                     "efcount": format_interval,
199                     "samplecount": format_interval,
200                     "sacount": format_interval,
201                     "rawcount": format_interval,
202                     "fgemcount": format_interval,
203                     "miamescore": format_interval,
204                     "date": format_date,
205                     "wholewords": format_wholewords,
206                     }
207        parts = []
208        arg_items = kwargs.items()
209        ordered = sorted(arg_items, key=lambda arg: self._ARGS_ORDER.index(arg[0]) \
210                         if arg[0] in self._ARGS_ORDER else 100)
211       
212        for key, value in kwargs.iteritems():
213            if key == "format":
214                continue # format is handled in query_url
215            if key not in ARRAYEXPRESS_FIELDS:
216                raise ValueError("Invalid argument name: '{0}'".format(key))
217            if value is not None and value != []:
218                fmt = formaters.get(key, format_default)
219                value = fmt(value)
220                parts.append("{0}={1}".format(key, value))
221                 
222        return "&".join(parts)
223       
224    def query_url(self, what="experiments", **kwargs):
225        """ Return a formated query URL for the query arguments
226       
227        Example ::
228            >>> conn.query_url(accession="E-MEXP-31")
229            'http://www.ebi.ac.uk/arrayexpress/json/v2/experiments?accession=E-MEXP-31'
230           
231        """
232        query = self.format_query(**kwargs)
233        url = posixpath.join(self.address, what)
234        url = url.format(format=kwargs.get("format", self.DEFAULT_FORMAT))
235        url = url + ("?" + query if query else "")
236        url = url.replace(" ", "%20")
237        return url
238   
239    def query_url_experiments(self, **kwargs):
240        """ Return a formated experiments query url for the calls arguments
241        """
242        return self.query_url("experiments", **kwargs)
243   
244    def query_url_files(self, **kwargs):
245        """ Return a formated experiments query url for the calls arguments
246        """
247        return self.query_url("files", **kwargs)
248   
249    def query_experiment(self, **kwargs):
250        """ Return an open stream to the experiments query results.
251       
252        .. note:: This member function takes the same arguments as the module
253                  level `query_experiemnts` function.
254         
255        """
256        url = self.query_url_experiments(**kwargs)
257        stream = self._cache_urlopen(url, timeout=self.timeout)
258        return stream
259   
260    def query_files(self, **kwargs):
261        """ Return an open stream to the files query results.
262       
263        .. note:: This member function takes the same arguments as the module
264                  level `query_files` function.
265        """
266        url = self.query_url_files(**kwargs)
267        stream = self._cache_urlopen(url, timeout=self.timeout)
268        return stream
269   
270    def open_file(self, accession, kind="raw", ext=None):
271        """ Return a file handle to experiment data.
272        Possible values for kind:
273            - raw: return the raw data if available
274            - fgem: return the processed data if available
275            - biosamples: a png or svg design image
276            - idf: investigation description
277            - adf: array design description
278            - mageml: MAGE-ML file
279           
280        Example ::
281       
282            >>> raw_file = conn.open_file("E-TABM-1087", kind="raw")
283            >>> processed_file = conn.open_file("E-TABM-1087", kind="fgem")
284             
285        """
286        stream = self.query_files(accession=accession, format="xml")
287        tree = ElementTree(file=stream)
288        files = tree.findall("experiment/file")
289        for file in files:
290            filekind = file.find("kind").text
291            fileext = file.find("extension").text
292            if filekind.strip() == kind and (fileext.strip() == ext or ext is None): 
293                url = file.find("url").text
294                return self._cache_urlopen(url.strip(), timeout=self.timeout)
295           
296    def _cache_urlopen(self, url, timeout=30):
297        if self.cache is not None:
298            with self.open_cache("r") as cache:
299                if url in cache:
300                    return StringIO(cache[url])
301
302            stream = urllib2.urlopen(url, timeout=timeout)
303            data = stream.read()
304            with self.open_cache("w") as cache:
305                cache[url] = data
306
307            return StringIO(data)
308        else:
309            return urllib2.urlopen(url, timeout=timeout)
310
311    def open_cache(self, flag="r"):
312        if isinstance(self.cache, basestring):
313            try:
314                return closing(_open_shelve(self.cache, flag))
315            except Exception:
316                return fake_closing({})
317        elif hasattr(self.cache, "close"):
318            return closing(self.cache)
319        elif self.cache is None:
320            return fake_closing({})
321        else:
322            return fake_closing(self.cache)
323
324from contextlib import contextmanager
325
326@contextmanager
327def fake_closing(obj):
328    yield obj
329   
330def query_experiments(keywords=None, accession=None, array=None, ef=None,
331                      efv=None, expdesign=None, exptype=None,
332                      gxa=None, pmid=None, sa=None, species=None,
333                      expandefo=None, directsub=None, assaycount=None,
334                      efcount=None, samplecount=None, rawcount=None,
335                      fgemcount=None, miamescore=None, date=None, 
336                      format="json", wholewords=None, connection=None):
337    """ Query Array Express experiments.
338   
339    :param keywords: A list of keywords to search (e.g. ['gliobastoma']
340    :param accession: Search by experiment accession (e.g. 'E-MEXP-31')
341    :param array: Search by array design name or accession (e.g. 'A-AFFY-33')
342    :param ef: Experimental factor (names of main variables of experiments)
343    :param efv: Experimental factor value (Has EFO expansion)
344    :param expdesign: Experiment design type. (e.g. ["dose", "response"])
345    :param exptype: Experiment type (e.g. 'RNA-Seq', has EFO expansion)
346    :param gxa: If True limit the results to experiments from the Gene
347        Expreission Atlas only.
348    :param pmid: Search by PubMed identifier
349    :param sa: Sample attribute values (e.g. 'fibroblast', has EFO expansion)
350    :param species: Search by species (e.g. 'Homo sapiens', has EFO expansion)
351   
352    :param expandefo: If True expand the search terms with all its child terms
353        in the Experimental Factor Ontology (EFO_) (e.g. keywords="cancer"
354        will be expanded to include for synonyms and sub types of cancer).
355    :param directsub: If True return only experiments submited directly to
356        Array Express else if False return only experiments imported from GEO
357        database (default None, return both)
358    :param assaycount: A two tuple (min, max) for filter on the number of
359        assays (e.g. (1, 5) will return only experiments with at least one
360        and no more then 5 assays).
361    :param efcount: Filter on the number of experimental factors (e.g. (1, 5))
362    :param sacount: Filter on the number of sample attribute categories
363    :param rawcount: Filter on the number or raw files
364    :param fgemcount: Filter on the number of final gene expression matrix
365        (processed data) files
366    :param miamescore: Filter on the MIAME complience score (max 5)
367    :param date: Filter by release date
368   
369    Example ::
370   
371        >>> query_experiments(species="Homo sapiens", ef="organism_part", efv="liver")
372        {u'experiments': ...
373       
374    .. _EFO: http://www.ebi.ac.uk/efo/
375   
376    """
377    if connection is None:
378        connection = ArrayExpressConnection()
379       
380    stream = connection.query_experiment(keywords=keywords, accession=accession,
381                array=array, ef=ef, efv=efv, expdesign=expdesign, exptype=exptype,
382                gxa=gxa, pmid=pmid, sa=sa, species=species, expandefo=expandefo,
383                directsub=directsub, assaycount=assaycount, efcount=efcount,
384                samplecount=samplecount, rawcount=rawcount, fgemcount=fgemcount,
385                miamescore=miamescore, date=date,  format=format,
386                wholewords=wholewords)
387   
388    if format == "json":
389        return parse_json(stream)
390    else:
391        return parse_xml(stream)
392   
393   
394def query_files(keywords=None, accession=None, array=None, ef=None,
395               efv=None, expdesign=None, exptype=None,
396               gxa=None, pmid=None, sa=None, species=None,
397               expandefo=None, directsub=None, assaycount=None,
398               efcount=None, samplecount=None, rawcount=None,
399               fgemcount=None, miamescore=None, date=None, 
400               format="json", wholewords=None, connection=None):
401    """ Query Array Express files.
402   
403    This function accepts the same arguments as `query_experiments`.
404   
405    Example ::
406   
407        >>> query_files(species="Mus musculus", ef="developmental_stage", efv="embryo", format="xml")
408        <xml.etree.ElementTree.ElementTree object ...
409       
410    .. todo:: why does the json interface not work.
411                       
412    """
413    if connection is None:
414        connection = ArrayExpressConnection()
415   
416    stream = connection.query_files(keywords=keywords, accession=accession,
417                array=array, ef=ef, efv=efv, expdesign=expdesign, exptype=exptype,
418                gxa=gxa, pmid=pmid, sa=sa, species=species, expandefo=expandefo,
419                directsub=directsub, assaycount=assaycount, efcount=efcount,
420                samplecount=samplecount, rawcount=rawcount, fgemcount=fgemcount,
421                miamescore=miamescore, date=date,  format=format,
422                wholewords=wholewords)
423   
424    if format == "json":
425        return parse_json(stream)
426    else:
427        return parse_xml(stream)
428   
429   
430def open_file(accession, kind="raw", ext=None, repo_dir=None):
431    """ Open a file for the experiment.
432     
433    Example ::
434   
435        >>> file = open_file("E-MTAB-369", kind="raw", repo_dir="~/.arrayexpress/")
436       
437    """
438    raise NotImplementedError
439
440"""\
441MAGE-TAB convinence functions, classes
442======================================
443"""
444
445IDF_SINGLE_TAGS = \
446    ["Investigation Title",
447     "Date of Experiment",
448     "Public Release Date",
449     "Experiment Description",
450    ]
451
452def parse_idf(file):
453    """ Parse an idf.txt (Investigation Description  Format) formated file.
454    Return a list of tuples where the first element is the tag (first column)
455    and the second element is a list of all tag values.
456   
457    """
458    if isinstance(file, basestring):
459        file = open(file, "rb")
460    data = file.read()
461    lines = data.splitlines()
462    lines = [line.split("\t") for line in lines if line and not line.startswith("#")]
463    parsed = [(line[0], line[1:]) for line in lines]
464    return parsed
465   
466def parse_sdrf(file):
467    """ Parse an sdfr formated file. Return a tuple with the first element
468    a list of header values and the second element is a list of all rows
469    (a row is a list of string values).
470   
471    """
472    if isinstance(file, basestring):
473        file = open(file, "rb")
474    data = file.read()
475    lines = data.splitlines()
476    lines = [line.split("\t") for line in lines if line.strip() and not line.startswith("#")]
477    header = [h for h in lines[0] if h]
478    rows = [line[:len(header)] for line in lines[1:]]
479    assert(all([len(r) == len(header) for r in rows]))
480    return header, rows
481   
482def parse_adf(file):
483    pass
484
485def parse_data_matrix(file):
486    """ Parse the MAGE-TAB processed data matrix. Return a tuple where the
487    elements are:
488        - a (header REF, header values) tuple (e.g. ("Hybridization REF", ["Stage1", "Stage2", ...]) )
489        - a list of quantitation type for header values (e.g. ["log2 ratio", "log2 ratio", ...])
490        - a (row REF, row names list) tuple ("e.g. ("Reporter REF", ["Probe 1", "Probe 2", ...]) )
491        - a list of list matrix with values (as strings)
492       
493    """
494    if isinstance(file, basestring):
495        file = open(file, "rb")
496    data = file.read()
497    lines = data.splitlines()
498    lines = [line.split("\t") for line in lines if line.strip()]
499    header = lines[0]
500    header_ref, header = header[0], header[1:]
501    line2 = lines[1]
502    row_ref, quant_type = line2[0], line2[1:]
503    row_names, rows = [], []
504    for line in lines[2:]:
505        r_name, row = line[0], line[1:]
506        row_names.append(r_name)
507        rows.append(row)
508       
509    return ((header_ref, header),
510            quant_type,
511            (row_ref, row_names),
512            rows) 
513   
514class InvestigationDesign(dict):
515    """ Investigation design (contains the contents of the .idf).
516   
517    Example ::
518   
519        >>> idf = InvestigationDesign("foobar.idf")
520        >>> print idf.investigation_title
521        foo investigation
522        >>> print idf.experimental_design
523        ['fubar', 'snafu']
524        >>> print idf.sdrf_file
525        ['foobar.sdrf']
526       
527    """
528    def __init__(self, idf_file=None):
529        idf = parse_idf(idf_file)
530        self._idf = idf
531        self.update(dict(idf))
532        for tag, values in idf:
533            if tag in IDF_SINGLE_TAGS:
534                values = values[0] if values else None
535            ttag = self.transform_tag(tag)
536            setattr(self, ttag, values)
537       
538    def transform_tag(self, tag):
539        """ Transform the tag into a proper python attribute name by
540        replacing all spaces and special characters (e.g '[', ']' into
541        underscores).
542       
543        """
544        toreplace = [" ", "-", "[", "]"]
545        for s in toreplace:
546            tag = tag.replace(s, "_")
547        return tag.lower()
548           
549    def __getitem__(self, tag):
550        """ Return the tag values
551       
552        Example ::
553       
554            >>> idf["Investigation Title"]
555            'foo investigation'
556           
557        """
558        try:
559            return self._idf_dict[tag]
560        except KeyError:
561            pass
562       
563        ttag = self.transform_tag(tag)
564        if hasattr(self, ttag):
565            return getattr(self, ttag)
566        else:
567            raise KeyError(tag)
568       
569class SampleDataRelationship(object):
570    """ Sample-Data Relationship (contains the contents of the .sdrf file).
571   
572    Example ::
573   
574        >>> sdr = SampleDataRelationship("foobar.sdrf")
575        >>> sdr.source_name()
576        ['foo', ...
577       
578        >>> sdr.sample_name()
579        ['sampled foo', ...
580       
581        >>> sdr.extract_protocol_ref()
582        ['bar the foo', ...
583       
584        >>> sdr.derived_array_data_matrix_file()
585        ['foobar.data.txt', ...
586       
587        >>>
588         
589    """
590   
591    # Nodes an edges
592    NODES_EDGES = ["Source Name", "Sample Name", "Extract Name",
593                   "Labeled Extract Name", "Hybridization Name",
594                   "Assay Name", "Scan Name", "Normalization Name",
595                   "Array Data File", "Derived Array Data File",
596                   "Array Data Matrix File", "Derived Array Data Matrix File",
597                   "Image File", "Protocol REF"]
598   
599    # Attributes for nodes and edges
600    NODE_EDGE_ATTRIBUTES = \
601    {"Source Name": ["Characteristics", "Provider", "Material Type", "Description", "Comment"],
602     "Sample Name": ["Characteristics", "Material Type", "Description", "Comment"],
603     "Extract Name":["Characteristics", "Material Type", "Description", "Comment"],
604     "Labeled Extract Name": ["Characteristics", "Material Type", "Description", "Label", "Comment"],
605     "Hybridization Name": ["Array Design File", "Array Design REF", "Comment"],
606     "Assay Name": ["Technology Type", "Comment"],
607     "Scan Name": ["Comment"],
608     "Normalization Name": ["Comment"],
609     "Array Data File": ["Comment"],
610     "Derived Array Data File": ["Comment"],
611     "Array Data Matrix File": ["Comment"],
612     "Derived Array Data Matrix File": ["Comment"],
613     "Image File": ["Comment"],
614     "Protocol REF": ["Term Source REF", "Parameter", "Performer", "Date", "Comment"]
615     }
616   
617    # Attributes
618    ATTRIBUTE_COLUMNS = \
619    {"Characteristics []": ["Unit", "Term Source REF"],
620     "Provider": ["Comment"],
621     "Material Type": ["Term Source REF"],
622     "Label": ["Term Source REF"],
623     "Array Design File": ["Comment"],
624     "Array Design REF": ["Term Source REF", "Comment"],   
625     "Technology Type": ["Term Source REF"],
626     "Factor Value [] ()": ["Unit", "Term Source REF"],
627     "Performer": ["Comment"],
628     "Date": [],
629     "Parameter Value []": ["Unit", "Comment"],
630     "Unit []": ["Term Source REF"],
631     "Description": [],
632     "Term Source REF": ["Term Accession Number"],
633     "Term Accession Number": [],
634     "Comment []": []
635     }
636    def __init__(self, sdrf_file=None):
637        header, rows = parse_sdrf(sdrf_file)
638        self.header = header
639        self.rows = rows
640       
641    def transform_tag(self, tag):
642        """ Transform the tag into a proper python attribute name by
643        replacing all spaces and special characters (e.g '[', ']' into
644        underscores).
645       
646        """
647        toreplace = [" ", "-", "[", "]"]
648        for s in toreplace:
649            tag = tag.replace(s, "_")
650        return tag.lower()
651   
652    def _subsection(self, name):
653        """ Return the named subsection (name must be from the
654        NODES_EDGES list).
655       
656        """
657        idx = self.NODES_EDGES.index(name)
658        start = self.header.index(name)
659        end = -1
660        for name in self.NODES_EDGES[idx + 1:]:
661            if name in self.header[start + 1:]:
662                end = self.header.index(name, start + 1)
663                break
664        return self.header[start:end], [r[start:end] for r in self.rows]
665   
666    def _column(self, name):
667        """ Return the named column.
668        """
669        if isinstance(name, basestring):
670            index = self.header.index(name)
671        else:
672            index = name
673        return [r[index] for r in self.rows]
674       
675    def source(self):
676        """ Return the Source subsection
677        """
678        return self._subsection("Source Name")
679       
680    def source_name(self):
681        """ Return the Source Name subsection
682        """
683        return self._column("Source Name")
684       
685    def sample(self):
686        """ Return the Sample subsection
687        """
688        return self._subsection("Sample Name")
689       
690    def sample_name(self):
691        """ Return the Sample Name subsection
692        """
693        return self._column("Sample Name")
694       
695    def extract(self):
696        """ Return the Extract subsection
697        """
698        return self._subsection("Extract Name")
699       
700    def extract_name(self):
701        """ Return the Extract Name subsection
702        """
703        return self._column("Extract Name")
704       
705    def labeled_extract(self):
706        """ Return the Labeled Extract subsection
707        """
708        return self._subsection("Labeled Extract Name")
709       
710    def labeled_extract_name(self):
711        """ Return the Labeled Extract Name subsection
712        """
713        return self._column("Labeled Extract Name")
714       
715    def hybridization(self):
716        """ Return the Hybridization subsection.
717        """
718        return self._subsection("Hybridization Name")
719       
720    def hybridization_name(self):
721        """ Return the Hibridization Name subsection.
722        """
723        return self._column("Hybridization Name")
724       
725    def assay(self):
726        """ Return the Assay subsection
727        """
728        return self._subsection("Assay Name")
729       
730    def assay_name(self):
731        """ Return the Assay Name subsection
732        """
733        return self._column("Assay Name")
734       
735    def scan(self):
736        """ Return the Scan subsection
737        """
738        return self._subsection("Scan Name")
739       
740    def scan_name(self):
741        """ Return the Scan name subsection
742        """
743        return self._column("Scan Name")
744       
745    def normalization(self):
746        """ Return the Normalization subsection.
747        """
748        return self._subsection("Normalization Name")
749       
750    def normalization_name(self):
751        """ Return the Normalization Name subsection.
752        """
753        return self._column("Normalization Name")
754         
755    def array_data(self):
756        """ Return the Array Data subsection
757        """
758        return self._subsection("Array Data File")
759   
760    def array_data_file(self):
761        """ Return the Array Data File subsection
762        """
763        return self._column("Array Data File")
764       
765    def derived_array_data(self):
766        """ Return the Derived Array Data subsection
767        """
768        return self._subsection("Derived Array Data File")
769   
770    def derived_array_data_file(self):
771        """ Return the Derived Array Data File subsection
772        """
773        return self._column("Derived Array Data File")
774       
775    def array_data_matrix(self):
776        """ Return the Array Data Matrix subsection.
777        """
778        return self._subsection("Array Data Matrix File")
779   
780    def array_data_matrix_file(self):
781        """ Return the Array Data Matrix File subsection.
782        """
783        return self._column("Array Data Matrix File")
784       
785    def derived_array_data_matrix(self):
786        """ Return the Derived Array Data Matrix subsection.
787        """
788        return self._subsection("Derived Array Data Matrix File")
789   
790    def derived_array_data_matrix_file(self):
791        """ Return the Derived Array Data Matrix File subsection.
792        """
793        return self._column("Derived Array Data Matrix File")
794       
795    def image(self):
796        """ Return the Image subsection
797        """
798        return self._subsection("Image File")
799   
800    def image_file(self):
801        """ Return the Image File subsection.
802        """
803        return self._column("Image File")
804       
805class ArrayDesign(object):
806    """ Arary design (contains the contents of the .adf file).
807    """
808    def __init__(self, adf_file=None):
809        adf = parse_adf(adf_file)
810        self._adf = adf
811   
812def _is_float(str):
813    try:
814        float(str)
815        return True
816    except ValueError:
817        return False
818   
819def _is_continuous(items, check_count=100):
820    """ Are the strings in items continous numbers.
821    """
822    count = 0
823    i = 0
824    for i, item in enumerate(items):
825        if _is_float(item):
826            count += 1
827        if i >= check_count:
828            break
829    return count >= i * 0.5
830   
831def processed_matrix_to_orange(matrix_file, sdrf=None):
832    """ Load a single processed matrix file in to an Orange.data.Table
833    instance.
834    """
835    import numpy
836    import Orange
837   
838    if isinstance(matrix_file, basestring):
839        matrix_file = open(matrix_file, "rb")
840
841    header, quant_type, rows, matrix = parse_data_matrix(matrix_file)
842    header_ref, header = header
843    row_ref, rows = rows
844   
845    matrix = numpy.array(matrix, dtype=object)
846   
847   
848    features = []
849    is_float = numpy.frompyfunc(_is_float, 1, 1) # an numpy ufunc
850         
851    for header_name, quant, column in zip(header, quant_type, matrix.T):
852        if _is_continuous(column):
853            feature = Orange.feature.Continuous(header_name)
854            column[numpy.where(1 - is_float(column))] = "?" # relace all non parsable floats with '?'
855        else:
856            values = set(column)
857            feature = Orange.feature.Discrete(header_name, values=sorted(values))
858        feature.attributes["quantitation type"] = quant
859        features.append(feature)
860       
861    row_ref_feature = Orange.feature.String(row_ref)
862    domain = Orange.data.Domain(features, None)
863    domain.addmeta(Orange.feature.Descriptor.new_meta_id(), row_ref_feature)
864   
865    table = Orange.data.Table(domain, [list(row) for row in matrix])
866    table.setattr("header_ref", header_ref)
867    # Add row identifiers
868    for instance, row in zip(table, rows):
869        instance[row_ref_feature] = row
870
871    if sdrf is not None:
872        pattern = re.compile(r"((Characteristics)|(Factor Value)|(Parameter Value)) \[(?P<name>.*?)\].*")
873        # Row name in sdrf
874        row_name = header_ref[:header_ref.find(" REF")] + " Name"
875        # feature names as listed in sdrf
876        feature_names = sdrf._column(row_name)
877        annotations = defaultdict(partial(defaultdict, set))
878        for i, header in enumerate(sdrf.header):
879            match = pattern.match(header)
880            if match:
881                name = match.group("name")
882                for val, feature_name in zip(sdrf._column(i), feature_names):
883                    annotations[feature_name][name].add(val)
884
885        def to_str(values):
886            if len(values) > 1:
887                return str(list(values))
888            else:
889                return str(list(values)[0])
890
891        for feature in table.domain.features:
892            feature.attributes.update([(key, to_str(value)) for key, value in \
893                                       annotations[feature.name].items()])
894    return table
895
896
897def mage_tab_to_orange(idf_filename):
898    """Convert an MAGE-TAB annotated experiment into an Orange.data.Table
899    instance (assumes all the associated MAGE-TAB files are in the same
900    directory.
901
902    """
903    dirname = os.path.dirname(idf_filename)
904    idf = InvestigationDesign(idf_filename)
905   
906    sdrf_filename = os.path.join(dirname, idf.sdrf_file[0])
907    sdrf = SampleDataRelationship(sdrf_filename)
908   
909    data_matrices = set(sdrf.derived_array_data_matrix_file())
910    data_matrices = [name for name in data_matrices if name.strip()]
911   
912    tables = []
913    for filename in data_matrices:
914        matrix_file = os.path.join(dirname, filename)
915        table = processed_matrix_to_orange(matrix_file, sdrf)
916        tables.append(table)
917    table = hstack_tables(tables)
918
919    return table
920
921
922def hstack_tables(tables):
923    """ Stack the tables horizontaly.
924    """
925    import Orange
926    max_len = max([len(table) for table in tables])
927    stacked_features = []
928    stacked_values = [[] for i in range(max_len)]
929    stacked_meta_features = []
930    stacked_meta_values = [{} for i in range(max_len)]
931   
932    for table in tables:
933        stacked_features.extend(table.domain.variables)
934        stacked_meta_features.extend(table.domain.getmetas().items())
935       
936        for i, instance in enumerate(table):
937            stacked_values[i].extend(list(instance))
938            stacked_meta_values[i].update(instance.getmetas())
939           
940        # Fill extra lines with unknowns
941        for i in range(len(table), max_len):
942            stacked_values[i].extend(["?"] * len(table.domain.variables))
943       
944    domain = Orange.data.Domain(stacked_features, tables[-1].domain.class_var)
945    domain.addmetas(dict(set(stacked_meta_features)))
946    table = Orange.data.Table(domain, stacked_values)
947   
948    # Add meta attributes
949    for instance, metas in zip(table, stacked_meta_values):
950        for m, val in metas.iteritems():
951            instance[m] = val
952           
953    return table
954   
955def _dictify(element):
956    """ 'Dictify' and xml.etree.Element.Element instance by taking all
957    subnode tags as keys and the corresponding text values as items i.e. turn
958    `<node><bla>foo</bla><a>b</b></node>` into a {"bla":"foo", "a":b"}
959       
960    """
961    if element is None:
962        element = []
963    dict = {}
964    strip = lambda s: s.strip() if s else s
965    for node in element:
966        dict[node.tag] = strip(getattr(node, "text", None))
967    return dict
968   
969class SearchableList(list):
970    """ A list with an advanced `search` method
971    """
972    def search(self, **kwargs):
973        """ Search the list for elements with members that correspond the
974        supplied keyword arguments.
975       
976            >>> foo.bar = "foo"
977            >>> list = SearchableList([foo, bar])
978            >>> list.search(bar="foo") # Search for objects which have a member named "bar" and that member equals "foo"
979            [<__main__.foo object ...
980
981        """
982        ret = []
983        for obj in self:
984            for member, value in kwargs.items():
985                if hasattr(obj, member) and getattr(obj, member) == value:
986                    ret.append(obj)
987        return ret
988   
989class ArrayExpressExperiment(object):
990    """ An convinience class representing an Array Express Experiment.
991   
992    Example ::
993   
994        >>> ae = ArrayExpressExperiment("E-MEXP-2917")
995        >>> print ae.name
996        Characterization of Data Variation in Gene Expression Profiling of Human Peripheral Blood Samples
997       
998        >>> for file in ae.files:
999        ...     print file["name"], file["url"]
1000        E-MEXP-2917.sdrf.txt http://www.ebi.ac.uk/arrayexpress/files/E-MEXP-2917/E-MEXP-2917.sdrf.txt
1001        ...
1002       
1003        >>> table = ae.fgem_to_table() # Retieve the experiment data table
1004           
1005    """
1006   
1007    def __init__(self, accession, connection=None):
1008        self.accession = accession
1009        self.connection = connection
1010        self._etree = tree = query_experiments(accession=accession, connection=self.connection, format="xml")
1011        experiments = tree.findall("experiment")
1012        # find the exact match (more then one experiment can be listed in the query result)
1013        experiments = [e for e in experiments if e.find("accession").text.strip() == accession]
1014        self._experiment = experiment = experiments[0]
1015       
1016        self.species = [e.text for e in experiment.findall("species")]
1017        bool_values = {"true": True, "false": False}
1018        self.rawdatafiles = bool_values[experiment.find("rawdatafiles").get("available","false")]
1019        self.fgemdatafiles = bool_values[experiment.find("fgemdatafiles").get("available", "false")]
1020       
1021        self.sampleattributes = []
1022        for sa in experiment.findall("sampleattribute"):
1023            category = sa.find("category").text.strip()
1024            values = [val.text for val in sa.findall("value")]
1025            self.sampleattributes.append((category, values))
1026           
1027        self.experimentalfactors = []
1028        for ef in experiment.findall("experimentalfactor"):
1029            name = ef.find("name").text.strip()
1030            values = [val.text.strip() for val in ef.findall("values")]
1031            self.experimentalfactors.append((name, values))
1032           
1033        self.miamescores = _dictify(experiment.find("miamescores"))
1034           
1035        self.id = experiment.find("id").text
1036        self.secondaryaccession = getattr(experiment.find("secondaryaccession"), "text", None)
1037        self.name = experiment.find("name").text
1038        self.experimenttype = experiment.find("experimenttype").text.strip()
1039        self.releasedate = experiment.find("releasedate").text
1040        self.lastupdatedate = getattr(experiment.find("lastupdatedate"), "text", None)
1041        self.samples = int(experiment.find("samples").text)
1042        self.assays = int(experiment.find("assays").text)
1043       
1044        self.arraydesign = [_dictify(e) for e in experiment.findall("arraydesign")]
1045           
1046        self.bioassaydatagroups = [_dictify(group) for group in experiment.findall("bioassaydatagroup")]
1047        self.bibliography = [_dictify(e) for e in experiment.findall("bibliography")]
1048        self.provider = [_dictify(e) for e in experiment.findall("provider")]
1049       
1050        self.experimentdesign = []
1051        for expd in experiment.findall("experimentdesign"):
1052            self.experimentdesign.append(expd.text)
1053           
1054        self.description = [_dictify(e) for e in experiment.findall("description")]
1055       
1056        tree = query_files(accession=self.accession, format="xml", connection=self.connection)
1057        experiments = tree.findall("experiment")
1058        experiments = [e for e in experiments if e.find("accession").text.strip() == accession]
1059        experiment = experiments[0]
1060        files = experiment.findall("file")
1061        self.files = [_dictify(file) for file in files]
1062       
1063    def _download_processed(self):
1064        """ Download the processed matrix file, and associated MAGE-TAB files (idf, sdrf, adf)
1065       
1066        .. todo:: What about the raw data files (we need converters for other file types)
1067        """
1068        assert(self.fgemdatafiles)
1069        exp_files = [(f["kind"], f) for f in self.files if f.get("kind") in ["idf", "sdrf"] and f.get("extension") == "txt"]
1070        exp_files += [(f["kind"], f) for f in self.files if f.get("kind") == "fgem"]
1071        array_files = [(f["kind"], f) for f in self.files if f.get("kind") == "adf" and f.get("extension") == "txt"]
1072        assert(len(files) == 3)
1073       
1074        for type, file in files.iteritems():
1075            url = file["url"].strip()
1076            rest, basename = os.path.split(url)
1077            _, dirname = os.path.split(rest)
1078           
1079            repo_dir = orngServerFiles.localpath("ArrayExpress", dirname)
1080            try:
1081                os.makedirs(repo_dir)
1082            except OSError:
1083                pass
1084            local_filename = os.path.join(repo_dir, basename)
1085            stream = urllib2.urlopen(url)
1086            shutil.copyfileobj(stream, open(local_filename, "wb"))
1087           
1088            if file["extension"] == "zip":
1089                import zipfile
1090                zfile = zlib.ZipFile(local_filename)
1091                zfile.extractall(repo_dir)
1092            elif file["extension"] == "gz":
1093                import gzip
1094                gzfile = gzip.open(local_filename)
1095                gzfile.extractall(repo_dir)
1096            elif file["extension"] in ["tgz", "tar"]:
1097                import tarfile
1098                tfile = tarfile.TarFile(local_filename)
1099                tfile.extractall(repo_dir)
1100            elif file["extension"] == "txt":
1101                pass
1102            else:
1103                raise ValueError("Unknown extension ('{0}').".format(basename))
1104           
1105    def _download_file(self, url, extract=True):
1106        """ Download the `file` from the ArrayExpress saving it to a local
1107        repository directory.
1108         
1109        """
1110        rest, basename = posixpath.split(url)
1111        dirname = posixpath.basename(rest)
1112        repo_dir = orngServerFiles.localpath("ArrayExpress", dirname)
1113        try:
1114            os.makedirs(repo_dir)
1115        except OSError:
1116            pass
1117        stream = urllib2.urlopen(url)
1118        local_filename = os.path.join(repo_dir, basename)
1119        shutil.copyfileobj(stream, open(local_filename, "wb"))
1120       
1121        if extract:
1122            _, extension = os.path.splitext(local_filename)
1123            if extension == ".zip":
1124                import zipfile
1125                zfile = zipfile.ZipFile(local_filename)
1126                zfile.extractall(repo_dir)
1127            elif extension == ".gz":
1128                import gzip
1129                gzfile = gzip.open(local_filename)
1130                gzfile.extractall(repo_dir)
1131            elif extension in [".tgz"]:
1132                import tarfile
1133                tfile = tarfile.TarFile(local_filename)
1134                tfile.extractall(repo_dir)
1135            elif extension == ".txt":
1136                pass
1137            else:
1138                raise ValueError("Unknown extension ('{0}').".format(basename))
1139           
1140    def _is_local(self, url):
1141        """ Is the `url` stored in the local repository.
1142        """
1143        return os.path.exists(self._local_filepath(url))
1144   
1145    def _local_filepath(self, url):
1146        """ Return the local file path for url.
1147        """
1148        rest, basename = posixpath.split(url)
1149        dirname = posixpath.basename(rest)
1150        return orngServerFiles.localpath("ArrayExpress", os.path.join(dirname, basename))
1151   
1152    def _open(self, url):
1153        """ Return an open file like handle to url (ArrayExpress file).
1154        The file is cached in the local repository for future access.
1155       
1156        """
1157        if not self._is_local(url):
1158            self._download_file(url, extract=True)
1159        file = self._local_filepath(url)
1160        return open(file, "rb")
1161   
1162    def _search_files(self, kind=None, extension=None):
1163        """ Search files by `kind` and `extension`.
1164        """
1165        res = []
1166        for file in self.files:
1167            kind_match = kind == file.get("kind") or kind is None
1168            extension_match = extension == file.get("extension") or extension is None
1169           
1170            if kind_match and extension_match:
1171                res.append(file)
1172        return res
1173       
1174    def array_design(self):
1175        """ Return a list of `ArrayDesign` instances used in this experiment.
1176        """
1177        files = [f for f in self.files if f.get("kind") == "adf" and \
1178                 f.get("extension") == "txt"]
1179       
1180        array_design = []
1181        for file in files:
1182            url = file.get("url")
1183            if not self._is_local(url):
1184                self._download_file(url)
1185            array_design.append(ArrayDesign(self._open(url)))
1186        return array_design
1187       
1188    def investigation_design(self):
1189        """ Return an `InvestigationDesgin` instance for this experiment
1190        """
1191        files = [f for f in self.files if f.get("kind") == "idf" and \
1192                 f.get("extension") == "txt"]
1193        if not files:
1194            raise ValueError("The experiment '{0}' does not have an investigation design file".format(self.accession))
1195        file = files[0]
1196        return InvestigationDesign(self._open(file.get("url")))
1197       
1198       
1199    def sample_data_relationship(self):
1200        """ Return an `SampleDataRelationship` instance describing this experiment.
1201        """
1202        files = [f for f in self.files if f.get("kind") == "sdrf" and \
1203                 f.get("extension") == "txt"]
1204        if not files:
1205            raise ValueError("The experiment '{0}' does not have an sample and data relationship file".format(self.accession))
1206        file = files[0]
1207        return SampleDataRelationship(self._open(file.get("url")))
1208       
1209    def fgem_to_table(self):
1210        """ Retrieve the processed matrix from the Array Express ftp
1211        server and convert it to an :class:`Orange.data.Table` instance.
1212         
1213        """
1214        assert(self.fgemdatafiles)
1215        repo_dir = orngServerFiles.localpath("ArrayExpress", self.accession)
1216        # Find the file listing the data matrix files (should be in sdrf but sometimes it is in 2column file only, why?)
1217        sdrf = self._search_files("sdrf", "txt")
1218        if sdrf:
1219            sdrf = SampleDataRelationship(self._open(sdrf[0].get("url")))
1220            if "Derived Array Data Matrix File" not in sdrf.header:
1221                twocol = self._search_files("twocolumn", "txt")
1222                if twocol:
1223                    sdrf = SampleDataRelationship(self._open(twocol[0].get("url")))
1224        matrix_file = self._search_files("fgem")[0]
1225        self._open(matrix_file.get("url")) 
1226        matrix_files = sorted(set(sdrf.derived_array_data_matrix_file()))
1227       
1228        idf_file = self._search_files("idf", "txt")[0]
1229        self._open(idf_file.get("url")) # To download if not cached
1230        return mage_tab_to_orange(os.path.join(repo_dir, idf_file.get("name")))
1231       
1232   
1233"""\
1234Gene Expression Atlas
1235---------------------
1236
1237.. WARNING:: Deprecated, use ``obiGeneAtlas`` instead.
1238
1239`Gene Expression Atlas <http://www.ebi.ac.uk/gxa/>`_ is a curated subset of
1240gene expression experiments in Array Express Archive.
1241
1242Use `query_atlas_simple` for simple querys.
1243
1244Example (query human genes for experiments in which they are up regulated) ::
1245
1246    >>> obiArrayExpress.query_atlas_simple(genes=["SORL1", "PSIP1", "CDKN1C"], regulation="up", organism="Homo sapiens")
1247    {u'...
1248   
1249Or use the `AtlasCondition` subclasses in this module to construct a more
1250advanced query and use the `query_atlas` function.
1251
1252Example (query human genes annotated to the GO term 'transporter activity'
1253that are up regulated in the liver in at least three experiments) ::
1254
1255    >>> go_cond = AtlasConditionGeneProperty("Goterm", "Is", "transporter activity")
1256    >>> liver_cond = AtlasConditionExperimentalFactor("Organism_part", "up", 3, "liver")
1257    >>> org_cond = AtlasConditionOrganism("Homo sapiens")
1258    >>> cond_list = AtlasConditionList([go_cond, liver_cond, org_cond])
1259    >>> query_atlas(cond_list)
1260    {u'...
1261   
1262"""
1263
1264
1265class GeneExpressionAtlasConenction(object):
1266    """
1267    A connection to Gene Expression Atlas database.
1268
1269    :param address:
1270        Address of the GXA server (default: http://www.ebi.ac.uk/gxa/api/deprecated).
1271    :param timeout:
1272        Socket timeout (default 30).
1273    :param cache:
1274        A dict like object to use as a cache.
1275
1276    """
1277    DEFAULT_ADDRESS = "http://www.ebi.ac.uk/gxa/api/deprecated"
1278
1279    DEFAULT_CACHE = orngServerFiles.localpath(
1280        "ArrayExpress", "GeneAtlasCache.shelve")
1281
1282    def __init__(self, address=None, timeout=30, cache=None):
1283        """
1284        Initialize the connection.
1285
1286        """
1287        self.address = address if address is not None else self.DEFAULT_ADDRESS
1288        self.timeout = timeout
1289        if cache is None:
1290            cache = self.DEFAULT_CACHE
1291
1292        self.cache = cache
1293
1294    def query(self, condition, format="json", start=None, rows=None, indent=False):
1295        url = self.address + "?" + condition.rest()
1296        if start is not None and rows is not None:
1297            url += "&start={0}&rows={1}".format(start, rows)
1298        url += "&format={0}".format(format)
1299        if indent:
1300            url += "&indent"
1301#        print url
1302        if self.cache is not None:
1303            return self._query_cached(url)
1304        else:
1305            return urllib2.urlopen(url)
1306
1307    def _query_cached(self, url):
1308        if self.cache is not None:
1309            with self.open_cache("r") as cache:
1310                if url in cache:
1311                    return StringIO(cache[url])
1312
1313            response = urllib2.urlopen(url)
1314            contents = response.read()
1315            with self.open_cache("w") as cache:
1316                cache[url] = contents
1317
1318            return StringIO(contents)
1319        else:
1320            return urllib2.urlopen(url)
1321
1322    def open_cache(self, flag="r"):
1323        """
1324        Return a context manager for a dict like object.
1325        """
1326        if isinstance(self.cache, basestring):
1327            try:
1328                return closing(_open_shelve(self.cache, flag))
1329            except Exception:
1330                return fake_closing({})
1331        else:
1332            return fake_closing(self.cache)
1333
1334
1335# Names of all Gene Property filter names
1336GENE_FILTERS = \
1337    ["Name", # Gene name
1338     "Goterm", #Gene Ontology Term
1339     "Interproterm", #InterPro Term
1340     "Disease", #Gene-Disease Assocation
1341     "Keyword", #Gene Keyword
1342     "Protein", #Protein
1343
1344     "Dbxref", #Other Database Cross-Refs
1345     "Embl", #EMBL-Bank ID
1346     "Ensfamily", #Ensembl Family
1347     "Ensgene", #Ensembl Gene ID
1348
1349     "Ensprotein", #Ensembl Protein ID
1350     "Enstranscript", #Ensembl Transcript ID
1351     "Goid", #Gene Ontology ID
1352     "Image", #IMAGE ID
1353     "Interproid", #InterPro ID
1354     "Locuslink", #Entrez Gene ID
1355
1356     "Omimid", #OMIM ID
1357     "Orf", #ORF
1358     "Refseq", #RefSeq ID
1359     "Unigene", #UniGene ID
1360     "Uniprot", #UniProt Accession
1361
1362     "Hmdb", #HMDB ID
1363     "Chebi", #ChEBI ID
1364     "Cas", #CAS
1365     "Uniprotmetenz", #Uniprotmetenz
1366     "Gene", #Gene Name or Identifier
1367     "Synonym", #Gene Synonym
1368     ]
1369   
1370# Valid Gene Property filter qualifiers
1371GENE_FILTER_QUALIFIERS =\
1372    ["Is",
1373     "IsNot"
1374     ]
1375
1376# Organisms in the Atlas
1377ATLAS_ORGANISMS = \
1378    ["Anopheles gambiae",
1379     "Arabidopsis thaliana",
1380     "Bos taurus",
1381     "Caenorhabditis elegans",
1382     "Danio rerio",
1383     "Drosophila melanogaster",
1384     "Epstein barr virus",
1385     "Gallus gallus",
1386     "Homo sapiens",
1387     "Human cytomegalovirus",
1388     "Kaposi sarcoma-associated herpesvirus",
1389     "Mus musculus",
1390     "Rattus norvegicus",
1391     "Saccharomyces cerevisiae",
1392     "Schizosaccharomyces pombe",
1393#     "Unknown",
1394     "Xenopus laevis"
1395     ]
1396   
1397#_COMMON_TAXIDS = \
1398#    {"Anopheles gambiae",
1399#     "Arabidopsis thaliana",
1400#     "Bos taurus",
1401#     "Caenorhabditis elegans",
1402#     "Danio rerio",
1403#     "Drosophila melanogaster",
1404#     "Epstein barr virus",
1405#     "Gallus gallus",
1406#     "Homo sapiens",
1407#     "Human cytomegalovirus",
1408#     "Kaposi sarcoma-associated herpesvirus",
1409#     "Mus musculus",
1410#     "Rattus norvegicus",
1411#     "Saccharomyces cerevisiae",
1412#     "Schizosaccharomyces pombe",
1413##     "Unknown",
1414#     "Xenopus laevis"
1415#     }
1416   
1417def ef_ontology():
1418    """ Return the `EF <http://www.ebi.ac.uk/efo/>`_ (Experimental Factor) ontology
1419    """
1420    from . import obiOntology
1421#    return obiOntology.OBOOntology(urllib2.urlopen("http://efo.svn.sourceforge.net/svnroot/efo/trunk/src/efoinobo/efo.obo"))
1422    from Orange.orng import orngServerFiles
1423    # Should this be in the OBOFoundry (Ontology) domain
1424    try:
1425        file = open(orngServerFiles.localpath_download("ArrayExpress", "efo.obo"), "rb")
1426    except urllib2.HTTPError:
1427        file = urllib2.urlopen("http://efo.svn.sourceforge.net/svnroot/efo/trunk/src/efoinobo/efo.obo")
1428    return obiOntology.OBOOntology(file)
1429
1430
1431class AtlasCondition(object):
1432    """ Base class for Gene Expression Atlas query condition
1433    """
1434    def validate(self):
1435        """ Validate condition in a subclass.
1436        """
1437        raise NotImplementedError
1438   
1439    def rest(self):
1440        """ Return a REST query part in a subclass.
1441        """
1442        raise NotImplementedError
1443   
1444   
1445class AtlasConditionList(list, AtlasCondition):
1446    """ A list of AtlasCondition instances.
1447    """ 
1448    def validate(self):
1449        for item in self:
1450            item.validate()
1451       
1452    def rest(self):
1453        return "&".join(cond.rest() for cond in self)
1454
1455
1456class AtlasConditionGeneProperty(AtlasCondition):
1457    """ An atlas gene filter condition.
1458   
1459    :param property: Property of the gene. If None or "" all properties
1460        will be searched.
1461    :param qualifier: Qualifier can be 'Is' or 'IsNot'
1462    :param value: The value to search for.
1463   
1464    Example ::
1465   
1466        >>> # Condition on a gene name
1467        >>> condition = AtlasConditionGeneProperty("Name", "Is", "AS3MT")
1468        >>> # Condition on genes from a GO Term
1469        >>> condition = AtlasConditionGeneProperty("Goterm", "Is", "p53 binding")
1470        >>> # Condition on disease association
1471        >>> condition = AtlasConditionGeneProperty("Disease", "Is", "cancer")
1472       
1473    """
1474    def __init__(self, property, qualifier, value):
1475        self.property = property or ""
1476        self.qualifier = qualifier
1477        if isinstance(value, basestring):
1478            self.value = value.replace(" ", "+")
1479        elif isinstance(value, list):
1480            self.value = "+".join(value)
1481        else:
1482            raise ValueError(value)
1483       
1484        self.validate()
1485       
1486    def validate(self):
1487        assert(self.property in GENE_FILTERS + [""])
1488        assert(self.qualifier in GENE_FILTER_QUALIFIERS + [""])
1489       
1490    def rest(self):
1491        return "gene{property}{qualifier}={value}".format(**self.__dict__)
1492       
1493       
1494class AtlasConditionExperimentalFactor(AtlasCondition):
1495    """ An atlas experimental factor filter condition.
1496   
1497    :param factor: EFO experiamntal factor
1498    :param regulation: "up", "down", "updown", "any" or "none"
1499    :param n: Minimum number of of experimants with this condition
1500    :param value: Experimantal factor value
1501   
1502    Example ::
1503   
1504        >>> # Any genes up regulated in at least 3 experiments involving cancer.
1505        >>> condition = AtlasConditionExperimentalFactor("", "up", 3, "cancer")
1506        >>> # Only genes which are up/down regulated in the heart in at least one experiment.
1507        >>> condition = AtlasConditionExperimentalFactor("Organism_part", "updown", 1, "heart")
1508       
1509    """
1510    def __init__(self, factor, regulation, n, value):
1511        self.factor = factor
1512        self.regulation = regulation
1513        self.n = n
1514        self.value = value
1515        self.validate()
1516       
1517    def validate(self):
1518        # TODO: validate the factor and value
1519#        assert(self.factor in ef_ontology())
1520        assert(self.regulation in ["up", "down", "updown"])
1521       
1522    def rest(self):
1523        return "{regulation}{n}In{factor}={value}".format(**self.__dict__)
1524       
1525class AtlasConditionOrganism(AtlasCondition):
1526    """ Condition on organism.
1527    """
1528    def __init__(self, organism):
1529        self.organism = organism
1530        self.validate()
1531       
1532    def validate(self):
1533        assert(self.organism in ATLAS_ORGANISMS)
1534       
1535    def rest(self):
1536        return "species={0}".format(self.organism.replace(" ", "+").lower())
1537       
1538       
1539class AtlasConditionExperiment(AtlasCondition):
1540    """ Condition on experiement
1541   
1542    :param property: Property of the experiment. If None or "" all properties
1543        will be searched.
1544    :param qualifier: Qualifier can be 'Has' or 'HasNot'
1545    :param value: The value to search for.
1546   
1547    Example ::
1548   
1549        >>> # Condition on a experiemnt acession
1550        >>> condition = AtlasConditionExperiment("", "", "E-GEOD-24283")
1551        >>> # Condition on experiments involving lung
1552        >>> condition = AtlasConditionExperiment("Organism_part", "Has", "lung")
1553       
1554    """
1555#    EXPERIMENT_FILTERS = [
1556#                "Organism"
1557#                "Factor"]
1558   
1559    EXPERIMENT_FILTER_QUALIFIERS = [
1560                "Has",
1561                "HasNot"]
1562   
1563    def __init__(self, property, qualifier, value):
1564        self.property = property
1565        self.qualifier = qualifier
1566        if isinstance(value, basestring):
1567            self.value = value.replace(" ", "+")
1568        elif isinstance(value, list):
1569            self.value = "+".join(value)
1570        else:
1571            raise ValueError(value)
1572       
1573        self.validate()
1574       
1575    def validate(self):
1576        # TODO: check to EFO factors
1577#        assert(self.property in EXPERIMENT_FILTERS + [""])
1578        assert(self.qualifier in self.EXPERIMENT_FILTER_QUALIFIERS + [""])
1579       
1580    def rest(self):
1581        return "experiment{property}{qualifier}={value}".format(**self.__dict__)
1582       
1583       
1584class GeneAtlasError(ValueError):
1585    """ An error response from the Atlas server.
1586    """
1587    pass
1588   
1589   
1590def __check_atlas_error_json(response):
1591    if "error" in response:
1592        raise GeneAtlasError(response["error"])
1593    return response
1594 
1595     
1596def __check_atlas_error_xml(response):
1597    error = response.find("error")
1598    if error is not None:
1599        raise GeneAtlasError(error.text)
1600    return response
1601   
1602       
1603def query_atlas_simple(genes=None, regulation=None, organism=None,
1604                       condition=None, format="json", start=None,
1605                       rows=None):
1606    """ A simple Atlas query.
1607   
1608    :param genes: A list of gene names to search for.
1609    :param regulation: Search for experiments in which `genes` are "up",
1610        "down", "updown" or "none" regulated. If None all experiments
1611        are searched.
1612    :param organism: Search experiments for organism. If None all experiments
1613        are searched.
1614    :param condition: An EFO factor value (e.g. "brain")
1615   
1616    Example ::
1617       
1618        >>> query_atlas_simple(genes=['Pou5f1', 'Dppa3'], organism="Mus musculus")
1619        {u'...
1620       
1621        >>> query_atlas_simple(genes=['Pou5f1', 'Dppa3'], regulation="up", organism="Mus musculus")
1622        {u'...
1623       
1624        >>> query_atlas_simple(genes=['Pou5f1', 'Dppa3'], regulation="up", condition="liver", organism="Mus musculus")
1625        {u'...
1626       
1627    """
1628    warnings.warn("Use 'obiGeneAtlas.run_simple_query' instead.", DeprecationWarning)
1629    conditions = AtlasConditionList()
1630    if genes:
1631        conditions.append(AtlasConditionGeneProperty("Gene", "Is", genes))
1632    if regulation or condition:
1633        regulation = "any" if regulation is None else regulation
1634        condition = "" if condition is None else condition
1635        conditions.append(AtlasConditionExperimentalFactor("", regulation, 1, condition))
1636    if organism:
1637        conditions.append(AtlasConditionOrganism(organism))
1638       
1639    connection = GeneExpressionAtlasConenction()
1640    results = connection.query(conditions, format=format, start=start,
1641                               rows=rows)
1642    if format == "json":
1643        return parse_json(results)
1644    else:
1645        return parse_xml(results)
1646
1647"""\
1648.. todo:: can this be implemented query_atlas(organism="...", Locuslink="...", Chebi="...", up3InCompound="..." downInEFO="...")
1649      Need a full list of accepted factors
1650"""
1651
1652def query_atlas(condition, format="json", start=None, rows=None, indent=False, connection=None):
1653    """ Query Atlas based on a `condition` (instance of AtlasCondition)
1654   
1655    Example ::
1656       
1657        >>> condition1 = AtlasConditionGeneProperty("Goterm", "Is", "p53 binding")
1658        >>> condition2 = AtlasConditionExperimentalFactor("Organism_part", "up", 3, "heart")
1659        >>> condition = AtlasConditionList([condition1, condition2])
1660        >>> query_atlas(condition)
1661        {u'...
1662       
1663    """
1664    warnings.warn("Use 'obiGeneAtlas.run_query' instead.", DeprecationWarning)
1665    if connection is None:
1666        connection = GeneExpressionAtlasConenction()
1667    results = connection.query(condition, format=format, start=start,
1668                               rows=rows, indent=indent)
1669    if format == "json":
1670        response = parse_json(results)
1671        return __check_atlas_error_json(response)
1672    else:
1673        response = parse_xml(results)
1674        return __check_atlas_error_xml(response)
1675
1676
1677def get_atlas_summary(genes, organism, connection=None):
1678    """ Return 3 dictionaries containing a summary of atlas information
1679    about three experimental factors:
1680   
1681        - Organism Part (OP)
1682        - Disease State (DS)
1683        - Cell type (CT)
1684   
1685    Each dictionary contains query genes as keys. Values are dictionaries
1686    mapping factor values to a 2-tuple containig the count of up regulated
1687    and down regulated experiments.
1688   
1689    Example ::
1690   
1691        >>> get_atlas_summary(["RUNX1"], "Homo sapiens")
1692        ({u'RUNX1': ...
1693       
1694    """
1695    warnings.warn("Use 'obiGeneAtlas.get_atlas_summary' instead.", DeprecationWarning)
1696    genes_condition = AtlasConditionGeneProperty("Gene", "Is", genes)
1697    org_condition = AtlasConditionOrganism(organism)
1698    condition = AtlasConditionList([genes_condition, org_condition])
1699    result = query_atlas(condition, format="json", connection=connection)
1700   
1701    org_part = collect_ef_summary(result, "organism_part")
1702    disease_state = collect_ef_summary(result, "disease_state")
1703    cell_type = collect_ef_summary(result, "cell_type")
1704   
1705    return dict(org_part), dict(disease_state), dict(cell_type)
1706   
1707   
1708def collect_ef_summary(info, ef, summary=None):
1709    """ Collect the results summary from query_atlas, result for experimental
1710    factor `ef`.
1711    """
1712    if summary is None:
1713        summary = defaultdict(dict)
1714       
1715    results = info["results"]
1716    for res in results:
1717        gene = res["gene"]
1718        expressions = res["expressions"] 
1719        for expression in expressions:
1720            if expression["ef"] == ef:
1721                efv = expression["efv"]
1722                updown = (expression["upExperiments"],
1723                          expression["downExperiments"]
1724                          )
1725               
1726                if any(updown):
1727                    summary[gene["name"]][efv] = updown
1728   
1729    return summary
1730
1731
1732def test():   
1733    conn = ArrayExpressConnection()
1734    import doctest
1735    foo = type("foo", (object,), {})()
1736    bar = type("bar", (object,), {})()
1737   
1738    doctest.testmod(optionflags=doctest.ELLIPSIS,
1739                    extraglobs={"conn": conn, "foo": foo,"bar": bar})
1740   
1741   
1742if __name__ == "__main__":
1743    test()
1744   
Note: See TracBrowser for help on using the repository browser.