source: orange-bioinformatics/_bioinformatics/obiArrayExpress.py @ 1687:460a66cfdc01

Revision 1687:460a66cfdc01, 59.3 KB checked in by Ales Erjavec <ales.erjavec@…>, 22 months ago (diff)

Basic annotations for data table features (Charecteristics, Factor/Parameter Values).

Line 
1"""
2===============
3obiArrayExpress
4===============
5
6A python module for accessing the ArrayExpress web services and database.
7
8`Array Express Archive <http://www.ebi.ac.uk/arrayexpress/>`_ is a
9database of gene expression experiments that you can query and download.
10
11Example ::
12
13    >>> # Retrieve the object representing experiment with accession E-TABM-25
14    >>> experiment = ArrayExpressExperiment("E-TABM-25")
15    >>> print experiment.accession
16    E-TABM-25
17
18    >>> print experiment.name
19    Transcription profiling of aging in the primate brain
20
21    >>> print experiment.species
22    ['Pan troglodytes']
23
24    >>> print experiment.files
25    [{'kind': ...
26
27    >>> # Retrieve the data matrix for experiment 'E-MEXP-2917'
28    >>> experiment = ArrayExpressExperiment("E-MEXP-2917")
29    >>> table = experiment.fgem_to_table()
30
31
32Low level Array Express query using REST services::
33
34    >>> from Orange.bioinformatics import obiArrayExpress
35    >>> obiArrayExpress.query_experiments(accession='E-MEXP-31')
36    {u'experiments': ...
37
38    >>> obiArrayExpress.query_experiments(keywords='gliobastoma')
39    {u'experiments': ...
40
41    >>> obiArrayExpress.query_files(accession='E-MEXP-32', format="xml")
42    <xml.etree.ElementTree.ElementTree object ...
43
44.. note:: Currently querying ArrayExpress files only works with the xml format.
45
46.. note:: See the documentation of `query_experiments` for a full set of
47          parameters that these functions accept.
48
49"""
50
51from __future__ import absolute_import
52
53import os
54import sys
55import urllib2
56import re
57
58from Orange.orng import orngServerFiles
59
60import warnings
61import shelve
62import shutil
63import posixpath
64import json
65from xml.etree.ElementTree import ElementTree
66from StringIO import StringIO
67
68from collections import defaultdict
69from functools import partial
70from contextlib import closing
71
72parse_json = json.load
73
74
75def parse_xml(stream):
76    """ Parse an xml stream into an instance of
77    `xml.etree.ElementTree.ElementTree`.
78    """
79    return ElementTree(file=stream)
80
81# All searchable fields of ArrayExpress (see query_experiments docstring
82# for a description of the fields)
83ARRAYEXPRESS_FIELDS = \
84    ["keywords",
85     "accession",
86     "array",
87     "ef",
88     "efv",
89     "expdesign",
90     "exptype",
91     "gxa",
92     "pmid",
93     "sa",
94     "species",
95     "expandefo",
96     "directsub",
97     "assaycount",
98     "efcount",
99     "samplecount",
100     "sacount",
101     "rawcount",
102     "fgemcount",
103     "miamescore",
104     "date",
105     "wholewords",
106    ]
107
108
109class ArrayExpressConnection(object):
110    """ A connection to the ArrayExpress. Used to construct a REST query
111    and run it.
112
113    .. todo:: Implement user login.
114
115    """
116
117    DEFAULT_ADDRESS = "http://www.ebi.ac.uk/arrayexpress/{format}/v2/"
118    DEFAULT_FORMAT = "json"
119    DEFAULT_CACHE = orngServerFiles.localpath("ArrayExpress",
120                                              "ArrayExpressCache.shelve")
121    # Order of arguments in the query
122    _ARGS_ORDER = ["keywords", "species", "array"]
123
124    def __init__(self, address=None, timeout=30, cache=None,
125                 username=None, password=None):
126        """ Initialize the connection object.
127
128        :param address: Address of the ArrayExpress API
129        :param timeout: Timeout for the socket connection
130
131        .. todo:: Implement user login (see Accessing Private Data in API docs)
132
133        """
134        self.address = address if address is not None else self.DEFAULT_ADDRESS
135        self.timeout = timeout
136        self.cache = cache if cache is not None else self.DEFAULT_CACHE
137        self.username = username
138        self.password = password
139       
140    def format_query(self, **kwargs):
141        """ Format the query arguments.
142       
143        Example ::
144       
145            >>> conn.format_query(gxa=True, efcount=(1, 5))
146            'efcount=[1 TO 5]&gxa=true'
147           
148        """
149        # Formaters:
150        def format_default(val):
151            if isinstance(val, basestring):
152                return val
153            else:
154                return "+".join(val)
155        def format_species(val):
156            return '"%s"' % val.lower()
157        def format_gxa(val):
158            if val:
159                return "true"
160            else:
161                raise ValueError("gxa={0}".format(val))
162        def format_expandefo(val):
163            if val:
164                return "on"
165            else:
166                raise ValueError("expandefo={0}".format(val))
167        def format_true_false(val):
168            return "true" if val else "false"
169        def format_interval(val):
170            if isinstance(val, tuple):
171                return "[{0} TO {1}]".format(*val)
172            else:
173                raise ValueError("Must be an interval argument (min, max)!")
174        def format_date(val):
175            # TODO check if val contains a datetime.date object, assert proper format
176            return format_interval(val)
177        def format_wholewords(val):
178            if val:
179                return "on"
180            else:
181                raise ValueError("wholewords={0}".format(val))
182       
183        formaters = {"species": format_species,
184                     "gxa": format_gxa,
185                     "expandefo": format_expandefo,
186                     "directsub": format_true_false,
187                     "assaycount": format_interval,
188                     "efcount": format_interval,
189                     "samplecount": format_interval,
190                     "sacount": format_interval,
191                     "rawcount": format_interval,
192                     "fgemcount": format_interval,
193                     "miamescore": format_interval,
194                     "date": format_date,
195                     "wholewords": format_wholewords,
196                     }
197        parts = []
198        arg_items = kwargs.items()
199        ordered = sorted(arg_items, key=lambda arg: self._ARGS_ORDER.index(arg[0]) \
200                         if arg[0] in self._ARGS_ORDER else 100)
201       
202        for key, value in kwargs.iteritems():
203            if key == "format":
204                continue # format is handled in query_url
205            if key not in ARRAYEXPRESS_FIELDS:
206                raise ValueError("Invalid argument name: '{0}'".format(key))
207            if value is not None and value != []:
208                fmt = formaters.get(key, format_default)
209                value = fmt(value)
210                parts.append("{0}={1}".format(key, value))
211                 
212        return "&".join(parts)
213       
214    def query_url(self, what="experiments", **kwargs):
215        """ Return a formated query URL for the query arguments
216       
217        Example ::
218            >>> conn.query_url(accession="E-MEXP-31")
219            'http://www.ebi.ac.uk/arrayexpress/json/v2/experiments?accession=E-MEXP-31'
220           
221        """
222        query = self.format_query(**kwargs)
223        url = posixpath.join(self.address, what)
224        url = url.format(format=kwargs.get("format", self.DEFAULT_FORMAT))
225        url = url + ("?" + query if query else "")
226        url = url.replace(" ", "%20")
227        return url
228   
229    def query_url_experiments(self, **kwargs):
230        """ Return a formated experiments query url for the calls arguments
231        """
232        return self.query_url("experiments", **kwargs)
233   
234    def query_url_files(self, **kwargs):
235        """ Return a formated experiments query url for the calls arguments
236        """
237        return self.query_url("files", **kwargs)
238   
239    def query_experiment(self, **kwargs):
240        """ Return an open stream to the experiments query results.
241       
242        .. note:: This member function takes the same arguments as the module
243                  level `query_experiemnts` function.
244         
245        """
246        url = self.query_url_experiments(**kwargs)
247        stream = self._cache_urlopen(url, timeout=self.timeout)
248        return stream
249   
250    def query_files(self, **kwargs):
251        """ Return an open stream to the files query results.
252       
253        .. note:: This member function takes the same arguments as the module
254                  level `query_files` function.
255        """
256        url = self.query_url_files(**kwargs)
257        stream = self._cache_urlopen(url, timeout=self.timeout)
258        return stream
259   
260    def open_file(self, accession, kind="raw", ext=None):
261        """ Return a file handle to experiment data.
262        Possible values for kind:
263            - raw: return the raw data if available
264            - fgem: return the processed data if available
265            - biosamples: a png or svg design image
266            - idf: investigation description
267            - adf: array design description
268            - mageml: MAGE-ML file
269           
270        Example ::
271       
272            >>> raw_file = conn.open_file("E-TABM-1087", kind="raw")
273            >>> processed_file = conn.open_file("E-TABM-1087", kind="fgem")
274             
275        """
276        stream = self.query_files(accession=accession, format="xml")
277        tree = ElementTree(file=stream)
278        files = tree.findall("experiment/file")
279        for file in files:
280            filekind = file.find("kind").text
281            fileext = file.find("extension").text
282            if filekind.strip() == kind and (fileext.strip() == ext or ext is None): 
283                url = file.find("url").text
284                return self._cache_urlopen(url.strip(), timeout=self.timeout)
285           
286    def _cache_urlopen(self, url, timeout=30):
287        if self.cache is not None:
288            with self.open_cache() as cache:
289                cached = url in cache
290            if not cached:
291                stream = urllib2.urlopen(url, timeout=timeout)
292                data = stream.read()
293                with self.open_cache() as cache:
294                    cache[url] = data
295            else:
296                with self.open_cache() as cache:
297                    data = cache[url]
298            return StringIO(data)
299        else:
300            return urllib2.urlopen(url, timeout=timeout)
301       
302    def open_cache(self):
303        if isinstance(self.cache, basestring):
304            return closing(shelve.open(self.cache))
305        elif hasattr(self.cache, "close"):
306            return closing(self.cache)
307        elif self.cache is None:
308            return fake_closing({})
309        else:
310            return fake_closing(self.cache)
311       
312from contextlib import contextmanager
313
314@contextmanager
315def fake_closing(obj):
316    yield obj
317   
318def query_experiments(keywords=None, accession=None, array=None, ef=None,
319                      efv=None, expdesign=None, exptype=None,
320                      gxa=None, pmid=None, sa=None, species=None,
321                      expandefo=None, directsub=None, assaycount=None,
322                      efcount=None, samplecount=None, rawcount=None,
323                      fgemcount=None, miamescore=None, date=None, 
324                      format="json", wholewords=None, connection=None):
325    """ Query Array Express experiments.
326   
327    :param keywords: A list of keywords to search (e.g. ['gliobastoma']
328    :param accession: Search by experiment accession (e.g. 'E-MEXP-31')
329    :param array: Search by array design name or accession (e.g. 'A-AFFY-33')
330    :param ef: Experimental factor (names of main variables of experiments)
331    :param efv: Experimental factor value (Has EFO expansion)
332    :param expdesign: Experiment design type. (e.g. ["dose", "response"])
333    :param exptype: Experiment type (e.g. 'RNA-Seq', has EFO expansion)
334    :param gxa: If True limit the results to experiments from the Gene
335        Expreission Atlas only.
336    :param pmid: Search by PubMed identifier
337    :param sa: Sample attribute values (e.g. 'fibroblast', has EFO expansion)
338    :param species: Search by species (e.g. 'Homo sapiens', has EFO expansion)
339   
340    :param expandefo: If True expand the search terms with all its child terms
341        in the Experimental Factor Ontology (EFO_) (e.g. keywords="cancer"
342        will be expanded to include for synonyms and sub types of cancer).
343    :param directsub: If True return only experiments submited directly to
344        Array Express else if False return only experiments imported from GEO
345        database (default None, return both)
346    :param assaycount: A two tuple (min, max) for filter on the number of
347        assays (e.g. (1, 5) will return only experiments with at least one
348        and no more then 5 assays).
349    :param efcount: Filter on the number of experimental factors (e.g. (1, 5))
350    :param sacount: Filter on the number of sample attribute categories
351    :param rawcount: Filter on the number or raw files
352    :param fgemcount: Filter on the number of final gene expression matrix
353        (processed data) files
354    :param miamescore: Filter on the MIAME complience score (max 5)
355    :param date: Filter by release date
356   
357    Example ::
358   
359        >>> query_experiments(species="Homo sapiens", ef="organism_part", efv="liver")
360        {u'experiments': ...
361       
362    .. _EFO: http://www.ebi.ac.uk/efo/
363   
364    """
365    if connection is None:
366        connection = ArrayExpressConnection()
367       
368    stream = connection.query_experiment(keywords=keywords, accession=accession,
369                array=array, ef=ef, efv=efv, expdesign=expdesign, exptype=exptype,
370                gxa=gxa, pmid=pmid, sa=sa, species=species, expandefo=expandefo,
371                directsub=directsub, assaycount=assaycount, efcount=efcount,
372                samplecount=samplecount, rawcount=rawcount, fgemcount=fgemcount,
373                miamescore=miamescore, date=date,  format=format,
374                wholewords=wholewords)
375   
376    if format == "json":
377        return parse_json(stream)
378    else:
379        return parse_xml(stream)
380   
381   
382def query_files(keywords=None, accession=None, array=None, ef=None,
383               efv=None, expdesign=None, exptype=None,
384               gxa=None, pmid=None, sa=None, species=None,
385               expandefo=None, directsub=None, assaycount=None,
386               efcount=None, samplecount=None, rawcount=None,
387               fgemcount=None, miamescore=None, date=None, 
388               format="json", wholewords=None, connection=None):
389    """ Query Array Express files.
390   
391    This function accepts the same arguments as `query_experiments`.
392   
393    Example ::
394   
395        >>> query_files(species="Mus musculus", ef="developmental_stage", efv="embryo", format="xml")
396        <xml.etree.ElementTree.ElementTree object ...
397       
398    .. todo:: why does the json interface not work.
399                       
400    """
401    if connection is None:
402        connection = ArrayExpressConnection()
403   
404    stream = connection.query_files(keywords=keywords, accession=accession,
405                array=array, ef=ef, efv=efv, expdesign=expdesign, exptype=exptype,
406                gxa=gxa, pmid=pmid, sa=sa, species=species, expandefo=expandefo,
407                directsub=directsub, assaycount=assaycount, efcount=efcount,
408                samplecount=samplecount, rawcount=rawcount, fgemcount=fgemcount,
409                miamescore=miamescore, date=date,  format=format,
410                wholewords=wholewords)
411   
412    if format == "json":
413        return parse_json(stream)
414    else:
415        return parse_xml(stream)
416   
417   
418def open_file(accession, kind="raw", ext=None, repo_dir=None):
419    """ Open a file for the experiment.
420     
421    Example ::
422   
423        >>> file = open_file("E-MTAB-369", kind="raw", repo_dir="~/.arrayexpress/")
424       
425    """
426    raise NotImplementedError
427
428"""\
429MAGE-TAB convinence functions, classes
430======================================
431"""
432
433IDF_SINGLE_TAGS = \
434    ["Investigation Title",
435     "Date of Experiment",
436     "Public Release Date",
437     "Experiment Description",
438    ]
439
440def parse_idf(file):
441    """ Parse an idf.txt (Investigation Description  Format) formated file.
442    Return a list of tuples where the first element is the tag (first column)
443    and the second element is a list of all tag values.
444   
445    """
446    if isinstance(file, basestring):
447        file = open(file, "rb")
448    data = file.read()
449    lines = data.splitlines()
450    lines = [line.split("\t") for line in lines if line and not line.startswith("#")]
451    parsed = [(line[0], line[1:]) for line in lines]
452    return parsed
453   
454def parse_sdrf(file):
455    """ Parse an sdfr formated file. Return a tuple with the first element
456    a list of header values and the second element is a list of all rows
457    (a row is a list of string values).
458   
459    """
460    if isinstance(file, basestring):
461        file = open(file, "rb")
462    data = file.read()
463    lines = data.splitlines()
464    lines = [line.split("\t") for line in lines if line.strip() and not line.startswith("#")]
465    header = [h for h in lines[0] if h]
466    rows = [line[:len(header)] for line in lines[1:]]
467    assert(all([len(r) == len(header) for r in rows]))
468    return header, rows
469   
470def parse_adf(file):
471    pass
472
473def parse_data_matrix(file):
474    """ Parse the MAGE-TAB processed data matrix. Return a tuple where the
475    elements are:
476        - a (header REF, header values) tuple (e.g. ("Hybridization REF", ["Stage1", "Stage2", ...]) )
477        - a list of quantitation type for header values (e.g. ["log2 ratio", "log2 ratio", ...])
478        - a (row REF, row names list) tuple ("e.g. ("Reporter REF", ["Probe 1", "Probe 2", ...]) )
479        - a list of list matrix with values (as strings)
480       
481    """
482    if isinstance(file, basestring):
483        file = open(file, "rb")
484    data = file.read()
485    lines = data.splitlines()
486    lines = [line.split("\t") for line in lines if line.strip()]
487    header = lines[0]
488    header_ref, header = header[0], header[1:]
489    line2 = lines[1]
490    row_ref, quant_type = line2[0], line2[1:]
491    row_names, rows = [], []
492    for line in lines[2:]:
493        r_name, row = line[0], line[1:]
494        row_names.append(r_name)
495        rows.append(row)
496       
497    return ((header_ref, header),
498            quant_type,
499            (row_ref, row_names),
500            rows) 
501   
502class InvestigationDesign(dict):
503    """ Investigation design (contains the contents of the .idf).
504   
505    Example ::
506   
507        >>> idf = InvestigationDesign("foobar.idf")
508        >>> print idf.investigation_title
509        foo investigation
510        >>> print idf.experimental_design
511        ['fubar', 'snafu']
512        >>> print idf.sdrf_file
513        ['foobar.sdrf']
514       
515    """
516    def __init__(self, idf_file=None):
517        idf = parse_idf(idf_file)
518        self._idf = idf
519        self.update(dict(idf))
520        for tag, values in idf:
521            if tag in IDF_SINGLE_TAGS:
522                values = values[0] if values else None
523            ttag = self.transform_tag(tag)
524            setattr(self, ttag, values)
525       
526    def transform_tag(self, tag):
527        """ Transform the tag into a proper python attribute name by
528        replacing all spaces and special characters (e.g '[', ']' into
529        underscores).
530       
531        """
532        toreplace = [" ", "-", "[", "]"]
533        for s in toreplace:
534            tag = tag.replace(s, "_")
535        return tag.lower()
536           
537    def __getitem__(self, tag):
538        """ Return the tag values
539       
540        Example ::
541       
542            >>> idf["Investigation Title"]
543            'foo investigation'
544           
545        """
546        try:
547            return self._idf_dict[tag]
548        except KeyError:
549            pass
550       
551        ttag = self.transform_tag(tag)
552        if hasattr(self, ttag):
553            return getattr(self, ttag)
554        else:
555            raise KeyError(tag)
556       
557class SampleDataRelationship(object):
558    """ Sample-Data Relationship (contains the contents of the .sdrf file).
559   
560    Example ::
561   
562        >>> sdr = SampleDataRelationship("foobar.sdrf")
563        >>> sdr.source_name()
564        ['foo', ...
565       
566        >>> sdr.sample_name()
567        ['sampled foo', ...
568       
569        >>> sdr.extract_protocol_ref()
570        ['bar the foo', ...
571       
572        >>> sdr.derived_array_data_matrix_file()
573        ['foobar.data.txt', ...
574       
575        >>>
576         
577    """
578   
579    # Nodes an edges
580    NODES_EDGES = ["Source Name", "Sample Name", "Extract Name",
581                   "Labeled Extract Name", "Hybridization Name",
582                   "Assay Name", "Scan Name", "Normalization Name",
583                   "Array Data File", "Derived Array Data File",
584                   "Array Data Matrix File", "Derived Array Data Matrix File",
585                   "Image File", "Protocol REF"]
586   
587    # Attributes for nodes and edges
588    NODE_EDGE_ATTRIBUTES = \
589    {"Source Name": ["Characteristics", "Provider", "Material Type", "Description", "Comment"],
590     "Sample Name": ["Characteristics", "Material Type", "Description", "Comment"],
591     "Extract Name":["Characteristics", "Material Type", "Description", "Comment"],
592     "Labeled Extract Name": ["Characteristics", "Material Type", "Description", "Label", "Comment"],
593     "Hybridization Name": ["Array Design File", "Array Design REF", "Comment"],
594     "Assay Name": ["Technology Type", "Comment"],
595     "Scan Name": ["Comment"],
596     "Normalization Name": ["Comment"],
597     "Array Data File": ["Comment"],
598     "Derived Array Data File": ["Comment"],
599     "Array Data Matrix File": ["Comment"],
600     "Derived Array Data Matrix File": ["Comment"],
601     "Image File": ["Comment"],
602     "Protocol REF": ["Term Source REF", "Parameter", "Performer", "Date", "Comment"]
603     }
604   
605    # Attributes
606    ATTRIBUTE_COLUMNS = \
607    {"Characteristics []": ["Unit", "Term Source REF"],
608     "Provider": ["Comment"],
609     "Material Type": ["Term Source REF"],
610     "Label": ["Term Source REF"],
611     "Array Design File": ["Comment"],
612     "Array Design REF": ["Term Source REF", "Comment"],   
613     "Technology Type": ["Term Source REF"],
614     "Factor Value [] ()": ["Unit", "Term Source REF"],
615     "Performer": ["Comment"],
616     "Date": [],
617     "Parameter Value []": ["Unit", "Comment"],
618     "Unit []": ["Term Source REF"],
619     "Description": [],
620     "Term Source REF": ["Term Accession Number"],
621     "Term Accession Number": [],
622     "Comment []": []
623     }
624    def __init__(self, sdrf_file=None):
625        header, rows = parse_sdrf(sdrf_file)
626        self.header = header
627        self.rows = rows
628       
629    def transform_tag(self, tag):
630        """ Transform the tag into a proper python attribute name by
631        replacing all spaces and special characters (e.g '[', ']' into
632        underscores).
633       
634        """
635        toreplace = [" ", "-", "[", "]"]
636        for s in toreplace:
637            tag = tag.replace(s, "_")
638        return tag.lower()
639   
640    def _subsection(self, name):
641        """ Return the named subsection (name must be from the
642        NODES_EDGES list).
643       
644        """
645        idx = self.NODES_EDGES.index(name)
646        start = self.header.index(name)
647        end = -1
648        for name in self.NODES_EDGES[idx + 1:]:
649            if name in self.header[start + 1:]:
650                end = self.header.index(name, start + 1)
651                break
652        return self.header[start:end], [r[start:end] for r in self.rows]
653   
654    def _column(self, name):
655        """ Return the named column.
656        """
657        if isinstance(name, basestring):
658            index = self.header.index(name)
659        else:
660            index = name
661        return [r[index] for r in self.rows]
662       
663    def source(self):
664        """ Return the Source subsection
665        """
666        return self._subsection("Source Name")
667       
668    def source_name(self):
669        """ Return the Source Name subsection
670        """
671        return self._column("Source Name")
672       
673    def sample(self):
674        """ Return the Sample subsection
675        """
676        return self._subsection("Sample Name")
677       
678    def sample_name(self):
679        """ Return the Sample Name subsection
680        """
681        return self._column("Sample Name")
682       
683    def extract(self):
684        """ Return the Extract subsection
685        """
686        return self._subsection("Extract Name")
687       
688    def extract_name(self):
689        """ Return the Extract Name subsection
690        """
691        return self._column("Extract Name")
692       
693    def labeled_extract(self):
694        """ Return the Labeled Extract subsection
695        """
696        return self._subsection("Labeled Extract Name")
697       
698    def labeled_extract_name(self):
699        """ Return the Labeled Extract Name subsection
700        """
701        return self._column("Labeled Extract Name")
702       
703    def hybridization(self):
704        """ Return the Hybridization subsection.
705        """
706        return self._subsection("Hybridization Name")
707       
708    def hybridization_name(self):
709        """ Return the Hibridization Name subsection.
710        """
711        return self._column("Hybridization Name")
712       
713    def assay(self):
714        """ Return the Assay subsection
715        """
716        return self._subsection("Assay Name")
717       
718    def assay_name(self):
719        """ Return the Assay Name subsection
720        """
721        return self._column("Assay Name")
722       
723    def scan(self):
724        """ Return the Scan subsection
725        """
726        return self._subsection("Scan Name")
727       
728    def scan_name(self):
729        """ Return the Scan name subsection
730        """
731        return self._column("Scan Name")
732       
733    def normalization(self):
734        """ Return the Normalization subsection.
735        """
736        return self._subsection("Normalization Name")
737       
738    def normalization_name(self):
739        """ Return the Normalization Name subsection.
740        """
741        return self._column("Normalization Name")
742         
743    def array_data(self):
744        """ Return the Array Data subsection
745        """
746        return self._subsection("Array Data File")
747   
748    def array_data_file(self):
749        """ Return the Array Data File subsection
750        """
751        return self._column("Array Data File")
752       
753    def derived_array_data(self):
754        """ Return the Derived Array Data subsection
755        """
756        return self._subsection("Derived Array Data File")
757   
758    def derived_array_data_file(self):
759        """ Return the Derived Array Data File subsection
760        """
761        return self._column("Derived Array Data File")
762       
763    def array_data_matrix(self):
764        """ Return the Array Data Matrix subsection.
765        """
766        return self._subsection("Array Data Matrix File")
767   
768    def array_data_matrix_file(self):
769        """ Return the Array Data Matrix File subsection.
770        """
771        return self._column("Array Data Matrix File")
772       
773    def derived_array_data_matrix(self):
774        """ Return the Derived Array Data Matrix subsection.
775        """
776        return self._subsection("Derived Array Data Matrix File")
777   
778    def derived_array_data_matrix_file(self):
779        """ Return the Derived Array Data Matrix File subsection.
780        """
781        return self._column("Derived Array Data Matrix File")
782       
783    def image(self):
784        """ Return the Image subsection
785        """
786        return self._subsection("Image File")
787   
788    def image_file(self):
789        """ Return the Image File subsection.
790        """
791        return self._column("Image File")
792       
793class ArrayDesign(object):
794    """ Arary design (contains the contents of the .adf file).
795    """
796    def __init__(self, adf_file=None):
797        adf = parse_adf(adf_file)
798        self._adf = adf
799   
800def _is_float(str):
801    try:
802        float(str)
803        return True
804    except ValueError:
805        return False
806   
807def _is_continuous(items, check_count=100):
808    """ Are the strings in items continous numbers.
809    """
810    count = 0
811    i = 0
812    for i, item in enumerate(items):
813        if _is_float(item):
814            count += 1
815        if i >= check_count:
816            break
817    return count >= i * 0.5
818   
819def processed_matrix_to_orange(matrix_file, sdrf=None):
820    """ Load a single processed matrix file in to an Orange.data.Table
821    instance.
822    """
823    import numpy
824    import Orange
825   
826    if isinstance(matrix_file, basestring):
827        matrix_file = open(matrix_file, "rb")
828
829    header, quant_type, rows, matrix = parse_data_matrix(matrix_file)
830    header_ref, header = header
831    row_ref, rows = rows
832   
833    matrix = numpy.array(matrix, dtype=object)
834   
835   
836    features = []
837    is_float = numpy.frompyfunc(_is_float, 1, 1) # an numpy ufunc
838         
839    for header_name, quant, column in zip(header, quant_type, matrix.T):
840        if _is_continuous(column):
841            feature = Orange.feature.Continuous(header_name)
842            column[numpy.where(1 - is_float(column))] = "?" # relace all non parsable floats with '?'
843        else:
844            values = set(column)
845            feature = Orange.feature.Discrete(header_name, values=sorted(values))
846        feature.attributes["quantitation type"] = quant
847        features.append(feature)
848       
849    row_ref_feature = Orange.feature.String(row_ref)
850    domain = Orange.data.Domain(features, None)
851    domain.addmeta(Orange.feature.Descriptor.new_meta_id(), row_ref_feature)
852   
853    table = Orange.data.Table(domain, [list(row) for row in matrix])
854    table.setattr("header_ref", header_ref)
855    # Add row identifiers
856    for instance, row in zip(table, rows):
857        instance[row_ref_feature] = row
858
859    if sdrf is not None:
860        pattern = re.compile(r"((Characteristics)|(Factor Value)|(Parameter Value)) \[(?P<name>.*?)\].*")
861        # Row name in sdrf
862        row_name = header_ref[:header_ref.find(" REF")] + " Name"
863        # feature names as listed in sdrf
864        feature_names = sdrf._column(row_name)
865        annotations = defaultdict(partial(defaultdict, set))
866        for i, header in enumerate(sdrf.header):
867            match = pattern.match(header)
868            if match:
869                name = match.group("name")
870                for val, feature_name in zip(sdrf._column(i), feature_names):
871                    annotations[feature_name][name].add(val)
872
873        def to_str(values):
874            if len(values) > 1:
875                return str(list(values))
876            else:
877                return str(list(values)[0])
878
879        for feature in table.domain.features:
880            feature.attributes.update([(key, to_str(value)) for key, value in \
881                                       annotations[feature.name].items()])
882    return table
883
884
885def mage_tab_to_orange(idf_filename):
886    """Convert an MAGE-TAB annotated experiment into an Orange.data.Table
887    instance (assumes all the associated MAGE-TAB files are in the same
888    directory.
889
890    """
891    dirname = os.path.dirname(idf_filename)
892    idf = InvestigationDesign(idf_filename)
893   
894    sdrf_filename = os.path.join(dirname, idf.sdrf_file[0])
895    sdrf = SampleDataRelationship(sdrf_filename)
896   
897    data_matrices = set(sdrf.derived_array_data_matrix_file())
898    data_matrices = [name for name in data_matrices if name.strip()]
899   
900    tables = []
901    for filename in data_matrices:
902        matrix_file = os.path.join(dirname, filename)
903        table = processed_matrix_to_orange(matrix_file, sdrf)
904        tables.append(table)
905    table = hstack_tables(tables)
906
907    return table
908
909
910def hstack_tables(tables):
911    """ Stack the tables horizontaly.
912    """
913    import Orange
914    max_len = max([len(table) for table in tables])
915    stacked_features = []
916    stacked_values = [[] for i in range(max_len)]
917    stacked_meta_features = []
918    stacked_meta_values = [{} for i in range(max_len)]
919   
920    for table in tables:
921        stacked_features.extend(table.domain.variables)
922        stacked_meta_features.extend(table.domain.getmetas().items())
923       
924        for i, instance in enumerate(table):
925            stacked_values[i].extend(list(instance))
926            stacked_meta_values[i].update(instance.getmetas())
927           
928        # Fill extra lines with unknowns
929        for i in range(len(table), max_len):
930            stacked_values[i].extend(["?"] * len(table.domain.variables))
931       
932    domain = Orange.data.Domain(stacked_features, tables[-1].domain.class_var)
933    domain.addmetas(dict(set(stacked_meta_features)))
934    table = Orange.data.Table(domain, stacked_values)
935   
936    # Add meta attributes
937    for instance, metas in zip(table, stacked_meta_values):
938        for m, val in metas.iteritems():
939            instance[m] = val
940           
941    return table
942   
943def _dictify(element):
944    """ 'Dictify' and xml.etree.Element.Element instance by taking all
945    subnode tags as keys and the corresponding text values as items i.e. turn
946    `<node><bla>foo</bla><a>b</b></node>` into a {"bla":"foo", "a":b"}
947       
948    """
949    if element is None:
950        element = []
951    dict = {}
952    strip = lambda s: s.strip() if s else s
953    for node in element:
954        dict[node.tag] = strip(getattr(node, "text", None))
955    return dict
956   
957class SearchableList(list):
958    """ A list with an advanced `search` method
959    """
960    def search(self, **kwargs):
961        """ Search the list for elements with members that correspond the
962        supplied keyword arguments.
963       
964            >>> foo.bar = "foo"
965            >>> list = SearchableList([foo, bar])
966            >>> list.search(bar="foo") # Search for objects which have a member named "bar" and that member equals "foo"
967            [<__main__.foo object ...
968
969        """
970        ret = []
971        for obj in self:
972            for member, value in kwargs.items():
973                if hasattr(obj, member) and getattr(obj, member) == value:
974                    ret.append(obj)
975        return ret
976   
977class ArrayExpressExperiment(object):
978    """ An convinience class representing an Array Express Experiment.
979   
980    Example ::
981   
982        >>> ae = ArrayExpressExperiment("E-MEXP-2917")
983        >>> print ae.name
984        Characterization of Data Variation in Gene Expression Profiling of Human Peripheral Blood Samples
985       
986        >>> for file in ae.files:
987        ...     print file["name"], file["url"]
988        E-MEXP-2917.sdrf.txt http://www.ebi.ac.uk/arrayexpress/files/E-MEXP-2917/E-MEXP-2917.sdrf.txt
989        ...
990       
991        >>> table = ae.fgem_to_table() # Retieve the experiment data table
992           
993    """
994   
995    def __init__(self, accession, connection=None):
996        self.accession = accession
997        self.connection = connection
998        self._etree = tree = query_experiments(accession=accession, connection=self.connection, format="xml")
999        experiments = tree.findall("experiment")
1000        # find the exact match (more then one experiment can be listed in the query result)
1001        experiments = [e for e in experiments if e.find("accession").text.strip() == accession]
1002        self._experiment = experiment = experiments[0]
1003       
1004        self.species = [e.text for e in experiment.findall("species")]
1005        bool_values = {"true": True, "false": False}
1006        self.rawdatafiles = bool_values[experiment.find("rawdatafiles").get("available","false")]
1007        self.fgemdatafiles = bool_values[experiment.find("fgemdatafiles").get("available", "false")]
1008       
1009        self.sampleattributes = []
1010        for sa in experiment.findall("sampleattribute"):
1011            category = sa.find("category").text.strip()
1012            values = [val.text for val in sa.findall("value")]
1013            self.sampleattributes.append((category, values))
1014           
1015        self.experimentalfactors = []
1016        for ef in experiment.findall("experimentalfactor"):
1017            name = ef.find("name").text.strip()
1018            values = [val.text.strip() for val in ef.findall("values")]
1019            self.experimentalfactors.append((name, values))
1020           
1021        self.miamescores = _dictify(experiment.find("miamescores"))
1022           
1023        self.id = experiment.find("id").text
1024        self.secondaryaccession = getattr(experiment.find("secondaryaccession"), "text", None)
1025        self.name = experiment.find("name").text
1026        self.experimenttype = experiment.find("experimenttype").text.strip()
1027        self.releasedate = experiment.find("releasedate").text
1028        self.lastupdatedate = getattr(experiment.find("lastupdatedate"), "text", None)
1029        self.samples = int(experiment.find("samples").text)
1030        self.assays = int(experiment.find("assays").text)
1031       
1032        self.arraydesign = [_dictify(e) for e in experiment.findall("arraydesign")]
1033           
1034        self.bioassaydatagroups = [_dictify(group) for group in experiment.findall("bioassaydatagroup")]
1035        self.bibliography = [_dictify(e) for e in experiment.findall("bibliography")]
1036        self.provider = [_dictify(e) for e in experiment.findall("provider")]
1037       
1038        self.experimentdesign = []
1039        for expd in experiment.findall("experimentdesign"):
1040            self.experimentdesign.append(expd.text)
1041           
1042        self.description = [_dictify(e) for e in experiment.findall("description")]
1043       
1044        tree = query_files(accession=self.accession, format="xml", connection=self.connection)
1045        experiments = tree.findall("experiment")
1046        experiments = [e for e in experiments if e.find("accession").text.strip() == accession]
1047        experiment = experiments[0]
1048        files = experiment.findall("file")
1049        self.files = [_dictify(file) for file in files]
1050       
1051    def _download_processed(self):
1052        """ Download the processed matrix file, and associated MAGE-TAB files (idf, sdrf, adf)
1053       
1054        .. todo:: What about the raw data files (we need converters for other file types)
1055        """
1056        assert(self.fgemdatafiles)
1057        exp_files = [(f["kind"], f) for f in self.files if f.get("kind") in ["idf", "sdrf"] and f.get("extension") == "txt"]
1058        exp_files += [(f["kind"], f) for f in self.files if f.get("kind") == "fgem"]
1059        array_files = [(f["kind"], f) for f in self.files if f.get("kind") == "adf" and f.get("extension") == "txt"]
1060        assert(len(files) == 3)
1061       
1062        for type, file in files.iteritems():
1063            url = file["url"].strip()
1064            rest, basename = os.path.split(url)
1065            _, dirname = os.path.split(rest)
1066           
1067            repo_dir = orngServerFiles.localpath("ArrayExpress", dirname)
1068            try:
1069                os.makedirs(repo_dir)
1070            except OSError:
1071                pass
1072            local_filename = os.path.join(repo_dir, basename)
1073            stream = urllib2.urlopen(url)
1074            shutil.copyfileobj(stream, open(local_filename, "wb"))
1075           
1076            if file["extension"] == "zip":
1077                import zipfile
1078                zfile = zlib.ZipFile(local_filename)
1079                zfile.extractall(repo_dir)
1080            elif file["extension"] == "gz":
1081                import gzip
1082                gzfile = gzip.open(local_filename)
1083                gzfile.extractall(repo_dir)
1084            elif file["extension"] in ["tgz", "tar"]:
1085                import tarfile
1086                tfile = tarfile.TarFile(local_filename)
1087                tfile.extractall(repo_dir)
1088            elif file["extension"] == "txt":
1089                pass
1090            else:
1091                raise ValueError("Unknown extension ('{0}').".format(basename))
1092           
1093    def _download_file(self, url, extract=True):
1094        """ Download the `file` from the ArrayExpress saving it to a local
1095        repository directory.
1096         
1097        """
1098        rest, basename = posixpath.split(url)
1099        dirname = posixpath.basename(rest)
1100        repo_dir = orngServerFiles.localpath("ArrayExpress", dirname)
1101        try:
1102            os.makedirs(repo_dir)
1103        except OSError:
1104            pass
1105        stream = urllib2.urlopen(url)
1106        local_filename = os.path.join(repo_dir, basename)
1107        shutil.copyfileobj(stream, open(local_filename, "wb"))
1108       
1109        if extract:
1110            _, extension = os.path.splitext(local_filename)
1111            if extension == ".zip":
1112                import zipfile
1113                zfile = zipfile.ZipFile(local_filename)
1114                zfile.extractall(repo_dir)
1115            elif extension == ".gz":
1116                import gzip
1117                gzfile = gzip.open(local_filename)
1118                gzfile.extractall(repo_dir)
1119            elif extension in [".tgz"]:
1120                import tarfile
1121                tfile = tarfile.TarFile(local_filename)
1122                tfile.extractall(repo_dir)
1123            elif extension == ".txt":
1124                pass
1125            else:
1126                raise ValueError("Unknown extension ('{0}').".format(basename))
1127           
1128    def _is_local(self, url):
1129        """ Is the `url` stored in the local repository.
1130        """
1131        return os.path.exists(self._local_filepath(url))
1132   
1133    def _local_filepath(self, url):
1134        """ Return the local file path for url.
1135        """
1136        rest, basename = posixpath.split(url)
1137        dirname = posixpath.basename(rest)
1138        return orngServerFiles.localpath("ArrayExpress", os.path.join(dirname, basename))
1139   
1140    def _open(self, url):
1141        """ Return an open file like handle to url (ArrayExpress file).
1142        The file is cached in the local repository for future access.
1143       
1144        """
1145        if not self._is_local(url):
1146            self._download_file(url, extract=True)
1147        file = self._local_filepath(url)
1148        return open(file, "rb")
1149   
1150    def _search_files(self, kind=None, extension=None):
1151        """ Search files by `kind` and `extension`.
1152        """
1153        res = []
1154        for file in self.files:
1155            kind_match = kind == file.get("kind") or kind is None
1156            extension_match = extension == file.get("extension") or extension is None
1157           
1158            if kind_match and extension_match:
1159                res.append(file)
1160        return res
1161       
1162    def array_design(self):
1163        """ Return a list of `ArrayDesign` instances used in this experiment.
1164        """
1165        files = [f for f in self.files if f.get("kind") == "adf" and \
1166                 f.get("extension") == "txt"]
1167       
1168        array_design = []
1169        for file in files:
1170            url = file.get("url")
1171            if not self._is_local(url):
1172                self._download_file(url)
1173            array_design.append(ArrayDesign(self._open(url)))
1174        return array_design
1175       
1176    def investigation_design(self):
1177        """ Return an `InvestigationDesgin` instance for this experiment
1178        """
1179        files = [f for f in self.files if f.get("kind") == "idf" and \
1180                 f.get("extension") == "txt"]
1181        if not files:
1182            raise ValueError("The experiment '{0}' does not have an investigation design file".format(self.accession))
1183        file = files[0]
1184        return InvestigationDesign(self._open(file.get("url")))
1185       
1186       
1187    def sample_data_relationship(self):
1188        """ Return an `SampleDataRelationship` instance describing this experiment.
1189        """
1190        files = [f for f in self.files if f.get("kind") == "sdrf" and \
1191                 f.get("extension") == "txt"]
1192        if not files:
1193            raise ValueError("The experiment '{0}' does not have an sample and data relationship file".format(self.accession))
1194        file = files[0]
1195        return SampleDataRelationship(self._open(file.get("url")))
1196       
1197    def fgem_to_table(self):
1198        """ Retrieve the processed matrix from the Array Express ftp
1199        server and convert it to an :class:`Orange.data.Table` instance.
1200         
1201        """
1202        assert(self.fgemdatafiles)
1203        repo_dir = orngServerFiles.localpath("ArrayExpress", self.accession)
1204        # Find the file listing the data matrix files (should be in sdrf but sometimes it is in 2column file only, why?)
1205        sdrf = self._search_files("sdrf", "txt")
1206        if sdrf:
1207            sdrf = SampleDataRelationship(self._open(sdrf[0].get("url")))
1208            if "Derived Array Data Matrix File" not in sdrf.header:
1209                twocol = self._search_files("twocolumn", "txt")
1210                if twocol:
1211                    sdrf = SampleDataRelationship(self._open(twocol[0].get("url")))
1212        matrix_file = self._search_files("fgem")[0]
1213        self._open(matrix_file.get("url")) 
1214        matrix_files = sorted(set(sdrf.derived_array_data_matrix_file()))
1215       
1216        idf_file = self._search_files("idf", "txt")[0]
1217        self._open(idf_file.get("url")) # To download if not cached
1218        return mage_tab_to_orange(os.path.join(repo_dir, idf_file.get("name")))
1219       
1220   
1221"""\
1222Gene Expression Atlas
1223---------------------
1224
1225.. WARNING:: Deprecated, use ``obiGeneAtlas`` instead.
1226
1227`Gene Expression Atlas <http://www.ebi.ac.uk/gxa/>`_ is a curated subset of
1228gene expression experiments in Array Express Archive.
1229
1230Use `query_atlas_simple` for simple querys.
1231
1232Example (query human genes for experiments in which they are up regulated) ::
1233
1234    >>> obiArrayExpress.query_atlas_simple(genes=["SORL1", "PSIP1", "CDKN1C"], regulation="up", organism="Homo sapiens")
1235    {u'...
1236   
1237Or use the `AtlasCondition` subclasses in this module to construct a more
1238advanced query and use the `query_atlas` function.
1239
1240Example (query human genes annotated to the GO term 'transporter activity'
1241that are up regulated in the liver in at least three experiments) ::
1242
1243    >>> go_cond = AtlasConditionGeneProperty("Goterm", "Is", "transporter activity")
1244    >>> liver_cond = AtlasConditionExperimentalFactor("Organism_part", "up", 3, "liver")
1245    >>> org_cond = AtlasConditionOrganism("Homo sapiens")
1246    >>> cond_list = AtlasConditionList([go_cond, liver_cond, org_cond])
1247    >>> query_atlas(cond_list)
1248    {u'...
1249   
1250"""
1251
1252class GeneExpressionAtlasConenction(object):
1253    """ A connection to Gene Expression Atlas database.
1254    """
1255    DEFAULT_ADDRESS = "http://www.ebi.ac.uk:80/gxa/"
1256    try:
1257        DEFAULT_CACHE = shelve.open(orngServerFiles.localpath("ArrayExpress", "GeneAtlasCache.shelve"))
1258    except Exception, ex:
1259        print ex
1260        DEFAULT_CACHE = {}
1261    def __init__(self, address=None, timeout=30, cache=None):
1262        """ Initialize the conenction.
1263       
1264        :param address: Address of the server.
1265        :param timeout: Socket timeout.
1266        :param cache : A dict like object to use as a cache.
1267       
1268        """
1269        self.address = address if address is not None else self.DEFAULT_ADDRESS
1270        self.timeout = timeout
1271        self.cache = cache if cache is not None else self.DEFAULT_CACHE
1272   
1273    def query(self, condition, format="json", start=None, rows=None, indent=False):
1274        url = self.address + "api/vx?" + condition.rest()
1275        if start is not None and rows is not None:
1276            url += "&start={0}&rows={1}".format(start, rows)
1277        url += "&format={0}".format(format)
1278        if indent:
1279            url += "&indent"
1280#        print url
1281        if self.cache is not None:
1282            return self._query_cached(url)
1283        else:
1284            return urllib2.urlopen(url)
1285        return response
1286   
1287    def _query_cached(self, url):
1288        if self.cache is not None:
1289            if url not in self.cache:
1290                response = urllib2.urlopen(url)
1291                contents = response.read()
1292                self.cache[url] = contents
1293                if hasattr(self.cache, "sync"):
1294                    self.cache.sync()
1295            return StringIO(self.cache[url])
1296        else:
1297            return urllib2.urlopen(url)
1298               
1299   
1300# Names of all Gene Property filter names
1301GENE_FILTERS = \
1302    ["Name", # Gene name
1303     "Goterm", #Gene Ontology Term
1304     "Interproterm", #InterPro Term
1305     "Disease", #Gene-Disease Assocation
1306     "Keyword", #Gene Keyword
1307     "Protein", #Protein
1308
1309     "Dbxref", #Other Database Cross-Refs
1310     "Embl", #EMBL-Bank ID
1311     "Ensfamily", #Ensembl Family
1312     "Ensgene", #Ensembl Gene ID
1313
1314     "Ensprotein", #Ensembl Protein ID
1315     "Enstranscript", #Ensembl Transcript ID
1316     "Goid", #Gene Ontology ID
1317     "Image", #IMAGE ID
1318     "Interproid", #InterPro ID
1319     "Locuslink", #Entrez Gene ID
1320
1321     "Omimid", #OMIM ID
1322     "Orf", #ORF
1323     "Refseq", #RefSeq ID
1324     "Unigene", #UniGene ID
1325     "Uniprot", #UniProt Accession
1326
1327     "Hmdb", #HMDB ID
1328     "Chebi", #ChEBI ID
1329     "Cas", #CAS
1330     "Uniprotmetenz", #Uniprotmetenz
1331     "Gene", #Gene Name or Identifier
1332     "Synonym", #Gene Synonym
1333     ]
1334   
1335# Valid Gene Property filter qualifiers
1336GENE_FILTER_QUALIFIERS =\
1337    ["Is",
1338     "IsNot"
1339     ]
1340
1341# Organisms in the Atlas
1342ATLAS_ORGANISMS = \
1343    ["Anopheles gambiae",
1344     "Arabidopsis thaliana",
1345     "Bos taurus",
1346     "Caenorhabditis elegans",
1347     "Danio rerio",
1348     "Drosophila melanogaster",
1349     "Epstein barr virus",
1350     "Gallus gallus",
1351     "Homo sapiens",
1352     "Human cytomegalovirus",
1353     "Kaposi sarcoma-associated herpesvirus",
1354     "Mus musculus",
1355     "Rattus norvegicus",
1356     "Saccharomyces cerevisiae",
1357     "Schizosaccharomyces pombe",
1358#     "Unknown",
1359     "Xenopus laevis"
1360     ]
1361   
1362#_COMMON_TAXIDS = \
1363#    {"Anopheles gambiae",
1364#     "Arabidopsis thaliana",
1365#     "Bos taurus",
1366#     "Caenorhabditis elegans",
1367#     "Danio rerio",
1368#     "Drosophila melanogaster",
1369#     "Epstein barr virus",
1370#     "Gallus gallus",
1371#     "Homo sapiens",
1372#     "Human cytomegalovirus",
1373#     "Kaposi sarcoma-associated herpesvirus",
1374#     "Mus musculus",
1375#     "Rattus norvegicus",
1376#     "Saccharomyces cerevisiae",
1377#     "Schizosaccharomyces pombe",
1378##     "Unknown",
1379#     "Xenopus laevis"
1380#     }
1381   
1382def ef_ontology():
1383    """ Return the `EF <http://www.ebi.ac.uk/efo/>`_ (Experimental Factor) ontology
1384    """
1385    from . import obiOntology
1386#    return obiOntology.OBOOntology(urllib2.urlopen("http://efo.svn.sourceforge.net/svnroot/efo/trunk/src/efoinobo/efo.obo"))
1387    from Orange.orng import orngServerFiles
1388    # Should this be in the OBOFoundry (Ontology) domain
1389    try:
1390        file = open(orngServerFiles.localpath_download("ArrayExpress", "efo.obo"), "rb")
1391    except urllib2.HTTPError:
1392        file = urllib2.urlopen("http://efo.svn.sourceforge.net/svnroot/efo/trunk/src/efoinobo/efo.obo")
1393    return obiOntology.OBOOntology(file)
1394
1395
1396class AtlasCondition(object):
1397    """ Base class for Gene Expression Atlas query condition
1398    """
1399    def validate(self):
1400        """ Validate condition in a subclass.
1401        """
1402        raise NotImplementedError
1403   
1404    def rest(self):
1405        """ Return a REST query part in a subclass.
1406        """
1407        raise NotImplementedError
1408   
1409   
1410class AtlasConditionList(list, AtlasCondition):
1411    """ A list of AtlasCondition instances.
1412    """ 
1413    def validate(self):
1414        for item in self:
1415            item.validate()
1416       
1417    def rest(self):
1418        return "&".join(cond.rest() for cond in self)
1419
1420
1421class AtlasConditionGeneProperty(AtlasCondition):
1422    """ An atlas gene filter condition.
1423   
1424    :param property: Property of the gene. If None or "" all properties
1425        will be searched.
1426    :param qualifier: Qualifier can be 'Is' or 'IsNot'
1427    :param value: The value to search for.
1428   
1429    Example ::
1430   
1431        >>> # Condition on a gene name
1432        >>> condition = AtlasConditionGeneProperty("Name", "Is", "AS3MT")
1433        >>> # Condition on genes from a GO Term
1434        >>> condition = AtlasConditionGeneProperty("Goterm", "Is", "p53 binding")
1435        >>> # Condition on disease association
1436        >>> condition = AtlasConditionGeneProperty("Disease", "Is", "cancer")
1437       
1438    """
1439    def __init__(self, property, qualifier, value):
1440        self.property = property or ""
1441        self.qualifier = qualifier
1442        if isinstance(value, basestring):
1443            self.value = value.replace(" ", "+")
1444        elif isinstance(value, list):
1445            self.value = "+".join(value)
1446        else:
1447            raise ValueError(value)
1448       
1449        self.validate()
1450       
1451    def validate(self):
1452        assert(self.property in GENE_FILTERS + [""])
1453        assert(self.qualifier in GENE_FILTER_QUALIFIERS + [""])
1454       
1455    def rest(self):
1456        return "gene{property}{qualifier}={value}".format(**self.__dict__)
1457       
1458       
1459class AtlasConditionExperimentalFactor(AtlasCondition):
1460    """ An atlas experimental factor filter condition.
1461   
1462    :param factor: EFO experiamntal factor
1463    :param regulation: "up", "down", "updown", "any" or "none"
1464    :param n: Minimum number of of experimants with this condition
1465    :param value: Experimantal factor value
1466   
1467    Example ::
1468   
1469        >>> # Any genes up regulated in at least 3 experiments involving cancer.
1470        >>> condition = AtlasConditionExperimentalFactor("", "up", 3, "cancer")
1471        >>> # Only genes which are up/down regulated in the heart in at least one experiment.
1472        >>> condition = AtlasConditionExperimentalFactor("Organism_part", "updown", 1, "heart")
1473       
1474    """
1475    def __init__(self, factor, regulation, n, value):
1476        self.factor = factor
1477        self.regulation = regulation
1478        self.n = n
1479        self.value = value
1480        self.validate()
1481       
1482    def validate(self):
1483        # TODO: validate the factor and value
1484#        assert(self.factor in ef_ontology())
1485        assert(self.regulation in ["up", "down", "updown"])
1486       
1487    def rest(self):
1488        return "{regulation}{n}In{factor}={value}".format(**self.__dict__)
1489       
1490class AtlasConditionOrganism(AtlasCondition):
1491    """ Condition on organism.
1492    """
1493    def __init__(self, organism):
1494        self.organism = organism
1495        self.validate()
1496       
1497    def validate(self):
1498        assert(self.organism in ATLAS_ORGANISMS)
1499       
1500    def rest(self):
1501        return "species={0}".format(self.organism.replace(" ", "+").lower())
1502       
1503       
1504class AtlasConditionExperiment(AtlasCondition):
1505    """ Condition on experiement
1506   
1507    :param property: Property of the experiment. If None or "" all properties
1508        will be searched.
1509    :param qualifier: Qualifier can be 'Has' or 'HasNot'
1510    :param value: The value to search for.
1511   
1512    Example ::
1513   
1514        >>> # Condition on a experiemnt acession
1515        >>> condition = AtlasConditionExperiment("", "", "E-GEOD-24283")
1516        >>> # Condition on experiments involving lung
1517        >>> condition = AtlasConditionExperiment("Organism_part", "Has", "lung")
1518       
1519    """
1520#    EXPERIMENT_FILTERS = [
1521#                "Organism"
1522#                "Factor"]
1523   
1524    EXPERIMENT_FILTER_QUALIFIERS = [
1525                "Has",
1526                "HasNot"]
1527   
1528    def __init__(self, property, qualifier, value):
1529        self.property = property
1530        self.qualifier = qualifier
1531        if isinstance(value, basestring):
1532            self.value = value.replace(" ", "+")
1533        elif isinstance(value, list):
1534            self.value = "+".join(value)
1535        else:
1536            raise ValueError(value)
1537       
1538        self.validate()
1539       
1540    def validate(self):
1541        # TODO: check to EFO factors
1542#        assert(self.property in EXPERIMENT_FILTERS + [""])
1543        assert(self.qualifier in self.EXPERIMENT_FILTER_QUALIFIERS + [""])
1544       
1545    def rest(self):
1546        return "experiment{property}{qualifier}={value}".format(**self.__dict__)
1547       
1548       
1549class GeneAtlasError(ValueError):
1550    """ An error response from the Atlas server.
1551    """
1552    pass
1553   
1554   
1555def __check_atlas_error_json(response):
1556    if "error" in response:
1557        raise GeneAtlasError(response["error"])
1558    return response
1559 
1560     
1561def __check_atlas_error_xml(response):
1562    error = response.find("error")
1563    if error is not None:
1564        raise GeneAtlasError(error.text)
1565    return response
1566   
1567       
1568def query_atlas_simple(genes=None, regulation=None, organism=None,
1569                       condition=None, format="json", start=None,
1570                       rows=None):
1571    """ A simple Atlas query.
1572   
1573    :param genes: A list of gene names to search for.
1574    :param regulation: Search for experiments in which `genes` are "up",
1575        "down", "updown" or "none" regulated. If None all experiments
1576        are searched.
1577    :param organism: Search experiments for organism. If None all experiments
1578        are searched.
1579    :param condition: An EFO factor value (e.g. "brain")
1580   
1581    Example ::
1582       
1583        >>> query_atlas_simple(genes=['Pou5f1', 'Dppa3'], organism="Mus musculus")
1584        {u'...
1585       
1586        >>> query_atlas_simple(genes=['Pou5f1', 'Dppa3'], regulation="up", organism="Mus musculus")
1587        {u'...
1588       
1589        >>> query_atlas_simple(genes=['Pou5f1', 'Dppa3'], regulation="up", condition="liver", organism="Mus musculus")
1590        {u'...
1591       
1592    """
1593    warnings.warn("Use 'obiGeneAtlas.run_simple_query' instead.", DeprecationWarning)
1594    conditions = AtlasConditionList()
1595    if genes:
1596        conditions.append(AtlasConditionGeneProperty("Gene", "Is", genes))
1597    if regulation or condition:
1598        regulation = "any" if regulation is None else regulation
1599        condition = "" if condition is None else condition
1600        conditions.append(AtlasConditionExperimentalFactor("", regulation, 1, condition))
1601    if organism:
1602        conditions.append(AtlasConditionOrganism(organism))
1603       
1604    connection = GeneExpressionAtlasConenction()
1605    results = connection.query(conditions, format=format, start=start,
1606                               rows=rows)
1607    if format == "json":
1608        return parse_json(results)
1609    else:
1610        return parse_xml(results)
1611
1612"""\
1613.. todo:: can this be implemented query_atlas(organism="...", Locuslink="...", Chebi="...", up3InCompound="..." downInEFO="...")
1614      Need a full list of accepted factors
1615"""
1616
1617def query_atlas(condition, format="json", start=None, rows=None, indent=False, connection=None):
1618    """ Query Atlas based on a `condition` (instance of AtlasCondition)
1619   
1620    Example ::
1621       
1622        >>> condition1 = AtlasConditionGeneProperty("Goterm", "Is", "p53 binding")
1623        >>> condition2 = AtlasConditionExperimentalFactor("Organism_part", "up", 3, "heart")
1624        >>> condition = AtlasConditionList([condition1, condition2])
1625        >>> query_atlas(condition)
1626        {u'...
1627       
1628    """
1629    warnings.warn("Use 'obiGeneAtlas.run_query' instead.", DeprecationWarning)
1630    if connection is None:
1631        connection = GeneExpressionAtlasConenction()
1632    results = connection.query(condition, format=format, start=start,
1633                               rows=rows, indent=indent)
1634    if format == "json":
1635        response = parse_json(results)
1636        return __check_atlas_error_json(response)
1637    else:
1638        response = parse_xml(results)
1639        return __check_atlas_error_xml(response)
1640
1641
1642def get_atlas_summary(genes, organism, connection=None):
1643    """ Return 3 dictionaries containing a summary of atlas information
1644    about three experimental factors:
1645   
1646        - Organism Part (OP)
1647        - Disease State (DS)
1648        - Cell type (CT)
1649   
1650    Each dictionary contains query genes as keys. Values are dictionaries
1651    mapping factor values to a 2-tuple containig the count of up regulated
1652    and down regulated experiments.
1653   
1654    Example ::
1655   
1656        >>> get_atlas_summary(["RUNX1"], "Homo sapiens")
1657        ({u'RUNX1': ...
1658       
1659    """
1660    warnings.warn("Use 'obiGeneAtlas.get_atlas_summary' instead.", DeprecationWarning)
1661    genes_condition = AtlasConditionGeneProperty("Gene", "Is", genes)
1662    org_condition = AtlasConditionOrganism(organism)
1663    condition = AtlasConditionList([genes_condition, org_condition])
1664    result = query_atlas(condition, format="json", connection=connection)
1665   
1666    org_part = collect_ef_summary(result, "organism_part")
1667    disease_state = collect_ef_summary(result, "disease_state")
1668    cell_type = collect_ef_summary(result, "cell_type")
1669   
1670    return dict(org_part), dict(disease_state), dict(cell_type)
1671   
1672   
1673def collect_ef_summary(info, ef, summary=None):
1674    """ Collect the results summary from query_atlas, result for experimental
1675    factor `ef`.
1676    """
1677    if summary is None:
1678        summary = defaultdict(dict)
1679       
1680    results = info["results"]
1681    for res in results:
1682        gene = res["gene"]
1683        expressions = res["expressions"] 
1684        for expression in expressions:
1685            if expression["ef"] == ef:
1686                efv = expression["efv"]
1687                updown = (expression["upExperiments"],
1688                          expression["downExperiments"]
1689                          )
1690               
1691                if any(updown):
1692                    summary[gene["name"]][efv] = updown
1693   
1694    return summary
1695
1696
1697def test():   
1698    conn = ArrayExpressConnection()
1699    import doctest
1700    foo = type("foo", (object,), {})()
1701    bar = type("bar", (object,), {})()
1702   
1703    doctest.testmod(optionflags=doctest.ELLIPSIS,
1704                    extraglobs={"conn": conn, "foo": foo,"bar": bar})
1705   
1706   
1707if __name__ == "__main__":
1708    test()
1709   
Note: See TracBrowser for help on using the repository browser.