source: orange-bioinformatics/Orange/bioinformatics/obiArrayExpress.py @ 1625:cefeb35cbfc9

Revision 1625:cefeb35cbfc9, 58.3 KB checked in by mitar, 2 years ago (diff)

Moving files around.

Line 
1"""
2===============
3obiArrayExpress
4===============
5
6A python module for accessing the ArrayExpress web services and database.
7
8`Array Express Archive <http://www.ebi.ac.uk/arrayexpress/>`_ is a database of gene expression experiments that you
9can query and download.
10
11Example ::
12
13    >>> # Retrieve the object representing experiment with accession E-TABM-25
14    >>> experiment = ArrayExpressExperiment("E-TABM-25")
15    >>> print experiment.accession
16    E-TABM-25
17   
18    >>> print experiment.name
19    Transcription profiling of aging in the primate brain
20   
21    >>> print experiment.species
22    ['Pan troglodytes']
23   
24    >>> print experiment.files
25    [{'kind': ...
26   
27    >>> # Retrieve the data matrix for experiment 'E-MEXP-2917'
28    >>> experiment = ArrayExpressExperiment("E-MEXP-2917")
29    >>> table = experiment.fgem_to_table()
30
31
32Low level Array Express query using REST services::
33
34    >>> import obiArrayExpress
35    >>> obiArrayExpress.query_experiments(accession='E-MEXP-31')
36    {u'experiments': ...
37   
38    >>> obiArrayExpress.query_experiments(keywords='gliobastoma')
39    {u'experiments': ...
40   
41    >>> obiArrayExpress.query_files(accession='E-MEXP-32', format="xml")
42    <xml.etree.ElementTree.ElementTree object ...
43   
44.. note:: Currently querying ArrayExpress files only works with the xml format.
45
46.. note:: See the documentation of `query_experiments` for a full set of
47          parameters that these functions accept.
48
49"""
50
51import os, sys
52import urllib2
53
54import orngServerFiles
55import warnings
56import posixpath
57import shelve
58import shutil
59import posixpath
60import json
61from xml.etree.ElementTree import ElementTree
62from StringIO import StringIO
63
64from collections import defaultdict
65from functools import partial
66from contextlib import closing
67
68parse_json = json.load
69
70def parse_xml(stream):
71    """ Parse an xml stream into an instance of xml.etree.ElementTree.ElementTree.
72    """
73    return ElementTree(file=stream) 
74
75# All searchable fields of ArrayExpress (see query_experiments docstring
76# for a description of the fields)
77ARRAYEXPRESS_FIELDS = \
78    ["keywords",
79     "accession",
80     "array",
81     "ef",
82     "efv",
83     "expdesign",
84     "exptype",
85     "gxa",
86     "pmid",
87     "sa",
88     "species",
89     "expandefo",
90     "directsub",
91     "assaycount",
92     "efcount",
93     "samplecount",
94     "sacount",
95     "rawcount",
96     "fgemcount",
97     "miamescore",
98     "date",
99     "wholewords",
100    ]
101   
102
103class ArrayExpressConnection(object):
104    """ A connection to the ArrayExpress. Used to construct a REST query
105    and run it.
106   
107    .. todo:: Implement user login.
108   
109    """
110   
111    DEFAULT_ADDRESS = "http://www.ebi.ac.uk/arrayexpress/{format}/v2/"
112    DEFAULT_FORMAT = "json"
113    DEFAULT_CACHE = orngServerFiles.localpath("ArrayExpress", "ArrayExpressCache.shelve")
114    # Order of arguments in the query
115    _ARGS_ORDER = ["keywords", "species", "array"]
116   
117    def __init__(self, address=None, timeout=30, cache=None,
118                 username=None, password=None):
119        """ Initialize the connection object.
120       
121        :param address: Address of the ArrayExpress API
122        :param timeout: Timeout for the socket connection
123       
124        .. todo:: Implement user login (see Accessing Private Data in API docs)
125       
126        """
127        self.address = address if address is not None else self.DEFAULT_ADDRESS
128        self.timeout = timeout
129        self.cache = cache if cache is not None else self.DEFAULT_CACHE
130        self.username = username
131        self.password = password
132       
133    def format_query(self, **kwargs):
134        """ Format the query arguments.
135       
136        Example ::
137       
138            >>> conn.format_query(gxa=True, efcount=(1, 5))
139            'efcount=[1 TO 5]&gxa=true'
140           
141        """
142        # Formaters:
143        def format_default(val):
144            if isinstance(val, basestring):
145                return val
146            else:
147                return "+".join(val)
148        def format_species(val):
149            return '"%s"' % val.lower()
150        def format_gxa(val):
151            if val:
152                return "true"
153            else:
154                raise ValueError("gxa={0}".format(val))
155        def format_expandefo(val):
156            if val:
157                return "on"
158            else:
159                raise ValueError("expandefo={0}".format(val))
160        def format_true_false(val):
161            return "true" if val else "false"
162        def format_interval(val):
163            if isinstance(val, tuple):
164                return "[{0} TO {1}]".format(*val)
165            else:
166                raise ValueError("Must be an interval argument (min, max)!")
167        def format_date(val):
168            # TODO check if val contains a datetime.date object, assert proper format
169            return format_interval(val)
170        def format_wholewords(val):
171            if val:
172                return "on"
173            else:
174                raise ValueError("wholewords={0}".format(val))
175       
176        formaters = {"species": format_species,
177                     "gxa": format_gxa,
178                     "expandefo": format_expandefo,
179                     "directsub": format_true_false,
180                     "assaycount": format_interval,
181                     "efcount": format_interval,
182                     "samplecount": format_interval,
183                     "sacount": format_interval,
184                     "rawcount": format_interval,
185                     "fgemcount": format_interval,
186                     "miamescore": format_interval,
187                     "date": format_date,
188                     "wholewords": format_wholewords,
189                     }
190        parts = []
191        arg_items = kwargs.items()
192        ordered = sorted(arg_items, key=lambda arg: self._ARGS_ORDER.index(arg[0]) \
193                         if arg[0] in self._ARGS_ORDER else 100)
194       
195        for key, value in kwargs.iteritems():
196            if key == "format":
197                continue # format is handled in query_url
198            if key not in ARRAYEXPRESS_FIELDS:
199                raise ValueError("Invalid argument name: '{0}'".format(key))
200            if value is not None and value != []:
201                fmt = formaters.get(key, format_default)
202                value = fmt(value)
203                parts.append("{0}={1}".format(key, value))
204                 
205        return "&".join(parts)
206       
207    def query_url(self, what="experiments", **kwargs):
208        """ Return a formated query URL for the query arguments
209       
210        Example ::
211            >>> conn.query_url(accession="E-MEXP-31")
212            'http://www.ebi.ac.uk/arrayexpress/json/v2/experiments?accession=E-MEXP-31'
213           
214        """
215        query = self.format_query(**kwargs)
216        url = posixpath.join(self.address, what)
217        url = url.format(format=kwargs.get("format", self.DEFAULT_FORMAT))
218        url = url + ("?" + query if query else "")
219        url = url.replace(" ", "%20")
220        return url
221   
222    def query_url_experiments(self, **kwargs):
223        """ Return a formated experiments query url for the calls arguments
224        """
225        return self.query_url("experiments", **kwargs)
226   
227    def query_url_files(self, **kwargs):
228        """ Return a formated experiments query url for the calls arguments
229        """
230        return self.query_url("files", **kwargs)
231   
232    def query_experiment(self, **kwargs):
233        """ Return an open stream to the experiments query results.
234       
235        .. note:: This member function takes the same arguments as the module
236                  level `query_experiemnts` function.
237         
238        """
239        url = self.query_url_experiments(**kwargs)
240        stream = self._cache_urlopen(url, timeout=self.timeout)
241        return stream
242   
243    def query_files(self, **kwargs):
244        """ Return an open stream to the files query results.
245       
246        .. note:: This member function takes the same arguments as the module
247                  level `query_files` function.
248        """
249        url = self.query_url_files(**kwargs)
250        stream = self._cache_urlopen(url, timeout=self.timeout)
251        return stream
252   
253    def open_file(self, accession, kind="raw", ext=None):
254        """ Return a file handle to experiment data.
255        Possible values for kind:
256            - raw: return the raw data if available
257            - fgem: return the processed data if available
258            - biosamples: a png or svg design image
259            - idf: investigation description
260            - adf: array design description
261            - mageml: MAGE-ML file
262           
263        Example ::
264       
265            >>> raw_file = conn.open_file("E-TABM-1087", kind="raw")
266            >>> processed_file = conn.open_file("E-TABM-1087", kind="fgem")
267             
268        """
269        stream = self.query_files(accession=accession, format="xml")
270        tree = ElementTree(file=stream)
271        files = tree.findall("experiment/file")
272        for file in files:
273            filekind = file.find("kind").text
274            fileext = file.find("extension").text
275            if filekind.strip() == kind and (fileext.strip() == ext or ext is None): 
276                url = file.find("url").text
277                return self._cache_urlopen(url.strip(), timeout=self.timeout)
278           
279    def _cache_urlopen(self, url, timeout=30):
280        if self.cache is not None:
281            with self.open_cache() as cache:
282                cached = url in cache
283            if not cached:
284                stream = urllib2.urlopen(url, timeout=timeout)
285                data = stream.read()
286                with self.open_cache() as cache:
287                    cache[url] = data
288            else:
289                with self.open_cache() as cache:
290                    data = cache[url]
291            return StringIO(data)
292        else:
293            return urllib2.urlopen(url, timeout=timeout)
294       
295    def open_cache(self):
296        if isinstance(self.cache, basestring):
297            return closing(shelve.open(self.cache))
298        elif hasattr(self.cache, "close"):
299            return closing(self.cache)
300        elif self.cache is None:
301            return fake_closing({})
302        else:
303            return fake_closing(self.cache)
304       
305from contextlib import contextmanager
306
307@contextmanager
308def fake_closing(obj):
309    yield obj
310   
311def query_experiments(keywords=None, accession=None, array=None, ef=None,
312                      efv=None, expdesign=None, exptype=None,
313                      gxa=None, pmid=None, sa=None, species=None,
314                      expandefo=None, directsub=None, assaycount=None,
315                      efcount=None, samplecount=None, rawcount=None,
316                      fgemcount=None, miamescore=None, date=None, 
317                      format="json", wholewords=None, connection=None):
318    """ Query Array Express experiments.
319   
320    :param keywords: A list of keywords to search (e.g. ['gliobastoma']
321    :param accession: Search by experiment accession (e.g. 'E-MEXP-31')
322    :param array: Search by array design name or accession (e.g. 'A-AFFY-33')
323    :param ef: Experimental factor (names of main variables of experiments)
324    :param efv: Experimental factor value (Has EFO expansion)
325    :param expdesign: Experiment design type. (e.g. ["dose", "response"])
326    :param exptype: Experiment type (e.g. 'RNA-Seq', has EFO expansion)
327    :param gxa: If True limit the results to experiments from the Gene
328        Expreission Atlas only.
329    :param pmid: Search by PubMed identifier
330    :param sa: Sample attribute values (e.g. 'fibroblast', has EFO expansion)
331    :param species: Search by species (e.g. 'Homo sapiens', has EFO expansion)
332   
333    :param expandefo: If True expand the search terms with all its child terms
334        in the Experimental Factor Ontology (EFO_) (e.g. keywords="cancer"
335        will be expanded to include for synonyms and sub types of cancer).
336    :param directsub: If True return only experiments submited directly to
337        Array Express else if False return only experiments imported from GEO
338        database (default None, return both)
339    :param assaycount: A two tuple (min, max) for filter on the number of
340        assays (e.g. (1, 5) will return only experiments with at least one
341        and no more then 5 assays).
342    :param efcount: Filter on the number of experimental factors (e.g. (1, 5))
343    :param sacount: Filter on the number of sample attribute categories
344    :param rawcount: Filter on the number or raw files
345    :param fgemcount: Filter on the number of final gene expression matrix
346        (processed data) files
347    :param miamescore: Filter on the MIAME complience score (max 5)
348    :param date: Filter by release date
349   
350    Example ::
351   
352        >>> query_experiments(species="Homo sapiens", ef="organism_part", efv="liver")
353        {u'experiments': ...
354       
355    .. _EFO: http://www.ebi.ac.uk/efo/
356   
357    """
358    if connection is None:
359        connection = ArrayExpressConnection()
360       
361    stream = connection.query_experiment(keywords=keywords, accession=accession,
362                array=array, ef=ef, efv=efv, expdesign=expdesign, exptype=exptype,
363                gxa=gxa, pmid=pmid, sa=sa, species=species, expandefo=expandefo,
364                directsub=directsub, assaycount=assaycount, efcount=efcount,
365                samplecount=samplecount, rawcount=rawcount, fgemcount=fgemcount,
366                miamescore=miamescore, date=date,  format=format,
367                wholewords=wholewords)
368   
369    if format == "json":
370        return parse_json(stream)
371    else:
372        return parse_xml(stream)
373   
374   
375def query_files(keywords=None, accession=None, array=None, ef=None,
376               efv=None, expdesign=None, exptype=None,
377               gxa=None, pmid=None, sa=None, species=None,
378               expandefo=None, directsub=None, assaycount=None,
379               efcount=None, samplecount=None, rawcount=None,
380               fgemcount=None, miamescore=None, date=None, 
381               format="json", wholewords=None, connection=None):
382    """ Query Array Express files.
383   
384    This function accepts the same arguments as `query_experiments`.
385   
386    Example ::
387   
388        >>> query_files(species="Mus musculus", ef="developmental_stage", efv="embryo", format="xml")
389        <xml.etree.ElementTree.ElementTree object ...
390       
391    .. todo:: why does the json interface not work.
392                       
393    """
394    if connection is None:
395        connection = ArrayExpressConnection()
396   
397    stream = connection.query_files(keywords=keywords, accession=accession,
398                array=array, ef=ef, efv=efv, expdesign=expdesign, exptype=exptype,
399                gxa=gxa, pmid=pmid, sa=sa, species=species, expandefo=expandefo,
400                directsub=directsub, assaycount=assaycount, efcount=efcount,
401                samplecount=samplecount, rawcount=rawcount, fgemcount=fgemcount,
402                miamescore=miamescore, date=date,  format=format,
403                wholewords=wholewords)
404   
405    if format == "json":
406        return parse_json(stream)
407    else:
408        return parse_xml(stream)
409   
410   
411def open_file(accession, kind="raw", ext=None, repo_dir=None):
412    """ Open a file for the experiment.
413     
414    Example ::
415   
416        >>> file = open_file("E-MTAB-369", kind="raw", repo_dir="~/.arrayexpress/")
417       
418    """
419    raise NotImplementedError
420
421"""\
422MAGE-TAB convinence functions, classes
423======================================
424"""
425
426IDF_SINGLE_TAGS = \
427    ["Investigation Title",
428     "Date of Experiment",
429     "Public Release Date",
430     "Experiment Description",
431    ]
432
433def parse_idf(file):
434    """ Parse an idf.txt (Investigation Description  Format) formated file.
435    Return a list of tuples where the first element is the tag (first column)
436    and the second element is a list of all tag values.
437   
438    """
439    if isinstance(file, basestring):
440        file = open(file, "rb")
441    data = file.read()
442    lines = data.splitlines()
443    lines = [line.split("\t") for line in lines if line and not line.startswith("#")]
444    parsed = [(line[0], line[1:]) for line in lines]
445    return parsed
446   
447def parse_sdrf(file):
448    """ Parse an sdfr formated file. Return a tuple with the first element
449    a list of header values and the second element is a list of all rows
450    (a row is a list of string values).
451   
452    """
453    if isinstance(file, basestring):
454        file = open(file, "rb")
455    data = file.read()
456    lines = data.splitlines()
457    lines = [line.split("\t") for line in lines if line.strip() and not line.startswith("#")]
458    header = [h for h in lines[0] if h]
459    rows = [line[:len(header)] for line in lines[1:]]
460    assert(all([len(r) == len(header) for r in rows]))
461    return header, rows
462   
463def parse_adf(file):
464    pass
465
466def parse_data_matrix(file):
467    """ Parse the MAGE-TAB processed data matrix. Return a tuple where the
468    elements are:
469        - a (header REF, header values) tuple (e.g. ("Hybridization REF", ["Stage1", "Stage2", ...]) )
470        - a list of quantitations for header values (e.g. ["log2 ratio", "log2 ratio", ...])
471        - a (row REF, row names list) tuple ("e.g. ("Reporter REF", ["Probe 1", "Probe 2", ...]) )
472        - a list of list matrix with values (as strings)
473       
474    """
475    if isinstance(file, basestring):
476        file = open(file, "rb")
477    data = file.read()
478    lines = data.splitlines()
479    lines = [line.split("\t") for line in lines if line.strip()]
480    header = lines[0]
481    header_ref, header = header[0], header[1:]
482    line2 = lines[1]
483    row_ref, quanifications = line2[0], line2[1:]
484    row_names, rows = [], []
485    for line in lines[2:]:
486        r_name, row = line[0], line[1:]
487        row_names.append(r_name)
488        rows.append(row)
489       
490    return ((header_ref, header),
491            quanifications,
492            (row_ref, row_names),
493            rows) 
494   
495class InvestigationDesign(dict):
496    """ Investigation design (contains the contents of the .idf).
497   
498    Example ::
499   
500        >>> idf = InvestigationDesign("foobar.idf")
501        >>> print idf.investigation_title
502        foo investigation
503        >>> print idf.experimental_design
504        ['fubar', 'snafu']
505        >>> print idf.sdrf_file
506        ['foobar.sdrf']
507       
508    """
509    def __init__(self, idf_file=None):
510        idf = parse_idf(idf_file)
511        self._idf = idf
512        self.update(dict(idf))
513        for tag, values in idf:
514            if tag in IDF_SINGLE_TAGS:
515                values = values[0] if values else None
516            ttag = self.transform_tag(tag)
517            setattr(self, ttag, values)
518       
519    def transform_tag(self, tag):
520        """ Transform the tag into a proper python attribute name by
521        replacing all spaces and special characters (e.g '[', ']' into
522        underscores).
523       
524        """
525        toreplace = [" ", "-", "[", "]"]
526        for s in toreplace:
527            tag = tag.replace(s, "_")
528        return tag.lower()
529           
530    def __getitem__(self, tag):
531        """ Return the tag values
532       
533        Example ::
534       
535            >>> idf["Investigation Title"]
536            'foo investigation'
537           
538        """
539        try:
540            return self._idf_dict[tag]
541        except KeyError:
542            pass
543       
544        ttag = self.transform_tag(tag)
545        if hasattr(self, ttag):
546            return getattr(self, ttag)
547        else:
548            raise KeyError(tag)
549       
550class SampleDataRelationship(object):
551    """ Sample-Data Relationship (contains the contents of the .sdrf file).
552   
553    Example ::
554   
555        >>> sdr = SampleDataRelationship("foobar.sdrf")
556        >>> sdr.source_name
557        ['foo', ...
558       
559        >>> sdr.sample_name
560        ['sampled foo', ...
561       
562        >>> sdr.extract_protocol_ref
563        ['bar the foo', ...
564       
565        >>> sdr.derived_array_data_matrix_file
566        ['foobar.data.txt', ...
567       
568        >>>
569         
570    """
571   
572    # Nodes an edges
573    NODES_EDGES = ["Source Name", "Sample Name", "Extract Name",
574                   "Labeled Extract Name", "Hybridization Name",
575                   "Assay Name", "Scan Name", "Normalization Name",
576                   "Array Data File", "Derived Array Data File",
577                   "Array Data Matrix File", "Derived Array Data Matrix File",
578                   "Image File", "Protocol REF"]
579   
580    # Attributes for nodes and edges
581    NODE_EDGE_ATTRIBUTES = \
582    {"Source Name": ["Characteristics", "Provider", "Material Type", "Description", "Comment"],
583     "Sample Name": ["Characteristics", "Material Type", "Description", "Comment"],
584     "Extract Name":["Characteristics", "Material Type", "Description", "Comment"],
585     "Labeled Extract Name": ["Characteristics", "Material Type", "Description", "Label", "Comment"],
586     "Hybridization Name": ["Array Design File", "Array Design REF", "Comment"],
587     "Assay Name": ["Technology Type", "Comment"],
588     "Scan Name": ["Comment"],
589     "Normalization Name": ["Comment"],
590     "Array Data File": ["Comment"],
591     "Derived Array Data File": ["Comment"],
592     "Array Data Matrix File": ["Comment"],
593     "Derived Array Data Matrix File": ["Comment"],
594     "Image File": ["Comment"],
595     "Protocol REF": ["Term Source REF", "Parameter", "Performer", "Date", "Comment"]
596     }
597   
598    # Attributes
599    ATTRIBUTE_COLUMNS = \
600    {"Characteristics []": ["Unit", "Term Source REF"],
601     "Provider": ["Comment"],
602     "Material Type": ["Term Source REF"],
603     "Label": ["Term Source REF"],
604     "Array Design File": ["Comment"],
605     "Array Design REF": ["Term Source REF", "Comment"],   
606     "Technology Type": ["Term Source REF"],
607     "Factor Value [] ()": ["Unit", "Term Source REF"],
608     "Performer": ["Comment"],
609     "Date": [],
610     "Parameter Value []": ["Unit", "Comment"],
611     "Unit []": ["Term Source REF"],
612     "Description": [],
613     "Term Source REF": ["Term Accession Number"],
614     "Term Accession Number": [],
615     "Comment []": []
616     }
617    def __init__(self, sdrf_file=None):
618        header, rows = parse_sdrf(sdrf_file)
619        self.header = header
620        self.rows = rows
621       
622    def transform_tag(self, tag):
623        """ Transform the tag into a proper python attribute name by
624        replacing all spaces and special characters (e.g '[', ']' into
625        underscores).
626       
627        """
628        toreplace = [" ", "-", "[", "]"]
629        for s in toreplace:
630            tag = tag.replace(s, "_")
631        return tag.lower()
632   
633    def _subsection(self, name):
634        """ Return the named subsection (name must be from the
635        NODES_EDGES list).
636       
637        """
638        idx = self.NODES_EDGES.index(name)
639        start = self.header.index(name)
640        end = -1
641        for name in self.NODES_EDGES[idx + 1:]:
642            if name in self.header[start + 1:]:
643                end = self.header.index(name, start + 1)
644                break
645        return self.header[start:end], [r[start:end] for r in self.rows]
646   
647    def _column(self, name):
648        """ Return the named column.
649        """
650        index = self.header.index(name)
651        return [r[index] for r in self.rows]
652       
653    def source(self):
654        """ Return the Source subsection
655        """
656        return self._subsection("Source Name")
657       
658    def source_name(self):
659        """ Return the Source Name subsection
660        """
661        return self._column("Source Name")
662       
663    def sample(self):
664        """ Return the Sample subsection
665        """
666        return self._subsection("Sample Name")
667       
668    def sample_name(self):
669        """ Return the Sample Name subsection
670        """
671        return self._column("Sample Name")
672       
673    def extract(self):
674        """ Return the Extract subsection
675        """
676        return self._subsection("Extract Name")
677       
678    def extract_name(self):
679        """ Return the Extract Name subsection
680        """
681        return self._column("Extract Name")
682       
683    def labeled_extract(self):
684        """ Return the Labeled Extract subsection
685        """
686        return self._subsection("Labeled Extract Name")
687       
688    def labeled_extract_name(self):
689        """ Return the Labeled Extract Name subsection
690        """
691        return self._column("Labeled Extract Name")
692       
693    def hybridization(self):
694        """ Return the Hibridization subsection.
695        """
696        return self._subsection("Hibridization Name")
697       
698    def hybridization_name(self):
699        """ Return the Hibridization Name subsection.
700        """
701        return self._column("Hibridization Name")
702       
703    def assay(self):
704        """ Return the Assay subsection
705        """
706        return self._subsection("Assay Name")
707       
708    def assay_name(self):
709        """ Return the Assay Name subsection
710        """
711        return self._column("Assay Name")
712       
713    def scan(self):
714        """ Return the Scan subsection
715        """
716        return self._subsection("Scan Name")
717       
718    def scan_name(self):
719        """ Return the Scan name subsection
720        """
721        return self._column("Scan Name")
722       
723    def normalization(self):
724        """ Return the Normalization subsection.
725        """
726        return self._subsection("Normalization Name")
727       
728    def normalization_name(self):
729        """ Return the Normalization Name subsection.
730        """
731        return self._column("Normalization Name")
732         
733    def array_data(self):
734        """ Return the Array Data subsection
735        """
736        return self._subsection("Array Data File")
737   
738    def array_data_file(self):
739        """ Return the Array Data File subsection
740        """
741        return self._column("Array Data File")
742       
743    def derived_array_data(self):
744        """ Return the Derived Array Data subsection
745        """
746        return self._subsection("Derived Array Data File")
747   
748    def derived_array_data_file(self):
749        """ Return the Derived Array Data File subsection
750        """
751        return self._column("Derived Array Data File")
752       
753    def array_data_matrix(self):
754        """ Return the Array Data Matrix subsection.
755        """
756        return self._subsection("Array Data Matrix File")
757   
758    def array_data_matrix_file(self):
759        """ Return the Array Data Matrix File subsection.
760        """
761        return self._column("Array Data Matrix File")
762       
763    def derived_array_data_matrix(self):
764        """ Return the Derived Array Data Matrix subsection.
765        """
766        return self._subsection("Derived Array Data Matrix File")
767   
768    def derived_array_data_matrix_file(self):
769        """ Return the Derived Array Data Matrix File subsection.
770        """
771        return self._column("Derived Array Data Matrix File")
772       
773    def image(self):
774        """ Return the Image subsection
775        """
776        return self._subsection("Image File")
777   
778    def image_file(self):
779        """ Return the Image File subsection.
780        """
781        return self._column("Image File")
782       
783class ArrayDesign(object):
784    """ Arary design (contains the contents of the .adf file).
785    """
786    def __init__(self, adf_file=None):
787        adf = parse_adf(adf_file)
788        self._adf = adf
789   
790def _is_float(str):
791    try:
792        float(str)
793        return True
794    except ValueError:
795        return False
796   
797def _is_continuous(items, check_count=100):
798    """ Are the strings in items continous numbers.
799    """
800    count = 0
801    i = 0
802    for i, item in enumerate(items):
803        if _is_float(item):
804            count += 1
805        if i >= check_count:
806            break
807    return count >= i * 0.5
808   
809def processed_matrix_to_orange(matrix_file):
810    """ Load a single processed matrix file in to an Orange.data.Table
811    instance.
812    """
813    import numpy
814    import Orange
815   
816    if isinstance(matrix_file, basestring):
817        matrix_file = open(matrix_file, "rb")
818       
819#    data_matrix = matrix_file.read()
820    header, quantification, rows, matrix = parse_data_matrix(matrix_file)
821    header_ref, header = header
822    row_ref, rows = rows
823   
824    matrix = numpy.array(matrix, dtype=object)
825   
826   
827    features = []
828    is_float = numpy.frompyfunc(_is_float, 1, 1) # an numpy ufunc
829         
830    for header_name, quant, column in zip(header, quantification, matrix.T):
831        if _is_continuous(column):
832            feature = Orange.feature.Continuous(header_name)
833            column[numpy.where(1 - is_float(column))] = "?" # relace all non parsable floats with '?'
834        else:
835            values = set(column)
836            feature = Orange.feature.Discrete(header_name, values=sorted(values))
837        feature.attributes["quantification"] = quant
838        features.append(feature)
839       
840    row_ref_feature = Orange.feature.String(row_ref)
841    domain = Orange.data.Domain(features, None)
842    domain.addmeta(Orange.feature.Descriptor.new_meta_id(), row_ref_feature)
843   
844    table = Orange.data.Table(domain, [list(row) for row in matrix])
845   
846    # Add row identifiers
847    for instance, row in zip(table, rows):
848        instance[row_ref_feature] = row
849   
850    return table
851   
852
853def mage_tab_to_orange(idf_filename):
854    """ Convert an MAGE-TAB annotated experiment into an Orange.data.Table
855    instance (assumes all the associated MAGE-TAB files are in the same
856    directory.
857   
858    .. todo:: Add Characteristics, Factor Values ... to the feature.attributes dict
859   
860    """
861    import Orange
862    dirname = os.path.dirname(idf_filename)
863    idf = InvestigationDesign(idf_filename)
864   
865    sdrf_filename = os.path.join(dirname, idf.sdrf_file[0])
866    sdrf = SampleDataRelationship(sdrf_filename)
867   
868    data_matrices = set(sdrf.derived_array_data_matrix_file())
869    data_matrices = [name for name in data_matrices if name.strip()]
870   
871    tables = []
872    for filename in data_matrices:
873        matrix_file = os.path.join(dirname, filename)
874        table = processed_matrix_to_orange(matrix_file)
875        tables.append(table)
876       
877    return hstack_tables(tables)
878   
879def hstack_tables(tables):
880    """ Stack the tables horizontaly.
881    """
882    import Orange
883    max_len = max([len(table) for table in tables])
884    stacked_features = []
885    stacked_values = [[] for i in range(max_len)]
886    stacked_meta_features = []
887    stacked_meta_values = [{} for i in range(max_len)]
888   
889    for table in tables:
890        stacked_features.extend(table.domain.variables)
891        stacked_meta_features.extend(table.domain.getmetas().items())
892       
893        for i, instance in enumerate(table):
894            stacked_values[i].extend(list(instance))
895            stacked_meta_values[i].update(instance.getmetas())
896           
897        # Fill extra lines with unknowns
898        for i in range(len(table), max_len):
899            stacked_values[i].extend(["?"] * len(table.domain.variables))
900       
901    domain = Orange.data.Domain(stacked_features, tables[-1].domain.class_var)
902    domain.addmetas(dict(set(stacked_meta_features)))
903    table = Orange.data.Table(domain, stacked_values)
904   
905    # Add meta attributes
906    for instance, metas in zip(table, stacked_meta_values):
907        for m, val in metas.iteritems():
908            instance[m] = val
909           
910    return table
911   
912def _dictify(element):
913    """ 'Dictify' and xml.etree.Element.Element instance by taking all
914    subnode tags as keys and the corresponding text values as items i.e. turn
915    `<node><bla>foo</bla><a>b</b></node>` into a {"bla":"foo", "a":b"}
916       
917    """
918    if element is None:
919        element = []
920    dict = {}
921    strip = lambda s: s.strip() if s else s
922    for node in element:
923        dict[node.tag] = strip(getattr(node, "text", None))
924    return dict
925   
926class SearchableList(list):
927    """ A list with a `search` method
928    """
929    def search(self, **kwargs):
930        """ Search the list for elements with members that correspond the
931        supplied keyword arguments.
932       
933            >>> foo.bar = "foo"
934            >>> list = SearchableList([foo, bar])
935            >>> list.search(bar="foo") # Search for objects which have a member named "bar" and that member equals "foo"
936            [<__main__.foo object ...
937           
938        """
939        ret = []
940        for obj in self:
941            for member, value in kwargs.items():
942                if hasattr(obj, member) and getattr(obj, member) == value:
943                    ret.append(obj)
944        return ret
945   
946class ArrayExpressExperiment(object):
947    """ An convinience class representing an Array Express Experiment.
948   
949    Example ::
950   
951        >>> ae = ArrayExpressExperiment("E-MEXP-2917")
952        >>> print ae.name
953        Characterization of Data Variation in Gene Expression Profiling of Human Peripheral Blood Samples
954       
955        >>> for file in ae.files:
956        ...     print file["name"], file["url"]
957        E-MEXP-2917.sdrf.txt http://www.ebi.ac.uk/arrayexpress/files/E-MEXP-2917/E-MEXP-2917.sdrf.txt
958        ...
959       
960        >>> table = ae.fgem_to_table() # Retieve the experiment data table
961           
962    """
963   
964    def __init__(self, accession, connection=None):
965        self.accession = accession
966        self.connection = connection
967        self._etree = tree = query_experiments(accession=accession, connection=self.connection, format="xml")
968        experiments = tree.findall("experiment")
969        # find the exact match (more then one experiment can be listed in the query result)
970        experiments = [e for e in experiments if e.find("accession").text.strip() == accession]
971        self._experiment = experiment = experiments[0]
972       
973        self.species = [e.text for e in experiment.findall("species")]
974        bool_values = {"true": True, "false": False}
975        self.rawdatafiles = bool_values[experiment.find("rawdatafiles").get("available","false")]
976        self.fgemdatafiles = bool_values[experiment.find("fgemdatafiles").get("available", "false")]
977       
978        self.sampleattributes = []
979        for sa in experiment.findall("sampleattribute"):
980            category = sa.find("category").text.strip()
981            values = [val.text for val in sa.findall("value")]
982            self.sampleattributes.append((category, values))
983           
984        self.experimentalfactors = []
985        for ef in experiment.findall("experimentalfactor"):
986            name = ef.find("name").text.strip()
987            values = [val.text.strip() for val in ef.findall("values")]
988            self.experimentalfactors.append((name, values))
989           
990        self.miamescores = _dictify(experiment.find("miamescores"))
991           
992        self.id = experiment.find("id").text
993        self.secondaryaccession = getattr(experiment.find("secondaryaccession"), "text", None)
994        self.name = experiment.find("name").text
995        self.experimenttype = experiment.find("experimenttype").text.strip()
996        self.releasedate = experiment.find("releasedate").text
997        self.lastupdatedate = getattr(experiment.find("lastupdatedate"), "text", None)
998        self.samples = int(experiment.find("samples").text)
999        self.assays = int(experiment.find("assays").text)
1000       
1001        self.arraydesign = [_dictify(e) for e in experiment.findall("arraydesign")]
1002           
1003        self.bioassaydatagroups = [_dictify(group) for group in experiment.findall("bioassaydatagroup")]
1004        self.bibliography = [_dictify(e) for e in experiment.findall("bibliography")]
1005        self.provider = [_dictify(e) for e in experiment.findall("provider")]
1006       
1007        self.experimentdesign = []
1008        for expd in experiment.findall("experimentdesign"):
1009            self.experimentdesign.append(expd.text)
1010           
1011        self.description = [_dictify(e) for e in experiment.findall("description")]
1012       
1013        tree = query_files(accession=self.accession, format="xml", connection=self.connection)
1014        experiments = tree.findall("experiment")
1015        experiments = [e for e in experiments if e.find("accession").text.strip() == accession]
1016        experiment = experiments[0]
1017        files = experiment.findall("file")
1018        self.files = [_dictify(file) for file in files]
1019       
1020    def _download_processed(self):
1021        """ Download the processed matrix file, and associated MAGE-TAB files (idf, sdrf, adf)
1022       
1023        .. todo:: What about the raw data files (we need converters for other file types)
1024        """
1025        assert(self.fgemdatafiles)
1026        exp_files = [(f["kind"], f) for f in self.files if f.get("kind") in ["idf", "sdrf"] and f.get("extension") == "txt"]
1027        exp_files += [(f["kind"], f) for f in self.files if f.get("kind") == "fgem"]
1028        array_files = [(f["kind"], f) for f in self.files if f.get("kind") == "adf" and f.get("extension") == "txt"]
1029        assert(len(files) == 3)
1030       
1031        for type, file in files.iteritems():
1032            url = file["url"].strip()
1033            rest, basename = os.path.split(url)
1034            _, dirname = os.path.split(rest)
1035           
1036            repo_dir = orngServerFiles.localpath("ArrayExpress", dirname)
1037            try:
1038                os.makedirs(repo_dir)
1039            except OSError:
1040                pass
1041            local_filename = os.path.join(repo_dir, basename)
1042            stream = urllib2.urlopen(url)
1043            shutil.copyfileobj(stream, open(local_filename, "wb"))
1044           
1045            if file["extension"] == "zip":
1046                import zipfile
1047                zfile = zlib.ZipFile(local_filename)
1048                zfile.extractall(repo_dir)
1049            elif file["extension"] == "gz":
1050                import gzip
1051                gzfile = gzip.open(local_filename)
1052                gzfile.extractall(repo_dir)
1053            elif file["extension"] in ["tgz", "tar"]:
1054                import tarfile
1055                tfile = tarfile.TarFile(local_filename)
1056                tfile.extractall(repo_dir)
1057            elif file["extension"] == "txt":
1058                pass
1059            else:
1060                raise ValueError("Unknown extension ('{0}').".format(basename))
1061           
1062    def _download_file(self, url, extract=True):
1063        """ Download the `file` from the ArrayExpress saving it to a local
1064        repository directory.
1065         
1066        """
1067        rest, basename = posixpath.split(url)
1068        dirname = posixpath.basename(rest)
1069        repo_dir = orngServerFiles.localpath("ArrayExpress", dirname)
1070        try:
1071            os.makedirs(repo_dir)
1072        except OSError:
1073            pass
1074        stream = urllib2.urlopen(url)
1075        local_filename = os.path.join(repo_dir, basename)
1076        shutil.copyfileobj(stream, open(local_filename, "wb"))
1077       
1078        if extract:
1079            _, extension = os.path.splitext(local_filename)
1080            if extension == ".zip":
1081                import zipfile
1082                zfile = zipfile.ZipFile(local_filename)
1083                zfile.extractall(repo_dir)
1084            elif extension == ".gz":
1085                import gzip
1086                gzfile = gzip.open(local_filename)
1087                gzfile.extractall(repo_dir)
1088            elif extension in [".tgz"]:
1089                import tarfile
1090                tfile = tarfile.TarFile(local_filename)
1091                tfile.extractall(repo_dir)
1092            elif extension == ".txt":
1093                pass
1094            else:
1095                raise ValueError("Unknown extension ('{0}').".format(basename))
1096           
1097    def _is_local(self, url):
1098        """ Is the `url` stored in the local repository.
1099        """
1100        return os.path.exists(self._local_filepath(url))
1101   
1102    def _local_filepath(self, url):
1103        """ Return the local file path for url.
1104        """
1105        rest, basename = posixpath.split(url)
1106        dirname = posixpath.basename(rest)
1107        return orngServerFiles.localpath("ArrayExpress", os.path.join(dirname, basename))
1108   
1109    def _open(self, url):
1110        """ Return an open file like handle to url (ArrayExpress file).
1111        The file is cached in the local repository for future access.
1112       
1113        """
1114        if not self._is_local(url):
1115            self._download_file(url, extract=True)
1116        file = self._local_filepath(url)
1117        return open(file, "rb")
1118   
1119    def _search_files(self, kind=None, extension=None):
1120        """ Search files by `kind` and `extension`.
1121        """
1122        res = []
1123        for file in self.files:
1124            kind_match = kind == file.get("kind") or kind is None
1125            extension_match = extension == file.get("extension") or extension is None
1126           
1127            if kind_match and extension_match:
1128                res.append(file)
1129        return res
1130       
1131    def array_design(self):
1132        """ Return a list of `ArrayDesign` instances used in this experiment.
1133        """
1134        files = [f for f in self.files if f.get("kind") == "adf" and \
1135                 f.get("extension") == "txt"]
1136       
1137        array_design = []
1138        for file in files:
1139            url = file.get("url")
1140            if not self._is_local(url):
1141                self._download_file(url)
1142            array_design.append(ArrayDesign(self._open(url)))
1143        return array_design
1144       
1145    def investigation_design(self):
1146        """ Return an `InvestigationDesgin` instance for this experiment
1147        """
1148        files = [f for f in self.files if f.get("kind") == "idf" and \
1149                 f.get("extension") == "txt"]
1150        if not files:
1151            raise ValueError("The experiment '{0}' does not have an investigation design file".format(self.accession))
1152        file = files[0]
1153        return InvestigationDesign(self._open(file.get("url")))
1154       
1155       
1156    def sample_data_relationship(self):
1157        """ Return an `SampleDataRelationship` instance describing this experiment.
1158        """
1159        files = [f for f in self.files if f.get("kind") == "sdrf" and \
1160                 f.get("extension") == "txt"]
1161        if not files:
1162            raise ValueError("The experiment '{0}' does not have an sample and data relationship file".format(self.accession))
1163        file = files[0]
1164        return SampleDataRelationship(self._open(file.get("url")))
1165       
1166    def fgem_to_table(self):
1167        """ Retrieve the processed matrix from the Array Express ftp
1168        server and convert it to an :class:`Orange.data.Table` instance.
1169         
1170        """
1171        assert(self.fgemdatafiles)
1172        repo_dir = orngServerFiles.localpath("ArrayExpress", self.accession)
1173        # Find the file listing the data matrix files (should be in sdrf but sometimes it is in 2column file only, why?)
1174        sdrf = self._search_files("sdrf", "txt")
1175        if sdrf:
1176            sdrf = SampleDataRelationship(self._open(sdrf[0].get("url")))
1177            if "Derived Array Data Matrix File" not in sdrf.header:
1178                twocol = self._search_files("twocolumn", "txt")
1179                if twocol:
1180                    sdrf = SampleDataRelationship(self._open(twocol[0].get("url")))
1181        matrix_file = self._search_files("fgem")[0]
1182        self._open(matrix_file.get("url")) 
1183        matrix_files = sorted(set(sdrf.derived_array_data_matrix_file()))
1184       
1185        idf_file = self._search_files("idf", "txt")[0]
1186        self._open(idf_file.get("url")) # To download if not cached
1187        return mage_tab_to_orange(os.path.join(repo_dir, idf_file.get("name")))
1188       
1189   
1190"""\
1191Gene Expression Atlas
1192---------------------
1193
1194.. WARNING:: Deprecated, use ``obiGeneAtlas`` instead.
1195
1196`Gene Expression Atlas <http://www.ebi.ac.uk/gxa/>`_ is a curated subset of
1197gene expression experiments in Array Express Archive.
1198
1199Use `query_atlas_simple` for simple querys.
1200
1201Example (query human genes for experiments in which they are up regulated) ::
1202
1203    >>> obiArrayExpress.query_atlas_simple(genes=["SORL1", "PSIP1", "CDKN1C"], regulation="up", organism="Homo sapiens")
1204    {u'...
1205   
1206Or use the `AtlasCondition` subclasses in this module to construct a more
1207advanced query and use the `query_atlas` function.
1208
1209Example (query human genes annotated to the GO term 'transporter activity'
1210that are up regulated in the liver in at least three experiments) ::
1211
1212    >>> go_cond = AtlasConditionGeneProperty("Goterm", "Is", "transporter activity")
1213    >>> liver_cond = AtlasConditionExperimentalFactor("Organism_part", "up", 3, "liver")
1214    >>> org_cond = AtlasConditionOrganism("Homo sapiens")
1215    >>> cond_list = AtlasConditionList([go_cond, liver_cond, org_cond])
1216    >>> query_atlas(cond_list)
1217    {u'...
1218   
1219"""
1220
1221class GeneExpressionAtlasConenction(object):
1222    """ A connection to Gene Expression Atlas database.
1223    """
1224    DEFAULT_ADDRESS = "http://www.ebi.ac.uk:80/gxa/"
1225    try:
1226        DEFAULT_CACHE = shelve.open(orngServerFiles.localpath("ArrayExpress", "GeneAtlasCache.shelve"))
1227    except Exception, ex:
1228        print ex
1229        DEFAULT_CACHE = {}
1230    def __init__(self, address=None, timeout=30, cache=None):
1231        """ Initialize the conenction.
1232       
1233        :param address: Address of the server.
1234        :param timeout: Socket timeout.
1235        :param cache : A dict like object to use as a cache.
1236       
1237        """
1238        self.address = address if address is not None else self.DEFAULT_ADDRESS
1239        self.timeout = timeout
1240        self.cache = cache if cache is not None else self.DEFAULT_CACHE
1241   
1242    def query(self, condition, format="json", start=None, rows=None, indent=False):
1243        url = self.address + "api/vx?" + condition.rest()
1244        if start is not None and rows is not None:
1245            url += "&start={0}&rows={1}".format(start, rows)
1246        url += "&format={0}".format(format)
1247        if indent:
1248            url += "&indent"
1249#        print url
1250        if self.cache is not None:
1251            return self._query_cached(url)
1252        else:
1253            return urllib2.urlopen(url)
1254        return response
1255   
1256    def _query_cached(self, url):
1257        if self.cache is not None:
1258            if url not in self.cache:
1259                response = urllib2.urlopen(url)
1260                contents = response.read()
1261                self.cache[url] = contents
1262                if hasattr(self.cache, "sync"):
1263                    self.cache.sync()
1264            return StringIO(self.cache[url])
1265        else:
1266            return urllib2.urlopen(url)
1267               
1268   
1269# Names of all Gene Property filter names
1270GENE_FILTERS = \
1271    ["Name", # Gene name
1272     "Goterm", #Gene Ontology Term
1273     "Interproterm", #InterPro Term
1274     "Disease", #Gene-Disease Assocation
1275     "Keyword", #Gene Keyword
1276     "Protein", #Protein
1277
1278     "Dbxref", #Other Database Cross-Refs
1279     "Embl", #EMBL-Bank ID
1280     "Ensfamily", #Ensembl Family
1281     "Ensgene", #Ensembl Gene ID
1282
1283     "Ensprotein", #Ensembl Protein ID
1284     "Enstranscript", #Ensembl Transcript ID
1285     "Goid", #Gene Ontology ID
1286     "Image", #IMAGE ID
1287     "Interproid", #InterPro ID
1288     "Locuslink", #Entrez Gene ID
1289
1290     "Omimid", #OMIM ID
1291     "Orf", #ORF
1292     "Refseq", #RefSeq ID
1293     "Unigene", #UniGene ID
1294     "Uniprot", #UniProt Accession
1295
1296     "Hmdb", #HMDB ID
1297     "Chebi", #ChEBI ID
1298     "Cas", #CAS
1299     "Uniprotmetenz", #Uniprotmetenz
1300     "Gene", #Gene Name or Identifier
1301     "Synonym", #Gene Synonym
1302     ]
1303   
1304# Valid Gene Property filter qualifiers
1305GENE_FILTER_QUALIFIERS =\
1306    ["Is",
1307     "IsNot"
1308     ]
1309
1310# Organisms in the Atlas
1311ATLAS_ORGANISMS = \
1312    ["Anopheles gambiae",
1313     "Arabidopsis thaliana",
1314     "Bos taurus",
1315     "Caenorhabditis elegans",
1316     "Danio rerio",
1317     "Drosophila melanogaster",
1318     "Epstein barr virus",
1319     "Gallus gallus",
1320     "Homo sapiens",
1321     "Human cytomegalovirus",
1322     "Kaposi sarcoma-associated herpesvirus",
1323     "Mus musculus",
1324     "Rattus norvegicus",
1325     "Saccharomyces cerevisiae",
1326     "Schizosaccharomyces pombe",
1327#     "Unknown",
1328     "Xenopus laevis"
1329     ]
1330   
1331#_COMMON_TAXIDS = \
1332#    {"Anopheles gambiae",
1333#     "Arabidopsis thaliana",
1334#     "Bos taurus",
1335#     "Caenorhabditis elegans",
1336#     "Danio rerio",
1337#     "Drosophila melanogaster",
1338#     "Epstein barr virus",
1339#     "Gallus gallus",
1340#     "Homo sapiens",
1341#     "Human cytomegalovirus",
1342#     "Kaposi sarcoma-associated herpesvirus",
1343#     "Mus musculus",
1344#     "Rattus norvegicus",
1345#     "Saccharomyces cerevisiae",
1346#     "Schizosaccharomyces pombe",
1347##     "Unknown",
1348#     "Xenopus laevis"
1349#     }
1350   
1351def ef_ontology():
1352    """ Return the `EF <http://www.ebi.ac.uk/efo/>`_ (Experimental Factor) ontology
1353    """
1354    import obiOntology
1355#    return obiOntology.OBOOntology(urllib2.urlopen("http://efo.svn.sourceforge.net/svnroot/efo/trunk/src/efoinobo/efo.obo"))
1356    import orngServerFiles
1357    # Should this be in the OBOFoundry (Ontology) domain
1358    try:
1359        file = open(orngServerFiles.localpath_download("ArrayExpress", "efo.obo"), "rb")
1360    except urllib2.HTTPError:
1361        file = urllib2.urlopen("http://efo.svn.sourceforge.net/svnroot/efo/trunk/src/efoinobo/efo.obo")
1362    return obiOntology.OBOOntology(file)
1363
1364
1365class AtlasCondition(object):
1366    """ Base class for Gene Expression Atlas query condition
1367    """
1368    def validate(self):
1369        """ Validate condition in a subclass.
1370        """
1371        raise NotImplementedError
1372   
1373    def rest(self):
1374        """ Return a REST query part in a subclass.
1375        """
1376        raise NotImplementedError
1377   
1378   
1379class AtlasConditionList(list, AtlasCondition):
1380    """ A list of AtlasCondition instances.
1381    """ 
1382    def validate(self):
1383        for item in self:
1384            item.validate()
1385       
1386    def rest(self):
1387        return "&".join(cond.rest() for cond in self)
1388
1389
1390class AtlasConditionGeneProperty(AtlasCondition):
1391    """ An atlas gene filter condition.
1392   
1393    :param property: Property of the gene. If None or "" all properties
1394        will be searched.
1395    :param qualifier: Qualifier can be 'Is' or 'IsNot'
1396    :param value: The value to search for.
1397   
1398    Example ::
1399   
1400        >>> # Condition on a gene name
1401        >>> condition = AtlasConditionGeneProperty("Name", "Is", "AS3MT")
1402        >>> # Condition on genes from a GO Term
1403        >>> condition = AtlasConditionGeneProperty("Goterm", "Is", "p53 binding")
1404        >>> # Condition on disease association
1405        >>> condition = AtlasConditionGeneProperty("Disease", "Is", "cancer")
1406       
1407    """
1408    def __init__(self, property, qualifier, value):
1409        self.property = property or ""
1410        self.qualifier = qualifier
1411        if isinstance(value, basestring):
1412            self.value = value.replace(" ", "+")
1413        elif isinstance(value, list):
1414            self.value = "+".join(value)
1415        else:
1416            raise ValueError(value)
1417       
1418        self.validate()
1419       
1420    def validate(self):
1421        assert(self.property in GENE_FILTERS + [""])
1422        assert(self.qualifier in GENE_FILTER_QUALIFIERS + [""])
1423       
1424    def rest(self):
1425        return "gene{property}{qualifier}={value}".format(**self.__dict__)
1426       
1427       
1428class AtlasConditionExperimentalFactor(AtlasCondition):
1429    """ An atlas experimental factor filter condition.
1430   
1431    :param factor: EFO experiamntal factor
1432    :param regulation: "up", "down", "updown", "any" or "none"
1433    :param n: Minimum number of of experimants with this condition
1434    :param value: Experimantal factor value
1435   
1436    Example ::
1437   
1438        >>> # Any genes up regulated in at least 3 experiments involving cancer.
1439        >>> condition = AtlasConditionExperimentalFactor("", "up", 3, "cancer")
1440        >>> # Only genes which are up/down regulated in the heart in at least one experiment.
1441        >>> condition = AtlasConditionExperimentalFactor("Organism_part", "updown", 1, "heart")
1442       
1443    """
1444    def __init__(self, factor, regulation, n, value):
1445        self.factor = factor
1446        self.regulation = regulation
1447        self.n = n
1448        self.value = value
1449        self.validate()
1450       
1451    def validate(self):
1452        # TODO: validate the factor and value
1453#        assert(self.factor in ef_ontology())
1454        assert(self.regulation in ["up", "down", "updown"])
1455       
1456    def rest(self):
1457        return "{regulation}{n}In{factor}={value}".format(**self.__dict__)
1458       
1459class AtlasConditionOrganism(AtlasCondition):
1460    """ Condition on organism.
1461    """
1462    def __init__(self, organism):
1463        self.organism = organism
1464        self.validate()
1465       
1466    def validate(self):
1467        assert(self.organism in ATLAS_ORGANISMS)
1468       
1469    def rest(self):
1470        return "species={0}".format(self.organism.replace(" ", "+").lower())
1471       
1472       
1473class AtlasConditionExperiment(AtlasCondition):
1474    """ Condition on experiement
1475   
1476    :param property: Property of the experiment. If None or "" all properties
1477        will be searched.
1478    :param qualifier: Qualifier can be 'Has' or 'HasNot'
1479    :param value: The value to search for.
1480   
1481    Example ::
1482   
1483        >>> # Condition on a experiemnt acession
1484        >>> condition = AtlasConditionExperiment("", "", "E-GEOD-24283")
1485        >>> # Condition on experiments involving lung
1486        >>> condition = AtlasConditionExperiment("Organism_part", "Has", "lung")
1487       
1488    """
1489#    EXPERIMENT_FILTERS = [
1490#                "Organism"
1491#                "Factor"]
1492   
1493    EXPERIMENT_FILTER_QUALIFIERS = [
1494                "Has",
1495                "HasNot"]
1496   
1497    def __init__(self, property, qualifier, value):
1498        self.property = property
1499        self.qualifier = qualifier
1500        if isinstance(value, basestring):
1501            self.value = value.replace(" ", "+")
1502        elif isinstance(value, list):
1503            self.value = "+".join(value)
1504        else:
1505            raise ValueError(value)
1506       
1507        self.validate()
1508       
1509    def validate(self):
1510        # TODO: check to EFO factors
1511#        assert(self.property in EXPERIMENT_FILTERS + [""])
1512        assert(self.qualifier in self.EXPERIMENT_FILTER_QUALIFIERS + [""])
1513       
1514    def rest(self):
1515        return "experiment{property}{qualifier}={value}".format(**self.__dict__)
1516       
1517       
1518class GeneAtlasError(ValueError):
1519    """ An error response from the Atlas server.
1520    """
1521    pass
1522   
1523   
1524def __check_atlas_error_json(response):
1525    if "error" in response:
1526        raise GeneAtlasError(response["error"])
1527    return response
1528 
1529     
1530def __check_atlas_error_xml(response):
1531    error = response.find("error")
1532    if error is not None:
1533        raise GeneAtlasError(error.text)
1534    return response
1535   
1536       
1537def query_atlas_simple(genes=None, regulation=None, organism=None,
1538                       condition=None, format="json", start=None,
1539                       rows=None):
1540    """ A simple Atlas query.
1541   
1542    :param genes: A list of gene names to search for.
1543    :param regulation: Search for experiments in which `genes` are "up",
1544        "down", "updown" or "none" regulated. If None all experiments
1545        are searched.
1546    :param organism: Search experiments for organism. If None all experiments
1547        are searched.
1548    :param condition: An EFO factor value (e.g. "brain")
1549   
1550    Example ::
1551       
1552        >>> query_atlas_simple(genes=['Pou5f1', 'Dppa3'], organism="Mus musculus")
1553        {u'...
1554       
1555        >>> query_atlas_simple(genes=['Pou5f1', 'Dppa3'], regulation="up", organism="Mus musculus")
1556        {u'...
1557       
1558        >>> query_atlas_simple(genes=['Pou5f1', 'Dppa3'], regulation="up", condition="liver", organism="Mus musculus")
1559        {u'...
1560       
1561    """
1562    import warnings
1563    warnings.warn("Use 'obiGeneAtlas.run_simple_query' instead.", DeprecationWarning)
1564    conditions = AtlasConditionList()
1565    if genes:
1566        conditions.append(AtlasConditionGeneProperty("Gene", "Is", genes))
1567    if regulation or condition:
1568        regulation = "any" if regulation is None else regulation
1569        condition = "" if condition is None else condition
1570        conditions.append(AtlasConditionExperimentalFactor("", regulation, 1, condition))
1571    if organism:
1572        conditions.append(AtlasConditionOrganism(organism))
1573       
1574    connection = GeneExpressionAtlasConenction()
1575    results = connection.query(conditions, format=format, start=start,
1576                               rows=rows)
1577    if format == "json":
1578        return parse_json(results)
1579    else:
1580        return parse_xml(results)
1581
1582"""\
1583.. todo:: can this be implemented query_atlas(organism="...", Locuslink="...", Chebi="...", up3InCompound="..." downInEFO="...")
1584      Need a full list of accepted factors
1585"""
1586
1587def query_atlas(condition, format="json", start=None, rows=None, indent=False, connection=None):
1588    """ Query Atlas based on a `condition` (instance of AtlasCondition)
1589   
1590    Example ::
1591       
1592        >>> condition1 = AtlasConditionGeneProperty("Goterm", "Is", "p53 binding")
1593        >>> condition2 = AtlasConditionExperimentalFactor("Organism_part", "up", 3, "heart")
1594        >>> condition = AtlasConditionList([condition1, condition2])
1595        >>> query_atlas(condition)
1596        {u'...
1597       
1598    """
1599    import warnings
1600    warnings.warn("Use 'obiGeneAtlas.run_query' instead.", DeprecationWarning)
1601    if connection is None:
1602        connection = GeneExpressionAtlasConenction()
1603    results = connection.query(condition, format=format, start=start,
1604                               rows=rows, indent=indent)
1605    if format == "json":
1606        response = parse_json(results)
1607        return __check_atlas_error_json(response)
1608    else:
1609        response = parse_xml(results)
1610        return __check_atlas_error_xml(response)
1611
1612
1613def get_atlas_summary(genes, organism, connection=None):
1614    """ Return 3 dictionaries containing a summary of atlas information
1615    about three experimental factors:
1616   
1617        - Organism Part (OP)
1618        - Disease State (DS)
1619        - Cell type (CT)
1620   
1621    Each dictionary contains query genes as keys. Values are dictionaries
1622    mapping factor values to a 2-tuple containig the count of up regulated
1623    and down regulated experiments.
1624   
1625    Example ::
1626   
1627        >>> get_atlas_summary(["RUNX1"], "Homo sapiens")
1628        ({u'RUNX1': ...
1629       
1630    """
1631    import warnings
1632    warnings.warn("Use 'obiGeneAtlas.get_atlas_summary' instead.", DeprecationWarning)
1633    genes_condition = AtlasConditionGeneProperty("Gene", "Is", genes)
1634    org_condition = AtlasConditionOrganism(organism)
1635    condition = AtlasConditionList([genes_condition, org_condition])
1636    result = query_atlas(condition, format="json", connection=connection)
1637   
1638    org_part = collect_ef_summary(result, "organism_part")
1639    disease_state = collect_ef_summary(result, "disease_state")
1640    cell_type = collect_ef_summary(result, "cell_type")
1641   
1642    return dict(org_part), dict(disease_state), dict(cell_type)
1643   
1644   
1645def collect_ef_summary(info, ef, summary=None):
1646    """ Collect the results summary from query_atlas, result for experimental
1647    factor `ef`.
1648    """
1649    if summary is None:
1650        summary = defaultdict(dict)
1651       
1652    results = info["results"]
1653    for res in results:
1654        gene = res["gene"]
1655        expressions = res["expressions"] 
1656        for expression in expressions:
1657            if expression["ef"] == ef:
1658                efv = expression["efv"]
1659                updown = (expression["upExperiments"],
1660                          expression["downExperiments"]
1661                          )
1662               
1663                if any(updown):
1664                    summary[gene["name"]][efv] = updown
1665   
1666    return summary
1667   
1668   
1669def test():   
1670    conn = ArrayExpressConnection()
1671    import doctest
1672    foo = type("foo", (object,), {})()
1673    bar = type("bar", (object,), {})()
1674   
1675    doctest.testmod(optionflags=doctest.ELLIPSIS,
1676                    extraglobs={"conn": conn, "foo": foo,"bar": bar})
1677   
1678   
1679if __name__ == "__main__":
1680    test()
1681   
Note: See TracBrowser for help on using the repository browser.