source: orange-bioinformatics/obiArrayExpress.py @ 1353:cbffc94cb604

Revision 1353:cbffc94cb604, 50.7 KB checked in by ales_erjavec <ales.erjavec@…>, 3 years ago (diff)

Added ArrayExpressExperiment class.
Added classes and functions for parsing MAGE-TAB standard files (this still needs lots of work).
Changed the xml parsing code to use the xml.etree parser.

Line 
1"""
2obiArrayExpress
3===============
4
5A python module for accessing the ArrayExpress and GeneExpressionAtlas
6web services.
7
8
9Array Express
10-------------
11
12`Array Express Archive <http://www.ebi.ac.uk/arrayexpress/>`_ is a database of gene expression experiments that you
13can query and download.
14
15Example of an Array Express query ::
16
17    >>> import obiArrayExpress
18    >>> obiArrayExpress.query_experiments(accession='E-MEXP-31')
19    {u'experiments': ...
20   
21    >>> obiArrayExpress.query_files(accession='E-MEXP-32', format="xml")
22    <xml.etree.ElementTree.ElementTree instance...
23   
24.. note:: Currently querying ArrayExpress files only works with the xml format.
25
26.. note:: See the documentation of `query_experiments` for a full set of
27          parameters that these functions accept.
28
29"""
30
31import os, sys
32import urllib2
33
34import orngEnviron
35import orngServerFiles
36import warnings
37import posixpath
38import shelve
39import shutil
40import posixpath
41import json
42from xml.etree.ElementTree import ElementTree
43
44from collections import defaultdict
45
46parse_json = json.load
47
48def parse_xml(stream):
49    """ Parse an xml stream into an instance of xml.etree.ElementTree.ElementTree.
50    """
51    return ElementTree(file=stream) 
52#    tree.parse(stream)
53#    return tree
54
55# All searchable fields of ArrayExpress (see query_experiments docstring
56# for a description of the fields)
57ARRAYEXPRESS_FIELDS = \
58    ["keywords",
59     "accession",
60     "array",
61     "ef",
62     "efv",
63     "expdesign",
64     "exptype",
65     "gxa",
66     "pmid",
67     "sa",
68     "species",
69     "expandefo",
70     "directsub",
71     "assaycount",
72     "efcount",
73     "samplecount",
74     "sacount",
75     "rawcount",
76     "fgemcount",
77     "miamescore",
78     "date",
79     "wholewords",
80    ]
81
82class forgetfull(dict):
83    """ A forgetfull dictionary.
84    """
85    def __setitem__(self, key, value):
86        """ Do nothing.
87        """
88        return
89   
90
91class ArrayExpressConnection(object):
92    """ A connection to the ArrayExpress. Used for query construction,
93    and user login.
94   
95    .. todo:: Implement user login.
96    """
97   
98    DEFAULT_ADDRESS = "http://www.ebi.ac.uk/arrayexpress/{format}/v2/"
99    DEFAULT_FORMAT = "json"
100   
101    try:
102        DEFAULT_CACHE = shelve.open(orngServerFiles.localpath("ArrayExpress", "ArrayExpressCache.shelve"))
103    except Exception:
104        DEFAULT_CACHE = {}
105    # Order of arguments in the query
106    _ARGS_ORDER = ["keywords", "species", "array"]
107   
108    def __init__(self, address=None, timeout=30, cache=None,
109                 username=None, password=None):
110        """ Initialize the connection object.
111       
112        :param address: Address of the ArrayExpress API
113        :param timeout: Timeout for the socket connection
114       
115        .. todo:: Implement user login (see Accessing Private Data in API docs)
116       
117        """
118        self.address = address if address is not None else self.DEFAULT_ADDRESS
119        self.timeout = timeout
120        self.cache = cache if cache is not None else self.DEFAULT_CACHE
121        self.username = username
122        self.password = password
123       
124       
125    def format_query(self, **kwargs):
126        """ Format the query arguments.
127       
128        Example ::
129       
130            >>> conn.format_query(gxa=True, efcount=(1, 5))
131            'efcount=[1 TO 5]&gxa=true'
132           
133        """
134        # Formaters:
135        def format_default(val):
136            if isinstance(val, basestring):
137                return val
138            else:
139                return "+".join(val)
140        def format_species(val):
141            return '"%s"' % val.lower()
142        def format_gxa(val):
143            if val:
144                return "true"
145            else:
146                raise ValueError("gxa={0}".format(val))
147        def format_expandefo(val):
148            if val:
149                return "on"
150            else:
151                raise ValueError("expandefo={0}".format(val))
152        def format_true_false(val):
153            return "true" if val else "false"
154        def format_interval(val):
155            if isinstance(val, tuple):
156                return "[{0} TO {1}]".format(*val)
157            else:
158                raise ValueError("Must be an interval argument (min, max)!")
159        def format_date(val):
160            return val
161        def format_wholewords(val):
162            if val:
163                return "on"
164            else:
165                raise ValueError("wholewords={0}".format(val))
166       
167        formaters = {"species": format_species,
168                     "gxa": format_gxa,
169                     "expandefo": format_expandefo,
170                     "directsub": format_true_false,
171                     "assaycount": format_interval,
172                     "efcount": format_interval,
173                     "samplecount": format_interval,
174                     "sacount": format_interval,
175                     "rawcount": format_interval,
176                     "fgemcount": format_interval,
177                     "miamescore": format_interval,
178                     "date": format_date,
179                     "wholewords": format_wholewords,
180                     }
181        parts = []
182        arg_items = kwargs.items()
183        ordered = sorted(arg_items, key=lambda arg: self._ARGS_ORDER.index(arg[0]) \
184                         if arg[0] in self._ARGS_ORDER else 100)
185       
186        for key, value in kwargs.iteritems():
187            if key == "format":
188                continue # format is handled in query_url
189            if key not in ARRAYEXPRESS_FIELDS:
190                raise ValueError("Invalid argument name: '{0}'".format(key))
191            if value is not None and value != []:
192                fmt = formaters.get(key, format_default)
193                value = fmt(value)
194                parts.append("{0}={1}".format(key, value))
195                 
196        return "&".join(parts)
197       
198    def query_url(self, what="experiments", **kwargs):
199        """ Return a formated query URL for the query arguments
200       
201        Example ::
202            >>> conn.query_url(accession="E-MEXP-31")
203            'http://www.ebi.ac.uk/arrayexpress/json/v2/experiments?accession=E-MEXP-31'
204           
205        """
206        query = self.format_query(**kwargs)
207        url = posixpath.join(self.address, what)
208        url = url.format(format=kwargs.get("format", self.DEFAULT_FORMAT))
209        url = url + ("?" + query if query else "")
210        url = url.replace(" ", "%20")
211        return url
212   
213    def query_url_experiments(self, **kwargs):
214        """ Return a formated experiments query url for the calls arguments
215        """
216        return self.query_url("experiments", **kwargs)
217   
218    def query_url_files(self, **kwargs):
219        """ Return a formated experiments query url for the calls arguments
220        """
221        return self.query_url("files", **kwargs)
222   
223    def query_experiment(self, **kwargs):
224        """ Return an open stream to the experiments query results.
225       
226        .. note:: This member function takes the same arguments as the module
227                  level `query_experiemnts` function.
228         
229        """
230        url = self.query_url_experiments(**kwargs)
231        stream = self._cache_urlopen(url, timeout=self.timeout)
232#        stream = urllib2.urlopen(url, timeout=self.timeout)
233        return stream
234   
235    def query_files(self, **kwargs):
236        """ Return an open stream to the files query results.
237       
238        .. note:: This member function takes the same arguments as the module
239                  level `query_files` function.
240        """
241        url = self.query_url_files(**kwargs)
242        stream = self._cache_urlopen(url, timeout=self.timeout)
243#        stream = urllib2.urlopen(url, timeout=self.timeout)
244        return stream
245   
246    def open_file(self, accession, kind="raw", ext=None):
247        """ Return a file handle to experiment data.
248        Possible values for kind:
249            - raw: return the raw data if available
250            - fgem: return the processed data if available
251            - biosamples: a png or svg design image
252            - idf: investigation description
253            - adf: array design description
254            - mageml: MAGE-ML file
255           
256        Example ::
257       
258            >>> raw_file = conn.open_file("E-TABM-1087", kind="raw")
259            >>> processed_file = conn.open_file("E-TABM-1087", kind="fgem")
260             
261        """
262        stream = self.query_files(accession=accession, format="xml")
263        tree = ElementTree(file=stream)
264        files = tree.findall("experiment/file")
265        for file in files:
266            filekind = file.find("kind").text
267            fileext = file.find("extension").text
268            if filekind.strip() == kind and (fileext.strip() == ext or ext is None): 
269                url = file.find("url").text
270                return self._cache_urlopen(url.strip(), timeout=self.timeout)
271#                return urllib2.urlopen(url.strip(), timeout=self.timeout)
272           
273    def _cache_urlopen(self, url, timeout=30):
274        from StringIO import StringIO
275        if self.cache is not None and url in self.cache:
276            stream = StringIO(self.cache[url])
277            return stream
278        elif self.cache is not None:
279            stream = urllib2.urlopen(url, timeout=timeout)
280            data = stream.read()
281            self.cache[url] = data
282            return StringIO(data)
283        else:
284            return urllib2.urlopen(url, timeout=timeout)
285       
286   
287def query_experiments(keywords=None, accession=None, array=None, ef=None,
288                      efv=None, expdesign=None, exptype=None,
289                      gxa=None, pmid=None, sa=None, species=None,
290                      expandefo=None, directsub=None, assaycount=None,
291                      efcount=None, samplecount=None, rawcount=None,
292                      fgemcount=None, miamescore=None, date=None, 
293                      format="json", wholewords=None, connection=None):
294    """ Query Array Express experiments.
295   
296    :param keywords: A list of keywords to search (e.g. ['gliobastoma']
297    :param accession: Search by experiment accession (e.g. 'E-MEXP-31')
298    :param array: Search by array design name or accession (e.g. 'A-AFFY-33')
299    :param ef: Experimental factor (names of main variables of experiments)
300    :param efv: Experimental factor value (Has EFO expansion)
301    :param expdesign: Experiment design type. (e.g. ["dose", "response"])
302    :param exptype: Experiment type (e.g. 'RNA-Seq', has EFO expansion)
303    :param gxa: If True limit the results to experiments from the Gene
304        Expreission Atlas only.
305    :param pmid: Search by PubMed identifier
306    :param sa: Sample attribute values (e.g. 'fibroblast', has EFO expansion)
307    :param species: Search by species (e.g. 'Homo sapiens', has EFO expansion)
308   
309    :param expandefo: If True expand the search terms with all its child terms
310        in the Experimental Factor Ontology (EFO_) (e.g. keywords="cancer"
311        will be expanded to include for synonyms and sub types of cancer).
312    :param directsub: If True return only experiments submited directly to
313        Array Express else if False return only experiments imported from GEO
314        database (default None, return both)
315    :param assaycount: A two tuple (min, max) for filter on the number of
316        assays (e.g. (1, 5) will return only experiments with at least one
317        and no more then 5 assays).
318    :param efcount: Filter on the number of experimental factors (e.g. (1, 5))
319    :param sacount: Filter on the number of sample attribute categories
320    :param rawcount: Filter on the number or raw files
321    :param fgemcount: Filter on the number of final gene expression matrix
322        (processed data) files
323    :param miamescore: Filter on the MIAME complience score (max 5)
324    :param date: Filter by release date
325   
326    Example ::
327   
328        >>> query_experiments(species="Homo sapiens", ef="organism_part", efv="liver")
329        {u'experiments': ...
330       
331    .. _EFO: http://www.ebi.ac.uk/efo/
332   
333    """
334    if connection is None:
335        connection = ArrayExpressConnection()
336       
337    stream = connection.query_experiment(keywords=keywords, accession=accession,
338                array=array, ef=ef, efv=efv, expdesign=expdesign, exptype=exptype,
339                gxa=gxa, pmid=pmid, sa=sa, species=species, expandefo=expandefo,
340                directsub=directsub, assaycount=assaycount, efcount=efcount,
341                samplecount=samplecount, rawcount=rawcount, fgemcount=fgemcount,
342                miamescore=miamescore, date=date,  format=format,
343                wholewords=wholewords)
344   
345    if format == "json":
346        return parse_json(stream)
347    else:
348        return parse_xml(stream)
349   
350   
351def query_files(keywords=None, accession=None, array=None, ef=None,
352               efv=None, expdesign=None, exptype=None,
353               gxa=None, pmid=None, sa=None, species=None,
354               expandefo=None, directsub=None, assaycount=None,
355               efcount=None, samplecount=None, rawcount=None,
356               fgemcount=None, miamescore=None, date=None, 
357               format="json", wholewords=None, connection=None):
358    """ Query Array Express files.
359   
360    This function accepts the same arguments as `query_experiments`.
361   
362    Example ::
363   
364        >>> query_files(species="Mus musculus", ef="developmental_stage", efv="embryo", format="xml")
365        <xml.etree.ElementTree.ElementTree instance...
366       
367    .. todo:: why does the json interface not work.
368                       
369    """
370    if connection is None:
371        connection = ArrayExpressConnection()
372   
373    stream = connection.query_files(keywords=keywords, accession=accession,
374                array=array, ef=ef, efv=efv, expdesign=expdesign, exptype=exptype,
375                gxa=gxa, pmid=pmid, sa=sa, species=species, expandefo=expandefo,
376                directsub=directsub, assaycount=assaycount, efcount=efcount,
377                samplecount=samplecount, rawcount=rawcount, fgemcount=fgemcount,
378                miamescore=miamescore, date=date,  format=format,
379                wholewords=wholewords)
380   
381    if format == "json":
382        return parse_json(stream)
383    else:
384        return parse_xml(stream)
385   
386   
387def open_file(accession, kind="raw", ext=None, repo_dir=None):
388    """ Open a file for the experiment.
389     
390    Example ::
391   
392        >>> file = open_file("E-MTAB-369", kind="raw", repo_dir="~/.arrayexpress/")
393       
394    """
395    raise NotImplementedError
396
397"""\
398MAGE-TAB convinence functions, classes
399======================================
400"""
401
402IDF_SINGLE_TAGS = \
403    ["Investigation Title",
404     "Date of Experiment",
405     "Public Release Date",
406     "Experiment Description",
407    ]
408
409def parse_idf(file):
410    """ Parse an idf.txt (Investigation Description  Format) formated file.
411    Return a list of tuples where the first element is the tag (first column)
412    and the second element is a list of all tag values.
413   
414    """
415    if isinstance(file, basestring):
416        file = open(file, "rb")
417    data = file.read()
418    lines = data.splitlines()
419    lines = [line.split("\t") for line in lines if line and not line.startswith("#")]
420    parsed = [(line[0], line[1:]) for line in lines]
421    return parsed
422   
423def parse_sdrf(file):
424    """ Parse an sdfr formated file. Return a tuple with the first element
425    a list of header values and the second element is a list of all rows
426    (a row is a list of string values).
427   
428    """
429    if isinstance(file, basestring):
430        file = open(file, "rb")
431    data = file.read()
432    lines = data.splitlines()
433    lines = [line.split("\t") for line in lines if line.strip() and not line.startswith("#")]
434    header = [h for h in lines[0] if h]
435    rows = lines[1:]
436    assert(all([len(r) == len(header) for r in rows]))
437    return header, rows
438   
439def parse_adf(file):
440    pass
441
442def parse_data_matrix(file):
443    """ Parse the MAGE-TAB processed data matrix. Return a tuple where the
444    elements are:
445        - a (header REF, header values) tuple (e.g. ("Hybridization REF", ["Stage1", "Stage2", ...]) )
446        - a list of quantitations for header values (e.g. ["log2 ratio", "log2 ratio", ...])
447        - a (row REF, row names list) tuple ("e.g. ("Reporter REF", ["Probe 1", "Probe 2", ...]) )
448        - a list of list matrix with values (as strings)
449       
450    """
451    if isinstance(file, basestring):
452        file = open(file, "rb")
453    data = file.read()
454    lines = data.splitlines()
455    lines = [line.split("\t") for line in lines if line.strip()]
456    header = lines[0]
457    header_ref, header = header[0], header[1:]
458    line2 = lines[1]
459    row_ref, quanifications = line2[0], line2[1:]
460    row_names, rows = [], []
461    for line in lines[2:]:
462        r_name, row = line[0], line[1:]
463        row_names.append(r_name)
464        rows.append(row)
465       
466    return ((header_ref, header),
467            quanifications,
468            (row_ref, row_names),
469            rows) 
470   
471class InvesigationDesign(dict):
472    """ Investigation design (contains the contents of the .idf).
473   
474    Example ::
475   
476        >>> idf = InvestigationDesign("foobar.idf")
477        >>> print idf.investigation_title
478        foo investigation
479        >>> print idf.experimental_design
480        ['fubar', 'snafu']
481        >>> print idf.sdrf_file
482        ['foobar.sdrf']
483       
484    """
485    def __init__(self, idf_file=None):
486        idf = parse_idf(idf_file)
487        self._idf = idf
488        self.update(dict(idf))
489        for tag, values in idf:
490            if tag in IDF_SINGLE_TAGS:
491                values = values[0] if values else None
492            ttag = self.transform_tag(tag)
493            setattr(self, ttag, values)
494       
495    def transform_tag(self, tag):
496        """ Transform the tag into a proper python attribute name by
497        replacing all spaces and special characters (e.g '[', ']' into
498        underscores).
499       
500        """
501        toreplace = [" ", "-", "[", "]"]
502        for s in toreplace:
503            tag = tag.replace(s, "_")
504        return tag.lower()
505           
506    def __getitem__(self, tag):
507        """ Return the tag values
508       
509        Example ::
510       
511            >>> idf["Investigation Title"]
512            'foo investigation'
513           
514        """
515        try:
516            return self._idf_dict[tag]
517        except KeyError:
518            pass
519       
520        ttag = self.transform_tag(tag)
521        if hasattr(self, ttag):
522            return getattr(self, ttag)
523        else:
524            raise KeyError(tag)
525       
526class SampleDataRelationship(object):
527    """ Sample-Data Relationship (contains the contents of the .sdrf file).
528   
529    Example ::
530   
531        >>> sdr = SampleDataRelationship("foobar.sdrf")
532        >>> sdr.source_name
533        ['foo', ...
534       
535        >>> sdr.sample_name
536        ['sampled foo', ...
537       
538        >>> sdr.extract_protocol_ref
539        ['bar the foo', ...
540       
541        >>> sdr.
542        >>> sdr.derived_array_data_matrix_file
543        ['foobar.data.txt', ...
544       
545        >>>
546         
547    """
548   
549    # Nodes an edges
550    NODES_EDGES = ["Source Name", "Sample Name", "Extract Name",
551                   "Labeled Extract Name", "Hybridization Name",
552                   "Assay Name", "Scan Name", "Normalization Name",
553                   "Array Data File", "Derived Array Data File",
554                   "Array Data Matrix File", "Derived Array Data Matrix File",
555                   "Image File", "Protocol REF"]
556   
557    # Attributes for nodes and edges
558    NODE_EDGE_ATTRIBUTES = \
559    {"Source Name": ["Characteristics", "Provider", "Material Type", "Description", "Comment"],
560     "Sample Name": ["Characteristics", "Material Type", "Description", "Comment"],
561     "Extract Name":["Characteristics", "Material Type", "Description", "Comment"],
562     "Labeled Extract Name": ["Characteristics", "Material Type", "Description", "Label", "Comment"],
563     "Hybridization Name": ["Array Design File", "Array Design REF", "Comment"],
564     "Assay Name": ["Technology Type", "Comment"],
565     "Scan Name": ["Comment"],
566     "Normalization Name": ["Comment"],
567     "Array Data File": ["Comment"],
568     "Derived Array Data File": ["Comment"],
569     "Array Data Matrix File": ["Comment"],
570     "Derived Array Data Matrix File": ["Comment"],
571     "Image File": ["Comment"],
572     "Protocol REF": ["Term Source REF", "Parameter", "Performer", "Date", "Comment"]
573     }
574   
575    # Attributes
576    ATTRIBUTE_COLUMNS = \
577    {"Characteristics []": ["Unit", "Term Source REF"],
578     "Provider": ["Comment"],
579     "Material Type": ["Term Source REF"],
580     "Label": ["Term Source REF"],
581     "Array Design File": ["Comment"],
582     "Array Design REF": ["Term Source REF", "Comment"],   
583     "Technology Type": ["Term Source REF"],
584     "Factor Value [] ()": ["Unit", "Term Source REF"],
585     "Performer": ["Comment"],
586     "Date": [],
587     "Parameter Value []": ["Unit", "Comment"],
588     "Unit []": ["Term Source REF"],
589     "Description": [],
590     "Term Source REF": ["Term Accession Number"],
591     "Term Accession Number": [],
592     "Comment []": []
593     }
594    def __init__(self, sdrf_file=None):
595        header, rows = parse_sdrf(sdrf_file)
596        self.header = header
597        self.rows = rows
598       
599    def transform_tag(self, tag):
600        """ Transform the tag into a proper python attribute name by
601        replacing all spaces and special characters (e.g '[', ']' into
602        underscores).
603       
604        """
605        toreplace = [" ", "-", "[", "]"]
606        for s in toreplace:
607            tag = tag.replace(s, "_")
608        return tag.lower()
609   
610    def _subsection(self, name):
611        """ Return the named subsection (name must be from the
612        NODES_EDGES list).
613       
614        """
615        idx = self.NODES_EDGES.index(name)
616        start = self.header.index(name)
617        end = -1
618        for name in self.NODES_EDGES[idx + 1:]:
619            if name in self.header[start + 1:]:
620                end = self.header.index(name, start + 1)
621                break
622        return self.header[start:end], [r[start:end] for r in self.rows]
623   
624    def _column(self, name):
625        """ Return the named column.
626        """
627        index = self.header.index(name)
628        return [r[index] for r in self.rows]
629       
630    def source(self):
631        """ Return the Source subsection
632        """
633        return self._subsection("Source Name")
634       
635    def source_name(self):
636        """ Return the Source Name subsection
637        """
638        return self._column("Source Name")
639       
640    def sample(self):
641        """ Return the Sample subsection
642        """
643        return self._subsection("Sample Name")
644       
645    def sample_name(self):
646        """ Return the Sample Name subsection
647        """
648        return self._column("Sample Name")
649       
650    def extract(self):
651        """ Return the Extract subsection
652        """
653        return self._subsection("Extract Name")
654       
655    def extract_name(self):
656        """ Return the Extract Name subsection
657        """
658        return self._column("Extract Name")
659       
660    def labeled_extract(self):
661        """ Return the Labeled Extract subsection
662        """
663        return self._subsection("Labeled Extract Name")
664       
665    def labeled_extract_name(self):
666        """ Return the Labeled Extract Name subsection
667        """
668        return self._column("Labeled Extract Name")
669       
670    def hybridization(self):
671        """ Return the Hibridization subsection.
672        """
673        return self._subsection("Hibridization Name")
674       
675    def hybridization_name(self):
676        """ Return the Hibridization Name subsection.
677        """
678        return self._column("Hibridization Name")
679       
680    def assay(self):
681        """ Return the Assay subsection
682        """
683        return self._subsection("Assay Name")
684       
685    def assay_name(self):
686        """ Return the Assay Name subsection
687        """
688        return self._column("Assay Name")
689       
690    def scan(self):
691        """ Return the Scan subsection
692        """
693        return self._subsection("Scan Name")
694       
695    def scan_name(self):
696        """ Return the Scan name subsection
697        """
698        return self._column("Scan Name")
699       
700    def normalization(self):
701        """ Return the Normalization subsection.
702        """
703        return self._subsection("Normalization Name")
704       
705    def normalization_name(self):
706        """ Return the Normalization Name subsection.
707        """
708        return self._column("Normalization Name")
709         
710    def array_data(self):
711        """ Return the Array Data subsection
712        """
713        return self._subsection("Array Data File")
714   
715    def array_data_file(self):
716        """ Return the Array Data File subsection
717        """
718        return self._column("Array Data File")
719       
720    def derived_array_data(self):
721        """ Return the Derived Array Data subsection
722        """
723        return self._subsection("Derived Array Data File")
724   
725    def derived_array_data_file(self):
726        """ Return the Derived Array Data File subsection
727        """
728        return self._column("Derived Array Data File")
729       
730    def array_data_matrix(self):
731        """ Return the Array Data Matrix subsection.
732        """
733        return self._subsection("Array Data Matrix File")
734   
735    def array_data_matrix_file(self):
736        """ Return the Array Data Matrix File subsection.
737        """
738        return self._column("Array Data Matrix File")
739       
740    def derived_array_data_matrix(self):
741        """ Return the Derived Array Data Matrix subsection.
742        """
743        return self._subsection("Derived Array Data Matrix File")
744   
745    def derived_array_data_matrix_file(self):
746        """ Return the Derived Array Data Matrix File subsection.
747        """
748        return self._column("Derived Array Data Matrix File")
749       
750    def image(self):
751        """ Return the Image subsection
752        """
753        return self._subsection("Image File")
754   
755    def image_file(self):
756        """ Return the Image File subsection.
757        """
758        return self._column("Image File")
759       
760class ArrayDesign(object):
761    """ Arary design (contains the contents of the .adf file).
762    """
763    def __init__(self, adf_file=None):
764        adf = parse_adf(adf_file)
765        self._adf = adf
766   
767def _is_float(str):
768    try:
769        float(str)
770        return True
771    except ValueError:
772        return False
773   
774def _is_continuous(items, check_count=100):
775    """ Are the strings in items continous numbers.
776    """
777    count = 0
778    i = 0
779    for i, item in enumerate(items):
780        if _is_float(item):
781            count += 1
782        if i >= check_count:
783            break
784    return count >= i * 0.5
785   
786def mage_tab_to_orange(idf_filename):
787    """ Convert an MAGE-TAB annotated experiment into an Orange.data.Table
788    instance (assumes all the associated MAGE-TAB files are in the same
789    directory.
790   
791    .. todo:: Add Characteristics, Factor Values ... to the feture.attributes dict
792   
793    """
794    import Orange
795    dirname = os.path.dirname(idf_filename)
796    idf = InvesigationDesign(idf_filename)
797   
798    sdrf_filename = os.path.join(dirname, idf.sdrf_file[0])
799    sdrf = SampleDataRelationship(sdrf_filename)
800   
801    data_matrices = set(sdrf.derived_array_data_matrix_file())
802    assert(len(data_matrices) == 1) # How to handle multiple array designs.
803    data_matrix = open(os.path.join(dirname, data_matrices.pop()), "rb")
804    header, quantification, rows, matrix = parse_data_matrix(data_matrix)
805    header_ref, header = header
806    row_ref, rows = rows
807   
808    import numpy
809    matrix = numpy.array(matrix, dtype=str)
810   
811   
812    features = []
813    for header_name, quant, column in zip(header, quantification, matrix.T):
814        if _is_continuous(column):
815            feature = Orange.data.variable.Continuous(header_name)
816        else:
817            values = set(column)
818            feature = Orange.data.variable.Discrete(header_name, values=sorted(values))
819        feature.attributes["quantification"] = quant
820        features.append(feature)
821       
822    row_ref_feature = Orange.data.variable.String(row_ref)
823    domain = Orange.data.Domain(features, None)
824    domain.addmeta(Orange.data.new_meta_id(), row_ref_feature)
825   
826    matrix[numpy.where(matrix == "NA")] = "?"
827   
828    table = Orange.data.Table(domain, [list(row) for row in matrix])
829   
830    for instance, row in zip(table, rows):
831        instance[row_ref_feature] = row
832   
833    return table
834   
835def _dictify(element):
836    """ Dictify and xml.etree.Element.Element instance.
837    """
838    if element is None:
839        element = []
840    dict = {}
841    strip = lambda s: s.strip() if s else s
842    for node in element:
843        dict[node.tag] = strip(getattr(node, "text", None))
844    return dict
845   
846   
847class ArrayExpressExperiment(object):
848    """ An convinience class representing an Array Express Experiment.
849   
850    Example ::
851   
852        >>> ae = ArrayExpressExperiment("E-MEXP-2917")
853        >>> print ae.name
854        Characterization of Data Variation in Gene Expression Profiling of Human Peripheral Blood Samples
855       
856        >>> for file in ae.files:
857        ...     print file["name"], file["url"]
858        E-MEXP-2917.biosamples.svg http://www.ebi.ac.uk/arrayexpress/files/E-MEXP-2917/E-MEXP-2917.biosamples.svg
859        ...
860           
861    """
862   
863    def __init__(self, accession, connection=None):
864        self.accession = accession
865        self.connection = connection
866        self._etree = tree = query_experiments(accession=accession, connection=self.connection, format="xml")
867        experiments = tree.findall("experiment")
868        # find the exact match (more then one experiment can be listed in the query result)
869        experiments = [e for e in experiments if e.find("accession").text.strip() == accession]
870        self._experiment = experiment = experiments[0] #tree.find("experiment")
871       
872        self.species = [e.text for e in experiment.findall("species")]
873        bool_values = {"true": True, "false": False}
874        self.rawdatafiles = bool_values[experiment.find("rawdatafiles").get("available","false")]
875        self.fgemdatafiles = bool_values[experiment.find("fgemdatafiles").get("available", "false")]
876       
877        self.sampleattributes = []
878        for sa in experiment.findall("sampleattribute"):
879            category = sa.find("category").text.strip()
880            values = [val.text for val in sa.findall("value")]
881            self.sampleattributes.append((category, values))
882           
883        self.experimentalfactors = []
884        for ef in experiment.findall("experimentalfactor"):
885            name = ef.find("name").text.strip()
886            values = [val.text.strip() for val in ef.findall("values")]
887            self.experimentalfactors.append((name, values))
888           
889        self.miamescores = _dictify(experiment.find("miamescores"))
890           
891        self.id = experiment.find("id").text
892        self.secondaryaccession = getattr(experiment.find("secondaryaccession"), "text", None)
893        self.name = experiment.find("name").text
894        self.experimenttype = experiment.find("experimenttype").text.strip()
895        self.releasedate = experiment.find("releasedate").text
896        self.lastupdatedate = getattr(experiment.find("lastupdatedate"), "text", None)
897        self.samples = int(experiment.find("samples").text)
898        self.assays = int(experiment.find("assays").text)
899       
900        self.arraydesign = [_dictify(e) for e in experiment.findall("arraydesign")]
901           
902        self.bioassaydatagroups = [_dictify(group) for group in experiment.findall("bioassaydatagroup")]
903        self.bibliography = [_dictify(e) for e in experiment.findall("bibliography")]
904#        assert(len(experiment.findall("bibliography")) < 2)
905        self.provider = [_dictify(e) for e in experiment.findall("provider")]
906#        assert(len(experiment.findall("provider")) < 2)
907       
908        self.experimentdesign = []
909        for expd in experiment.findall("experimentdesign"):
910            self.experimentdesign.append(expd.text)
911           
912        self.description = [_dictify(e) for e in experiment.findall("description")]
913#        assert(len(experiment.findall("description")) < 2)
914       
915        tree = query_files(accession=self.accession, format="xml", connection=self.connection)
916        experiments = tree.findall("experiment")
917        experiments = [e for e in experiments if e.find("accession").text.strip() == accession]
918        experiment = experiments[0]
919#        files = tree.findall("experiment/file")
920        files = experiment.findall("file")
921        self.files = [_dictify(file) for file in files]
922       
923    def _download_processed(self):
924        """ Download the processed matrix file, and associated MAGE-TAB files (idf, sdrf, adf)
925        """
926        assert(self.fgemdatafiles)
927        exp_files = [(f["kind"], f) for f in self.files if f.get("kind") in ["idf", "sdrf"] and f.get("extension") == "txt"]
928        exp_files += [(f["kind"], f) for f in self.files if f.get("kind") == "fgem"]
929        array_files = [(f["kind"], f) for f in self.files if f.get("kind") == "adf" and f.get("extension") == "txt"]
930        assert(len(files) == 3)
931       
932        for type, file in files.iteritems():
933            url = file["url"].strip()
934            rest, basename = os.path.split(url)
935            _, dirname = os.path.split(rest)
936           
937            repo_dir = orngServerFiles.localpath("ArrayExpress", dirname)
938            try:
939                os.makedirs(repo_dir)
940            except OSError:
941                pass
942            local_filename = os.path.join(repo_dir, basename)
943            stream = urllib2.urlopen(url)
944            shutil.copyfileobj(stream, open(local_filename, "wb"))
945           
946            if file["extension"] == "zip":
947                import zipfile
948                zfile = zlib.ZipFile(local_filename)
949                zfile.extractall(repo_dir)
950            elif file["extension"] == "gz":
951                import gzip
952                gzfile = gzip.open(local_filename)
953                gzfile.extractall(repo_dir)
954            elif file["extension"] in ["tgz", "tar.gz"]:
955                import tarfile
956                tfile = tarfile.TarFile(local_filename)
957                tfile.extractall(repo_dir)
958            elif file["extension"] == "txt":
959                pass
960            else:
961                raise ValueError("Unknown extension ('{0}').".format(basename))
962           
963    def _download_file(self, url, extract=True):
964        """ Download the `file` from the ArrayExpress saving it to a local
965        repository directory.
966         
967        """
968        rest, basename = posixpath.split(url)
969        dirname = posixpath.basename(rest)
970        repo_dir = orngServerFiles.localpath("ArrayExpress", dirname)
971        try:
972            os.makedirs(repo_dir)
973        except OSError:
974            pass
975        stream = urllib2.urlopen(url)
976        local_filename = os.path.join(repo_dir, basename)
977        shutil.copyfileobj(stream, open(local_filename, "wb"))
978       
979        if extract:
980            _, extension = os.path.splitext(local_filename)
981            if extension == ".zip":
982                import zipfile
983                zfile = zipfile.ZipFile(local_filename)
984                zfile.extractall(repo_dir)
985            elif extension == ".gz":
986                import gzip
987                gzfile = gzip.open(local_filename)
988                gzfile.extractall(repo_dir)
989            elif extension in [".tgz"]:
990                import tarfile
991                tfile = tarfile.TarFile(local_filename)
992                tfile.extractall(repo_dir)
993            elif extension == ".txt":
994                pass
995            else:
996                raise ValueError("Unknown extension ('{0}').".format(basename))
997           
998    def _is_local(self, url):
999        """ Is the `url` stored in the local repository.
1000        """
1001        return os.path.exists(self._local_filepath(url))
1002   
1003    def _local_filepath(self, url):
1004        """ Return the local file path for url.
1005        """
1006        rest, basename = posixpath.split(url)
1007        dirname = posixpath.basename(rest)
1008        return orngServerFiles.localpath("ArrayExpress", os.path.join(dirname, basename))
1009   
1010    def _open(self, url):
1011        """ Return an open file like handle to url (ArrayExpress file).
1012        The file is cached in the local repository for future access.
1013       
1014        """
1015        if not self._is_local(url):
1016            self._download_file(url, extract=True)
1017        file = self._local_filepath(url)
1018        return open(file, "rb")
1019   
1020    def _search_files(self, kind=None, extension=None):
1021        """ Search files by `kind` and `extension`.
1022        """
1023        res = []
1024        for file in self.files:
1025            kind_match = kind == file.get("kind") or kind is None
1026            extension_match = extension == file.get("extension") or extension is None
1027           
1028            if kind_match and extension_match:
1029                res.append(file)
1030        return res
1031       
1032    def array_design(self):
1033        """ Return a list of `ArrayDesign` instances used in this experiment.
1034        """
1035        files = [f for f in self.files if f.get("kind") == "adf" and \
1036                 f.get("extension") == "txt"]
1037       
1038        array_design = []
1039        for file in files:
1040            url = file.get("url")
1041            if not self._is_local(url):
1042                self._download_file(url)
1043            array_design.append(ArrayDesign(self._open(url)))
1044        return array_design
1045       
1046    def investigation_design(self):
1047        """ Return an `InvestigationDesgin` instance for this experiment
1048        """
1049        files = [f for f in self.files if f.get("kind") == "idf" and \
1050                 f.get("extension") == "txt"]
1051        if not files:
1052            raise ValueError("The experiment '{0}' does not have an investigation design file".format(self.accession))
1053        file = files[0]
1054        return InvesigationDesign(self._open(file.get("url")))
1055       
1056       
1057    def sample_data_relationship(self):
1058        """ Return an `SampleDataRelationship` instance describing this experiment.
1059        """
1060        files = [f for f in self.files if f.get("kind") == "sdrf" and \
1061                 f.get("extension") == "txt"]
1062        if not files:
1063            raise ValueError("The experiment '{0}' does not have an sample and data relationship file".format(self.accession))
1064        file = files[0]
1065        return SampleDataRelationship(self._open(file.get("url")))
1066       
1067    def fgem_to_table(self):
1068        assert(self.fgemdatafiles)
1069        repo_dir = orngServerFiles.localpath("ArrayExpress", self.accession)
1070        # Find the file listing the data matrix files (should be in sdrf but sometimes it is in 2column file only, why?)
1071        sdrf = self._search_files("sdrf", "txt")
1072        if sdrf:
1073            sdrf = SampleDataRelationship(self._open(sdrf[0].get("url")))
1074            if "Derived Array Data Matrix File" not in sdrf.header:
1075                twocol = self._search_files("twocolumn", "txt")
1076                if twocol:
1077                    sdrf = SampleDataRelationship(self._open(twocol[0].get("url")))
1078        matrix_file = self._search_files("fgem")[0]
1079        self._open(matrix_file.get("url")) 
1080        matrix_files = sorted(set(sdrf.derived_array_data_matrix_file()))
1081       
1082        return mage_tab_to_orange(os.path.join(repo_dir, self.accession + ".idf.txt"))
1083       
1084   
1085__doc__ += """\
1086Gene Expression Atlas
1087---------------------
1088
1089`Gene Expression Atlas <http://www.ebi.ac.uk/gxa/>`_ is a curated subset of
1090gene expression experiments in Array Express Archive.
1091
1092Use `query_atlas_simple` for simple querys.
1093
1094Example (query human genes for experiments in which they are up regulated) ::
1095
1096    >>> obiArrayExpress.query_atlas_simple(genes=["SORL1", "PSIP1", "CDKN1C"], regulation="up", organism="Homo sapiens")
1097    {u'...
1098   
1099Or use the `AtlasCondition` subclasses in this module to construct a more
1100advanced query and use the `query_atlas` function.
1101
1102Example (query human genes annotated to the GO term 'transporter activity'
1103that are up regulated in the liver in at least three experiments) ::
1104
1105    >>> go_cond = AtlasConditionGeneProperty("Goterm", "Is", "transporter activity")
1106    >>> liver_cond = AtlasConditionExperimentalFactor("Organism_part", "up", 3, "liver")
1107    >>> org_cond = AtlasConditionOrganism("Homo sapiens")
1108    >>> cond_list = AtlasConditionList([go_cond, liver_cond, org_cond])
1109    >>> query_atlas(cond_list)
1110    {u'...
1111   
1112"""
1113
1114class GeneExpressionAtlasConenction(object):
1115    """ A connection to Gene Expression Atlas database.
1116    """
1117    DEFAULT_ADDRESS = "http://www.ebi.ac.uk:80/gxa/"
1118   
1119    def __init__(self, address=None, timeout=30):
1120        """ Initialize the conenction.
1121       
1122        :param address: Address of the server.
1123        :timeout: Socket timeout.
1124       
1125        """
1126        self.address = address if address is not None else self.DEFAULT_ADDRESS
1127        self.timeout = timeout
1128   
1129    def query(self, condition, format="json", start=None, rows=None, indent=False):
1130        url = self.address + "api?" + condition.rest()
1131        if start and rows:
1132            url += "&start={0}&rows={1}".format(start, rows)
1133        url += "&format={0}".format(format)
1134        if indent:
1135            url += "&indent"
1136#        print url
1137        response = urllib2.urlopen(url)
1138        return response
1139   
1140# Names of all Gene Property filter names
1141GENE_FILTERS = \
1142    ["Name", # Gene name
1143     "Goterm", #Gene Ontology Term
1144     "Interproterm", #InterPro Term
1145     "Disease", #Gene-Disease Assocation
1146     "Keyword", #Gene Keyword
1147     "Protein", #Protein
1148
1149     "Dbxref", #Other Database Cross-Refs
1150     "Embl", #EMBL-Bank ID
1151     "Ensfamily", #Ensembl Family
1152     "Ensgene", #Ensembl Gene ID
1153
1154     "Ensprotein", #Ensembl Protein ID
1155     "Enstranscript", #Ensembl Transcript ID
1156     "Goid", #Gene Ontology ID
1157     "Image", #IMAGE ID
1158     "Interproid", #InterPro ID
1159     "Locuslink", #Entrez Gene ID
1160
1161     "Omimid", #OMIM ID
1162     "Orf", #ORF
1163     "Refseq", #RefSeq ID
1164     "Unigene", #UniGene ID
1165     "Uniprot", #UniProt Accession
1166
1167     "Hmdb", #HMDB ID
1168     "Chebi", #ChEBI ID
1169     "Cas", #CAS
1170     "Uniprotmetenz", #Uniprotmetenz
1171     "Gene", #Gene Name or Identifier
1172     "Synonym", #Gene Synonym
1173     ]
1174   
1175# Valid Gene Property filter qualifiers
1176GENE_FILTER_QUALIFIERS =\
1177    ["Is",
1178     "IsNot"
1179     ]
1180
1181# Organisms in the Atlas
1182ATLAS_ORGANISMS = \
1183    ["Anopheles gambiae",
1184     "Arabidopsis thaliana",
1185     "Bos taurus",
1186     "Caenorhabditis elegans",
1187     "Danio rerio",
1188     "Drosophila melanogaster",
1189     "Epstein barr virus",
1190     "Gallus gallus",
1191     "Homo sapiens",
1192     "Human cytomegalovirus",
1193     "Kaposi sarcoma-associated herpesvirus",
1194     "Mus musculus",
1195     "Rattus norvegicus",
1196     "Saccharomyces cerevisiae",
1197     "Schizosaccharomyces pombe",
1198     "Unknown",
1199     "Xenopus laevis"
1200     ]
1201   
1202def ef_ontology():
1203    """ Return the `EF <http://www.ebi.ac.uk/efo/>`_ (Experimental Factor) ontology
1204    """
1205    import obiOntology
1206#    return obiOntology.OBOOntology(urllib2.urlopen("http://efo.svn.sourceforge.net/svnroot/efo/trunk/src/efoinobo/efo.obo"))
1207    import orngServerFiles
1208    # Should this be in the OBOFoundry (Ontology) domain
1209    file_name = orngServerFiles.localpath_download("ArrayExpress", "efo.obo")
1210    return obiOntology.OBOOntology(open(filename, "rb"))
1211
1212
1213class AtlasCondition(object):
1214    """ Base class for Gene Expression Atlas query condition
1215    """
1216    def validate(self):
1217        """ Validate condition in a subclass.
1218        """
1219        raise NotImplementedError
1220   
1221    def rest(self):
1222        """ Return a REST query part in a subclass.
1223        """
1224        raise NotImplementedError
1225   
1226   
1227class AtlasConditionList(list, AtlasCondition):
1228    """ A list of AtlasCondition instances.
1229    """ 
1230    def validate(self):
1231        for item in self:
1232            item.validate()
1233       
1234    def rest(self):
1235        return "&".join(cond.rest() for cond in self)
1236
1237class AtlasConditionGeneProperty(AtlasCondition):
1238    """ An atlas gene filter condition.
1239   
1240    :param property: Property of the gene. If None or "" all properties
1241        will be searched.
1242    :param qualifier: Qualifier can be 'Is' or 'IsNot'
1243    :param value: The value to search for.
1244   
1245    Example ::
1246   
1247        >>> # Condition on a gene name
1248        >>> condition = AtlasConditionGeneProperty("Name", "Is", "AS3MT")
1249        >>> # Condition on genes from a GO Term
1250        >>> condition = AtlasConditionGeneProperty("Goterm", "Is", "p53 binding")
1251        >>> # Condition on disease association
1252        >>> condition = AtlasConditionGeneProperty("Disease", "Is", "cancer")
1253       
1254    """
1255    def __init__(self, property, qualifier, value):
1256        self.property = property or ""
1257        self.qualifier = qualifier
1258        if isinstance(value, basestring):
1259            self.value = value.replace(" ", "+")
1260        elif isinstance(value, list):
1261            self.value = "+".join(value)
1262        else:
1263            raise ValueError(value)
1264       
1265        self.validate()
1266       
1267    def validate(self):
1268        assert(self.property in GENE_FILTERS + [""])
1269        assert(self.qualifier in GENE_FILTER_QUALIFIERS + [""])
1270       
1271    def rest(self):
1272        return "gene{property}{qualifier}={value}".format(**self.__dict__)
1273       
1274       
1275class AtlasConditionExperimentalFactor(AtlasCondition):
1276    """ An atlas experimental factor filter condition.
1277   
1278    :param factor: EFO experiamntal factor
1279    :param regulation: "up", "down", "updown", "any" or "none"
1280    :param n: Minimum number of of experimants with this condition
1281    :param value: Experimantal factor value
1282   
1283    Example ::
1284   
1285        >>> # Any genes up regulated in at least 3 experiments involving cancer.
1286        >>> condition = AtlasConditionExperimentalFactor("", "up", 3, "cancer")
1287        >>> # Only genes which are up/down regulated in the heart in at least one experiment.
1288        >>> condition = AtlasConditionExperimentalFactor("Organism_part", "updown", 1, "heart")
1289       
1290    """
1291    def __init__(self, factor, regulation, n, value):
1292        self.factor = factor
1293        self.regulation = regulation
1294        self.n = n
1295        self.value = value
1296        self.validate()
1297       
1298    def validate(self):
1299        # TODO: validate the factor and value
1300#        assert(self.factor in ef_ontology())
1301        assert(self.regulation in ["up", "down", "updown"])
1302       
1303    def rest(self):
1304        return "{regulation}{n}In{factor}={value}".format(**self.__dict__)
1305       
1306class AtlasConditionOrganism(AtlasCondition):
1307    """ Condition on organism.
1308    """
1309    def __init__(self, organism):
1310        self.organism = organism
1311        self.validate()
1312       
1313    def validate(self):
1314        assert(self.organism in ATLAS_ORGANISMS)
1315       
1316    def rest(self):
1317        return "species={0}".format(self.organism.replace(" ", "+").lower())
1318       
1319   
1320def query_atlas_simple(genes=None, regulation=None, organism=None,
1321                       condition=None, format="json", start=None,
1322                       rows=None):
1323    """ A simple Atlas query.
1324   
1325    :param genes: A list of gene names to search for.
1326    :param regulation: Search for experiments in which `genes` are "up",
1327        "down", "updown" or "none" regulated. If None all experiments
1328        are searched.
1329    :param organism: Search experiments for organism. If None all experiments
1330        are searched.
1331    :param condition: An EFO factor value (e.g. "brain")
1332   
1333    Example ::
1334       
1335        >>> query_atlas_simple(genes=['Pou5f1', 'Dppa3'], organism="Mus musculus")
1336        {u'...
1337       
1338        >>> query_atlas_simple(genes=['Pou5f1', 'Dppa3'], regulation="up", organism="Mus musculus")
1339        {u'...
1340       
1341        >>> query_atlas_simple(genes=['Pou5f1', 'Dppa3'], regulation="up", condition="liver", organism="Mus musculus")
1342        {u'...
1343       
1344    """
1345    conditions = AtlasConditionList()
1346    if genes:
1347        conditions.append(AtlasConditionGeneProperty("Gene", "Is", genes))
1348    if regulation or condition:
1349        regulation = "any" if regulation is None else regulation
1350        condition = "" if condition is None else condition
1351        conditions.append(AtlasConditionExperimentalFactor("", regulation, 1, condition))
1352    if organism:
1353        conditions.append(AtlasConditionOrganism(organism))
1354       
1355    connection = GeneExpressionAtlasConenction()
1356    results = connection.query(conditions, format=format, start=start,
1357                               rows=rows)
1358    if format == "json":
1359        return parse_json(results)
1360    else:
1361        return parse_xml(results)
1362
1363"""\
1364.. todo:: can this be implemented query_atlas(organism="...", Locuslink="...", Chebi="...", up3InCompound="..." downInEFO="...")
1365      Need a full list of accepted factors
1366"""
1367
1368def query_atlas(condition, format="json", start=None, rows=None, indent=False):
1369    """ Query Atlas based on a `condition` (instance of AtlasCondition)
1370   
1371    Example ::
1372       
1373        >>> condition1 = AtlasConditionGeneProperty("Goterm", "Is", "p53 binding")
1374        >>> condition2 = AtlasConditionExperimentalFactor("Organism_part", "up", 3, "heart")
1375        >>> condition = AtlasConditionList([condition1, condition2])
1376        >>> query_atlas(condition)
1377        {u'...
1378       
1379    """
1380    connection = GeneExpressionAtlasConenction()
1381    results = connection.query(condition, format=format, start=start,
1382                               rows=rows, indent=indent)
1383    if format == "json":
1384        return parse_json(results)
1385    else:
1386        return parse_xml(results)
1387
1388
1389def get_atlas_summary(genes, organism):
1390    """ Return 3 dictionaries containing a summary of atlas information
1391    about three experimental factors:
1392   
1393        - Organism Part (OP)
1394        - Disease State (DS)
1395        - Cell type (CT)
1396   
1397    Each dictionary contains query genes as keys. Values are dictionaries
1398    mapping factor values to a 2-tuple containig the count of up regulated
1399    and down regulated experiments.
1400   
1401    Example ::
1402   
1403        >>> get_atlas_summary(["RUNX1"], "Homo sapiens")
1404        ({u'RUNX1': ...
1405       
1406    """
1407    genes_condition = AtlasConditionGeneProperty("Gene", "Is", genes)
1408    org_condition = AtlasConditionOrganism(organism)
1409    condition = AtlasConditionList([genes_condition, org_condition])
1410    result = query_atlas(condition, format="json")
1411   
1412    org_part = collect_ef_summary(result, "organism_part")
1413    disease_state = collect_ef_summary(result, "disease_state")
1414    cell_type = collect_ef_summary(result, "cell_type")
1415   
1416    return org_part, disease_state, cell_type
1417   
1418def collect_ef_summary(info, ef):
1419    """ Collect the results summary from query_atlas, result for experimental
1420    factor `ef`.
1421    """
1422    summary = defaultdict(dict)
1423    results = info["results"]
1424    for res in results:
1425        gene = res["gene"]
1426        expressions = res["expressions"] 
1427        for expression in expressions:
1428            if expression["ef"] == ef:
1429                efv = expression["efv"]
1430                updown = (expression["upExperiments"],
1431                          expression["downExperiments"]
1432                          )
1433               
1434                if any(updown):
1435                    summary[gene["name"]][efv] = updown
1436   
1437    return dict(summary)
1438   
1439   
1440def test():
1441    from pprint import pprint   
1442    pprint(get_atlas_summary(['Pou5f1', 'Dppa3'], 'Mus musculus'))
1443       
1444    pprint(get_atlas_summary(['PDLIM5', 'FGFR2' ], 'Homo sapiens'))
1445   
1446   
1447    conn = ArrayExpressConnection()
1448    import doctest
1449    doctest.testmod(optionflags=doctest.ELLIPSIS, extraglobs={"conn": conn})
1450   
1451if __name__ == "__main__":
1452    test()
1453   
Note: See TracBrowser for help on using the repository browser.