| 1 | """\ |
|---|
| 2 | Protein-protein interactions |
|---|
| 3 | ============================ |
|---|
| 4 | |
|---|
| 5 | This is python module for accessing PPI data. |
|---|
| 6 | """ |
|---|
| 7 | |
|---|
| 8 | import os, sys |
|---|
| 9 | import xml.dom.minidom as minidom |
|---|
| 10 | import warnings |
|---|
| 11 | import collections |
|---|
| 12 | |
|---|
| 13 | import orngServerFiles |
|---|
| 14 | |
|---|
| 15 | from obiKEGG import downloader |
|---|
| 16 | |
|---|
| 17 | from collections import defaultdict |
|---|
| 18 | |
|---|
| 19 | import obiTaxonomy |
|---|
| 20 | from obiTaxonomy import pickled_cache |
|---|
| 21 | |
|---|
| 22 | from Orange.misc import lru_cache |
|---|
| 23 | |
|---|
| 24 | import sqlite3 |
|---|
| 25 | import urllib2 |
|---|
| 26 | import posixpath |
|---|
| 27 | import shutil |
|---|
| 28 | import gzip |
|---|
| 29 | |
|---|
| 30 | class PPIDatabase(object): |
|---|
| 31 | """ A general interface for protein-protein interaction database access. |
|---|
| 32 | |
|---|
| 33 | An example useage:: |
|---|
| 34 | >>> ppidb = MySuperPPIDatabase() |
|---|
| 35 | >>> ppidb.organisms() # List all organisms (taxids) |
|---|
| 36 | ['... |
|---|
| 37 | |
|---|
| 38 | >>> ppidb.ids() # List all protein ids |
|---|
| 39 | ['... |
|---|
| 40 | |
|---|
| 41 | >>> ppidb.ids(taxid="9606") # List all human protein ids. |
|---|
| 42 | ['... |
|---|
| 43 | |
|---|
| 44 | >>> ppidb.links() # List all links |
|---|
| 45 | [('... |
|---|
| 46 | """ |
|---|
| 47 | def __init__(self): |
|---|
| 48 | pass |
|---|
| 49 | |
|---|
| 50 | def organisms(self): |
|---|
| 51 | """ Return all organism taxids contained in this database. |
|---|
| 52 | """ |
|---|
| 53 | raise NotImplementedError |
|---|
| 54 | |
|---|
| 55 | def ids(self, taxid=None): |
|---|
| 56 | """ Return a list of all protein ids. If `taxid` is not None limit |
|---|
| 57 | the results to ids from this organism only. |
|---|
| 58 | |
|---|
| 59 | """ |
|---|
| 60 | raise NotImplementedError |
|---|
| 61 | |
|---|
| 62 | def synonyms(self, id): |
|---|
| 63 | """ Return a list of synonyms for primary `id`. |
|---|
| 64 | """ |
|---|
| 65 | raise NotImplementedError |
|---|
| 66 | |
|---|
| 67 | def all_edges(self, taxid=None): |
|---|
| 68 | """ Return a list of all edges. If taxid is not None return the |
|---|
| 69 | edges for this organism only. |
|---|
| 70 | |
|---|
| 71 | """ |
|---|
| 72 | raise NotImplementedError |
|---|
| 73 | |
|---|
| 74 | def edges(self, id1, id2=None): |
|---|
| 75 | """ Return a list of all edges (a list of 3-tuples (id1, id2, score)). |
|---|
| 76 | """ |
|---|
| 77 | raise NotImplementedError |
|---|
| 78 | |
|---|
| 79 | def all_edges_annotated(self, taxid=None): |
|---|
| 80 | """ Return a list of all edges annotated. If taxid is not None |
|---|
| 81 | return the edges for this organism only. |
|---|
| 82 | |
|---|
| 83 | """ |
|---|
| 84 | res = [] |
|---|
| 85 | for id in self.ids(taxid): |
|---|
| 86 | res.extend(self.edges_annotated(id)) |
|---|
| 87 | return res |
|---|
| 88 | |
|---|
| 89 | def edges_annotated(self, id=None): |
|---|
| 90 | """ Return a list of all edges annotated |
|---|
| 91 | """ |
|---|
| 92 | raise NotImplementedError |
|---|
| 93 | |
|---|
| 94 | def search_id(self, name, taxid=None): |
|---|
| 95 | """ Search the database for protein name. Return a list of matching |
|---|
| 96 | primary ids. Use `taxid` to limit the results to a single organism. |
|---|
| 97 | |
|---|
| 98 | """ |
|---|
| 99 | raise NotImplementedError |
|---|
| 100 | |
|---|
| 101 | @classmethod |
|---|
| 102 | def download_data(self): |
|---|
| 103 | """ Download the latest PPI data for local work. |
|---|
| 104 | """ |
|---|
| 105 | raise NotImplementedError |
|---|
| 106 | |
|---|
| 107 | class BioGRID(PPIDatabase): |
|---|
| 108 | """ Access `BioGRID <http://thebiogrid.org>`_ PPI data. |
|---|
| 109 | |
|---|
| 110 | Example :: |
|---|
| 111 | |
|---|
| 112 | >>> biogrid = BioGRID() |
|---|
| 113 | >>> print biogrid.organism() # Print a list of all organism ncbi taxis in BioGRID |
|---|
| 114 | [u'10090',... |
|---|
| 115 | |
|---|
| 116 | >>> print biogrid.ids(taxid="9606") # Print a set of all human protein ids |
|---|
| 117 | [u'110004' |
|---|
| 118 | |
|---|
| 119 | >>> print biogrid.synonyms("110004") # Print a list of all synonyms for protein id '110004' as reported by BioGRID |
|---|
| 120 | [u'3803', u'CU464060.2', u'CD158b', u'p58.2', u'CD158B1', u'NKAT6'] |
|---|
| 121 | |
|---|
| 122 | >>> |
|---|
| 123 | |
|---|
| 124 | """ |
|---|
| 125 | |
|---|
| 126 | SCHEMA = [("links", """\ |
|---|
| 127 | biogrid_interaction_id text, |
|---|
| 128 | biogrid_id_interactor_a text, |
|---|
| 129 | biogrid_id_interactor_b text, |
|---|
| 130 | experimental_system text, |
|---|
| 131 | experimental_system_type text, |
|---|
| 132 | author text, |
|---|
| 133 | pubmed_id text, |
|---|
| 134 | throughput text, |
|---|
| 135 | score real, |
|---|
| 136 | modification text, |
|---|
| 137 | phenotypes text, |
|---|
| 138 | qualifications text, |
|---|
| 139 | tags text, |
|---|
| 140 | source_database text |
|---|
| 141 | """), |
|---|
| 142 | ("proteins", """\ |
|---|
| 143 | biogrid_id_interactor text, |
|---|
| 144 | entrez_gene_interactor text, |
|---|
| 145 | systematic_name_interactor text, |
|---|
| 146 | official_symbol_interactor text, |
|---|
| 147 | synonyms_interactor text, |
|---|
| 148 | organism_interactor text, |
|---|
| 149 | """)] |
|---|
| 150 | VERSION = "2.0" |
|---|
| 151 | |
|---|
| 152 | # All column names in the tab2 table. |
|---|
| 153 | FIELDS = ['biogrid_interaction_id', |
|---|
| 154 | 'entrez_gene_interactor_a', |
|---|
| 155 | 'entrez_gene_interactor_b', |
|---|
| 156 | 'biogrid_id_interactor_a', |
|---|
| 157 | 'biogrid_id_interactor_b', |
|---|
| 158 | 'systematic_name_interactor_a', |
|---|
| 159 | 'systematic_name_interactor_b', |
|---|
| 160 | 'official_symbol_interactor_a', |
|---|
| 161 | 'official_symbol_interactor_b', |
|---|
| 162 | 'synonyms_interactor_a', |
|---|
| 163 | 'synonyms_interactor_b', |
|---|
| 164 | 'experimental_system', |
|---|
| 165 | 'experimental_system_type', |
|---|
| 166 | 'author', |
|---|
| 167 | 'pubmed_id', |
|---|
| 168 | 'organism_interactor_a', |
|---|
| 169 | 'organism_interactor_b', |
|---|
| 170 | 'throughput', |
|---|
| 171 | 'score', |
|---|
| 172 | 'modification', |
|---|
| 173 | 'phenotypes', |
|---|
| 174 | 'qualifications', |
|---|
| 175 | 'tags', |
|---|
| 176 | 'source_database' |
|---|
| 177 | ] |
|---|
| 178 | |
|---|
| 179 | # BioGRIDInteraction = collections.namedtuple("BioGRIDInteraction", " ".join(SCHEMA[0][1])) |
|---|
| 180 | # BioGRIDInteractor = collections.namedtuple("BioGRIDInteractor", " ".join(SCHEMA[1][1])) |
|---|
| 181 | |
|---|
| 182 | DOMAIN = "PPI" |
|---|
| 183 | SERVER_FILE = "BIOGRID-ALL.sqlite" |
|---|
| 184 | |
|---|
| 185 | def __init__(self): |
|---|
| 186 | self.filename = orngServerFiles.localpath_download(self.DOMAIN, self.SERVER_FILE) |
|---|
| 187 | # info = orngServerFiles.info(self.DOMAIN, self.SERVER_FILE) |
|---|
| 188 | # assert version matches |
|---|
| 189 | self.db = sqlite3.connect(self.filename) |
|---|
| 190 | self.init_db_index() |
|---|
| 191 | |
|---|
| 192 | @lru_cache(1) |
|---|
| 193 | def organisms(self): |
|---|
| 194 | cur = self.db.execute("select distinct organism_interactor from proteins") |
|---|
| 195 | return cur.fetchall() |
|---|
| 196 | |
|---|
| 197 | @lru_cache(3) |
|---|
| 198 | def ids(self, taxid=None): |
|---|
| 199 | """ Return a list of all protein ids (biogrid_id_interactors). |
|---|
| 200 | If `taxid` is not None limit the results to ids from this organism |
|---|
| 201 | only. |
|---|
| 202 | |
|---|
| 203 | """ |
|---|
| 204 | if taxid is None: |
|---|
| 205 | cur = self.db.execute("""\ |
|---|
| 206 | select biogrid_id_interactor |
|---|
| 207 | from proteins""") |
|---|
| 208 | else: |
|---|
| 209 | cur = self.db.execute("""\ |
|---|
| 210 | select biogrid_id_interactor |
|---|
| 211 | from proteins |
|---|
| 212 | where organism_interactor=?""", (taxid,)) |
|---|
| 213 | |
|---|
| 214 | return [t[0] for t in cur.fetchall()] |
|---|
| 215 | |
|---|
| 216 | def synonyms(self, id): |
|---|
| 217 | """ Return a list of synonyms for primary `id`. |
|---|
| 218 | |
|---|
| 219 | """ |
|---|
| 220 | cur = self.db.execute("""\ |
|---|
| 221 | select entrez_gene_interactor, |
|---|
| 222 | systematic_name_interactor, |
|---|
| 223 | official_symbol_interactor, |
|---|
| 224 | synonyms_interactor |
|---|
| 225 | from proteins |
|---|
| 226 | where biogrid_id_interactor=? |
|---|
| 227 | """, (id,)) |
|---|
| 228 | rec = cur.fetchone() |
|---|
| 229 | synonyms = list(rec[:-1]) + (rec[-1].split("|") if rec[-1] is not None else []) |
|---|
| 230 | return [s for s in synonyms if s is not None] |
|---|
| 231 | |
|---|
| 232 | def all_edges(self, taxid=None): |
|---|
| 233 | """ Return a list of all edges. If taxid is not None return the |
|---|
| 234 | edges for this organism only. |
|---|
| 235 | |
|---|
| 236 | """ |
|---|
| 237 | if taxid is not None: |
|---|
| 238 | cur = self.db.execute("""\ |
|---|
| 239 | select biogrid_id_interactor_a, biogrid_id_interactor_a, score |
|---|
| 240 | from links left join proteins on |
|---|
| 241 | biogrid_id_interactor_a=biogrid_id_interactor or |
|---|
| 242 | biogrid_id_interactor_b=biogrid_id_interactor |
|---|
| 243 | where organism_interactor=? |
|---|
| 244 | """, (taxid,)) |
|---|
| 245 | else: |
|---|
| 246 | cur = self.db.execute("""\ |
|---|
| 247 | select biogrid_id_interactor_a, biogrid_id_interactor_a, score |
|---|
| 248 | from links |
|---|
| 249 | """) |
|---|
| 250 | edges = cur.fetchall() |
|---|
| 251 | return edges |
|---|
| 252 | |
|---|
| 253 | def edges(self, id): |
|---|
| 254 | """ Return a list of all interactions where id is a participant |
|---|
| 255 | (a list of 3-tuples (id_a, id_b, score)). |
|---|
| 256 | |
|---|
| 257 | """ |
|---|
| 258 | |
|---|
| 259 | cur = self.db.execute("""\ |
|---|
| 260 | select biogrid_id_interactor_a, biogrid_id_interactor_b, score |
|---|
| 261 | from links |
|---|
| 262 | where biogrid_id_interactor_a=? or biogrid_id_interactor_b=? |
|---|
| 263 | """, (id, id)) |
|---|
| 264 | return cur.fetchall() |
|---|
| 265 | |
|---|
| 266 | def all_edges_annotated(self, taxid=None): |
|---|
| 267 | """ Return a list of all edges annotated. If taxid is not None |
|---|
| 268 | return the edges for this organism only. |
|---|
| 269 | |
|---|
| 270 | """ |
|---|
| 271 | if taxid is not None: |
|---|
| 272 | cur = self.db.execute("""\ |
|---|
| 273 | select * |
|---|
| 274 | from links left join proteins on |
|---|
| 275 | biogrid_id_interactor_a=biogrid_id_interactor or |
|---|
| 276 | biogrid_id_interactor_b=biogrid_id_interactor |
|---|
| 277 | where organism_interactor=? |
|---|
| 278 | """, (taxid,)) |
|---|
| 279 | else: |
|---|
| 280 | cur = self.db.execute("""\ |
|---|
| 281 | select * |
|---|
| 282 | from links |
|---|
| 283 | """) |
|---|
| 284 | edges = cur.fetchall() |
|---|
| 285 | return edges |
|---|
| 286 | |
|---|
| 287 | |
|---|
| 288 | def edges_annotated(self, id): |
|---|
| 289 | """ Return a list of all links |
|---|
| 290 | """ |
|---|
| 291 | cur = self.db.execute("""\ |
|---|
| 292 | select * |
|---|
| 293 | from links |
|---|
| 294 | where biogrid_id_interactor_a=? or biogrid_id_interactor_b=? |
|---|
| 295 | """, (id, id)) |
|---|
| 296 | return cur.fetchall() |
|---|
| 297 | |
|---|
| 298 | def search_id(self, name, taxid=None): |
|---|
| 299 | """ Search the database for protein name. Return a list of matching |
|---|
| 300 | primary ids. Use `taxid` to limit the results to a single organism. |
|---|
| 301 | |
|---|
| 302 | """ |
|---|
| 303 | if taxid is None: |
|---|
| 304 | self.db.execute("""\ |
|---|
| 305 | select biogrid_id_interactor |
|---|
| 306 | from proteins |
|---|
| 307 | where biogrid_id_interactor=? or |
|---|
| 308 | entrez_id_interactor=? or |
|---|
| 309 | systematic_name_interactor=? or |
|---|
| 310 | official_symbol_interactor=? or |
|---|
| 311 | synonyms_interactor=? |
|---|
| 312 | """, (id,) * 5) |
|---|
| 313 | |
|---|
| 314 | @classmethod |
|---|
| 315 | def download_data(cls, address): |
|---|
| 316 | """ Pass the address of the latest release (the tab2 format). |
|---|
| 317 | """ |
|---|
| 318 | import urllib2, shutil, zipfile |
|---|
| 319 | from StringIO import StringIO |
|---|
| 320 | stream = urllib2.urlopen(address) |
|---|
| 321 | stream = StringIO(stream.read()) |
|---|
| 322 | file = zipfile.ZipFile(stream) |
|---|
| 323 | filename = file.namelist()[0] |
|---|
| 324 | ppi_dir = orngServerFiles.localpath("PPI") |
|---|
| 325 | file.extract(filename, ppi_dir) |
|---|
| 326 | shutil.move(orngServerFiles.localpath("PPI", filename), |
|---|
| 327 | orngServerFiles.localpath("PPI", "BIOGRID-ALL.tab2")) |
|---|
| 328 | filepath = orngServerFiles.localpath("PPI", "BIOGRID-ALL.tab2") |
|---|
| 329 | cls.init_db(filepath) |
|---|
| 330 | shutil.remove(filepath) |
|---|
| 331 | |
|---|
| 332 | @classmethod |
|---|
| 333 | def init_db(cls, filepath): |
|---|
| 334 | """ Initialize the sqlite data base from a BIOGRID-ALL.*tab2.txt file |
|---|
| 335 | format. |
|---|
| 336 | |
|---|
| 337 | """ |
|---|
| 338 | dirname = os.path.dirname(filepath) |
|---|
| 339 | lineiter = iter(open(filepath, "rb")) |
|---|
| 340 | headers = lineiter.next() # read the first line |
|---|
| 341 | |
|---|
| 342 | con = sqlite3.connect(os.path.join(dirname, "BIOGRID-ALL.sqlite")) |
|---|
| 343 | con.execute("drop table if exists links") # Drop old table |
|---|
| 344 | con.execute("drop table if exists proteins") # Drop old table |
|---|
| 345 | |
|---|
| 346 | con.execute("""\ |
|---|
| 347 | create table links ( |
|---|
| 348 | biogrid_interaction_id text, |
|---|
| 349 | biogrid_id_interactor_a text, |
|---|
| 350 | biogrid_id_interactor_b text, |
|---|
| 351 | experimental_system text, |
|---|
| 352 | experimental_system_type text, |
|---|
| 353 | author text, |
|---|
| 354 | pubmed_id text, |
|---|
| 355 | throughput text, |
|---|
| 356 | score real, |
|---|
| 357 | modification text, |
|---|
| 358 | phenotypes text, |
|---|
| 359 | qualifications text, |
|---|
| 360 | tags text, |
|---|
| 361 | source_database text |
|---|
| 362 | )""") |
|---|
| 363 | |
|---|
| 364 | con.execute("""\ |
|---|
| 365 | create table proteins ( |
|---|
| 366 | biogrid_id_interactor text, |
|---|
| 367 | entrez_gene_interactor text, |
|---|
| 368 | systematic_name_interactor text, |
|---|
| 369 | official_symbol_interactor text, |
|---|
| 370 | synonyms_interactor text, |
|---|
| 371 | organism_interactor text |
|---|
| 372 | )""") |
|---|
| 373 | |
|---|
| 374 | proteins = {} |
|---|
| 375 | nulls = lambda values: [val if val != "-" else None for val in values] |
|---|
| 376 | link_indices = [0, 3, 4, 11, 12, 13, 14, 17, 18, 19, 20, 21, 22, 23] # Values that go in the links table |
|---|
| 377 | interactor_a_indices = [3, 1, 5, 7, 9, 15] # Values that go in the proteins table |
|---|
| 378 | interactor_b_indices = [4, 2, 6, 8, 10, 16] # Values that go in the proteins table |
|---|
| 379 | |
|---|
| 380 | def processlinks(file): |
|---|
| 381 | for line in file: |
|---|
| 382 | if line != "\n": |
|---|
| 383 | fields = nulls(line.strip().split("\t")) |
|---|
| 384 | yield [fields[i] for i in link_indices] |
|---|
| 385 | interactor_a = [fields[i] for i in interactor_a_indices] |
|---|
| 386 | interactor_b = [fields[i] for i in interactor_b_indices] |
|---|
| 387 | proteins[interactor_a[0]] = interactor_a |
|---|
| 388 | proteins[interactor_b[0]] = interactor_b |
|---|
| 389 | |
|---|
| 390 | con.executemany("""\ |
|---|
| 391 | insert into links values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) |
|---|
| 392 | """, processlinks(lineiter)) |
|---|
| 393 | |
|---|
| 394 | con.executemany("""\ |
|---|
| 395 | insert into proteins values (?, ?, ?, ?, ?, ?) |
|---|
| 396 | """, proteins.itervalues()) |
|---|
| 397 | con.commit() |
|---|
| 398 | con.close() |
|---|
| 399 | |
|---|
| 400 | def init_db_index(self): |
|---|
| 401 | """ Will create an indexes (if not already pressent) in the database |
|---|
| 402 | for faster searching by primary ids. |
|---|
| 403 | |
|---|
| 404 | """ |
|---|
| 405 | self.db.execute("""\ |
|---|
| 406 | create index if not exists index_on_biogrid_id_interactor_a |
|---|
| 407 | on links (biogrid_id_interactor_a) |
|---|
| 408 | """) |
|---|
| 409 | self.db.execute("""\ |
|---|
| 410 | create index if not exists index_on_biogrid_id_interactor_b |
|---|
| 411 | on links (biogrid_id_interactor_b) |
|---|
| 412 | """) |
|---|
| 413 | self.db.execute("""\ |
|---|
| 414 | create index if not exists index_on_biogrid_id_interactor |
|---|
| 415 | on proteins (biogrid_id_interactor) |
|---|
| 416 | """) |
|---|
| 417 | |
|---|
| 418 | from collections import namedtuple |
|---|
| 419 | from functools import partial |
|---|
| 420 | |
|---|
| 421 | STRINGInteraction = namedtuple("STRINGInteraciton", |
|---|
| 422 | ["protein_id1", "protein_id2", "combined_score", "mode", |
|---|
| 423 | "action", "score"]) |
|---|
| 424 | |
|---|
| 425 | class STRING(PPIDatabase): |
|---|
| 426 | """ Access `STRING <http://www.string-db.org/>`_ PPI database. |
|---|
| 427 | |
|---|
| 428 | Database schema |
|---|
| 429 | --------------- |
|---|
| 430 | table `links`: |
|---|
| 431 | - `protein_id1`: id (text) |
|---|
| 432 | - `protein_id2`: id (text) |
|---|
| 433 | - `score`: combined score (int) |
|---|
| 434 | |
|---|
| 435 | table `actions`: |
|---|
| 436 | - `protein_id1`: id (text) |
|---|
| 437 | - `protein_id2`: id (text) |
|---|
| 438 | - `mode`: mode (text) |
|---|
| 439 | - `action`: action type (text) |
|---|
| 440 | - `score`: action score (int) |
|---|
| 441 | |
|---|
| 442 | table `proteins`: |
|---|
| 443 | - `protein_id`: protein id in STRING (text) (in the form of {taxid}.{name}) |
|---|
| 444 | - `taxid`: organism taxid (text) |
|---|
| 445 | |
|---|
| 446 | table `aliases`: |
|---|
| 447 | - `protein_id: id (text) |
|---|
| 448 | - `alias`: protein alias (text) |
|---|
| 449 | |
|---|
| 450 | """ |
|---|
| 451 | DOMAIN = "PPI" |
|---|
| 452 | FILENAME = "string-protein.sqlite" |
|---|
| 453 | VERSION = "1.0" |
|---|
| 454 | |
|---|
| 455 | # Mapping from obiTaxonomy.common_taxids() to taxids in STRING. |
|---|
| 456 | |
|---|
| 457 | TAXID_MAP = {"352472": "44689", # Dictyostelium discoideum |
|---|
| 458 | "562": None, |
|---|
| 459 | "2104": "272634", # Mycoplasma pneumoniae M129 |
|---|
| 460 | "4530": "39947", # Oryza sativa Japonica Group |
|---|
| 461 | "4754": None, |
|---|
| 462 | "8355": None, |
|---|
| 463 | "4577": None |
|---|
| 464 | } |
|---|
| 465 | |
|---|
| 466 | |
|---|
| 467 | def __init__(self): |
|---|
| 468 | self.filename = orngServerFiles.localpath_download(self.DOMAIN, self.FILENAME) |
|---|
| 469 | self.db = sqlite3.connect(self.filename) |
|---|
| 470 | |
|---|
| 471 | def organisms(self): |
|---|
| 472 | """ Return all organism taxids contained in this database. |
|---|
| 473 | """ |
|---|
| 474 | cur = self.db.execute("select distinct taxid from proteins") |
|---|
| 475 | return [r[0] for r in cur.fetchall()] |
|---|
| 476 | |
|---|
| 477 | def ids(self, taxid=None): |
|---|
| 478 | """ Return a list of all protein ids. If `taxid` is not None limit |
|---|
| 479 | the results to ids from this organism only. |
|---|
| 480 | |
|---|
| 481 | """ |
|---|
| 482 | if taxid is not None: |
|---|
| 483 | cur = self.db.execute("""\ |
|---|
| 484 | select protein_id |
|---|
| 485 | from proteins |
|---|
| 486 | where taxid=? |
|---|
| 487 | """, (taxid,)) |
|---|
| 488 | else: |
|---|
| 489 | cur = self.db.execute("""\ |
|---|
| 490 | select protein_id |
|---|
| 491 | from proteins |
|---|
| 492 | """) |
|---|
| 493 | return [r[0] for r in cur.fetchall()] |
|---|
| 494 | |
|---|
| 495 | def synonyms(self, id): |
|---|
| 496 | """ Return a list of synonyms for primary `id` as reported by STRING (proteins.aliases.{version}.txt file) |
|---|
| 497 | """ |
|---|
| 498 | cur = self.db.execute("""\ |
|---|
| 499 | select alias |
|---|
| 500 | from aliases |
|---|
| 501 | where protein_id=? |
|---|
| 502 | """, (id,)) |
|---|
| 503 | res = cur.fetchall() |
|---|
| 504 | return [r[0] for r in res] |
|---|
| 505 | |
|---|
| 506 | def all_edges(self, taxid=None): |
|---|
| 507 | """ Return a list of all edges. If taxid is not None return the |
|---|
| 508 | edges for this organism only. |
|---|
| 509 | |
|---|
| 510 | .. note:: This may take some time (and memory). |
|---|
| 511 | |
|---|
| 512 | """ |
|---|
| 513 | if taxid is not None: |
|---|
| 514 | cur = self.db.execute("""\ |
|---|
| 515 | select links.protein_id1, links.protein_id2, score |
|---|
| 516 | from links join proteins on |
|---|
| 517 | links.protein_id1=proteins.protein_id |
|---|
| 518 | where taxid=? |
|---|
| 519 | """, (taxid,)) |
|---|
| 520 | else: |
|---|
| 521 | cur = self.db.execute("""\ |
|---|
| 522 | select protein_id1, protein_id1, score |
|---|
| 523 | from links |
|---|
| 524 | """) |
|---|
| 525 | return cur.fetchall() |
|---|
| 526 | |
|---|
| 527 | def edges(self, id): |
|---|
| 528 | """ Return a list of all edges (a list of 3-tuples (id1, id2, score)). |
|---|
| 529 | """ |
|---|
| 530 | cur = self.db.execute("""\ |
|---|
| 531 | select protein_id1, protein_id2, score |
|---|
| 532 | from links |
|---|
| 533 | where protein_id1=? |
|---|
| 534 | """, (id,)) |
|---|
| 535 | return cur.fetchall() |
|---|
| 536 | |
|---|
| 537 | def all_edges_annotated(self, taxid=None): |
|---|
| 538 | """ Return a list of all edges annotated. If taxid is not None |
|---|
| 539 | return the edges for this organism only. |
|---|
| 540 | |
|---|
| 541 | """ |
|---|
| 542 | res = [] |
|---|
| 543 | for id in self.ids(taxid): |
|---|
| 544 | res.extend(self.edges_annotated(id)) |
|---|
| 545 | return res |
|---|
| 546 | |
|---|
| 547 | def edges_annotated(self, id): |
|---|
| 548 | """ Return a list of all edges annotated. |
|---|
| 549 | """ |
|---|
| 550 | cur = self.db.execute("""\ |
|---|
| 551 | select links.protein_id1, links.protein_id2, links.score, |
|---|
| 552 | actions.action, actions.mode, actions.score |
|---|
| 553 | from links left join actions on |
|---|
| 554 | links.protein_id1=actions.protein_id1 and |
|---|
| 555 | links.protein_id2=actions.protein_id2 |
|---|
| 556 | where links.protein_id1=? |
|---|
| 557 | """, (id,)) |
|---|
| 558 | return map(partial(apply,STRINGInteraction), cur.fetchall()) |
|---|
| 559 | |
|---|
| 560 | @classmethod |
|---|
| 561 | def download_data(cls, version, taxids=None): |
|---|
| 562 | """ Download the PPI data for local work (this may take some time). |
|---|
| 563 | Pass the version of the STRING release e.g. v8.3. |
|---|
| 564 | The resulting sqlite database will only contain the protein |
|---|
| 565 | interactions for `taxids` (if None obiTaxonomy.common_taxids() will |
|---|
| 566 | be used). |
|---|
| 567 | |
|---|
| 568 | """ |
|---|
| 569 | dir = orngServerFiles.localpath("PPI") |
|---|
| 570 | |
|---|
| 571 | def download(address, dir): |
|---|
| 572 | stream = urllib2.urlopen(address) |
|---|
| 573 | basename = posixpath.basename(address) |
|---|
| 574 | tmpfilename = os.path.join(dir, basename + ".part") |
|---|
| 575 | tmpfile = open(tmpfilename, "wb") |
|---|
| 576 | shutil.copyfileobj(stream, tmpfile) |
|---|
| 577 | tmpfile.close() |
|---|
| 578 | os.rename(tmpfilename, os.path.join(dir, basename)) |
|---|
| 579 | base_url = "http://www.string-db.org/newstring_download/" #protein.links.v9.0.txt.gz |
|---|
| 580 | links = base_url + "protein.links.{version}.txt.gz" |
|---|
| 581 | actions = base_url + "protein.actions.{version}.txt.gz" |
|---|
| 582 | aliases = base_url + "protein.aliases.{version}.txt.gz" |
|---|
| 583 | |
|---|
| 584 | download(links.format(version=version), dir) |
|---|
| 585 | download(actions.format(version=version), dir) |
|---|
| 586 | download(aliases.format(version=version), dir) |
|---|
| 587 | |
|---|
| 588 | links_filename = os.path.join(dir, "protein.links.{version}.txt".format(version=version)) |
|---|
| 589 | actions_filename = os.path.join(dir, "protein.links.{version}.txt".format(version=version)) |
|---|
| 590 | aliases_filename = os.path.join(dir, "protein.aliases.{version}.txt".format(version=version)) |
|---|
| 591 | |
|---|
| 592 | from orngMisc import ConsoleProgressBar |
|---|
| 593 | |
|---|
| 594 | progress = ConsoleProgressBar("Extracting files:") |
|---|
| 595 | progress(1.0) |
|---|
| 596 | links_file = gzip.GzipFile(links_filename + ".gz", "rb") |
|---|
| 597 | shutil.copyfileobj(links_file, open(links_filename, "wb")) |
|---|
| 598 | |
|---|
| 599 | progress(60.0) |
|---|
| 600 | actions_file = gzip.GzipFile(actions_filename + ".gz", "rb") |
|---|
| 601 | shutil.copyfileobj(actions_file, open(actions_filename, "wb")) |
|---|
| 602 | actions_file = open(actions_filename, "rb") |
|---|
| 603 | # |
|---|
| 604 | progress(90.0) |
|---|
| 605 | aliases_file = gzip.GzipFile(aliases_filename + ".gz", "rb") |
|---|
| 606 | shutil.copyfileobj(aliases_file, open(aliases_filename, "wb")) |
|---|
| 607 | aliases_file = open(aliases_filename, "rb") |
|---|
| 608 | progress.finish() |
|---|
| 609 | |
|---|
| 610 | cls.init_db(version, taxids) |
|---|
| 611 | |
|---|
| 612 | @classmethod |
|---|
| 613 | def init_db(cls, version, taxids=None): |
|---|
| 614 | """ Initialize the sqlite3 data base. `version` must contain a |
|---|
| 615 | STRING release version e.g 'v8.3'. If `taxids` is not `None` it |
|---|
| 616 | must contain a list of tax-ids in the STRING database for which |
|---|
| 617 | to extract the interactions for. |
|---|
| 618 | |
|---|
| 619 | """ |
|---|
| 620 | def counter(): |
|---|
| 621 | i = 0 |
|---|
| 622 | while True: |
|---|
| 623 | yield i |
|---|
| 624 | i += 1 |
|---|
| 625 | |
|---|
| 626 | protein_ids = defaultdict(counter().next) |
|---|
| 627 | protein_taxid = {} |
|---|
| 628 | |
|---|
| 629 | dir = orngServerFiles.localpath(cls.DOMAIN) |
|---|
| 630 | |
|---|
| 631 | links_filename = os.path.join(dir, "protein.links.{version}.txt".format(version=version)) |
|---|
| 632 | actions_filename = os.path.join(dir, "protein.actions.{version}.txt".format(version=version)) |
|---|
| 633 | aliases_filename = os.path.join(dir, "protein.aliases.{version}.txt".format(version=version)) |
|---|
| 634 | |
|---|
| 635 | links_file = open(links_filename, "rb") |
|---|
| 636 | actions_file = open(actions_filename, "rb") |
|---|
| 637 | aliases_file = open(aliases_filename, "rb") |
|---|
| 638 | |
|---|
| 639 | from orngMisc import ConsoleProgressBar |
|---|
| 640 | |
|---|
| 641 | progress = ConsoleProgressBar("Processing links file:") |
|---|
| 642 | progress(0.0) |
|---|
| 643 | filesize = os.stat(links_filename).st_size |
|---|
| 644 | |
|---|
| 645 | if taxids: |
|---|
| 646 | taxids = set(taxids) |
|---|
| 647 | else: |
|---|
| 648 | taxids = [cls.TAXID_MAP.get(id, id) for id in obiTaxonomy.common_taxids()] |
|---|
| 649 | taxids = set(filter(None, taxids)) |
|---|
| 650 | |
|---|
| 651 | con = sqlite3.connect(orngServerFiles.localpath(cls.DOMAIN, cls.FILENAME)) |
|---|
| 652 | with con: |
|---|
| 653 | con.execute("drop table if exists links") |
|---|
| 654 | con.execute("drop table if exists proteins") |
|---|
| 655 | con.execute("drop table if exists actions") |
|---|
| 656 | con.execute("drop table if exists aliases") |
|---|
| 657 | |
|---|
| 658 | con.execute("create table links (protein_id1 text, protein_id2 text, score int)") |
|---|
| 659 | con.execute("create table proteins (protein_id text, taxid text)") |
|---|
| 660 | con.execute("create table actions (protein_id1 text, protein_id2 text, mode text, action text, score int)") |
|---|
| 661 | con.execute("create table aliases (protein_id text, alias text)") |
|---|
| 662 | |
|---|
| 663 | header = links_file.readline() # read the header |
|---|
| 664 | |
|---|
| 665 | import csv |
|---|
| 666 | reader = csv.reader(links_file, delimiter=" ") |
|---|
| 667 | |
|---|
| 668 | def read_links(reader, chunk_size=1000000): |
|---|
| 669 | links = [] |
|---|
| 670 | i = 0 |
|---|
| 671 | for p1, p2, score in reader: |
|---|
| 672 | taxid1 = p1.split(".", 1)[0] |
|---|
| 673 | taxid2 = p2.split(".", 1)[0] |
|---|
| 674 | if taxid1 in taxids and taxid2 in taxids: |
|---|
| 675 | links.append((intern(p1), intern(p2), int(score))) |
|---|
| 676 | if len(links) == chunk_size: |
|---|
| 677 | yield links |
|---|
| 678 | links = [] |
|---|
| 679 | i += 1 |
|---|
| 680 | if i % 1000 == 0: # Update the progress every 1000 lines |
|---|
| 681 | progress(100.0 * links_file.tell() / filesize) |
|---|
| 682 | if links: |
|---|
| 683 | yield links |
|---|
| 684 | |
|---|
| 685 | for chunk in read_links(reader): |
|---|
| 686 | con.executemany("insert into links values (?, ?, ?)", chunk) |
|---|
| 687 | |
|---|
| 688 | con.commit() |
|---|
| 689 | |
|---|
| 690 | progress.finish() |
|---|
| 691 | |
|---|
| 692 | proteins = [res[0] for res in con.execute("select distinct protein_id1 from links")] |
|---|
| 693 | progress = ConsoleProgressBar("Processing proteins:") |
|---|
| 694 | |
|---|
| 695 | def protein_taxids(proteins): |
|---|
| 696 | protein_taxids = [] |
|---|
| 697 | for i, prot in enumerate(proteins): |
|---|
| 698 | taxid = prot.split(".", 1)[0] |
|---|
| 699 | protein_taxids.append((prot, taxid)) |
|---|
| 700 | if i % 1000 == 0: |
|---|
| 701 | progress(100.0 * i / len(proteins)) |
|---|
| 702 | protein_taxids.sort() |
|---|
| 703 | return protein_taxids |
|---|
| 704 | |
|---|
| 705 | con.executemany("insert into proteins values (?, ?)", protein_taxids(proteins)) |
|---|
| 706 | |
|---|
| 707 | con.commit() |
|---|
| 708 | progress.finish() |
|---|
| 709 | |
|---|
| 710 | filesize = os.stat(actions_filename).st_size |
|---|
| 711 | |
|---|
| 712 | actions_file.readline() # read header |
|---|
| 713 | |
|---|
| 714 | progress = ConsoleProgressBar("Processing actions:") |
|---|
| 715 | reader = csv.reader(actions_file, delimiter="\t") |
|---|
| 716 | def read_actions(reader): |
|---|
| 717 | actions = [] |
|---|
| 718 | i = 0 |
|---|
| 719 | for p1, p2, mode, action, a_is_acting, score in reader: |
|---|
| 720 | taxid1 = p1.split(".", 1)[0] |
|---|
| 721 | taxid2 = p2.split(".", 1)[0] |
|---|
| 722 | if taxid1 in taxids and taxid2 in taxids: |
|---|
| 723 | actions.append((intern(p1), intern(p2), mode, action, int(score))) |
|---|
| 724 | i += 1 |
|---|
| 725 | if i % 1000 == 0: |
|---|
| 726 | progress(100.0 * actions_file.tell() / filesize) |
|---|
| 727 | actions.sort() |
|---|
| 728 | return actions |
|---|
| 729 | |
|---|
| 730 | con.executemany("insert into actions values (?, ?, ?, ?, ?)", read_actions(reader)) |
|---|
| 731 | con.commit() |
|---|
| 732 | progress.finish() |
|---|
| 733 | |
|---|
| 734 | filesize = os.stat(aliases_filename).st_size |
|---|
| 735 | aliases_file.readline() # read header |
|---|
| 736 | |
|---|
| 737 | progress = ConsoleProgressBar("Processing aliases:") |
|---|
| 738 | |
|---|
| 739 | reader = csv.reader(aliases_file, delimiter="\t") |
|---|
| 740 | def read_aliases(reader): |
|---|
| 741 | i = 0 |
|---|
| 742 | for taxid, name, alias, source in reader: |
|---|
| 743 | if taxid in taxids: |
|---|
| 744 | yield ".".join([taxid, name]), alias.decode("utf-8", errors="ignore") |
|---|
| 745 | i += 1 |
|---|
| 746 | if i % 1000 == 0: |
|---|
| 747 | progress(100.0 * aliases_file.tell() / filesize) |
|---|
| 748 | |
|---|
| 749 | con.executemany("insert into aliases values (?, ?)", read_aliases(reader)) |
|---|
| 750 | |
|---|
| 751 | print "Indexing the database" |
|---|
| 752 | self.init_db_index() |
|---|
| 753 | |
|---|
| 754 | progress.finish() |
|---|
| 755 | |
|---|
| 756 | def init_db_index(self): |
|---|
| 757 | """ Will create indexes (if not already pressent) in the database |
|---|
| 758 | for faster searching by primary ids. |
|---|
| 759 | |
|---|
| 760 | """ |
|---|
| 761 | self.db.execute("""\ |
|---|
| 762 | create index if not exists index_link_protein_id1 |
|---|
| 763 | on links (protein_id1)""") |
|---|
| 764 | |
|---|
| 765 | self.db.execute("""\ |
|---|
| 766 | create index if not exists index_action_protein_id1 |
|---|
| 767 | on actions (protein_id1)""") |
|---|
| 768 | |
|---|
| 769 | self.db.execute("""\ |
|---|
| 770 | create index if not exists index_proteins_id |
|---|
| 771 | on proteins (protein_id)""") |
|---|
| 772 | |
|---|
| 773 | self.db.execute("""\ |
|---|
| 774 | create index if not exists index_taxids |
|---|
| 775 | on proteins (taxid)""") |
|---|
| 776 | |
|---|
| 777 | self.db.execute("""\ |
|---|
| 778 | create index if not exists index_aliases_id |
|---|
| 779 | on aliases (protein_id)""") |
|---|
| 780 | |
|---|
| 781 | self.db.execute("""\ |
|---|
| 782 | create index if not exists index_aliases_alias |
|---|
| 783 | on aliases (alias)""") |
|---|
| 784 | |
|---|
| 785 | |
|---|
| 786 | class Interaction(object): |
|---|
| 787 | def __init__(self, protein1, protein2, ref1=None, ref2=None, conf1=None, conf2=None): |
|---|
| 788 | self.protein1, self.protein2 = protein1, protein2 |
|---|
| 789 | self.ref1, self.ref2 = ref1, ref2 |
|---|
| 790 | self.conf1, self.conf2 = conf1, conf2 |
|---|
| 791 | self.org1, self.org2 = None, None |
|---|
| 792 | |
|---|
| 793 | class MIPS(object): |
|---|
| 794 | VERSION = 1 |
|---|
| 795 | def __init__(self): |
|---|
| 796 | self.load() |
|---|
| 797 | |
|---|
| 798 | def load(self): |
|---|
| 799 | self.protein_names = defaultdict(set) |
|---|
| 800 | self.refs = {} |
|---|
| 801 | self.confidance = {} |
|---|
| 802 | def process(element): |
|---|
| 803 | d = {} |
|---|
| 804 | participants = element.getElementsByTagName("proteinParticipant") |
|---|
| 805 | proteins = [] |
|---|
| 806 | for protein in participants: |
|---|
| 807 | interactor = protein.getElementsByTagName("proteinInteractor")[0] |
|---|
| 808 | names = [] |
|---|
| 809 | for name in interactor.getElementsByTagName("shortLabel") + \ |
|---|
| 810 | interactor.getElementsByTagName("fullName"): |
|---|
| 811 | names.append((name.tagName, name.childNodes[0].data)) |
|---|
| 812 | |
|---|
| 813 | refs = [] |
|---|
| 814 | for ref in interactor.getElementsByTagName("primaryRef"): |
|---|
| 815 | refs += [(ref.tagName, ref.attributes.items())] |
|---|
| 816 | org = dict(interactor.getElementsByTagName("organism")[0].attributes.items()).get("ncbiTaxId") |
|---|
| 817 | conf = protein.getElementsByTagName("confidence")[0].attributes.items() |
|---|
| 818 | proteins.append((names, refs, conf, org)) |
|---|
| 819 | interaction = Interaction(proteins[0][0][1][1], proteins[1][0][1][1]) |
|---|
| 820 | interaction.ref1, interaction.ref2 = proteins[0][1], proteins[1][1] |
|---|
| 821 | interaction.conf1, interaction.conf2 = proteins[0][2], proteins[1][2] |
|---|
| 822 | interaction.org1, interaction.org2 = proteins[0][3], proteins[1][3] |
|---|
| 823 | |
|---|
| 824 | self.protein_names[interaction.protein1].add(proteins[0][0][0][1]) |
|---|
| 825 | self.protein_names[interaction.protein2].add(proteins[1][0][0][1]) |
|---|
| 826 | |
|---|
| 827 | return interaction |
|---|
| 828 | |
|---|
| 829 | document = minidom.parse(orngServerFiles.localpath_download("PPI", "allppis.xml")) |
|---|
| 830 | interactions = document.getElementsByTagName("interaction") |
|---|
| 831 | self.interactions = [process(interaction) for interaction in interactions] |
|---|
| 832 | |
|---|
| 833 | self.protein_interactions = defaultdict(set) |
|---|
| 834 | |
|---|
| 835 | for inter in self.interactions: |
|---|
| 836 | self.protein_names[inter.protein1] = dict(inter.ref1[0][1]).get("id") |
|---|
| 837 | self.protein_names[inter.protein2] = dict(inter.ref2[0][1]).get("id") |
|---|
| 838 | self.protein_interactions[inter.protein1].add(inter) |
|---|
| 839 | self.protein_interactions[inter.protein2].add(inter) |
|---|
| 840 | |
|---|
| 841 | def __iter__(self): |
|---|
| 842 | return iter(self.interactions) |
|---|
| 843 | |
|---|
| 844 | @classmethod |
|---|
| 845 | def download(cls): |
|---|
| 846 | import urllib2, shutil |
|---|
| 847 | src = urllib2.urlopen("http://mips.helmholtz-muenchen.de/proj/ppi/data/mppi.gz") |
|---|
| 848 | dest = orngServerFiles.localpath("PPI", "mppi.gz") |
|---|
| 849 | shutil.copyfileobj(src, open(dest, "wb")) |
|---|
| 850 | |
|---|
| 851 | @classmethod |
|---|
| 852 | @pickled_cache(None, [("PPI", "allppis.xml")], version=1) |
|---|
| 853 | def _get_instance(cls): |
|---|
| 854 | return MIPS() |
|---|
| 855 | |
|---|
| 856 | @classmethod |
|---|
| 857 | def get_instance(cls): |
|---|
| 858 | if not hasattr(cls, "_instance"): |
|---|
| 859 | cls._instance= cls._get_instance() |
|---|
| 860 | return cls._instance |
|---|
| 861 | |
|---|
| 862 | def mips_interactions(protein = None): |
|---|
| 863 | mips = MIPS.get_instance() |
|---|
| 864 | if protein is None: |
|---|
| 865 | return list(mips) |
|---|
| 866 | else: |
|---|
| 867 | return mips.protein_interactions.get(protein) |
|---|
| 868 | |
|---|
| 869 | def mips_proteins(): |
|---|
| 870 | return set(MIPS.get_instance().protein_names.keys()) |
|---|
| 871 | |
|---|
| 872 | class BioGRIDInteraction(object): |
|---|
| 873 | """ An object representing a BioGRID interaction. Each member of this object |
|---|
| 874 | represents a data from a single column of BIOGRID-ALL.tab file. |
|---|
| 875 | Attributes: |
|---|
| 876 | - *interactor_a* - BioGRID identifier |
|---|
| 877 | - *interactor_b* - BioGRID identifier |
|---|
| 878 | - *official_symbol_a* - An official symbol for *interactor_a* |
|---|
| 879 | - *official_symbol_b* - An official symbol for *interactor_b* |
|---|
| 880 | - *aliases_for_a* - Aliases separated by '|' |
|---|
| 881 | - *aliases_for_b* - Aliases separated by '|' |
|---|
| 882 | - *experimental_system* - Experimental system (see BioGRID documentation on www.thebiogrid.org for a list of valid entrys) |
|---|
| 883 | - *source* - |
|---|
| 884 | - *organism_a_id* - NCBI Taxonomy identifier for *interactor_a*'s organism |
|---|
| 885 | - *organism_b_id* - NCBI Taxonomy identifier for *interactor_b*'s organism |
|---|
| 886 | """ |
|---|
| 887 | __slots__ = ["interactor_a", "interactor_b", "official_symbol_a","official_symbol_b", "aliases_for_a", "aliases_for_b", "experimental_system", "source", "pubmed_id", "organism_a_id", "organism_b_id"] |
|---|
| 888 | def __init__(self, line): |
|---|
| 889 | for attr, val in zip(self.__slots__, line.split("\t")): |
|---|
| 890 | setattr(self, attr, val) |
|---|
| 891 | |
|---|
| 892 | class _BioGRID_Old(object): |
|---|
| 893 | """ A BioGRID database interface |
|---|
| 894 | Example:: |
|---|
| 895 | >>> ## finding all interactions for Homo sapiens sapiens |
|---|
| 896 | >>> grid = BioGRID(case_insensitive=True) |
|---|
| 897 | >>> proteins = proteins = biogrid.proteins() ## All proteins |
|---|
| 898 | >>> proteins = [p for p in proteins if any(["9606" in [int.organism_a_id, int.organism_b_id] for int in grid.get(p)])] |
|---|
| 899 | """ |
|---|
| 900 | VERSION = 1 |
|---|
| 901 | def __init__(self, case_insensitive=True): |
|---|
| 902 | # warnings.warn("obiPPi._BioGRID_Old class is deprecated. Use obiPPI.BioGRID") |
|---|
| 903 | self.case_insensitive = case_insensitive |
|---|
| 904 | self._case = (lambda name: name.lower()) if self.case_insensitive else (lambda name: name) |
|---|
| 905 | self.load() |
|---|
| 906 | |
|---|
| 907 | def load(self): |
|---|
| 908 | text = open(orngServerFiles.localpath_download("PPI", "BIOGRID-ALL.tab"), "rb").read() |
|---|
| 909 | text = text.split("SOURCE\tPUBMED_ID\tORGANISM_A_ID\tORGANISM_B_ID\n", 1)[-1] |
|---|
| 910 | self.interactions = [BioGRIDInteraction(line) for line in text.split("\n") if line.strip()] |
|---|
| 911 | |
|---|
| 912 | self.protein_interactions = defaultdict(set) |
|---|
| 913 | self.protein_names = {} |
|---|
| 914 | |
|---|
| 915 | case = self._case |
|---|
| 916 | |
|---|
| 917 | def update(keys, value, collection): |
|---|
| 918 | for k in keys: |
|---|
| 919 | collection.setdefault(k, set()).add(value) |
|---|
| 920 | |
|---|
| 921 | for inter in self.interactions: |
|---|
| 922 | update(map(case, [inter.official_symbol_a] + inter.aliases_for_a.split("|")), case(inter.interactor_a), self.protein_names) |
|---|
| 923 | update(map(case, [inter.official_symbol_b] + inter.aliases_for_b.split("|")), case(inter.interactor_b), self.protein_names) |
|---|
| 924 | |
|---|
| 925 | self.protein_interactions[case(inter.interactor_a)].add(inter) |
|---|
| 926 | self.protein_interactions[case(inter.interactor_b)].add(inter) |
|---|
| 927 | |
|---|
| 928 | self.protein_interactions = dict(self.protein_interactions) |
|---|
| 929 | |
|---|
| 930 | if case("N/A") in self.protein_names: |
|---|
| 931 | del self.protein_names[case("N/A")] |
|---|
| 932 | |
|---|
| 933 | def proteins(self): |
|---|
| 934 | """ Return all protein names in BioGRID (from INTERACTOR_A, and INTERACTOR_B columns) |
|---|
| 935 | """ |
|---|
| 936 | return self.protein_interactions.keys() |
|---|
| 937 | |
|---|
| 938 | def __iter__(self): |
|---|
| 939 | """ Iterate over all BioGRIDInteraction objects |
|---|
| 940 | """ |
|---|
| 941 | return iter(self.interactions) |
|---|
| 942 | |
|---|
| 943 | def __getitem__(self, key): |
|---|
| 944 | """ Return a list of protein interactions that a protein is a part of |
|---|
| 945 | """ |
|---|
| 946 | key = self._case(key) |
|---|
| 947 | # keys = self.protein_alias_matcher.match(key) |
|---|
| 948 | if key not in self.protein_interactions: |
|---|
| 949 | keys = self.protein_names.get(key, []) |
|---|
| 950 | else: |
|---|
| 951 | keys = [key] |
|---|
| 952 | if keys: |
|---|
| 953 | return list(reduce(set.union, [self.protein_interactions.get(k, []) for k in keys], set())) |
|---|
| 954 | else: |
|---|
| 955 | raise KeyError(key) |
|---|
| 956 | |
|---|
| 957 | def get(self, key, default=None): |
|---|
| 958 | """ Return a list of protein interactions that a protein is a part of |
|---|
| 959 | """ |
|---|
| 960 | key = self._case(key) |
|---|
| 961 | # keys = self.protein_alias_matcher.match(key) |
|---|
| 962 | if key not in self.protein_interactions: |
|---|
| 963 | keys = self.protein_names.get(keys, []) |
|---|
| 964 | else: |
|---|
| 965 | keys = [key] |
|---|
| 966 | if keys: |
|---|
| 967 | return list(reduce(set.union, [self.protein_interactions.get(k, []) for k in keys], set())) |
|---|
| 968 | else: |
|---|
| 969 | return default |
|---|
| 970 | |
|---|
| 971 | @classmethod |
|---|
| 972 | def get_instance(cls): |
|---|
| 973 | if getattr(cls, "_instance", None) is None: |
|---|
| 974 | cls._instance = _BioGRID_Old() |
|---|
| 975 | return cls._instance |
|---|
| 976 | |
|---|
| 977 | def biogrid_interactions(name=None): |
|---|
| 978 | """Return a list of protein interactions (BioGRIDInteraction objects) that a protein is a part of |
|---|
| 979 | """ |
|---|
| 980 | if name: |
|---|
| 981 | return list(_BioGRID_Old.get_instance().get(name, set())) |
|---|
| 982 | else: |
|---|
| 983 | return _BioGRID_Old.get_instance().interactions |
|---|
| 984 | |
|---|
| 985 | def biogrid_proteins(): |
|---|
| 986 | """ Return all protein names in BioGRID (from INTERACTOR_A, and INTERACTOR_B columns) |
|---|
| 987 | """ |
|---|
| 988 | return _BioGRID_Old.get_instance().proteins() |
|---|
| 989 | |
|---|
| 990 | |
|---|
| 991 | if __name__ == "__main__": |
|---|
| 992 | for protein in mips_proteins(): |
|---|
| 993 | print "Protein", protein, "interacts with", |
|---|
| 994 | print ",".join(set(reduce(list.__add__, [[inter.protein1, inter.protein2] for inter in mips_interactions(protein)], [])) -set([protein])) |
|---|
| 995 | |
|---|