Source code for cobra.topology.reporter_metabolites

# Based on Patil et al 2005 PNAS 102:2685-9
# TODO: Validate cobra.core compliance
from __future__ import print_function
from numpy import array, mean, std, where
from scipy.stats import norm, randint
from six import iteritems


[docs]def identify_reporter_metabolites(cobra_model, reaction_scores_dict,
                                  number_of_randomizations=1000,
                                  scoring_metric='default', score_type='p',
                                  entire_network=False,
                                  background_correction=True,
                                  ignore_external_boundary_reactions=False):
    """Calculate the aggregate Z-score for the metabolites in the model.
    Ignore reactions that are solely spontaneous or orphan. Allow the scores to
    have multiple columns / experiments.   This will change the way the output
    is represented.

    cobra_model: A cobra.Model object

    TODO: CHANGE TO USING DICTIONARIES for the_reactions: the_scores

    reaction_scores_dict:  A dictionary where the keys are reactions in
    cobra_model.reactions and the values are the scores.  Currently, only
    supports a single numeric value as the value; however, this will be updated
    to allow for lists

    number_of_randomizations: Integer.  Number of random shuffles of the
    scores to assess which are significant.

    scoring_metric: default means divide by k**0.5

    score_type: 'p' Is the only option at the moment and indicates p-value.

    entire_network: Boolean. Currently, only compares scores calculated from
    the_reactions

    background_correction: Boolean.  If True apply background correction to the
    aggreagate Z-score

    ignore_external_boundary_reactions: Not yet implemented. Boolean.  If True
    do not count exchange reactions when calculating the score.
    """

    # Add in a function to calculate based on correlation coefficients and to
    # deal with other multidimensional data.
    the_reactions = reaction_scores_dict.keys()
    the_scores = reaction_scores_dict.values()
    if score_type == 'p' and not hasattr(the_scores[0], '__iter__'):
        # minimum and maximum p-values are used to prevent numerical problems.
        # haven't decided whether an arbitrary min / max 1e-15 is preferred to
        # blunting the ends based on the values closest to 0 or 1.
        the_reactions = reaction_scores_dict.keys()
        the_scores = array(reaction_scores_dict.values())
        minimum_p = min(the_scores[the_scores.nonzero()[0]])
        maximum_p = max(the_scores[where(the_scores < 1)[0]])
        the_scores[where(the_scores < minimum_p)] = minimum_p
        the_scores[where(the_scores > maximum_p)] = maximum_p
        the_scores = -norm.ppf(the_scores)
        # update the dictionary with the new scores
        reaction_scores_dict = dict(zip(the_reactions, the_scores))
    elif hasattr(the_scores[0], '__iter__'):
        # In the case that the_scores is a list of lists, assume that each list
        # is the score for each reaction in the_reactions across all reactions.
        # Then for each metabolite, calculate the invnorm(|Pearson Correlation
        # Coefficient| for each reaction pair that it links.
        raise Exception("This isn't implemented yet")

    # Get the connectivity for each metabolite
    the_metabolites = set()
    for x in reaction_scores_dict:
        the_metabolites.update(x._metabolites)

    metabolite_scores = {}
    metabolite_connections = {}
    # Calculate the score for each metabolite
    for the_metabolite in the_metabolites:
        nonspontaneous_connections = [x for x in the_metabolite._reaction
                                      if x.gene_reaction_rule.lower() not in
                                      ['s0001', '']]
        tmp_score = 0
        number_of_connections = len(nonspontaneous_connections)
        for the_reaction in nonspontaneous_connections:
            if the_reaction not in reaction_scores_dict:
                if not entire_network:
                    number_of_connections -= 1
                continue
            else:
                tmp_score += reaction_scores_dict[the_reaction]
        metabolite_scores[the_metabolite] = tmp_score
        metabolite_connections[the_metabolite] = number_of_connections

    # NOTE: Doing the corrections based only on the significantly perturbed
    # scores is probably going to underestimate the significance.
    if background_correction:
        correction_dict = {}
        for i in set(metabolite_connections.values()):
            # if entire_network # add in a section to deal with the situation
            # where the entire network structure is considered by only have
            # p-values for a limited subset.
            #
            # Basically, what we're doing here is that for each i we select i
            # scores number_of_randomizations times
            the_random_indices = randint.rvs(
                0, len(the_scores), size=(number_of_randomizations, i))
            random_score_distribution = array(
                [sum(the_scores[x])
                 for x in list(the_random_indices)]) / i**0.5
            correction_dict[i] = [mean(random_score_distribution),
                                  std(random_score_distribution, ddof=1)]

    for the_metabolite, the_score in iteritems(metabolite_scores):
        number_of_connections = metabolite_connections[the_metabolite]
        if number_of_connections > 0:
            # Correct based on background distribution
            if background_correction:
                # if the list of scores is only for significant perturbations
                # then the background correction shouldn't be applied because
                # the current sampling method only takes into account
                # the_scores not the entire network.  It'd be more accurate to
                # assign unscored reactions a default score.
                the_score = ((the_score / number_of_connections**.5) -
                             correction_dict[number_of_connections][0]) / \
                    correction_dict[number_of_connections][1]
            else:
                the_score = the_score / number_of_connections**.5
            # Update the score
            metabolite_scores[the_metabolite] = the_score

    return_dictionary = {'scores': metabolite_scores,
                         'connections': metabolite_connections}
    if background_correction:
        return_dictionary['corrections'] = correction_dict

    return return_dictionary