Source code for cobra.topology.reporter_metabolites

# Based on Patil et al 2005 PNAS 102:2685-9
# TODO: Validate cobra.core compliance
from __future__ import print_function
from numpy import array, mean, std, where
from scipy.stats import norm, randint
from six import iteritems


[docs]def identify_reporter_metabolites(cobra_model, reaction_scores_dict, number_of_randomizations=1000, scoring_metric='default', score_type='p', entire_network=False, background_correction=True, ignore_external_boundary_reactions=False): """Calculate the aggregate Z-score for the metabolites in the model. Ignore reactions that are solely spontaneous or orphan. Allow the scores to have multiple columns / experiments. This will change the way the output is represented. cobra_model: A cobra.Model object TODO: CHANGE TO USING DICTIONARIES for the_reactions: the_scores reaction_scores_dict: A dictionary where the keys are reactions in cobra_model.reactions and the values are the scores. Currently, only supports a single numeric value as the value; however, this will be updated to allow for lists number_of_randomizations: Integer. Number of random shuffles of the scores to assess which are significant. scoring_metric: default means divide by k**0.5 score_type: 'p' Is the only option at the moment and indicates p-value. entire_network: Boolean. Currently, only compares scores calculated from the_reactions background_correction: Boolean. If True apply background correction to the aggreagate Z-score ignore_external_boundary_reactions: Not yet implemented. Boolean. If True do not count exchange reactions when calculating the score. """ # Add in a function to calculate based on correlation coefficients and to # deal with other multidimensional data. the_reactions = reaction_scores_dict.keys() the_scores = reaction_scores_dict.values() if score_type == 'p' and not hasattr(the_scores[0], '__iter__'): # minimum and maximum p-values are used to prevent numerical problems. # haven't decided whether an arbitrary min / max 1e-15 is preferred to # blunting the ends based on the values closest to 0 or 1. the_reactions = reaction_scores_dict.keys() the_scores = array(reaction_scores_dict.values()) minimum_p = min(the_scores[the_scores.nonzero()[0]]) maximum_p = max(the_scores[where(the_scores < 1)[0]]) the_scores[where(the_scores < minimum_p)] = minimum_p the_scores[where(the_scores > maximum_p)] = maximum_p the_scores = -norm.ppf(the_scores) # update the dictionary with the new scores reaction_scores_dict = dict(zip(the_reactions, the_scores)) elif hasattr(the_scores[0], '__iter__'): # In the case that the_scores is a list of lists, assume that each list # is the score for each reaction in the_reactions across all reactions. # Then for each metabolite, calculate the invnorm(|Pearson Correlation # Coefficient| for each reaction pair that it links. raise Exception("This isn't implemented yet") # Get the connectivity for each metabolite the_metabolites = set() for x in reaction_scores_dict: the_metabolites.update(x._metabolites) metabolite_scores = {} metabolite_connections = {} # Calculate the score for each metabolite for the_metabolite in the_metabolites: nonspontaneous_connections = [x for x in the_metabolite._reaction if x.gene_reaction_rule.lower() not in ['s0001', '']] tmp_score = 0 number_of_connections = len(nonspontaneous_connections) for the_reaction in nonspontaneous_connections: if the_reaction not in reaction_scores_dict: if not entire_network: number_of_connections -= 1 continue else: tmp_score += reaction_scores_dict[the_reaction] metabolite_scores[the_metabolite] = tmp_score metabolite_connections[the_metabolite] = number_of_connections # NOTE: Doing the corrections based only on the significantly perturbed # scores is probably going to underestimate the significance. if background_correction: correction_dict = {} for i in set(metabolite_connections.values()): # if entire_network # add in a section to deal with the situation # where the entire network structure is considered by only have # p-values for a limited subset. # # Basically, what we're doing here is that for each i we select i # scores number_of_randomizations times the_random_indices = randint.rvs( 0, len(the_scores), size=(number_of_randomizations, i)) random_score_distribution = array( [sum(the_scores[x]) for x in list(the_random_indices)]) / i**0.5 correction_dict[i] = [mean(random_score_distribution), std(random_score_distribution, ddof=1)] for the_metabolite, the_score in iteritems(metabolite_scores): number_of_connections = metabolite_connections[the_metabolite] if number_of_connections > 0: # Correct based on background distribution if background_correction: # if the list of scores is only for significant perturbations # then the background correction shouldn't be applied because # the current sampling method only takes into account # the_scores not the entire network. It'd be more accurate to # assign unscored reactions a default score. the_score = ((the_score / number_of_connections**.5) - correction_dict[number_of_connections][0]) / \ correction_dict[number_of_connections][1] else: the_score = the_score / number_of_connections**.5 # Update the score metabolite_scores[the_metabolite] = the_score return_dictionary = {'scores': metabolite_scores, 'connections': metabolite_connections} if background_correction: return_dictionary['corrections'] = correction_dict return return_dictionary