Skip to content

Commit

Permalink
Human 1.16
Browse files Browse the repository at this point in the history
  • Loading branch information
haowang-bioinfo committed Jun 19, 2023
2 parents faa7e2f + fdd7b5c commit 753f178
Show file tree
Hide file tree
Showing 22 changed files with 21,820 additions and 1,601 deletions.
19 changes: 19 additions & 0 deletions .github/workflows/check-metabolictasks.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
name: Test metabolic tasks

on: [push, pull_request]

jobs:
check-metabolictasks:
runs-on: self-hosted
timeout-minutes: 60
strategy:
matrix:
task-type: [essential, verification]
steps:

- name: Checkout
uses: actions/checkout@v3

- name: Check ${{ matrix.task-type }} metabolic tasks
run: |
/usr/local/bin/matlab -nodisplay -nosplash -nodesktop -r "addpath(genpath('.')); testMetabolicTasks('${{ matrix.task-type }}'); exit;"
2 changes: 1 addition & 1 deletion .github/workflows/yaml-conversion.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ jobs:
steps:

- name: Checkout
uses: actions/checkout@v2
uses: actions/checkout@v3

- name: Run conversion script
run: |
Expand Down
107 changes: 107 additions & 0 deletions code/estimateEssentialGenes.m
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
function [eGenes, INIT_output] = estimateEssentialGenes(model, dataFile, taskStruct, useGeneSymbol)
% generate tINIT models and estimate essential genes
%
% Input:
%
% model reference human or animal model
%
% dataFile (opt, default Hart2015_RNAseq.txt)
%
% taskStruct metabolic task structure (opt, default is Essential tasks)
%
% useGeneSymbol use gene symbols as ids and in grRules (opt, default TRUE)
%
% Output:
%
% eGenes results structure with the following fields:
% taskList list of metabolic tasks that were tested
% tissues list of tissues (model IDs) corresponding to each model
% geneList cell array of the list of genes from each model
% essentialGenes cell array with one entry per model, where each
% entry is a logical matrix with rows corresponding
% to genes (in geneList) and columns to tasks (in
% taskList). Entries in the matrix are true when a
% gene is essential for a task, and false otherwise.
%
% INIT_output structure containing tINIT models (or information
% necessary to regenerate tINIT models) for which gene
% essentiality will be evaluated.
%
% Note: This function may take long computation time.
%


if nargin < 2
% load Hart et al. RNA-Seq cell line data
dataFile = 'Hart2015_RNAseq.txt';
end

if nargin < 3
taskStruct = parseTaskList('metabolicTasks_Essential.txt');
end

if nargin < 4
useGeneSymbol = true;
end

% replace gene IDs with gene symbols
if useGeneSymbol
idMapping = [model.genes, model.geneShortNames];
[grRules,genes,rxnGeneMat] = replaceGrRules(model.grRules,idMapping);
model.grRules = grRules;
model.genes = genes;
model.rxnGeneMat = rxnGeneMat;
end

% pre-process RNA-Seq data
disp('Step 1: preprocess and preliminary step')
tmp = readtable(dataFile);
arrayData.genes = tmp.gene;
arrayData.tissues = tmp.Properties.VariableNames(2:end)';
arrayData.levels = table2array(tmp(:,2:end));


% Run some preliminary steps
[~,deletedDeadEndRxns] = simplifyModel(model,true,false,true,true,true);
cModel = removeReactions(model,deletedDeadEndRxns,false,true);
[taskReport, essentialRxnMat] = checkTasks(cModel,[],true,false,true,taskStruct);


% add pre-processing results to arrayData structure
arrayData.deletedDeadEndRxns = deletedDeadEndRxns;
arrayData.taskReport = taskReport;
arrayData.essentialRxnMat = essentialRxnMat;
arrayData.threshold = 1;


% run tINIT
disp('Step 2: get tissue models')
model = addBoundaryMets(model);
params = {};
INIT_output = {};

for i = 1:length(arrayData.tissues)
disp(['Tissue ', num2str(i), ' out of ', num2str(length(arrayData.tissues)),': ', arrayData.tissues{i}])

% First try to run tINIT with shorter time limit. If it fails, then
% try again with a longer time limit.
try
params.TimeLimit = 1000;
init_model = getINITModel2(model,arrayData.tissues{i},[],[],arrayData,[],true,[],true,true,taskStruct,params);
catch
params.TimeLimit = 5000;
init_model = getINITModel2(model,arrayData.tissues{i},[],[],arrayData,[],true,[],true,true,taskStruct,params);
end

init_model.id = arrayData.tissues{i};
INIT_output.id{i,1} = init_model.id;
INIT_output.model{i,1} = init_model;

end

disp('Step 3: get essential genes')
% get essential genes for each model and task
eGenes = getTaskEssentialGenes(INIT_output, model, taskStruct);
eGenes.refModel = model;

end
153 changes: 153 additions & 0 deletions code/evaluateHart2015Essentiality.m
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
function results = evaluateHart2015Essentiality(eGenes)
% Evaluate and compare experimental fitness genes with essentiality results
% predicted from 5 cell-line specific GEMs from Hart 2015 datasets
%
% Input:
%
% eGenes results structure with the following fields:
% taskList list of metabolic tasks that were tested
% tissues list of tissues (model IDs) corresponding to each model
% geneList cell array of the list of genes from each model
% essentialGenes cell array with one entry per model, where each
% entry is a logical matrix with rows corresponding
% to genes (in geneList) and columns to tasks (in
% taskList). Entries in the matrix are true when a
% gene is essential for a task, and false otherwise.
%
% Output:
%
%
% results evaluation with values of TP, TN, FP, FN, accuracy,
% sensitivity, specificity, F1, MCC
%
% Example usage:
%
% results = evaluateEssentialityHart2015(eGenes)
%


%% load Hart2015 essentiality data

% Table S2 from the Hart2015 datasets:
% In this study, essential genes are, by definition, a subset of fitness genes
x = readtable('Hart2015_TableS2.xlsx');

% remove duplicated rows (MARCH1 and MARCH2 genes)
ind1 = find(ismember(x.Gene,'MARCH1'),1,'last');
ind2 = find(ismember(x.Gene,'MARCH2'),1,'last');
x([ind1;ind2],:) = [];

% extract gene list and cell types
genes = x.Gene;
celltypes = regexprep(upper(x.Properties.VariableNames(3:7)'),'BF_','');

% for each cell type, determine the "fitness" genes, defined as those with
% a 5% FDRs were chosen as thresholds (the values are extracted from the
% supporting information of the Hart 2015 paper. Genes observed in 3 or more
% of the 5 TKO screens (n=1,580) were considered as core fitness genes
bf_data = table2array(x(:,3:7));
bf_thresh = {'HCT116', 1.57
'HELA', 15.47
'GBM', 3.20
'RPE1', 6.84
'DLD1', 3.57};

% construct fitness matrix according to threshold values
fitness_mat = zeros(numel(genes),numel(celltypes));
for i = 1:numel(celltypes)
[~,ind] = ismember(celltypes{i},bf_thresh(:,1));
fitness_mat(:,i) = bf_data(:,i) > bf_thresh{ind,2};
end
fitness_mat(isnan(bf_data)) = NaN; % mark missing values differently than non-hits

% also get the genes that were essential in all 5 cell lines ("all")
celltypes(end+1) = {'all'};
fitness_mat(:,end+1) = all(fitness_mat == 1,2);

% put Hart2015 essentiality data into data structure for comparison
expdata = {};
expdata.genes = genes;
expdata.tissues = celltypes;
expdata.essential = fitness_mat;


%% Load and organize model-predicted gene essentiality results

% re-organize essentialGenes logical matrices into a uniform 3D matrix,
% where rows represent genes, columns are tasks, and the 3rd dimension is
% different cell lines.
eGenes.essentialMat = false(numel(eGenes.refModel.genes), numel(eGenes.taskList), numel(eGenes.tissues));
for i = 1:numel(eGenes.tissues)
[hasMatch,ind] = ismember(eGenes.refModel.genes,eGenes.geneList{i});
eGenes.essentialMat(hasMatch,:,i) = eGenes.essentialGenes{i}(ind(hasMatch),:);
end


%% compare model predictions to experimental essentiality data

% specify model-determined essential genes
modelPred = squeeze(any(eGenes.essentialMat,2)); % essential for any task

tissues = eGenes.tissues;
tissues(end+1) = {'all'}; % also include "all" category for Hart2015 dataset

% calculate true and false positives and negatives
[TP,TN,FP,FN,Penr] = deal(nan(numel(tissues),1)); % initialize variables
for i = 1:numel(tissues)

[~,tissue_ind] = ismember(tissues(i), expdata.tissues);
if tissue_ind == 0
continue % a few tissues are missing from the DepMap dataset
end

modelGenes = eGenes.refModel.genes;
if strcmpi(tissues{i},'all')
modelEssential = modelGenes(sum(modelPred,2) == size(modelPred,2));
else
modelEssential = modelGenes(modelPred(:,i));
end
modelNonEssential = setdiff(modelGenes,modelEssential);

expGenes = expdata.genes(~isnan(expdata.essential(:,tissue_ind)));
expEssential = expdata.genes(expdata.essential(:,tissue_ind) == 1);
expNonEssential = setdiff(expGenes,expEssential);

TP(i) = sum(ismember(modelEssential, expEssential)); % true positives
TN(i) = sum(ismember(modelNonEssential, expNonEssential)); % true negatives
FP(i) = sum(ismember(modelEssential, expNonEssential)); % false positives
FN(i) = sum(ismember(modelNonEssential, expEssential)); % false negatives

Penr(i) = EnrichmentTest(intersect(modelGenes,expGenes), intersect(modelEssential,expGenes), intersect(expEssential,modelGenes));
end

% calculate some metrics
sensitivity = TP./(TP + FN);
specificity = TN./(TN + FP);
accuracy = (TP + TN)./(TP + TN + FP + FN);
F1 = 2*TP./(2*TP + FP + FN);
MCC = ((TP.*TN) - (FP.*FN))./sqrt((TP+FP).*(TP+FN).*(TN+FP).*(TN+FN)); % Matthews correlation coefficient
PenrAdj = adjust_pvalues(Penr,'Benjamini');

% get results for cell types
results = [{'cellLine','TP','TN','FP','FN','accuracy','sensitivity','specificity','F1','MCC','Penr','logPenr','PenrAdj','logPenrAdj'};
[tissues, num2cell([TP, TN, FP, FN, accuracy, sensitivity, specificity, F1, MCC, Penr, -log10(Penr), PenrAdj, -log10(PenrAdj)])]];


end

function [penr,pdep] = EnrichmentTest(pop,sample,successes)
% Caculates p-value of enrichment of successes in a sample.
%
% Evaluates the significance of enrichment (and depletion) of SUCCESSES in
% a SAMPLE drawn from population POP using the hypergeometric test.

x = numel(intersect(successes,sample)); % calc # of successes in sample
m = numel(pop); % calc size of population
k = numel(intersect(successes,pop));
n = numel(sample);

penr = hygecdf(x-1,m,k,n,'upper');
pdep = hygecdf(x,m,k,n);

end

89 changes: 89 additions & 0 deletions code/getTaskEssentialGenes.m
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
function eGenes = getTaskEssentialGenes(INIT_output, refModel, taskStruct)
% Identify genes essential for different tasks in different tINIT models.
%
% Input:
%
% INIT_output structure containing tINIT models (or information
% necessary to regenerate tINIT models) for which gene
% essentiality will be evaluated.
%
% refModel reference model from which the tINIT models were
% generated.
%
% taskStruct metabolic task structure
%
%
% Output:
%
% eGenes results structure with the following fields:
% taskList list of metabolic tasks that were tested
% tissues list of tissues (model IDs) corresponding to each model
% geneList cell array of the list of genes from each model
% essentialGenes cell array with one entry per model, where each
% entry is a logical matrix with rows corresponding
% to genes (in geneList) and columns to tasks (in
% taskList). Entries in the matrix are true when a
% gene is essential for a task, and false otherwise.
%
%
% Usage:
%
% eGenes = getTaskEssentialGenes(INIT_output, refModel, taskStruct);
%
%
% Jonathan Robinson, 2019-10-26



if ~isfield(INIT_output, 'model')
% if INIT_output does not contain complete models, they first need to
% be regenerated
models = regen_tINIT_model(refModel, INIT_output);
else
models = INIT_output.model(:);
end

% ensure that model ID is set to the tissue names
for i = 1:numel(models)
if ~isfield(INIT_output,'tissues')
models{i}.id = INIT_output.id{i};
else
models{i}.id = INIT_output.tissues{i};
end
end

% replace Ensembl IDs with gene symbols
for i = 1:length(models)
if any(startsWith(models{i}.genes,'ENSG000'))
idMapping = [models{i}.genes, models{i}.geneShortNames];
[grRules,genes,rxnGeneMat] = replaceGrRules(models{i}.grRules,idMapping);
models{i}.grRules = grRules;
models{i}.genes = genes;
models{i}.rxnGeneMat = rxnGeneMat;
end
end

% iterate through the different models
parfor i = 1:length(models)

m = models{i};

% determine essential genes for each task
[~,essentialGenes] = checkTasksGenes(m,[],false,false,true,taskStruct);

% collect results
tissues{i,1} = m.id;
geneList{i,1} = m.genes;
allEssentials{i,1} = essentialGenes;

end

% gather results into eGenes structure
eGenes = {};
eGenes.taskList = {taskStruct(:).description}';
eGenes.tissues = tissues;
eGenes.geneList = geneList;
eGenes.essentialGenes = allEssentials;



Loading

0 comments on commit 753f178

Please sign in to comment.