From 2d0eeba648aac6c2305c3a7e8fa01e9d7453a445 Mon Sep 17 00:00:00 2001 From: shuang Date: Wed, 2 Oct 2024 15:45:22 +0000 Subject: [PATCH] update query prompt and func --- gptbioinsightor/celltype.py | 13 ++-- gptbioinsightor/prompt.py | 148 ++++++++++-------------------------- 2 files changed, 49 insertions(+), 112 deletions(-) diff --git a/gptbioinsightor/celltype.py b/gptbioinsightor/celltype.py index 6080dc9..d609f8a 100644 --- a/gptbioinsightor/celltype.py +++ b/gptbioinsightor/celltype.py @@ -12,8 +12,8 @@ from .prompt import * -def _query_celltype(genes, queryid, background, provider, model, base_url, sys_prompt): - text = CELLTYPE_PROMPT.format(setid=queryid, gene=",".join(genes), background=background) +def _query_celltype(queryid, gene_txt, cluster_num, background, provider, model, base_url, sys_prompt): + text = CELLTYPE_PROMPT.format(setid=queryid, gene=gene_txt, setnum=cluster_num,background=background) msg = [{"role": "user", "content": text}] response = query_model(msg, provider=provider, model=model, base_url=base_url, sys_prompt=sys_prompt) return response @@ -79,13 +79,14 @@ def get_celltype( if n_jobs is None: n_jobs = min(os.cpu_count()//2, len(gene_dic)) - def _aux_func(item): - return _query_celltype(item[1][:topnumber], item[0], background, provider, model, base_url, sys_prompt) + def _aux_func(args): + gene_txt = "\n".join([f"cluster {k}: {','.join(genes[:topnumber])}" for k,genes in args[1].items()]) + return _query_celltype(args[0], gene_txt, len(args[1]), background, provider, model, base_url, sys_prompt) celltype_ls = [] with ThreadPoolExecutor(max_workers=n_jobs) as executor: - results = executor.map(_aux_func, gene_dic.items()) - for (gsid, genes), res in zip(gene_dic.items(), results): + results = executor.map(_aux_func, [(k, gene_dic) for k in gene_dic.keys()]) + for gsid, res in zip(gene_dic.values(), results): res = res.strip("```").strip("'''").strip() print(res, file=handle) ctn = ul.get_celltype_name(res) diff --git a/gptbioinsightor/prompt.py b/gptbioinsightor/prompt.py index c37a764..46394fc 100644 --- a/gptbioinsightor/prompt.py +++ b/gptbioinsightor/prompt.py @@ -42,127 +42,63 @@ """ -LIKELY_CELLTYPE_PROMPT = """ -Geneset {setid}: -```gene list +CELLTYPE_PROMPT = """ +Input: +''' +Geneset: {gene} -``` - -Hi, GPTBioInsightor! Please analyze the above geneset and determine three most likely celltypes based on the following INSTRUCTIONS. - -INSTRUCTIONS: -0. Evaluate each gene individually, providing evidence and rationale for every potential cell type. -1. Prioritize cell-specific gene markers -2. Consider context-specific gene markers and samples/cells source within context (BACKGROUND) -3. Analyze the celltype context (BACKGROUND) to speculate on cell states, such as stress responses, invasiveness, proliferation rates, developmental stages, or other transient/dynamic properties. -4. Focus on positive evidence; avoid using the absence of markers as primary reasoning. -5. Evaluate the possibility of mixed cell populations or transitional states. -6. If applicable, note any unexpected gene combinations that might suggest novel cell states or types. +Context: +{background}. Here are {setnum} genesets for {setnum} different cell clusters up-regulated DEGs. +''' -BACKGROUND: -{background} - -Please format your output as follows, without any additional content: +Hi, GPTBioInsightor! Please analyze Input and predict the celltypes of geneset cluster {setid} based on the following INSTRUCTIONS. +INSTRUCTIONS: +0. Analyze each gene in cluster {setid}, check cell-specific and context-specific markers +1. prioritize single Gold Standard marker or gene marker combinations for celltype prediction +2. Consider the context of Input for celltype prediction; e.g. tissue, disease, etc. +3. Integrate Context of Input to speculate on some novel insights +4. Focus on positive evidence, avoid using marker absence as primary reasoning. +5. Exclude celltypes with clear negative markers in the geneset. +6. Consider each cluster has different celltype prediction in most time, exclude celltypes represented by ohter cluster gene markers. +7. Consider one Optimal celltypes and two alternative celltypes. + +Output Format:, without any additional prompt or string: ''' -## Geneset {setid}: +## cluster geneset {setid} ### Gene List ``` -[gene list] +[cluster {setid} gene list] ``` -### Potential Cell Types - -#### [CELLTYPE1] -**Gene Markers**: -- cell-specific: [CELL-SPECIFIC GENE MARKERS] -- context-specific: [CONTEXT-SPECIFIC GENE MARKERS] - -**Evidence**: [DETAILED EVIDENCE SUPPORTING THIS CELL TYPE] - -**Rationale**: [COMPREHENSIVE REASONING] - -**Potential Cell State**: [SPECULATED STATE BASED ON BACKGROUND] - -#### [CELLTYPE2] -**Gene Markers**: -- cell-specific: [CELL-SPECIFIC GENE MARKERS] -- context-specific: [CONTEXT-SPECIFIC GENE MARKERS] - -**Evidence**: [DETAILED EVIDENCE SUPPORTING THIS CELL TYPE] - -**Rationale**: [COMPREHENSIVE REASONING] - -**Potential Cell State**: [SPECULATED STATE BASED ON BACKGROUND] - -#### [CELLTYPE3] -**Gene Markers**: -- cell-specific: [CELL-SPECIFIC GENE MARKERS] -- context-specific: [CONTEXT-SPECIFIC GENE MARKERS] - -**Evidence**: [DETAILED EVIDENCE SUPPORTING THIS CELL TYPE] - -**Rationale**: [COMPREHENSIVE REASONING] - -**Potential Cell State**: [SPECULATED STATE BASED ON BACKGROUND] - -### Additional Observations -[ANY NOTEWORTHY PATTERNS, UNUSUAL GENE COMBINATIONS, OR POTENTIAL NOVEL INSIGHTS] -''' -""" - - -FINAL_CELLTYPE_PROMPT = """ -Hi, GPTBioInsightor! Please determine the most likely cell type for each gene set from the provided potential cell types. Your analysis should be based on the following INSTRUCTIONS and context(BACKGROUND) information. - -INSTRUCTIONS: -1. Provide comprehensive evidence and reasoning for the most likely cell type of each gene set wtih context(BACKGROUND). -2. Prioritize cell-specific and context-specific gene markers in your analysis. -3. Fully integrate the BACKGROUND information into your analysis to determine the most logical cell type. -4. Speculate on the cell state, considering factors such as stress response, invasiveness, proliferation rate, developmental stage, or other transient/dynamic properties. -5. Do not use the absence of markers as primary evidence; focus on positive evidence. -6. Exclude cell types with clear negative markers present in the gene set. -7. Evaluate the possibility of mixed cell populations or transitional states if the gene set suggests this. -8. Note any unusual gene combinations or expression patterns that might indicate novel cell states or types. - -BACKGROUND: -{background}. Above are {geneset_num} genesets and their potential celltypes, geneseach geneset is highly expressed relative to other gene sets.. - -For the output you should follow this format: -''' -### [geneset id] : [ CELLTYPE NAME] - -**Gene Markers**: -- cell-specific: [CELL-SPECIFIC GENE MARKERS] -- context-specific: [CONTEXT-SPECIFIC GENE MARKERS] - -**Evidence and Reasoning**: -1. [MAIN EVIDENCE POINT, like cell-specific marker] -2. [SECONDARY EVIDENCE POINT, like context-specific marker ] -3. [ADDITIONAL EVIDENCE POINTS AS NEEDED] - -**Cell State/Subtype**: [SPECULATED CELL STATE OR SUBTYPE BASED ON BACKGROUND] -**Alternative Considerations**: [BRIEFLY MENTION OTHER CELL TYPES CONSIDERED AND WHY THEY WERE RULED OUT] - - -### [geneset id] : [ CELLTYPE NAME ] +### Celltype Prediction +#### Optimal Celltype: [OPTIMAL CELLTYPE NAME] +**Key Markers**: +- Cell-specific: [CELL-SPECIFIC MARKERS] +- Context-specific: [CONTEXT-SPECIFIC MARKERS] -**Gene Markers**: -- cell-specific: [CELL-SPECIFIC GENE MARKERS] -- context-specific: [CONTEXT-SPECIFIC GENE MARKERS] +**Evidence and Reasoning** +- [PRIMARY EVIDENCE] +- [SECONDARY EVIDENCE] +- [ADDITIONAL EVIDENCE AS NEEDED] -**Evidence and Reasoning**: -1. [MAIN EVIDENCE POINT, like cell-specific marker] -2. [SECONDARY EVIDENCE POINT, like context-specific marker ] -3. [ADDITIONAL EVIDENCE POINTS AS NEEDED] +**Validation**: [OTHER Gold Standard MARKERS(NOT IN Geneset {setid}) TO VALIDATE THE OPTIMAL CELLTYPE] -**Cell State/Subtype**: [SPECULATED CELL STATE OR SUBTYPE BASED ON BACKGROUND] -**Alternative Considerations**: [BRIEFLY MENTION OTHER CELL TYPES CONSIDERED AND WHY THEY WERE RULED OUT] +#### Alternative Considerations +- Alternative celltype1 + - [WHY Alternative? Key MARKERS, Evidence and Reasoning] + - [OTHER Gold Standard MARKERS(NOT IN Geneset {setid}) TO VALIDATE THE Alternative celltype1] -[REPEAT FOR EACH GENE SET] +- Alternative celltype2 + - [WHY Alternative? Key MARKERS, Evidence and Reasoning] + - [OTHER Gold Standard MARKERS(NOT IN Geneset {setid}) TO VALIDATE THE Alternative celltype2] +### Novel Insights +- [NOTEWORTHY PATTERNS] +- [CELL STATE] +- [POTENTIAL NEW FINDINGS] ''' """