Skip to content

Commit

Permalink
update query prompt and func
Browse files Browse the repository at this point in the history
  • Loading branch information
shuang committed Oct 2, 2024
1 parent d36c290 commit 2d0eeba
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 112 deletions.
13 changes: 7 additions & 6 deletions gptbioinsightor/celltype.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
from .prompt import *


def _query_celltype(genes, queryid, background, provider, model, base_url, sys_prompt):
text = CELLTYPE_PROMPT.format(setid=queryid, gene=",".join(genes), background=background)
def _query_celltype(queryid, gene_txt, cluster_num, background, provider, model, base_url, sys_prompt):
text = CELLTYPE_PROMPT.format(setid=queryid, gene=gene_txt, setnum=cluster_num,background=background)
msg = [{"role": "user", "content": text}]
response = query_model(msg, provider=provider, model=model, base_url=base_url, sys_prompt=sys_prompt)
return response
Expand Down Expand Up @@ -79,13 +79,14 @@ def get_celltype(
if n_jobs is None:
n_jobs = min(os.cpu_count()//2, len(gene_dic))

def _aux_func(item):
return _query_celltype(item[1][:topnumber], item[0], background, provider, model, base_url, sys_prompt)
def _aux_func(args):
gene_txt = "\n".join([f"cluster {k}: {','.join(genes[:topnumber])}" for k,genes in args[1].items()])
return _query_celltype(args[0], gene_txt, len(args[1]), background, provider, model, base_url, sys_prompt)

celltype_ls = []
with ThreadPoolExecutor(max_workers=n_jobs) as executor:
results = executor.map(_aux_func, gene_dic.items())
for (gsid, genes), res in zip(gene_dic.items(), results):
results = executor.map(_aux_func, [(k, gene_dic) for k in gene_dic.keys()])
for gsid, res in zip(gene_dic.values(), results):
res = res.strip("```").strip("'''").strip()
print(res, file=handle)
ctn = ul.get_celltype_name(res)
Expand Down
148 changes: 42 additions & 106 deletions gptbioinsightor/prompt.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,127 +42,63 @@
"""


LIKELY_CELLTYPE_PROMPT = """
Geneset {setid}:
```gene list
CELLTYPE_PROMPT = """
Input:
'''
Geneset:
{gene}
```
Hi, GPTBioInsightor! Please analyze the above geneset and determine three most likely celltypes based on the following INSTRUCTIONS.
INSTRUCTIONS:
0. Evaluate each gene individually, providing evidence and rationale for every potential cell type.
1. Prioritize cell-specific gene markers
2. Consider context-specific gene markers and samples/cells source within context (BACKGROUND)
3. Analyze the celltype context (BACKGROUND) to speculate on cell states, such as stress responses, invasiveness, proliferation rates, developmental stages, or other transient/dynamic properties.
4. Focus on positive evidence; avoid using the absence of markers as primary reasoning.
5. Evaluate the possibility of mixed cell populations or transitional states.
6. If applicable, note any unexpected gene combinations that might suggest novel cell states or types.
Context:
{background}. Here are {setnum} genesets for {setnum} different cell clusters up-regulated DEGs.
'''
BACKGROUND:
{background}
Please format your output as follows, without any additional content:
Hi, GPTBioInsightor! Please analyze Input and predict the celltypes of geneset cluster {setid} based on the following INSTRUCTIONS.
INSTRUCTIONS:
0. Analyze each gene in cluster {setid}, check cell-specific and context-specific markers
1. prioritize single Gold Standard marker or gene marker combinations for celltype prediction
2. Consider the context of Input for celltype prediction; e.g. tissue, disease, etc.
3. Integrate Context of Input to speculate on some novel insights
4. Focus on positive evidence, avoid using marker absence as primary reasoning.
5. Exclude celltypes with clear negative markers in the geneset.
6. Consider each cluster has different celltype prediction in most time, exclude celltypes represented by ohter cluster gene markers.
7. Consider one Optimal celltypes and two alternative celltypes.
Output Format:, without any additional prompt or string:
'''
## Geneset {setid}:
## cluster geneset {setid}
### Gene List
```
[gene list]
[cluster {setid} gene list]
```
### Potential Cell Types
#### [CELLTYPE1]
**Gene Markers**:
- cell-specific: [CELL-SPECIFIC GENE MARKERS]
- context-specific: [CONTEXT-SPECIFIC GENE MARKERS]
**Evidence**: [DETAILED EVIDENCE SUPPORTING THIS CELL TYPE]
**Rationale**: [COMPREHENSIVE REASONING]
**Potential Cell State**: [SPECULATED STATE BASED ON BACKGROUND]
#### [CELLTYPE2]
**Gene Markers**:
- cell-specific: [CELL-SPECIFIC GENE MARKERS]
- context-specific: [CONTEXT-SPECIFIC GENE MARKERS]
**Evidence**: [DETAILED EVIDENCE SUPPORTING THIS CELL TYPE]
**Rationale**: [COMPREHENSIVE REASONING]
**Potential Cell State**: [SPECULATED STATE BASED ON BACKGROUND]
#### [CELLTYPE3]
**Gene Markers**:
- cell-specific: [CELL-SPECIFIC GENE MARKERS]
- context-specific: [CONTEXT-SPECIFIC GENE MARKERS]
**Evidence**: [DETAILED EVIDENCE SUPPORTING THIS CELL TYPE]
**Rationale**: [COMPREHENSIVE REASONING]
**Potential Cell State**: [SPECULATED STATE BASED ON BACKGROUND]
### Additional Observations
[ANY NOTEWORTHY PATTERNS, UNUSUAL GENE COMBINATIONS, OR POTENTIAL NOVEL INSIGHTS]
'''
"""


FINAL_CELLTYPE_PROMPT = """
Hi, GPTBioInsightor! Please determine the most likely cell type for each gene set from the provided potential cell types. Your analysis should be based on the following INSTRUCTIONS and context(BACKGROUND) information.
INSTRUCTIONS:
1. Provide comprehensive evidence and reasoning for the most likely cell type of each gene set wtih context(BACKGROUND).
2. Prioritize cell-specific and context-specific gene markers in your analysis.
3. Fully integrate the BACKGROUND information into your analysis to determine the most logical cell type.
4. Speculate on the cell state, considering factors such as stress response, invasiveness, proliferation rate, developmental stage, or other transient/dynamic properties.
5. Do not use the absence of markers as primary evidence; focus on positive evidence.
6. Exclude cell types with clear negative markers present in the gene set.
7. Evaluate the possibility of mixed cell populations or transitional states if the gene set suggests this.
8. Note any unusual gene combinations or expression patterns that might indicate novel cell states or types.
BACKGROUND:
{background}. Above are {geneset_num} genesets and their potential celltypes, geneseach geneset is highly expressed relative to other gene sets..
For the output you should follow this format:
'''
### [geneset id] : [ CELLTYPE NAME]
**Gene Markers**:
- cell-specific: [CELL-SPECIFIC GENE MARKERS]
- context-specific: [CONTEXT-SPECIFIC GENE MARKERS]
**Evidence and Reasoning**:
1. [MAIN EVIDENCE POINT, like cell-specific marker]
2. [SECONDARY EVIDENCE POINT, like context-specific marker ]
3. [ADDITIONAL EVIDENCE POINTS AS NEEDED]
**Cell State/Subtype**: [SPECULATED CELL STATE OR SUBTYPE BASED ON BACKGROUND]
**Alternative Considerations**: [BRIEFLY MENTION OTHER CELL TYPES CONSIDERED AND WHY THEY WERE RULED OUT]
### [geneset id] : [ CELLTYPE NAME ]
### Celltype Prediction
#### Optimal Celltype: [OPTIMAL CELLTYPE NAME]
**Key Markers**:
- Cell-specific: [CELL-SPECIFIC MARKERS]
- Context-specific: [CONTEXT-SPECIFIC MARKERS]
**Gene Markers**:
- cell-specific: [CELL-SPECIFIC GENE MARKERS]
- context-specific: [CONTEXT-SPECIFIC GENE MARKERS]
**Evidence and Reasoning**
- [PRIMARY EVIDENCE]
- [SECONDARY EVIDENCE]
- [ADDITIONAL EVIDENCE AS NEEDED]
**Evidence and Reasoning**:
1. [MAIN EVIDENCE POINT, like cell-specific marker]
2. [SECONDARY EVIDENCE POINT, like context-specific marker ]
3. [ADDITIONAL EVIDENCE POINTS AS NEEDED]
**Validation**: [OTHER Gold Standard MARKERS(NOT IN Geneset {setid}) TO VALIDATE THE OPTIMAL CELLTYPE]
**Cell State/Subtype**: [SPECULATED CELL STATE OR SUBTYPE BASED ON BACKGROUND]
**Alternative Considerations**: [BRIEFLY MENTION OTHER CELL TYPES CONSIDERED AND WHY THEY WERE RULED OUT]
#### Alternative Considerations
- Alternative celltype1
- [WHY Alternative? Key MARKERS, Evidence and Reasoning]
- [OTHER Gold Standard MARKERS(NOT IN Geneset {setid}) TO VALIDATE THE Alternative celltype1]
[REPEAT FOR EACH GENE SET]
- Alternative celltype2
- [WHY Alternative? Key MARKERS, Evidence and Reasoning]
- [OTHER Gold Standard MARKERS(NOT IN Geneset {setid}) TO VALIDATE THE Alternative celltype2]
### Novel Insights
- [NOTEWORTHY PATTERNS]
- [CELL STATE]
- [POTENTIAL NEW FINDINGS]
'''
"""

Expand Down

0 comments on commit 2d0eeba

Please sign in to comment.