Spaces:
Running
Running
Hong Ong
commited on
Commit
·
c7b1d79
1
Parent(s):
02392d7
Implement One shot ACMG classification
Browse files- .flake8 +3 -0
- .gitignore +181 -0
- app.py +161 -2
- prompt_engineering/__init__.py +0 -0
- prompt_engineering/acgs_prompt.py +75 -0
- prompt_engineering/acmg_prompt.py +88 -0
- prompt_engineering/knowlege_prompt.py +51 -0
- prompt_engineering/one_shot_prompt.py +50 -0
- prompt_engineering/quality_control_prompt.py +36 -0
- requirements.txt +33 -0
- tools/__init__.py +0 -0
- tools/acgs_points.py +210 -0
- tools/json_utils.py +26 -0
.flake8
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
[flake8]
|
2 |
+
max-line-length = 200
|
3 |
+
extend-ignore = W291,E501
|
.gitignore
ADDED
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
share/python-wheels/
|
24 |
+
*.egg-info/
|
25 |
+
.installed.cfg
|
26 |
+
*.egg
|
27 |
+
MANIFEST
|
28 |
+
|
29 |
+
# PyInstaller
|
30 |
+
# Usually these files are written by a python script from a template
|
31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
32 |
+
*.manifest
|
33 |
+
*.spec
|
34 |
+
|
35 |
+
# Installer logs
|
36 |
+
pip-log.txt
|
37 |
+
pip-delete-this-directory.txt
|
38 |
+
|
39 |
+
# Unit test / coverage reports
|
40 |
+
htmlcov/
|
41 |
+
.tox/
|
42 |
+
.nox/
|
43 |
+
.coverage
|
44 |
+
.coverage.*
|
45 |
+
.cache
|
46 |
+
nosetests.xml
|
47 |
+
coverage.xml
|
48 |
+
*.cover
|
49 |
+
*.py,cover
|
50 |
+
.hypothesis/
|
51 |
+
.pytest_cache/
|
52 |
+
cover/
|
53 |
+
|
54 |
+
# Translations
|
55 |
+
*.mo
|
56 |
+
*.pot
|
57 |
+
|
58 |
+
# Django stuff:
|
59 |
+
*.log
|
60 |
+
local_settings.py
|
61 |
+
db.sqlite3
|
62 |
+
db.sqlite3-journal
|
63 |
+
|
64 |
+
# Flask stuff:
|
65 |
+
instance/
|
66 |
+
.webassets-cache
|
67 |
+
|
68 |
+
# Scrapy stuff:
|
69 |
+
.scrapy
|
70 |
+
|
71 |
+
# Sphinx documentation
|
72 |
+
docs/_build/
|
73 |
+
|
74 |
+
# PyBuilder
|
75 |
+
.pybuilder/
|
76 |
+
target/
|
77 |
+
|
78 |
+
# Jupyter Notebook
|
79 |
+
.ipynb_checkpoints
|
80 |
+
|
81 |
+
# IPython
|
82 |
+
profile_default/
|
83 |
+
ipython_config.py
|
84 |
+
|
85 |
+
# pyenv
|
86 |
+
# For a library or package, you might want to ignore these files since the code is
|
87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
88 |
+
# .python-version
|
89 |
+
|
90 |
+
# pipenv
|
91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
94 |
+
# install all needed dependencies.
|
95 |
+
#Pipfile.lock
|
96 |
+
|
97 |
+
# UV
|
98 |
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
100 |
+
# commonly ignored for libraries.
|
101 |
+
#uv.lock
|
102 |
+
|
103 |
+
# poetry
|
104 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
105 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
106 |
+
# commonly ignored for libraries.
|
107 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
108 |
+
#poetry.lock
|
109 |
+
|
110 |
+
# pdm
|
111 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
112 |
+
#pdm.lock
|
113 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
114 |
+
# in version control.
|
115 |
+
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
116 |
+
.pdm.toml
|
117 |
+
.pdm-python
|
118 |
+
.pdm-build/
|
119 |
+
|
120 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
121 |
+
__pypackages__/
|
122 |
+
|
123 |
+
# Celery stuff
|
124 |
+
celerybeat-schedule
|
125 |
+
celerybeat.pid
|
126 |
+
|
127 |
+
# SageMath parsed files
|
128 |
+
*.sage.py
|
129 |
+
|
130 |
+
# Environments
|
131 |
+
.env
|
132 |
+
.venv
|
133 |
+
env/
|
134 |
+
venv/
|
135 |
+
ENV/
|
136 |
+
env.bak/
|
137 |
+
venv.bak/
|
138 |
+
|
139 |
+
# Spyder project settings
|
140 |
+
.spyderproject
|
141 |
+
.spyproject
|
142 |
+
|
143 |
+
# Rope project settings
|
144 |
+
.ropeproject
|
145 |
+
|
146 |
+
# mkdocs documentation
|
147 |
+
/site
|
148 |
+
|
149 |
+
# mypy
|
150 |
+
.mypy_cache/
|
151 |
+
.dmypy.json
|
152 |
+
dmypy.json
|
153 |
+
|
154 |
+
# Pyre type checker
|
155 |
+
.pyre/
|
156 |
+
|
157 |
+
# pytype static type analyzer
|
158 |
+
.pytype/
|
159 |
+
|
160 |
+
# Cython debug symbols
|
161 |
+
cython_debug/
|
162 |
+
|
163 |
+
# PyCharm
|
164 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
165 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
166 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
167 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
168 |
+
#.idea/
|
169 |
+
|
170 |
+
# Ruff stuff:
|
171 |
+
.ruff_cache/
|
172 |
+
|
173 |
+
# PyPI configuration file
|
174 |
+
.pypirc
|
175 |
+
.DS_Store
|
176 |
+
*.html
|
177 |
+
langfuse/
|
178 |
+
*.gz
|
179 |
+
*.pkl
|
180 |
+
*.csv
|
181 |
+
.idea/
|
app.py
CHANGED
@@ -1,4 +1,163 @@
|
|
|
|
|
|
1 |
import streamlit as st
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
-
|
4 |
-
st.write(x, 'squared is', x * x)
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
import streamlit as st
|
4 |
+
from langchain.schema import SystemMessage, HumanMessage
|
5 |
+
from langchain_openai import ChatOpenAI
|
6 |
+
from langfuse import Langfuse
|
7 |
+
from langfuse.callback import CallbackHandler
|
8 |
+
|
9 |
+
from prompt_engineering.one_shot_prompt import SYS_ONE_SHOT_PROMPT
|
10 |
+
from tools.json_utils import extract_json_from_response
|
11 |
+
|
12 |
+
# --- CONFIGURATION ---
|
13 |
+
langfuse = Langfuse()
|
14 |
+
langfuse_handler = CallbackHandler()
|
15 |
+
llm = ChatOpenAI(
|
16 |
+
temperature=0,
|
17 |
+
model_name="deepseek-chat",
|
18 |
+
openai_api_base=os.environ["OPENAI_API_BASE"],
|
19 |
+
openai_api_key=os.environ["OPENAI_API_KEY"],
|
20 |
+
callbacks=[langfuse_handler],
|
21 |
+
)
|
22 |
+
|
23 |
+
# --- SYSTEM PROMPT ---
|
24 |
+
system_prompt = SystemMessage(content=SYS_ONE_SHOT_PROMPT)
|
25 |
+
|
26 |
+
# --- STREAMLIT UI SETUP ---
|
27 |
+
st.set_page_config(page_title="ACMG/ACGS Variant Interpreter")
|
28 |
+
st.title("🔬 Germline Variant Classifier")
|
29 |
+
st.write("Enter a variant name to get ACMG and ACGS-based classification.")
|
30 |
+
|
31 |
+
variant = st.text_input("Variant (e.g. BRCA1 c.68_69delAG)")
|
32 |
+
|
33 |
+
if st.button("Interpret Variant"):
|
34 |
+
if not variant:
|
35 |
+
st.warning("Please enter a variant name.")
|
36 |
+
else:
|
37 |
+
with st.spinner("Analyzing with DeepSeek..."):
|
38 |
+
user_prompt = HumanMessage(content=f"Variant to interpret: {variant}")
|
39 |
+
response = llm([system_prompt, user_prompt])
|
40 |
+
try:
|
41 |
+
parsed = extract_json_from_response(response.content)
|
42 |
+
# Store the parsed response in session_state for later reference
|
43 |
+
st.session_state["llm_response"] = parsed
|
44 |
+
st.session_state["variant"] = variant
|
45 |
+
|
46 |
+
st.subheader("📄 ACMG Classification")
|
47 |
+
st.json(parsed.get("acmg", {}))
|
48 |
+
st.subheader("📄 ACGS Scoring")
|
49 |
+
st.json(parsed.get("acgs", {}))
|
50 |
+
st.subheader("✅ Final Consensus")
|
51 |
+
st.json(parsed.get("final_consensus", {}))
|
52 |
+
except Exception:
|
53 |
+
st.error(
|
54 |
+
"Failed to parse response. Please check the variant format or try again."
|
55 |
+
)
|
56 |
+
st.text(response.content)
|
57 |
+
|
58 |
+
# --- FEEDBACK SECTION ---
|
59 |
+
if "llm_response" in st.session_state:
|
60 |
+
st.markdown("## Provide Your Feedback")
|
61 |
+
st.write(
|
62 |
+
"If you see any inaccuracies or have corrections to the predictions, "
|
63 |
+
"please select your corrections below:"
|
64 |
+
)
|
65 |
+
|
66 |
+
# Corrected ACMG Classification (multi-select from 28 criteria)
|
67 |
+
acmg_criteria_options = [
|
68 |
+
"PVS1",
|
69 |
+
"PS1",
|
70 |
+
"PS2",
|
71 |
+
"PS3",
|
72 |
+
"PS4",
|
73 |
+
"PM1",
|
74 |
+
"PM2",
|
75 |
+
"PM3",
|
76 |
+
"PM4",
|
77 |
+
"PM5",
|
78 |
+
"PM6",
|
79 |
+
"PP1",
|
80 |
+
"PP2",
|
81 |
+
"PP3",
|
82 |
+
"PP4",
|
83 |
+
"PP5",
|
84 |
+
"BA1",
|
85 |
+
"BS1",
|
86 |
+
"BS2",
|
87 |
+
"BS3",
|
88 |
+
"BS4",
|
89 |
+
"BP1",
|
90 |
+
"BP2",
|
91 |
+
"BP3",
|
92 |
+
"BP4",
|
93 |
+
"BP5",
|
94 |
+
"BP6",
|
95 |
+
"BP7",
|
96 |
+
]
|
97 |
+
corrected_acmg = st.multiselect(
|
98 |
+
"Select Corrected ACMG Classification Criteria",
|
99 |
+
options=acmg_criteria_options,
|
100 |
+
help="Search and select the criteria you think apply.",
|
101 |
+
)
|
102 |
+
|
103 |
+
# Corrected ACGS Scoring (number input with step increment/decrement)
|
104 |
+
corrected_acgs_score = st.number_input(
|
105 |
+
"Enter Corrected ACGS Score",
|
106 |
+
min_value=-50,
|
107 |
+
max_value=50,
|
108 |
+
value=0,
|
109 |
+
step=1,
|
110 |
+
help="Use the arrow buttons to adjust the score.",
|
111 |
+
)
|
112 |
+
|
113 |
+
# Corrected Final Consensus (dropdown with 5 classifications)
|
114 |
+
consensus_options = [
|
115 |
+
"Pathogenic",
|
116 |
+
"Likely Pathogenic",
|
117 |
+
"Uncertain Significance",
|
118 |
+
"Likely Benign",
|
119 |
+
"Benign",
|
120 |
+
]
|
121 |
+
corrected_final_consensus = st.selectbox(
|
122 |
+
"Select Corrected Final Consensus Classification",
|
123 |
+
options=consensus_options,
|
124 |
+
help="Select the classification that you think is most appropriate.",
|
125 |
+
)
|
126 |
+
corrected_explanation = st.text_area(
|
127 |
+
"Provide Explanation for Corrections",
|
128 |
+
help="Explain why you made these corrections.",
|
129 |
+
placeholder="Your explanation here...",
|
130 |
+
height=100,
|
131 |
+
)
|
132 |
+
|
133 |
+
if st.button("Submit Feedback"):
|
134 |
+
# Create a dictionary with the feedback data
|
135 |
+
feedback_data = {
|
136 |
+
"variant": st.session_state.get("variant", variant),
|
137 |
+
"original_response": st.session_state["llm_response"],
|
138 |
+
"feedback": {
|
139 |
+
"acmg_criteria": corrected_acmg,
|
140 |
+
"acgs_score": corrected_acgs_score,
|
141 |
+
"final_consensus": corrected_final_consensus,
|
142 |
+
},
|
143 |
+
}
|
144 |
+
|
145 |
+
# Log the feedback as a custom event to Langfuse
|
146 |
+
langfuse.create_dataset_item(
|
147 |
+
dataset_name="deepva-dev-feedback-v1",
|
148 |
+
input={
|
149 |
+
"variant": feedback_data["variant"],
|
150 |
+
"original_response": feedback_data["original_response"],
|
151 |
+
},
|
152 |
+
expected_output={
|
153 |
+
"acmg_criteria": feedback_data["feedback"]["acmg_criteria"],
|
154 |
+
"acgs_score": feedback_data["feedback"]["acgs_score"],
|
155 |
+
"final_consensus": feedback_data["feedback"]["final_consensus"],
|
156 |
+
"explanation": corrected_explanation,
|
157 |
+
},
|
158 |
+
)
|
159 |
+
|
160 |
+
# Assert that all events were sent to the Langfuse API
|
161 |
+
langfuse.flush()
|
162 |
|
163 |
+
st.success("Thank you for your feedback!")
|
|
prompt_engineering/__init__.py
ADDED
File without changes
|
prompt_engineering/acgs_prompt.py
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
SYS_ACGS_PROMPT = """
|
2 |
+
You are the "ACGS Analysis Agent." Your task is to evaluate genetic variants according to the ACGS 2020 Best Practice Guidelines used in the UK. The ACGS guidelines are largely an extension of the ACMG 2015 rules but include specific modifications in how evidence is weighed and combined. In particular, note the following differences:
|
3 |
+
- A variant with one very strong evidence (e.g., PVS1) plus one moderate evidence is classified as Pathogenic (whereas ACMG might call it Likely Pathogenic).
|
4 |
+
- Two strong pieces of evidence will yield a Likely Pathogenic classification (instead of full Pathogenic).
|
5 |
+
|
6 |
+
You have access to the calculate_acgs_points tool, which accepts a list of criterion codes and returns a JSON object with a breakdown of ACGS points and the total score. The following table defines the point values and strengths for each criterion you must consider:
|
7 |
+
|
8 |
+
────────────────────────────────────────────────────────
|
9 |
+
Criterion | Classification Type | Strength | ACGS Points | Short Description
|
10 |
+
-----------------------------------------------------------------------------------
|
11 |
+
PVS1 | Pathogenic | Very Strong | 8 | Predicted null variant in a gene where LOF is a known mechanism
|
12 |
+
PS1 | Pathogenic | Strong | 4 | Same amino acid change as a known pathogenic variant
|
13 |
+
PS2 | Pathogenic | Strong | 4 | De novo (confirmed parentage) in a patient with the disease
|
14 |
+
PS3 | Pathogenic | Strong | 4 | Well-established functional studies show damaging effect
|
15 |
+
PS4 | Pathogenic | Strong | 4 | Significantly increased prevalence in affected individuals vs. controls
|
16 |
+
PM1 | Pathogenic | Moderate | 2 | Located in a critical functional domain/hot spot
|
17 |
+
PM2 | Pathogenic | Moderate | 2 | Absent (or extremely low frequency) in population databases
|
18 |
+
PM3 | Pathogenic | Moderate | 2 | For recessive disorders: in trans with a known pathogenic variant
|
19 |
+
PM4 | Pathogenic | Moderate | 2 | Protein length changes due to in-frame indels/stop-loss
|
20 |
+
PM5 | Pathogenic | Moderate | 2 | Novel missense change at a residue where a different pathogenic missense has been seen
|
21 |
+
PM6 | Pathogenic | Moderate | 2 | Assumed de novo without confirmation of parentage
|
22 |
+
PP1 | Pathogenic | Supporting | 1 | Cosegregation with disease in multiple affected family members
|
23 |
+
PP2 | Pathogenic | Supporting | 1 | Missense variant in a gene with a low benign missense rate and known disease mechanism
|
24 |
+
PP3 | Pathogenic | Supporting | 1 | Multiple computational predictions support a deleterious effect
|
25 |
+
PP4 | Pathogenic | Supporting | 1 | Patient phenotype or family history highly specific for this disease
|
26 |
+
PP5 | Pathogenic | Supporting | 1 | Reputable source reports variant as pathogenic without accessible data
|
27 |
+
BA1 | Benign | Stand-alone | -8 | Allele frequency >5% in general population databases
|
28 |
+
BS1 | Benign | Strong | -4 | Allele frequency higher than expected for the disorder
|
29 |
+
BS2 | Benign | Strong | -4 | Observed in healthy adult (homozygous or hemizygous) for fully penetrant disease
|
30 |
+
BS3 | Benign | Strong | -4 | Well-established functional studies show no damaging effect
|
31 |
+
BS4 | Benign | Strong | -4 | Lack of segregation in affected family members
|
32 |
+
BP1 | Benign | Supporting | -1 | Missense variant in a gene where truncating variants cause disease
|
33 |
+
BP2 | Benign | Supporting | -1 | Observed in trans with a pathogenic variant for a dominant disorder or in cis with a pathogenic variant
|
34 |
+
BP3 | Benign | Supporting | -1 | In-frame indel/dup in a repetitive region without known function
|
35 |
+
BP4 | Benign | Supporting | -1 | Multiple computational predictions support no impact
|
36 |
+
BP5 | Benign | Supporting | -1 | Variant found in a case with an alternate molecular explanation
|
37 |
+
BP6 | Benign | Supporting | -1 | Reputable source reports variant as benign without accessible data
|
38 |
+
BP7 | Benign | Supporting | -1 | Silent or non-coding variant with no predicted splice impact and not highly conserved
|
39 |
+
────────────────────────────────────────────────────────
|
40 |
+
|
41 |
+
Your workflow is as follows:
|
42 |
+
|
43 |
+
1. Analyze the provided variant evidence and determine which ACGS criteria (with corresponding codes) are met. For each piece of evidence, assign the appropriate criterion (e.g., "PVS1" for a null variant in a gene where LOF is known to cause disease, "PM2" for extremely low population frequency, etc.).
|
44 |
+
|
45 |
+
2. Call the calculate_acgs_points tool with the list of criteria codes you have identified. This tool will return a JSON object containing:
|
46 |
+
- "points_breakdown": A mapping of each criterion to its ACGS point value.
|
47 |
+
- "total_points": The cumulative score from all criteria.
|
48 |
+
|
49 |
+
3. Based on the total points, classify the variant using the following thresholds:
|
50 |
+
- Total points ≥ 8: "Pathogenic"
|
51 |
+
- Total points between 4 and 7: "Likely Pathogenic"
|
52 |
+
- Total points between -3 and 3: "Uncertain Significance"
|
53 |
+
- Total points between -7 and -4: "Likely Benign"
|
54 |
+
- Total points ≤ -8: "Benign"
|
55 |
+
|
56 |
+
4. Produce a final JSON output that includes:
|
57 |
+
- "criteria_met": An array of all criterion codes assigned.
|
58 |
+
- "points_breakdown": The detailed points received for each criterion.
|
59 |
+
- "total_points": The total score calculated.
|
60 |
+
- "classification": The final classification ("Pathogenic", "Likely Pathogenic", "Uncertain Significance", "Likely Benign", or "Benign").
|
61 |
+
- "justification": A detailed explanation of how each criterion contributed to the final score and classification.
|
62 |
+
|
63 |
+
Follow the ACGS 2020 guidelines strictly. Use only the evidence provided, and do not over-interpret ambiguous or inconclusive data.
|
64 |
+
|
65 |
+
When ready, call the tool calculate_acgs_points with the identified list of criteria codes to obtain the cumulative points and then provide your final classification with the required detailed justification.
|
66 |
+
|
67 |
+
Your final answer must be strictly in valid JSON format with the keys:
|
68 |
+
{
|
69 |
+
"criteria_met": [...],
|
70 |
+
"points_breakdown": {...},
|
71 |
+
"total_points": <integer>,
|
72 |
+
"classification": "<final_classification>",
|
73 |
+
"justification": "<detailed explanation>"
|
74 |
+
}
|
75 |
+
"""
|
prompt_engineering/acmg_prompt.py
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
SYS_ACMG_PROMPT = """
|
2 |
+
You are the “ACMG Analysis Agent.” You apply the 2015 ACMG-AMP guidelines for genetic variant interpretation.
|
3 |
+
For each variant, you must carefully assess all provided evidence and classify the variant accordingly.
|
4 |
+
The evidence types are organized from population data to other data, and for each type, you should consider the strength of the evidence—ranging from strong benign indicators to very strong pathogenic signals.
|
5 |
+
Below are all 28 ACMG criteria you must consider:
|
6 |
+
|
7 |
+
──────────────────────────────────────────────────────────
|
8 |
+
1. Population Data
|
9 |
+
──────────────────────────────────────────────────────────
|
10 |
+
• BA1 (Stand-Alone Benign): Allele frequency is so high in population databases that the variant is incompatible with disease.
|
11 |
+
• BS1 (Strong Benign): Allele frequency is higher than expected for the disorder.
|
12 |
+
• BS2 (Strong Benign): Variant is observed in healthy individuals, counter to expectation.
|
13 |
+
• PM2 (Moderate Pathogenic): Variant is absent or at an extremely low frequency in population datasets.
|
14 |
+
• PS4 (Strong Pathogenic): The prevalence of the variant is significantly increased in affected individuals compared to controls.
|
15 |
+
|
16 |
+
──────────────────────────────────────────────────────────
|
17 |
+
2. Computational and Predictive Data
|
18 |
+
──────────────────────────────────────────────────────────
|
19 |
+
• BP1 (Supporting Benign): Missense variant in a gene for which only loss-of-function variants (not missense) cause disease.
|
20 |
+
• BP3 (Supporting Benign): In-frame deletions/insertions in a repetitive region without known functional impact.
|
21 |
+
• BP4 (Supporting Benign): Multiple computational tools predict no damaging impact.
|
22 |
+
• BP7 (Supporting Benign): For synonymous variants, no splicing impact predicted.
|
23 |
+
• PP3 (Supporting Pathogenic): Multiple in silico algorithms support a deleterious effect on the gene or protein.
|
24 |
+
• PM4 (Moderate Pathogenic): Protein length changes (in-frame deletions/insertions) in a non-repetitive region.
|
25 |
+
• PM5 (Moderate Pathogenic): A novel missense change at an amino acid residue where a different missense change is known to be pathogenic.
|
26 |
+
• PS1 (Strong Pathogenic): The amino acid change is identical to that of a previously established pathogenic variant (despite a different nucleotide change).
|
27 |
+
• PVS1 (Very Strong Pathogenic): Null variant (e.g., nonsense, frameshift, canonical ±1/2 splice sites, initiation codon loss, or single exon deletion) in a gene where loss-of-function is a known mechanism.
|
28 |
+
|
29 |
+
──────────────────────────────────────────────────────────
|
30 |
+
3. Functional Data
|
31 |
+
──────────────────────────────────────────────────────────
|
32 |
+
• BS3 (Strong Benign): Functional studies show no damaging effect.
|
33 |
+
• PP2 (Supporting Pathogenic): Missense variant in a gene that has a low rate of benign missense variation and where missense is a common mechanism of disease.
|
34 |
+
• PM1 (Moderate Pathogenic): Variant is located in a mutational hot spot or a critical functional domain with no benign variation.
|
35 |
+
• PS3 (Strong Pathogenic): Well-established functional studies support a damaging effect on the gene or protein.
|
36 |
+
|
37 |
+
──────────────────────────────────────────────────────────
|
38 |
+
4. Segregation Data
|
39 |
+
──────────────────────────────────────────────────────────
|
40 |
+
• BS4 (Strong Benign): Lack of segregation with the disease in affected family members.
|
41 |
+
• PP1 (Supporting Pathogenic): Co-segregation of the variant with disease in multiple affected family members.
|
42 |
+
|
43 |
+
──────────────────────────────────────────────────────────
|
44 |
+
5. De Novo data
|
45 |
+
──────────────────────────────────────────────────────────
|
46 |
+
• PM6 (Moderate Pathogenic): Assumed de novo, but without confirmation of both maternity and paternity.
|
47 |
+
• PS2 (Strong Pathogenic): Confirmed de novo occurrence (with maternity and paternity verified) in a patient with the disease.
|
48 |
+
|
49 |
+
───────────────────────��──────────────────────────────────
|
50 |
+
6. Allelic Data
|
51 |
+
──────────────────────────────────────────────────────────
|
52 |
+
• BP2 (Supporting Benign): Observation of the variant in trans with a pathogenic variant for a fully penetrant dominant disorder.
|
53 |
+
• PM3 (Moderate Pathogenic): For recessive disorders, the variant is detected in trans with a pathogenic variant.
|
54 |
+
|
55 |
+
──────────────────────────────────────────────────────────
|
56 |
+
7. Other Database
|
57 |
+
──────────────────────────────────────────────────────────
|
58 |
+
• BP6 (Supporting Benign): A reputable source has reported the variant as benign with minimal provided evidence.
|
59 |
+
• PP5 (Supporting Pathogenic): A reputable source has reported the variant as pathogenic with minimal provided evidence.
|
60 |
+
|
61 |
+
|
62 |
+
──────────────────────────────────────────────────────────
|
63 |
+
8. Other data
|
64 |
+
──────────────────────────────────────────────────────────
|
65 |
+
• BP5 (Supporting Benign): Variant is found in a patient with another established molecular cause for the disease.
|
66 |
+
• PP4 (Supporting Pathogenic): Patient’s phenotype or family history is highly specific for a disease with a single genetic etiology.
|
67 |
+
|
68 |
+
──────────────────────────────────────────────────────────
|
69 |
+
Your job is to:
|
70 |
+
1. Identify which ACMG criteria are met based on the input evidence.
|
71 |
+
2. For each piece of evidence provided, assign the appropriate criterion code (e.g., “PVS1” for a null variant in a gene where loss-of-function is known to cause disease or “PM2” for extremely low population frequency), noting the strength of evidence.
|
72 |
+
3. Combine the criteria according to ACMG-AMP rules to arrive at one of the five final classifications:
|
73 |
+
- Pathogenic
|
74 |
+
- Likely Pathogenic
|
75 |
+
- Uncertain Significance (VUS)
|
76 |
+
- Likely Benign
|
77 |
+
- Benign
|
78 |
+
4. Output a structured explanation detailing which criteria were met and your final classification.
|
79 |
+
|
80 |
+
Return your answer strictly in JSON format with the following keys:
|
81 |
+
{
|
82 |
+
"criteria_met": ["PVS1", "PS1", "PM2", ...],
|
83 |
+
"classification": "Pathogenic",
|
84 |
+
"justification": "Detailed explanation describing the evaluation of each evidence type and how they combine to yield the final classification."
|
85 |
+
}
|
86 |
+
|
87 |
+
Follow the ACMG guidelines strictly. If certain pieces of evidence are not conclusive, do not over-interpret them.
|
88 |
+
"""
|
prompt_engineering/knowlege_prompt.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
SYS_KNOWLEDGE_PROMPT = """
|
2 |
+
You are the "Knowledge Integration Agent." Your role is to gather and collate all relevant evidence for a given variant that the downstream ACMG and ACGS Analysis Agents require for their classification tasks. You should act as an information retrieval and preprocessing pipeline, integrating data from multiple sources. Please retrieve and structure the evidence as follows:
|
3 |
+
|
4 |
+
1. ClinVar Data:
|
5 |
+
- Retrieve any ClinVar information associated with the variant, such as clinical significance, review status (e.g., Expert Panel, number of stars), and any summary of supporting evidence.
|
6 |
+
- Output this information under the key "clinvar".
|
7 |
+
|
8 |
+
2. Population Frequencies:
|
9 |
+
- Determine the allele frequency of the variant from population databases like gnomAD or 1000 Genomes.
|
10 |
+
- Specify if the variant is absent or present, and include the percentage or count as applicable.
|
11 |
+
- Output this information under the key "frequency".
|
12 |
+
|
13 |
+
3. In Silico Predictive Scores:
|
14 |
+
- Gather predictive annotations from computational tools (e.g., PolyPhen, SIFT, CADD) that evaluate the potential effect of the variant.
|
15 |
+
- Output this information under the key "insilico".
|
16 |
+
|
17 |
+
4. Gene/Disease Relevance:
|
18 |
+
- Provide context about the gene in which the variant is located including its known disease associations, whether loss-of-function is a known disease mechanism, or any gene-level intolerance information.
|
19 |
+
- Output this data under the key "gene_disease_info".
|
20 |
+
|
21 |
+
5. Literature Evidence:
|
22 |
+
- Retrieve any relevant literature references or summaries (e.g., PubMed IDs and brief descriptions) where the variant has been described or evaluated.
|
23 |
+
- Output this information under the key "literature".
|
24 |
+
|
25 |
+
6. Variant Type:
|
26 |
+
- Summarize the variant type (e.g., missense, nonsense, frameshift) and its predicted functional consequence (e.g., loss-of-function, gain-of-function).
|
27 |
+
- Output this information under the key "variant_type".
|
28 |
+
|
29 |
+
Your final output must be strictly in valid JSON format with exactly the following keys:
|
30 |
+
{
|
31 |
+
"clinvar": "<ClinVar evidence summary>",
|
32 |
+
"frequency": "<Population frequency details>",
|
33 |
+
"insilico": "<In silico prediction results>",
|
34 |
+
"gene_disease_info": "<Gene and disease association details>",
|
35 |
+
"literature": "<Summary of relevant literature>",
|
36 |
+
"variant_type": "<Type and functional effect of the variant>"
|
37 |
+
}
|
38 |
+
|
39 |
+
For example, if you are processing a variant in the BRCA1 gene, an ideal output could be:
|
40 |
+
|
41 |
+
{
|
42 |
+
"clinvar": "Pathogenic (Expert Panel reviewed, 2019) for BRCA1-associated hereditary cancer",
|
43 |
+
"frequency": "gnomAD: 0.0% (not found in 141,000 alleles)",
|
44 |
+
"insilico": "PolyPhen: Probably Damaging; SIFT: Deleterious",
|
45 |
+
"gene_disease_info": "BRCA1 is linked to hereditary breast and ovarian cancer. LOF is a known disease mechanism.",
|
46 |
+
"literature": "PMID 30112345: Reported in two sisters with early-onset cancer",
|
47 |
+
"variant_type": "Frameshift leading to stop codon at position 23"
|
48 |
+
}
|
49 |
+
|
50 |
+
Do not include additional keys or commentary. Only use the reliable, retrievable evidence from the available sources. Ensure that your response is strictly formatted as valid JSON.
|
51 |
+
"""
|
prompt_engineering/one_shot_prompt.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
SYS_ONE_SHOT_PROMPT = """
|
2 |
+
You are a clinical variant interpretation expert trained to annotate and classify germline genetic variants in humans according to both ACMG-AMP 2015 guidelines and ACGS 2020 best practice guidelines. Your role is to simulate an expert panel’s reasoning and deliver structured, explainable, and reproducible interpretations.
|
3 |
+
|
4 |
+
You will be given a variant name (e.g., \"BRCA1 c.68_69delAG\") as input.
|
5 |
+
|
6 |
+
Your tasks are:
|
7 |
+
|
8 |
+
1. Interpret Evidence:
|
9 |
+
- Infer or simulate relevant clinical evidence (if none is provided) based on known patterns from population data (e.g., gnomAD), ClinVar, in silico tools (e.g., SIFT, PolyPhen), and variant type (e.g., frameshift, missense, nonsense).
|
10 |
+
|
11 |
+
2. Apply ACMG-AMP 2015 Guidelines:
|
12 |
+
- List all applicable ACMG criteria (e.g., PVS1, PM2, PP3).
|
13 |
+
- Apply the ACMG classification logic to assign a final 5-tier classification.
|
14 |
+
- Provide an explanation of why each criterion was used.
|
15 |
+
|
16 |
+
3. Apply ACGS 2020 Guidelines:
|
17 |
+
- Assign ACGS points using the ACGS scoring system.
|
18 |
+
- Calculate the total score and assign a classification:
|
19 |
+
- ≥ 8: Pathogenic
|
20 |
+
- 4–7: Likely Pathogenic
|
21 |
+
- -3 to +3: VUS
|
22 |
+
- -4 to -7: Likely Benign
|
23 |
+
- ≤ -8: Benign
|
24 |
+
|
25 |
+
4. Generate Final Consensus:
|
26 |
+
- Reconcile ACMG and ACGS classifications.
|
27 |
+
- Provide a clear justification for the final call.
|
28 |
+
|
29 |
+
Return your answer strictly in the following JSON format:
|
30 |
+
{
|
31 |
+
\"variant\": \"<input_variant>\",
|
32 |
+
\"acmg\": {
|
33 |
+
\"criteria_met\": [...],
|
34 |
+
\"classification\": \"...\",
|
35 |
+
\"explanation\": \"...\"
|
36 |
+
},
|
37 |
+
\"acgs\": {
|
38 |
+
\"criteria_met\": [...],
|
39 |
+
\"points_breakdown\": {...},
|
40 |
+
\"total_points\": <int>,
|
41 |
+
\"classification\": \"...\",
|
42 |
+
\"explanation\": \"...\"
|
43 |
+
},
|
44 |
+
\"final_consensus\": {
|
45 |
+
\"classification\": \"...\",
|
46 |
+
\"justification\": \"...\"
|
47 |
+
}
|
48 |
+
}
|
49 |
+
Only use criteria supported by plausible or known evidence. If evidence is insufficient, respond accordingly with VUS.
|
50 |
+
"""
|
prompt_engineering/quality_control_prompt.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
SYS_QUALITY_CONTROL_PROMPT = """
|
2 |
+
You are the "Quality Control & Consensus Agent." Your role is to act as a final arbiter for variant classification by reviewing and reconciling the independent outputs from the ACMG Analysis Agent, the ACGS Analysis Agent, and the Knowledge Integration Agent. Your responsibilities are twofold:
|
3 |
+
|
4 |
+
1. Discrepancy Resolution:
|
5 |
+
- If the ACMG and ACGS agents provide different classifications (e.g., one says "Likely Pathogenic" while the other says "Pathogenic"), analyze their rationales, the evidence each considered (e.g., which ACMG criteria or ACGS points were met), and any additional context provided by the Knowledge agent.
|
6 |
+
- Determine whether the evidence favors one classification over the other or if an intermediate outcome is justified. For instance, if ACMG suggests "Likely Pathogenic" (due to conservative criteria) but ACGS, using updated thresholds (e.g., one PVS1 plus one moderate evidence), supports "Pathogenic" and the Knowledge Agent confirms corroborative evidence (e.g., robust ClinVar pathogenic calls, absence in population databases), you should lean toward "Pathogenic."
|
7 |
+
- In cases where one agent assigns "Benign" and the other "Uncertain Significance," evaluate the benign or risk evidence (such as allele frequency, multiple benign criteria, or conflicting computational predictions) to decide if a conservative approach (such as "VUS" or "Likely Benign") is warranted.
|
8 |
+
|
9 |
+
2. Output Consolidation:
|
10 |
+
- Whether or not there is a discrepancy, synthesize a single, coherent final report by integrating the reasoning of both agents and the supporting evidence from the Knowledge Agent.
|
11 |
+
- Ensure that the final output reflects a balanced, evidence-based consensus.
|
12 |
+
- The final report must include:
|
13 |
+
- The final classification (one of: "Pathogenic", "Likely Pathogenic", "Uncertain Significance", "Likely Benign", or "Benign").
|
14 |
+
- A comprehensive explanation detailing which key criteria and pieces of evidence influenced the decision, how discrepancies (if any) were resolved, and why the final classification is justified.
|
15 |
+
|
16 |
+
Your final output must be strictly in valid JSON format with the following keys:
|
17 |
+
{
|
18 |
+
"final_classification": "<final classification>",
|
19 |
+
"explanation": "<detailed explanation of the decision, integrating outputs from ACMG, ACGS, and Knowledge agents>"
|
20 |
+
}
|
21 |
+
|
22 |
+
For example, if the ACMG agent outputs:
|
23 |
+
"Likely Pathogenic. Rationale: PVS1 (null variant in a disease gene) and PP1_Moderate (segregation in 3 affected individuals) met; no contradicting benign evidence,"
|
24 |
+
and the ACGS agent outputs:
|
25 |
+
"Pathogenic. Rationale: PVS1 and one moderate evidence suffice under updated ACGS rules,"
|
26 |
+
and the Knowledge agent corroborates with:
|
27 |
+
"ClinVar: 3-star Pathogenic; Population: absent in gnomAD; In silico: damaging; Variant Type: frameshift."
|
28 |
+
Then a correct consolidated output might be:
|
29 |
+
|
30 |
+
{
|
31 |
+
"final_classification": "Pathogenic",
|
32 |
+
"explanation": "Both agents indicate strong evidence for pathogenicity. Although ACMG guidelines suggest a Likely Pathogenic call due to conservative thresholds, the ACGS guidelines upgrade this combination to Pathogenic. The Knowledge Agent further reinforces this by confirming expert-rated pathogenicity and supportive population and in silico data. Therefore, based on integrated evidence, the final classification is Pathogenic."
|
33 |
+
}
|
34 |
+
|
35 |
+
Follow these instructions strictly and ensure your final output is concise, accurate, and evidence-based.
|
36 |
+
"""
|
requirements.txt
CHANGED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# python 3.11
|
2 |
+
black
|
3 |
+
black[jupyter]
|
4 |
+
flake8
|
5 |
+
markdownify
|
6 |
+
smolagents
|
7 |
+
requests
|
8 |
+
duckduckgo_search
|
9 |
+
pandas
|
10 |
+
openai
|
11 |
+
opentelemetry - sdk
|
12 |
+
opentelemetry - exporter - otlp
|
13 |
+
openinference - instrumentation - smolagents
|
14 |
+
geopandas
|
15 |
+
plotly
|
16 |
+
shapely
|
17 |
+
numpy
|
18 |
+
scikit - learn
|
19 |
+
|
20 |
+
#
|
21 |
+
llama - index
|
22 |
+
llama - index - llms - openai
|
23 |
+
llama - index - utils - workflow
|
24 |
+
langfuse
|
25 |
+
matplotlib
|
26 |
+
|
27 |
+
#
|
28 |
+
langgraph
|
29 |
+
langchain
|
30 |
+
langchain - community
|
31 |
+
langchain - openai
|
32 |
+
streamlit
|
33 |
+
seaborn
|
tools/__init__.py
ADDED
File without changes
|
tools/acgs_points.py
ADDED
@@ -0,0 +1,210 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
This module provides a function to calculate ACGS points for a given variant based on
|
3 |
+
the ACGS 2020 Best Practice Guidelines. Each criterion (e.g., PVS1, PS1, BS1, BP1, etc.)
|
4 |
+
is mapped to a point score reflecting its weight in the ACGS system.
|
5 |
+
"""
|
6 |
+
|
7 |
+
from typing import List, Dict
|
8 |
+
|
9 |
+
# Mapping of criteria to their ACGS points, strengths, and short descriptions.
|
10 |
+
ACGS_CRITERIA = {
|
11 |
+
"PVS1": {
|
12 |
+
"score": 8,
|
13 |
+
"classification": "Pathogenic",
|
14 |
+
"strength": "Very Strong",
|
15 |
+
"description": "Predicted null variant in a gene where LOF is a known mechanism",
|
16 |
+
},
|
17 |
+
"PS1": {
|
18 |
+
"score": 4,
|
19 |
+
"classification": "Pathogenic",
|
20 |
+
"strength": "Strong",
|
21 |
+
"description": "Same amino acid change as a known pathogenic variant",
|
22 |
+
},
|
23 |
+
"PS2": {
|
24 |
+
"score": 4,
|
25 |
+
"classification": "Pathogenic",
|
26 |
+
"strength": "Strong",
|
27 |
+
"description": "De novo (confirmed parentage) in a patient with the disease",
|
28 |
+
},
|
29 |
+
"PS3": {
|
30 |
+
"score": 4,
|
31 |
+
"classification": "Pathogenic",
|
32 |
+
"strength": "Strong",
|
33 |
+
"description": "Well-established functional studies show damaging effect",
|
34 |
+
},
|
35 |
+
"PS4": {
|
36 |
+
"score": 4,
|
37 |
+
"classification": "Pathogenic",
|
38 |
+
"strength": "Strong",
|
39 |
+
"description": "Significantly increased prevalence in affected individuals vs. controls",
|
40 |
+
},
|
41 |
+
"PM1": {
|
42 |
+
"score": 2,
|
43 |
+
"classification": "Pathogenic",
|
44 |
+
"strength": "Moderate",
|
45 |
+
"description": "Located in a critical functional domain/hot spot",
|
46 |
+
},
|
47 |
+
"PM2": {
|
48 |
+
"score": 2,
|
49 |
+
"classification": "Pathogenic",
|
50 |
+
"strength": "Moderate",
|
51 |
+
"description": "Absent (or extremely low frequency) in population databases",
|
52 |
+
},
|
53 |
+
"PM3": {
|
54 |
+
"score": 2,
|
55 |
+
"classification": "Pathogenic",
|
56 |
+
"strength": "Moderate",
|
57 |
+
"description": "For recessive disorders: in trans with a known pathogenic variant",
|
58 |
+
},
|
59 |
+
"PM4": {
|
60 |
+
"score": 2,
|
61 |
+
"classification": "Pathogenic",
|
62 |
+
"strength": "Moderate",
|
63 |
+
"description": "Protein length changes due to in-frame indels/stop-loss",
|
64 |
+
},
|
65 |
+
"PM5": {
|
66 |
+
"score": 2,
|
67 |
+
"classification": "Pathogenic",
|
68 |
+
"strength": "Moderate",
|
69 |
+
"description": "Novel missense change at a residue where a different pathogenic missense has been seen",
|
70 |
+
},
|
71 |
+
"PM6": {
|
72 |
+
"score": 2,
|
73 |
+
"classification": "Pathogenic",
|
74 |
+
"strength": "Moderate",
|
75 |
+
"description": "Assumed de novo without confirmation of parentage",
|
76 |
+
},
|
77 |
+
"PP1": {
|
78 |
+
"score": 1,
|
79 |
+
"classification": "Pathogenic",
|
80 |
+
"strength": "Supporting",
|
81 |
+
"description": "Cosegregation with disease in multiple affected family members",
|
82 |
+
},
|
83 |
+
"PP2": {
|
84 |
+
"score": 1,
|
85 |
+
"classification": "Pathogenic",
|
86 |
+
"strength": "Supporting",
|
87 |
+
"description": "Missense variant in a gene with low benign missense rate and known disease mechanism",
|
88 |
+
},
|
89 |
+
"PP3": {
|
90 |
+
"score": 1,
|
91 |
+
"classification": "Pathogenic",
|
92 |
+
"strength": "Supporting",
|
93 |
+
"description": "Multiple computational predictions support a deleterious effect",
|
94 |
+
},
|
95 |
+
"PP4": {
|
96 |
+
"score": 1,
|
97 |
+
"classification": "Pathogenic",
|
98 |
+
"strength": "Supporting",
|
99 |
+
"description": "Patient phenotype or family history highly specific for this disease",
|
100 |
+
},
|
101 |
+
"PP5": {
|
102 |
+
"score": 1,
|
103 |
+
"classification": "Pathogenic",
|
104 |
+
"strength": "Supporting",
|
105 |
+
"description": "Reputable source reports variant as pathogenic without accessible data",
|
106 |
+
},
|
107 |
+
"BA1": {
|
108 |
+
"score": -8,
|
109 |
+
"classification": "Benign",
|
110 |
+
"strength": "Stand-alone",
|
111 |
+
"description": "Allele frequency >5% in general population databases",
|
112 |
+
},
|
113 |
+
"BS1": {
|
114 |
+
"score": -4,
|
115 |
+
"classification": "Benign",
|
116 |
+
"strength": "Strong",
|
117 |
+
"description": "Allele frequency higher than expected for disorder",
|
118 |
+
},
|
119 |
+
"BS2": {
|
120 |
+
"score": -4,
|
121 |
+
"classification": "Benign",
|
122 |
+
"strength": "Strong",
|
123 |
+
"description": "Observed in healthy adult (homozygous or hemizygous) for fully penetrant disease",
|
124 |
+
},
|
125 |
+
"BS3": {
|
126 |
+
"score": -4,
|
127 |
+
"classification": "Benign",
|
128 |
+
"strength": "Strong",
|
129 |
+
"description": "Well-established functional studies show no damaging effect",
|
130 |
+
},
|
131 |
+
"BS4": {
|
132 |
+
"score": -4,
|
133 |
+
"classification": "Benign",
|
134 |
+
"strength": "Strong",
|
135 |
+
"description": "Lack of segregation in affected family members",
|
136 |
+
},
|
137 |
+
"BP1": {
|
138 |
+
"score": -1,
|
139 |
+
"classification": "Benign",
|
140 |
+
"strength": "Supporting",
|
141 |
+
"description": "Missense variant in a gene where truncating variants cause disease",
|
142 |
+
},
|
143 |
+
"BP2": {
|
144 |
+
"score": -1,
|
145 |
+
"classification": "Benign",
|
146 |
+
"strength": "Supporting",
|
147 |
+
"description": "Observed in trans with a pathogenic variant for a dominant disorder or in cis with a pathogenic variant",
|
148 |
+
},
|
149 |
+
"BP3": {
|
150 |
+
"score": -1,
|
151 |
+
"classification": "Benign",
|
152 |
+
"strength": "Supporting",
|
153 |
+
"description": "In-frame indel/dup in a repetitive region without known function",
|
154 |
+
},
|
155 |
+
"BP4": {
|
156 |
+
"score": -1,
|
157 |
+
"classification": "Benign",
|
158 |
+
"strength": "Supporting",
|
159 |
+
"description": "Multiple computational predictions support no impact",
|
160 |
+
},
|
161 |
+
"BP5": {
|
162 |
+
"score": -1,
|
163 |
+
"classification": "Benign",
|
164 |
+
"strength": "Supporting",
|
165 |
+
"description": "Variant found in a case with an alternate molecular explanation",
|
166 |
+
},
|
167 |
+
"BP6": {
|
168 |
+
"score": -1,
|
169 |
+
"classification": "Benign",
|
170 |
+
"strength": "Supporting",
|
171 |
+
"description": "Reputable source reports variant as benign without accessible data",
|
172 |
+
},
|
173 |
+
"BP7": {
|
174 |
+
"score": -1,
|
175 |
+
"classification": "Benign",
|
176 |
+
"strength": "Supporting",
|
177 |
+
"description": "Silent or non-coding variant with no predicted splice impact and not highly conserved",
|
178 |
+
},
|
179 |
+
}
|
180 |
+
|
181 |
+
|
182 |
+
def calculate_acgs_points(criteria_list: List[str]) -> Dict[str, object]:
|
183 |
+
"""
|
184 |
+
Calculate the cumulative ACGS points for a variant based on the criteria codes provided.
|
185 |
+
|
186 |
+
Parameters:
|
187 |
+
criteria_list (List[str]): A list of criterion codes (e.g., ["PVS1", "PM2", "BS1"]).
|
188 |
+
|
189 |
+
Returns:
|
190 |
+
Dict[str, object]: A dictionary containing a breakdown of points by criterion and the total score.
|
191 |
+
Example:
|
192 |
+
{
|
193 |
+
"points_breakdown": {"PVS1": 8, "PM2": 2},
|
194 |
+
"total_points": 10
|
195 |
+
}
|
196 |
+
"""
|
197 |
+
points_breakdown = {}
|
198 |
+
total_points = 0
|
199 |
+
|
200 |
+
for criterion in criteria_list:
|
201 |
+
# Retrieve the point score if available; otherwise assume 0 for unknown codes.
|
202 |
+
if criterion in ACGS_CRITERIA:
|
203 |
+
score = ACGS_CRITERIA[criterion]["score"]
|
204 |
+
points_breakdown[criterion] = score
|
205 |
+
total_points += score
|
206 |
+
else:
|
207 |
+
# Unknown codes are flagged with 0 points.
|
208 |
+
points_breakdown[criterion] = 0
|
209 |
+
|
210 |
+
return {"points_breakdown": points_breakdown, "total_points": total_points}
|
tools/json_utils.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
|
3 |
+
|
4 |
+
def extract_json_from_response(content: str):
|
5 |
+
# Remove Markdown code block markers
|
6 |
+
if content.startswith("```json"):
|
7 |
+
content = content.replace("```json", "").strip()
|
8 |
+
if content.endswith("```"):
|
9 |
+
content = content[:-3].strip()
|
10 |
+
|
11 |
+
# Now it's safe to parse the cleaned JSON
|
12 |
+
return json.loads(content)
|
13 |
+
|
14 |
+
|
15 |
+
def extract_classification(response_content: str) -> str:
|
16 |
+
try:
|
17 |
+
if response_content.startswith("```json"):
|
18 |
+
response_content = response_content.replace("```json", "").strip()
|
19 |
+
if response_content.endswith("```"):
|
20 |
+
response_content = response_content[:-3].strip()
|
21 |
+
parsed = json.loads(response_content)
|
22 |
+
return parsed["final_consensus"]["classification"]
|
23 |
+
except Exception as e:
|
24 |
+
# Log the error or handle it as needed
|
25 |
+
print(f"Error parsing classification: {e}")
|
26 |
+
return "ParsingError"
|