Spaces:

michellemli
/

PINNACLE

Running

File size: 9,203 Bytes

import gradio as gr
import numpy as np
import pandas as pd
import plotly.express as px
from collections import Counter


css_colors = ["darkmagenta", "darkolivegreen", "darkorange", "darkorchid", "darkred", "darksalmon", "darkseagreen", "darkslateblue", "darkturquoise", "darkviolet", "deeppink", "deepskyblue", "dodgerblue", "firebrick", "coral", "magenta", "maroon", "mediumaquamarine", "mediumblue", "mediumorchid", "mediumpurple", "mediumseagreen", "mediumslateblue", "mediumspringgreen", "mediumturquoise", "mediumvioletred", "midnightblue", "mintcream", "mistyrose", "moccasin", "navajowhite", "navy", "oldlace", "olive", "olivedrab", "orange", "orangered", "orchid", "aqua", "aquamarine", "azure", "blue", "blueviolet", "brown", "burlywood", "cadetblue", "chartreuse", "chocolate", "cornflowerblue", "cornsilk", "crimson", "cyan", "darkblue", "darkcyan", "darkgoldenrod", "darkgreen", "darkkhaki", "floralwhite", "forestgreen", "fuchsia", "gainsboro", "ghostwhite", "gold", "goldenrod", "green", "greenyellow", "honeydew", "hotpink", "indianred", "indigo", "ivory", "khaki", "lavender", "lavenderblush", "lawngreen", "lemonchiffon", "lightblue", "lightcoral", "lightcyan", "lightgoldenrodyellow", "lightgreen", "lightpink", "lightsalmon", "lightseagreen", "lightskyblue", "lightsteelblue", "lightyellow", "lime", "limegreen", "linen", "palegoldenrod", "palegreen", "paleturquoise", "palevioletred", "papayawhip", "peachpuff", "peru", "pink", "plum", "powderblue", "purple", "red", "rosybrown", "royalblue", "rebeccapurple", "saddlebrown", "salmon", "sandybrown", "seagreen", "seashell", "sienna", "silver", "skyblue", "slateblue", "snow", "springgreen", "steelblue", "tan", "teal", "thistle", "tomato", "turquoise", "violet", "wheat", "white", "whitesmoke", "yellow", "yellowgreen"] # "darkgray", "darkgrey", "slategray", "slategrey", "lightslategray", "lightslategrey", "lightgray", "lightgrey", "gray", "grey", "dimgray", "dimgrey", "darkslategray", "darkslategrey", "aliceblue",  "black", "beige", "antiquewhite", "bisque", "blanchedalmond", 

# Read data
data = []
with open("data/inventory.txt", "r") as fin:
    for f in fin:
        c_data = pd.read_csv(f.strip(), sep = "\t")
        data.append(c_data)
data = pd.concat(data)
unique_celltypes = sorted([c for c in data["Celltype"].unique() if "CCI" not in c and "BTO" not in c])

max_safe_scores = pd.read_csv("data/max_safe_scores.csv", sep = "\t").rename(columns = {"Score": "Max SAFE Score", "Label": "Celltype"})
mean_safe_scores = pd.read_csv("data/mean_safe_scores.csv", sep = "\t").rename(columns = {"Score": "Mean SAFE Score", "Label": "Celltype"})
neighborhood_enrichment = pd.read_csv("data/safe_neighborhoods_enriched.csv", sep = "\t").rename(columns = {"Label": "Celltype"})
safe_scores = max_safe_scores.merge(mean_safe_scores, on = "Celltype")
safe_scores = safe_scores.merge(neighborhood_enrichment, on = "Celltype")
print(safe_scores)

# Helper functions
def plot_protein_emb(protein):
    hover_keys = {"Name": True, "Celltype": True, "x": False, "y": False, "Selected": False}
    p_data = data.copy()
    p_data["Selected"] = [c if p == protein.lower() else "Not Selected" for p, c in zip(p_data["Name"].str.lower(), p_data["Celltype"].tolist())]
    p_data["Size"] = [1 if i == "Not Selected" else 10 for i in p_data["Selected"].tolist()]
    symbol_map = {s: "circle" if s == 1 else "star" for s in p_data["Size"].unique()}
    p_celltypes = p_data["Selected"].unique()
    color_map = {c: i for c, i in zip(p_celltypes, css_colors) if c != "Not Selected"}
    color_map.update({"Not Selected": "lightgrey"})

    fig = px.scatter(p_data, x = "x", y = "y", color = "Selected", color_discrete_map = color_map, symbol = "Size", symbol_map = symbol_map, size = "Size", opacity = 0.8, hover_data = hover_keys)
    fig.update_layout({"plot_bgcolor": "rgba(0, 0, 0, 0)"}, {"paper_bgcolor": "rgba(0, 0, 0, 0)"})
    fig.update_xaxes(title_text = "", showticklabels = False)
    fig.update_yaxes(title_text = "", showticklabels = False)
    fig.update_layout(showlegend = False)
    fig.update_traces(marker=dict(line=dict(width=0)))

    protein_context_df = p_data[p_data["Selected"] != "Not Selected"][["Name", "Celltype", "x", "y"]]
    
    return fig, protein_context_df


def get_protein_counts(df):
    counts = Counter(df["Celltype"].tolist())
    df = pd.DataFrame({"Celltype": list(counts.keys()), "Activated Proteins": list(counts.values())})
    df = df.sort_values(by = "Celltype")
    df = df.merge(safe_scores, on = "Celltype")
    print(df)
    return df


def plot_celltype_emb(celltype):
    hover_keys = {"Name": True, "Celltype": True, "x": False, "y": False}
    if "All" in celltype:
        fig = px.scatter(data, x = "x", y = "y", color = "Celltype", opacity = 0.4, hover_data = hover_keys)
        activated_proteins_df = get_protein_counts(data)
    else:
        hover_keys.update({"Selected": False})
        c_data = data.copy()
        celltype = [c.lower() for c in celltype]
        
        color_map = {c: i for c, i in zip(celltype, css_colors)}
        color_map.update({"Not Selected": "lightgrey"})
        
        c_data["Selected"] = [c if c in celltype else "Not Selected" for c in c_data["Celltype"].tolist()]
        fig = px.scatter(c_data, x = "x", y = "y", color = "Selected", color_discrete_map = color_map, opacity = 0.8, hover_data = hover_keys)
        
        activated_proteins_df = get_protein_counts(c_data[c_data["Selected"] != "Not Selected"])

    fig.update_layout({"plot_bgcolor": "rgba(0, 0, 0, 0)"}, {"paper_bgcolor": "rgba(0, 0, 0, 0)"})
    fig.update_xaxes(title_text = "", showticklabels = False)
    fig.update_yaxes(title_text = "", showticklabels = False)
    fig.update_layout(showlegend = False)
    return fig, activated_proteins_df


# Create gradio interface
with gr.Blocks() as demo:
    gr.Markdown('<center><h1>Contextualizing Protein Representations with PINNACLE</h1></center>')
    gr.Markdown('Protein interaction networks are a critical component to study the function and therapeutic potential of proteins. \
                 However, accurately modeling protein interactions across diverse biological contexts, such as tissues and cell types, \
                 remains a significant challenge for existing algorithms. Here, we introduce <b>PINNACLE</b>, a flexible geometric deep learning approach \
                 that trains on contextualized protein interaction networks to generate context-aware protein representations. Leveraging a \
                 multi-organ single cell transcriptomic atlas of humans, <b>PINNACLE provides 394,760 protein representations split across 156 cell-type \
                 contexts from 24 tissues and organs</b>. Our contextualized protein representations, infused with cellular and tissue organization, \
                 can easily be adapted for diverse downstream tasks.')
    gr.Markdown(' For more information, please check out our manuscript and documentation (links provided at the bottom of the page)!')
    
    with gr.Tabs():

        with gr.TabItem("Protein"):
            with gr.Column():
                gr.Markdown('<center><h3>Select protein of interest to examine across biological contexts</h3></center>')
                protein = gr.Textbox(info = "Enter a protein name (in HGNC symbol)", lines = 1, value = "TNF", label = "Protein")
                protein_submit_btn = gr.Button("Submit")
                
                gr.Markdown('<center><h3>Contextualized protein representations</h3></center>')
                protein_plot = gr.Plot()

                with gr.Accordion(label = "Protein Contexts", open = False):
                    protein_context_df = gr.Dataframe(headers = ["Protein", "Celltype", "x", "y"], overflow_row_behaviour = "paginate")

        with gr.TabItem("Cell Type"):
            with gr.Column():
                gr.Markdown('<center><h3>Select biological context by specifying cell type of interest</h3></center>')
                celltype = gr.Dropdown(["All"] + unique_celltypes, info = "Please select from the following cell types.", value = ["All"], multiselect = True, label="Cell Type")
                celltype_submit_btn = gr.Button("Submit")
                
                gr.Markdown('<center><h3>Contextualized protein representations</h3></center>')
                celltype_plot = gr.Plot()

                with gr.Accordion(label = "Cell Type Context", open = False):
                    activated_proteins_df = gr.Dataframe(headers = ["Celltype", "Activated Proteins"], overflow_row_behaviour = "paginate")

    gr.Markdown("<p style='text-align: center'><a href='https://github.com/mims-harvard/PINNACLE'>Github Repo</a>" \
                "| <a href='https://zitniklab.hms.harvard.edu/projects/PINNACLE/'>Documentation</a> " \
                "| <a href='https://www.nature.com/articles/s41592-024-02341-3/'>Publication</a></p>")

    protein_submit_btn.click(plot_protein_emb, inputs = [protein], outputs = [protein_plot, protein_context_df])
    celltype_submit_btn.click(plot_celltype_emb, inputs = [celltype], outputs = [celltype_plot, activated_proteins_df])


# Launch
if __name__ == "__main__":
    demo.launch()