TransparentGPT / methods.py
rhuang03's picture
Upload 6 files
c3c8f07 verified
raw
history blame contribute delete
3.49 kB
import os
import chainlit as cl
from langchain.memory.buffer import ConversationBufferMemory
from langchain_openai import ChatOpenAI, OpenAI
from langchain.chains import LLMChain
from prompts import default_prompt_template, doctor_prompt_template, default_prompt_template_no_sources, doctor_prompt_template_no_sources
from dotenv import load_dotenv
from chainlit.input_widget import Select, Switch, Slider
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from math import exp
import numpy as np
from typing import Any, Dict, List, Tuple
from langchain_core.output_parsers import BaseOutputParser
from difflib import SequenceMatcher
import requests
from bs4 import BeautifulSoup
import nltk
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import json
llm = ChatOpenAI(
base_url="https://api.studio.nebius.com/v1/",
api_key=os.environ.get("NEBIUS_API_KEY"),
model = "meta-llama/Llama-3.3-70B-Instruct",
temperature = 0.7
).bind(logprobs=True)
def get_wikipedia_page_content(page_title):
#scraping wikipedia pages with the Revisions API
page_title = re.sub(r"\s+", "", page_title).strip()
url = f"https://en.wikipedia.org/w/api.php?action=query&format=json&prop=revisions&titles={page_title}&formatversion=2&rvprop=content&rvslots=*"
response = requests.get(url)
data = response.json()
return data["query"]["pages"][0]["revisions"][0]["slots"]["main"]["content"]
def test_scrape_sim(link, response):
tfidf_vectorizer = TfidfVectorizer()
try:
idx = link.rfind("/")
title = link[idx+1:]
tfidf_matrix = tfidf_vectorizer.fit_transform([get_wikipedia_page_content(title), response])
# tfidf_matrix = tfidf_vectorizer.fit_transform([scrape_web_text(link), response])
cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
return cosine_sim*100
except:
return 0
config_file="config.json"
def get_config():
with open(config_file, "r") as file:
return json.load(file)
def update_config(new_value):
config = get_config()
config["num_sources"] = new_value
with open(config_file, "w") as file:
json.dump(config, file, indent=4)
def load_config():
with open("config.json","r") as file:
return json.load(file)
def generate_hypothetical_answer(question: str) -> str:
"""Have LLM generate a hypothetical answer to assist with bot response."""
prompt = PromptTemplate(
input_variables=['question'],
template="""
You are an AI assistant taked with generate a hypothetical answer to the following question. Your answer shoulld be detailed and comprehensive,
as if you had access to all relevant information. This hypothetical answer will be used to improve document retrieval, so include key terms and concepts
that might be relevant. Do not include phrases like "I think" or "It's possible that" - present the information as if it were factual.
Question:{question}
Hypothetical answer:
""",
)
return TransparentGPT_settings.llm.invoke(prompt.format(question=question))
def highest_log_prob(vals):
"""Calculates the perplexity score (confidence) of bot response."""
logprobs = []
for token in vals:
logprobs += [token['logprob']]
average_log_prob = sum(logprobs)/len(logprobs)
return np.round(np.exp(average_log_prob)*100,2)