Update app.py
Browse files
app.py
CHANGED
@@ -18,7 +18,10 @@ from threading import Thread
|
|
18 |
import gptcache
|
19 |
from sklearn.metrics.pairwise import cosine_similarity
|
20 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
21 |
-
|
|
|
|
|
|
|
22 |
|
23 |
load_dotenv()
|
24 |
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
|
@@ -101,118 +104,150 @@ global_data = {
|
|
101 |
'ssm_dt_rank': {},
|
102 |
'ssm_dt_b_c_rms': {},
|
103 |
'vocab_type': {},
|
104 |
-
'model_type': {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
}
|
106 |
|
107 |
model_configs = [
|
108 |
{"repo_id": "Hjgugugjhuhjggg/testing_semifinal-Q2_K-GGUF", "filename": "testing_semifinal-q2_k.gguf", "name": "testing"},
|
109 |
{"repo_id": "bartowski/Llama-3.2-3B-Instruct-uncensored-GGUF", "filename": "Llama-3.2-3B-Instruct-uncensored-Q2_K.gguf", "name": "Llama-3.2-3B-Instruct"},
|
110 |
{"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-70B-Q2_K-GGUF", "filename": "meta-llama-3.1-70b-q2_k.gguf", "name": "Meta-Llama-3.1-70B"},
|
111 |
-
{"repo_id": "
|
112 |
-
{"repo_id": "Ffftdtd5dtft/Codestral-22B-v0.1-Q2_K-GGUF", "filename": "codestral-22b-v0.1-q2_k.gguf", "name": "Codestral-22B-v0.1"},
|
113 |
-
{"repo_id": "Ffftdtd5dtft/WizardLM-13B-Uncensored-Q2_K-GGUF", "filename": "wizardlm-13b-uncensored-q2_k.gguf", "name": "WizardLM-13B-Uncensored"},
|
114 |
-
{"repo_id": "Ffftdtd5dtft/Qwen2-Math-72B-Instruct-Q2_K-GGUF", "filename": "qwen2-math-72b-instruct-q2_k.gguf", "name": "Qwen2-Math-72B-Instruct"},
|
115 |
-
{"repo_id": "Ffftdtd5dtft/Phi-3-mini-128k-instruct-Q2_K-GGUF", "filename": "phi-3-mini-128k-instruct-q2_k.gguf", "name": "Phi-3-mini-128k"},
|
116 |
-
{"repo_id": "Ffftdtd5dtft/DeepSeek-Coder-V2-Lite-Instruct-Q2_K-GGUF", "filename": "deepseek-coder-v2-lite-instruct-q2_k.gguf", "name": "DeepSeek-Coder-V2-Lite"},
|
117 |
-
{"repo_id": "Ffftdtd5dtft/Mistral-Nemo-Instruct-2407-Q2_K-GGUF", "filename": "mistral-nemo-instruct-2407-q2_k.gguf", "name": "Mistral-Nemo-Instruct-2407"}
|
118 |
]
|
119 |
|
120 |
-
class ModelManager:
|
121 |
-
def __init__(self):
|
122 |
-
self.models = {}
|
123 |
-
|
124 |
-
def load_model(self, model_config):
|
125 |
-
if model_config['name'] not in self.models:
|
126 |
-
try:
|
127 |
-
self.models[model_config['name']] = Llama.from_pretrained(
|
128 |
-
repo_id=model_config['repo_id'],
|
129 |
-
filename=model_config['filename'],
|
130 |
-
use_auth_token=HUGGINGFACE_TOKEN,
|
131 |
-
n_threads=20,
|
132 |
-
use_gpu=False
|
133 |
-
)
|
134 |
-
except Exception as e:
|
135 |
-
print(f"Error loading model {model_config['name']}: {e}")
|
136 |
-
|
137 |
-
def load_all_models(self):
|
138 |
-
loop = asyncio.get_event_loop()
|
139 |
-
tasks = [loop.run_in_executor(None, self.load_model, config) for config in model_configs]
|
140 |
-
loop.run_until_complete(asyncio.gather(*tasks))
|
141 |
-
return self.models
|
142 |
-
|
143 |
-
model_manager = ModelManager()
|
144 |
-
global_data['models'] = model_manager.load_all_models()
|
145 |
-
|
146 |
-
def release_resources():
|
147 |
-
try:
|
148 |
-
if torch.cuda.is_available():
|
149 |
-
torch.cuda.empty_cache()
|
150 |
-
gc.collect()
|
151 |
-
except Exception as e:
|
152 |
-
print(f"Failed to release resources: {e}")
|
153 |
-
|
154 |
-
def resource_manager():
|
155 |
-
MAX_RAM_PERCENT = 90
|
156 |
-
MAX_CPU_PERCENT = 90
|
157 |
-
MAX_GPU_PERCENT = 90
|
158 |
-
|
159 |
-
while True:
|
160 |
-
try:
|
161 |
-
virtual_mem = psutil.virtual_memory()
|
162 |
-
current_ram_percent = virtual_mem.percent
|
163 |
-
|
164 |
-
if current_ram_percent > MAX_RAM_PERCENT:
|
165 |
-
release_resources()
|
166 |
-
|
167 |
-
current_cpu_percent = psutil.cpu_percent()
|
168 |
-
if current_cpu_percent > MAX_CPU_PERCENT:
|
169 |
-
psutil.Process(os.getpid()).nice()
|
170 |
-
|
171 |
-
if torch.cuda.is_available():
|
172 |
-
gpu = torch.cuda.current_device()
|
173 |
-
gpu_mem = torch.cuda.memory_allocated(gpu) / (1024 * 1024)
|
174 |
-
total_gpu_mem = torch.cuda.get_device_properties(gpu).total_memory / (1024 * 1024)
|
175 |
-
gpu_mem_percent = (gpu_mem / total_gpu_mem) * 100
|
176 |
-
|
177 |
-
if gpu_mem_percent > MAX_GPU_PERCENT:
|
178 |
-
release_resources()
|
179 |
-
|
180 |
-
except Exception as e:
|
181 |
-
print(f"Error in resource manager: {e}")
|
182 |
-
|
183 |
-
def run_resource_manager():
|
184 |
-
resource_manager()
|
185 |
-
|
186 |
-
Thread(target=run_resource_manager, daemon=True).start()
|
187 |
-
|
188 |
def normalize_input(input_text):
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
unique_lines = []
|
194 |
-
seen_lines = set()
|
195 |
-
for line in lines:
|
196 |
-
if line not in seen_lines:
|
197 |
-
unique_lines.append(line)
|
198 |
-
seen_lines.add(line)
|
199 |
-
return '\n'.join(unique_lines)
|
200 |
-
|
201 |
-
def get_best_response(responses):
|
202 |
-
responses = [response for response in responses if response and not set(response.lower().split()).intersection(ENGLISH_STOP_WORDS)]
|
203 |
-
if not responses:
|
204 |
-
return "No valid content generated."
|
205 |
-
|
206 |
-
vectorizer = TfidfVectorizer().fit_transform(responses)
|
207 |
-
similarity_matrix = cosine_similarity(vectorizer)
|
208 |
-
total_similarities = similarity_matrix.sum(axis=1)
|
209 |
-
best_response_index = total_similarities.argmax()
|
210 |
-
return responses[best_response_index]
|
211 |
|
212 |
async def generate_model_response(model, inputs):
|
213 |
try:
|
214 |
-
response = await model(inputs)
|
215 |
-
return
|
216 |
except Exception as e:
|
217 |
return ""
|
218 |
|
|
|
18 |
import gptcache
|
19 |
from sklearn.metrics.pairwise import cosine_similarity
|
20 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
21 |
+
import nltk
|
22 |
+
from nltk.corpus import stopwords
|
23 |
+
|
24 |
+
nltk.download('stopwords')
|
25 |
|
26 |
load_dotenv()
|
27 |
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
|
|
|
104 |
'ssm_dt_rank': {},
|
105 |
'ssm_dt_b_c_rms': {},
|
106 |
'vocab_type': {},
|
107 |
+
'model_type': {},
|
108 |
+
"general.architecture": {},
|
109 |
+
"general.type": {},
|
110 |
+
"general.name": {},
|
111 |
+
"general.finetune": {},
|
112 |
+
"general.basename": {},
|
113 |
+
"general.size_label": {},
|
114 |
+
"general.license": {},
|
115 |
+
"general.license.link": {},
|
116 |
+
"general.tags": {},
|
117 |
+
"general.languages": {},
|
118 |
+
"general.organization": {},
|
119 |
+
"general.base_model.count": {},
|
120 |
+
'general.file_type': {},
|
121 |
+
"phi3.context_length": {},
|
122 |
+
"phi3.rope.scaling.original_context_length": {},
|
123 |
+
"phi3.embedding_length": {},
|
124 |
+
"phi3.feed_forward_length": {},
|
125 |
+
"phi3.block_count": {},
|
126 |
+
"phi3.attention.head_count": {},
|
127 |
+
"phi3.attention.head_count_kv": {},
|
128 |
+
"phi3.attention.layer_norm_rms_epsilon": {},
|
129 |
+
"phi3.rope.dimension_count": {},
|
130 |
+
"phi3.rope.freq_base": {},
|
131 |
+
"phi3.attention.sliding_window": {},
|
132 |
+
"phi3.rope.scaling.attn_factor": {},
|
133 |
+
"llama.block_count": {},
|
134 |
+
"llama.context_length": {},
|
135 |
+
"llama.embedding_length": {},
|
136 |
+
"llama.feed_forward_length": {},
|
137 |
+
"llama.attention.head_count": {},
|
138 |
+
"llama.attention.head_count_kv": {},
|
139 |
+
"llama.rope.freq_base": {},
|
140 |
+
"llama.attention.layer_norm_rms_epsilon": {},
|
141 |
+
"llama.attention.key_length": {},
|
142 |
+
"llama.attention.value_length": {},
|
143 |
+
"llama.vocab_size": {},
|
144 |
+
"llama.rope.dimension_count": {},
|
145 |
+
"deepseek2.block_count": {},
|
146 |
+
"deepseek2.context_length": {},
|
147 |
+
"deepseek2.embedding_length": {},
|
148 |
+
"deepseek2.feed_forward_length": {},
|
149 |
+
"deepseek2.attention.head_count": {},
|
150 |
+
"deepseek2.attention.head_count_kv": {},
|
151 |
+
"deepseek2.rope.freq_base": {},
|
152 |
+
"deepseek2.attention.layer_norm_rms_epsilon": {},
|
153 |
+
"deepseek2.expert_used_count": {},
|
154 |
+
"deepseek2.leading_dense_block_count": {},
|
155 |
+
"deepseek2.vocab_size": {},
|
156 |
+
"deepseek2.attention.kv_lora_rank": {},
|
157 |
+
"deepseek2.attention.key_length": {},
|
158 |
+
"deepseek2.attention.value_length": {},
|
159 |
+
"deepseek2.expert_feed_forward_length": {},
|
160 |
+
"deepseek2.expert_count": {},
|
161 |
+
"deepseek2.expert_shared_count": {},
|
162 |
+
"deepseek2.expert_weights_scale": {},
|
163 |
+
"deepseek2.rope.dimension_count": {},
|
164 |
+
"deepseek2.rope.scaling.type": {},
|
165 |
+
"deepseek2.rope.scaling.factor": {},
|
166 |
+
"deepseek2.rope.scaling.yarn_log_multiplier": {},
|
167 |
+
"qwen2.block_count": {},
|
168 |
+
"qwen2.context_length": {},
|
169 |
+
"qwen2.embedding_length": {},
|
170 |
+
"qwen2.feed_forward_length": {},
|
171 |
+
"qwen2.attention.head_count": {},
|
172 |
+
"qwen2.attention.head_count_kv": {},
|
173 |
+
"qwen2.rope.freq_base": {},
|
174 |
+
"qwen2.attention.layer_norm_rms_epsilon": {},
|
175 |
+
"general.version": {},
|
176 |
+
"general.datasets": {},
|
177 |
+
"tokenizer.ggml.model": {},
|
178 |
+
"tokenizer.ggml.pre": {},
|
179 |
+
"tokenizer.ggml.tokens": {},
|
180 |
+
"tokenizer.ggml.token_type": {},
|
181 |
+
"tokenizer.ggml.merges": {},
|
182 |
+
"tokenizer.ggml.bos_token_id": {},
|
183 |
+
"tokenizer.ggml.eos_token_id": {},
|
184 |
+
"tokenizer.ggml.unknown_token_id": {},
|
185 |
+
"tokenizer.ggml.padding_token_id": {},
|
186 |
+
"tokenizer.ggml.add_bos_token": {},
|
187 |
+
"tokenizer.ggml.add_eos_token": {},
|
188 |
+
"tokenizer.ggml.add_space_prefix": {},
|
189 |
+
"tokenizer.chat_template": {},
|
190 |
+
"quantize.imatrix.file": {},
|
191 |
+
"quantize.imatrix.dataset": {},
|
192 |
+
"quantize.imatrix.entries_count": {},
|
193 |
+
"quantize.imatrix.chunks_count": {},
|
194 |
+
"general.quantization_version": {},
|
195 |
+
'n_lora_q': {},
|
196 |
+
'n_lora_kv': {},
|
197 |
+
'n_expert_shared': {},
|
198 |
+
'n_ff_exp': {},
|
199 |
+
"n_layer_dense_lead": {},
|
200 |
+
"expert_weights_scale": {},
|
201 |
+
"rope_yarn_log_mul": {},
|
202 |
+
'model_type': {},
|
203 |
+
'eval': {},
|
204 |
+
'time': {},
|
205 |
+
'token': {},
|
206 |
+
'tokens': {},
|
207 |
+
'pads': {},
|
208 |
+
'model': {},
|
209 |
+
'base': {},
|
210 |
+
'model_base': {},
|
211 |
+
'perhaps': {},
|
212 |
+
'word': {},
|
213 |
+
'words': {},
|
214 |
+
'start': {},
|
215 |
+
'stop': {},
|
216 |
+
'run': {},
|
217 |
+
'runs': {},
|
218 |
+
'ms': {},
|
219 |
+
'vocabulary': {},
|
220 |
+
'timeout': {},
|
221 |
+
'load': {},
|
222 |
+
'load_time': {},
|
223 |
+
'bas': {},
|
224 |
+
'tok': {},
|
225 |
+
'second': {},
|
226 |
+
'seconds': {},
|
227 |
+
'graph': {},
|
228 |
+
'load_model': {},
|
229 |
+
'end': {},
|
230 |
+
'llama_perf_context_print': {},
|
231 |
+
'llm_load_print_meta': {}
|
232 |
}
|
233 |
|
234 |
model_configs = [
|
235 |
{"repo_id": "Hjgugugjhuhjggg/testing_semifinal-Q2_K-GGUF", "filename": "testing_semifinal-q2_k.gguf", "name": "testing"},
|
236 |
{"repo_id": "bartowski/Llama-3.2-3B-Instruct-uncensored-GGUF", "filename": "Llama-3.2-3B-Instruct-uncensored-Q2_K.gguf", "name": "Llama-3.2-3B-Instruct"},
|
237 |
{"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-70B-Q2_K-GGUF", "filename": "meta-llama-3.1-70b-q2_k.gguf", "name": "Meta-Llama-3.1-70B"},
|
238 |
+
{"repo_id": "Hhhbvvkgh/Heidi-Llama-v4-Q2_K-GGUF", "filename": "heidi-llama-v4-q2_k.gguf", "name": "Heidi-Llama-V4"}
|
|
|
|
|
|
|
|
|
|
|
|
|
239 |
]
|
240 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
241 |
def normalize_input(input_text):
|
242 |
+
stop_words = set(stopwords.words('english'))
|
243 |
+
words = input_text.split()
|
244 |
+
filtered_words = [word for word in words if word.lower() not in stop_words]
|
245 |
+
return " ".join(filtered_words)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
246 |
|
247 |
async def generate_model_response(model, inputs):
|
248 |
try:
|
249 |
+
response = await model.generate(inputs)
|
250 |
+
return response
|
251 |
except Exception as e:
|
252 |
return ""
|
253 |
|