Hjgugugjhuhjggg commited on
Commit
4f047bc
·
verified ·
1 Parent(s): db25a46

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +136 -101
app.py CHANGED
@@ -18,7 +18,10 @@ from threading import Thread
18
  import gptcache
19
  from sklearn.metrics.pairwise import cosine_similarity
20
  from sklearn.feature_extraction.text import TfidfVectorizer
21
- from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
 
 
 
22
 
23
  load_dotenv()
24
  HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
@@ -101,118 +104,150 @@ global_data = {
101
  'ssm_dt_rank': {},
102
  'ssm_dt_b_c_rms': {},
103
  'vocab_type': {},
104
- 'model_type': {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  }
106
 
107
  model_configs = [
108
  {"repo_id": "Hjgugugjhuhjggg/testing_semifinal-Q2_K-GGUF", "filename": "testing_semifinal-q2_k.gguf", "name": "testing"},
109
  {"repo_id": "bartowski/Llama-3.2-3B-Instruct-uncensored-GGUF", "filename": "Llama-3.2-3B-Instruct-uncensored-Q2_K.gguf", "name": "Llama-3.2-3B-Instruct"},
110
  {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-70B-Q2_K-GGUF", "filename": "meta-llama-3.1-70b-q2_k.gguf", "name": "Meta-Llama-3.1-70B"},
111
- {"repo_id": "bartowski/QwQ-32B-Preview-GGUF", "filename": "QwQ-32B-Preview-Q2_K.gguf", "name": "QwQ-32B-Preview"},
112
- {"repo_id": "Ffftdtd5dtft/Codestral-22B-v0.1-Q2_K-GGUF", "filename": "codestral-22b-v0.1-q2_k.gguf", "name": "Codestral-22B-v0.1"},
113
- {"repo_id": "Ffftdtd5dtft/WizardLM-13B-Uncensored-Q2_K-GGUF", "filename": "wizardlm-13b-uncensored-q2_k.gguf", "name": "WizardLM-13B-Uncensored"},
114
- {"repo_id": "Ffftdtd5dtft/Qwen2-Math-72B-Instruct-Q2_K-GGUF", "filename": "qwen2-math-72b-instruct-q2_k.gguf", "name": "Qwen2-Math-72B-Instruct"},
115
- {"repo_id": "Ffftdtd5dtft/Phi-3-mini-128k-instruct-Q2_K-GGUF", "filename": "phi-3-mini-128k-instruct-q2_k.gguf", "name": "Phi-3-mini-128k"},
116
- {"repo_id": "Ffftdtd5dtft/DeepSeek-Coder-V2-Lite-Instruct-Q2_K-GGUF", "filename": "deepseek-coder-v2-lite-instruct-q2_k.gguf", "name": "DeepSeek-Coder-V2-Lite"},
117
- {"repo_id": "Ffftdtd5dtft/Mistral-Nemo-Instruct-2407-Q2_K-GGUF", "filename": "mistral-nemo-instruct-2407-q2_k.gguf", "name": "Mistral-Nemo-Instruct-2407"}
118
  ]
119
 
120
- class ModelManager:
121
- def __init__(self):
122
- self.models = {}
123
-
124
- def load_model(self, model_config):
125
- if model_config['name'] not in self.models:
126
- try:
127
- self.models[model_config['name']] = Llama.from_pretrained(
128
- repo_id=model_config['repo_id'],
129
- filename=model_config['filename'],
130
- use_auth_token=HUGGINGFACE_TOKEN,
131
- n_threads=20,
132
- use_gpu=False
133
- )
134
- except Exception as e:
135
- print(f"Error loading model {model_config['name']}: {e}")
136
-
137
- def load_all_models(self):
138
- loop = asyncio.get_event_loop()
139
- tasks = [loop.run_in_executor(None, self.load_model, config) for config in model_configs]
140
- loop.run_until_complete(asyncio.gather(*tasks))
141
- return self.models
142
-
143
- model_manager = ModelManager()
144
- global_data['models'] = model_manager.load_all_models()
145
-
146
- def release_resources():
147
- try:
148
- if torch.cuda.is_available():
149
- torch.cuda.empty_cache()
150
- gc.collect()
151
- except Exception as e:
152
- print(f"Failed to release resources: {e}")
153
-
154
- def resource_manager():
155
- MAX_RAM_PERCENT = 90
156
- MAX_CPU_PERCENT = 90
157
- MAX_GPU_PERCENT = 90
158
-
159
- while True:
160
- try:
161
- virtual_mem = psutil.virtual_memory()
162
- current_ram_percent = virtual_mem.percent
163
-
164
- if current_ram_percent > MAX_RAM_PERCENT:
165
- release_resources()
166
-
167
- current_cpu_percent = psutil.cpu_percent()
168
- if current_cpu_percent > MAX_CPU_PERCENT:
169
- psutil.Process(os.getpid()).nice()
170
-
171
- if torch.cuda.is_available():
172
- gpu = torch.cuda.current_device()
173
- gpu_mem = torch.cuda.memory_allocated(gpu) / (1024 * 1024)
174
- total_gpu_mem = torch.cuda.get_device_properties(gpu).total_memory / (1024 * 1024)
175
- gpu_mem_percent = (gpu_mem / total_gpu_mem) * 100
176
-
177
- if gpu_mem_percent > MAX_GPU_PERCENT:
178
- release_resources()
179
-
180
- except Exception as e:
181
- print(f"Error in resource manager: {e}")
182
-
183
- def run_resource_manager():
184
- resource_manager()
185
-
186
- Thread(target=run_resource_manager, daemon=True).start()
187
-
188
  def normalize_input(input_text):
189
- return input_text.strip()
190
-
191
- def remove_duplicates(text):
192
- lines = text.split('\n')
193
- unique_lines = []
194
- seen_lines = set()
195
- for line in lines:
196
- if line not in seen_lines:
197
- unique_lines.append(line)
198
- seen_lines.add(line)
199
- return '\n'.join(unique_lines)
200
-
201
- def get_best_response(responses):
202
- responses = [response for response in responses if response and not set(response.lower().split()).intersection(ENGLISH_STOP_WORDS)]
203
- if not responses:
204
- return "No valid content generated."
205
-
206
- vectorizer = TfidfVectorizer().fit_transform(responses)
207
- similarity_matrix = cosine_similarity(vectorizer)
208
- total_similarities = similarity_matrix.sum(axis=1)
209
- best_response_index = total_similarities.argmax()
210
- return responses[best_response_index]
211
 
212
  async def generate_model_response(model, inputs):
213
  try:
214
- response = await model(inputs)
215
- return remove_duplicates(response['choices'][0]['text'])
216
  except Exception as e:
217
  return ""
218
 
 
18
  import gptcache
19
  from sklearn.metrics.pairwise import cosine_similarity
20
  from sklearn.feature_extraction.text import TfidfVectorizer
21
+ import nltk
22
+ from nltk.corpus import stopwords
23
+
24
+ nltk.download('stopwords')
25
 
26
  load_dotenv()
27
  HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
 
104
  'ssm_dt_rank': {},
105
  'ssm_dt_b_c_rms': {},
106
  'vocab_type': {},
107
+ 'model_type': {},
108
+ "general.architecture": {},
109
+ "general.type": {},
110
+ "general.name": {},
111
+ "general.finetune": {},
112
+ "general.basename": {},
113
+ "general.size_label": {},
114
+ "general.license": {},
115
+ "general.license.link": {},
116
+ "general.tags": {},
117
+ "general.languages": {},
118
+ "general.organization": {},
119
+ "general.base_model.count": {},
120
+ 'general.file_type': {},
121
+ "phi3.context_length": {},
122
+ "phi3.rope.scaling.original_context_length": {},
123
+ "phi3.embedding_length": {},
124
+ "phi3.feed_forward_length": {},
125
+ "phi3.block_count": {},
126
+ "phi3.attention.head_count": {},
127
+ "phi3.attention.head_count_kv": {},
128
+ "phi3.attention.layer_norm_rms_epsilon": {},
129
+ "phi3.rope.dimension_count": {},
130
+ "phi3.rope.freq_base": {},
131
+ "phi3.attention.sliding_window": {},
132
+ "phi3.rope.scaling.attn_factor": {},
133
+ "llama.block_count": {},
134
+ "llama.context_length": {},
135
+ "llama.embedding_length": {},
136
+ "llama.feed_forward_length": {},
137
+ "llama.attention.head_count": {},
138
+ "llama.attention.head_count_kv": {},
139
+ "llama.rope.freq_base": {},
140
+ "llama.attention.layer_norm_rms_epsilon": {},
141
+ "llama.attention.key_length": {},
142
+ "llama.attention.value_length": {},
143
+ "llama.vocab_size": {},
144
+ "llama.rope.dimension_count": {},
145
+ "deepseek2.block_count": {},
146
+ "deepseek2.context_length": {},
147
+ "deepseek2.embedding_length": {},
148
+ "deepseek2.feed_forward_length": {},
149
+ "deepseek2.attention.head_count": {},
150
+ "deepseek2.attention.head_count_kv": {},
151
+ "deepseek2.rope.freq_base": {},
152
+ "deepseek2.attention.layer_norm_rms_epsilon": {},
153
+ "deepseek2.expert_used_count": {},
154
+ "deepseek2.leading_dense_block_count": {},
155
+ "deepseek2.vocab_size": {},
156
+ "deepseek2.attention.kv_lora_rank": {},
157
+ "deepseek2.attention.key_length": {},
158
+ "deepseek2.attention.value_length": {},
159
+ "deepseek2.expert_feed_forward_length": {},
160
+ "deepseek2.expert_count": {},
161
+ "deepseek2.expert_shared_count": {},
162
+ "deepseek2.expert_weights_scale": {},
163
+ "deepseek2.rope.dimension_count": {},
164
+ "deepseek2.rope.scaling.type": {},
165
+ "deepseek2.rope.scaling.factor": {},
166
+ "deepseek2.rope.scaling.yarn_log_multiplier": {},
167
+ "qwen2.block_count": {},
168
+ "qwen2.context_length": {},
169
+ "qwen2.embedding_length": {},
170
+ "qwen2.feed_forward_length": {},
171
+ "qwen2.attention.head_count": {},
172
+ "qwen2.attention.head_count_kv": {},
173
+ "qwen2.rope.freq_base": {},
174
+ "qwen2.attention.layer_norm_rms_epsilon": {},
175
+ "general.version": {},
176
+ "general.datasets": {},
177
+ "tokenizer.ggml.model": {},
178
+ "tokenizer.ggml.pre": {},
179
+ "tokenizer.ggml.tokens": {},
180
+ "tokenizer.ggml.token_type": {},
181
+ "tokenizer.ggml.merges": {},
182
+ "tokenizer.ggml.bos_token_id": {},
183
+ "tokenizer.ggml.eos_token_id": {},
184
+ "tokenizer.ggml.unknown_token_id": {},
185
+ "tokenizer.ggml.padding_token_id": {},
186
+ "tokenizer.ggml.add_bos_token": {},
187
+ "tokenizer.ggml.add_eos_token": {},
188
+ "tokenizer.ggml.add_space_prefix": {},
189
+ "tokenizer.chat_template": {},
190
+ "quantize.imatrix.file": {},
191
+ "quantize.imatrix.dataset": {},
192
+ "quantize.imatrix.entries_count": {},
193
+ "quantize.imatrix.chunks_count": {},
194
+ "general.quantization_version": {},
195
+ 'n_lora_q': {},
196
+ 'n_lora_kv': {},
197
+ 'n_expert_shared': {},
198
+ 'n_ff_exp': {},
199
+ "n_layer_dense_lead": {},
200
+ "expert_weights_scale": {},
201
+ "rope_yarn_log_mul": {},
202
+ 'model_type': {},
203
+ 'eval': {},
204
+ 'time': {},
205
+ 'token': {},
206
+ 'tokens': {},
207
+ 'pads': {},
208
+ 'model': {},
209
+ 'base': {},
210
+ 'model_base': {},
211
+ 'perhaps': {},
212
+ 'word': {},
213
+ 'words': {},
214
+ 'start': {},
215
+ 'stop': {},
216
+ 'run': {},
217
+ 'runs': {},
218
+ 'ms': {},
219
+ 'vocabulary': {},
220
+ 'timeout': {},
221
+ 'load': {},
222
+ 'load_time': {},
223
+ 'bas': {},
224
+ 'tok': {},
225
+ 'second': {},
226
+ 'seconds': {},
227
+ 'graph': {},
228
+ 'load_model': {},
229
+ 'end': {},
230
+ 'llama_perf_context_print': {},
231
+ 'llm_load_print_meta': {}
232
  }
233
 
234
  model_configs = [
235
  {"repo_id": "Hjgugugjhuhjggg/testing_semifinal-Q2_K-GGUF", "filename": "testing_semifinal-q2_k.gguf", "name": "testing"},
236
  {"repo_id": "bartowski/Llama-3.2-3B-Instruct-uncensored-GGUF", "filename": "Llama-3.2-3B-Instruct-uncensored-Q2_K.gguf", "name": "Llama-3.2-3B-Instruct"},
237
  {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-70B-Q2_K-GGUF", "filename": "meta-llama-3.1-70b-q2_k.gguf", "name": "Meta-Llama-3.1-70B"},
238
+ {"repo_id": "Hhhbvvkgh/Heidi-Llama-v4-Q2_K-GGUF", "filename": "heidi-llama-v4-q2_k.gguf", "name": "Heidi-Llama-V4"}
 
 
 
 
 
 
239
  ]
240
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
  def normalize_input(input_text):
242
+ stop_words = set(stopwords.words('english'))
243
+ words = input_text.split()
244
+ filtered_words = [word for word in words if word.lower() not in stop_words]
245
+ return " ".join(filtered_words)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
246
 
247
  async def generate_model_response(model, inputs):
248
  try:
249
+ response = await model.generate(inputs)
250
+ return response
251
  except Exception as e:
252
  return ""
253