CoffeBank commited on
Commit
e47a060
·
1 Parent(s): c9f2bdf
binoculars/detector.py CHANGED
@@ -20,8 +20,11 @@ huggingface_config = {
20
  BINOCULARS_ACCURACY_THRESHOLD = 0.9015310749276843 # optimized for f1-score
21
  BINOCULARS_FPR_THRESHOLD = 0.8536432310785527 # optimized for low-fpr [chosen at 0.01%]
22
 
23
- DEVICE_1 = "cuda:0" if torch.cuda.is_available() else "cpu"
24
- DEVICE_2 = "cuda:1" if torch.cuda.device_count() > 1 else DEVICE_1
 
 
 
25
 
26
 
27
  class Binoculars(object):
@@ -35,20 +38,36 @@ class Binoculars(object):
35
  assert_tokenizer_consistency(observer_name_or_path, performer_name_or_path)
36
 
37
  self.change_mode(mode)
 
 
 
 
 
 
38
  self.observer_model = AutoModelForCausalLM.from_pretrained(observer_name_or_path,
39
- device_map={"": DEVICE_1},
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  trust_remote_code=True,
41
  torch_dtype=torch.bfloat16 if use_bfloat16
42
  else torch.float32,
43
  token=huggingface_config["TOKEN"]
44
  )
45
- self.performer_model = AutoModelForCausalLM.from_pretrained(performer_name_or_path,
46
- device_map={"": DEVICE_2},
47
- trust_remote_code=True,
48
- torch_dtype=torch.bfloat16 if use_bfloat16
49
- else torch.float32,
50
- token=huggingface_config["TOKEN"]
51
- )
52
  self.observer_model.eval()
53
  self.performer_model.eval()
54
 
@@ -66,8 +85,13 @@ class Binoculars(object):
66
  raise ValueError(f"Invalid mode: {mode}")
67
 
68
  def free_memory(self) -> None:
69
- self.observer_model = self.observer_model.to('cpu')
70
- self.performer_model = self.performer_model.to('cpu')
 
 
 
 
 
71
 
72
  if torch.cuda.is_available():
73
  torch.cuda.empty_cache()
@@ -91,6 +115,7 @@ class Binoculars(object):
91
 
92
  @torch.inference_mode()
93
  def _get_logits(self, encodings: transformers.BatchEncoding) -> torch.Tensor:
 
94
  observer_logits = self.observer_model(**encodings.to(DEVICE_1)).logits
95
  performer_logits = self.performer_model(**encodings.to(DEVICE_2)).logits
96
  if DEVICE_1 != "cpu":
@@ -102,8 +127,9 @@ class Binoculars(object):
102
  encodings = self._tokenize(batch)
103
  observer_logits, performer_logits = self._get_logits(encodings)
104
  ppl = perplexity(encodings, performer_logits)
105
- x_ppl = entropy(observer_logits.to(DEVICE_1), performer_logits.to(DEVICE_1),
106
- encodings.to(DEVICE_1), self.tokenizer.pad_token_id)
 
107
  binoculars_scores = ppl / x_ppl
108
  binoculars_scores = binoculars_scores.tolist()
109
  return binoculars_scores[0] if isinstance(input_text, str) else binoculars_scores
 
20
  BINOCULARS_ACCURACY_THRESHOLD = 0.9015310749276843 # optimized for f1-score
21
  BINOCULARS_FPR_THRESHOLD = 0.8536432310785527 # optimized for low-fpr [chosen at 0.01%]
22
 
23
+ # More efficient device handling for Spaces (likely single GPU)
24
+ DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
25
+ # Use same device for both models in single-GPU environment
26
+ DEVICE_1 = DEVICE
27
+ DEVICE_2 = DEVICE
28
 
29
 
30
  class Binoculars(object):
 
38
  assert_tokenizer_consistency(observer_name_or_path, performer_name_or_path)
39
 
40
  self.change_mode(mode)
41
+
42
+ # Log memory usage before loading models
43
+ if torch.cuda.is_available():
44
+ print(f"Before loading observer model: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB allocated")
45
+
46
+ # Load first model
47
  self.observer_model = AutoModelForCausalLM.from_pretrained(observer_name_or_path,
48
+ device_map={"": DEVICE_1},
49
+ trust_remote_code=True,
50
+ torch_dtype=torch.bfloat16 if use_bfloat16
51
+ else torch.float32,
52
+ token=huggingface_config["TOKEN"]
53
+ )
54
+ # Clear cache between model loads
55
+ if torch.cuda.is_available():
56
+ torch.cuda.empty_cache()
57
+ print(f"After loading observer model: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB allocated")
58
+
59
+ # Load second model
60
+ self.performer_model = AutoModelForCausalLM.from_pretrained(performer_name_or_path,
61
+ device_map={"": DEVICE_2},
62
  trust_remote_code=True,
63
  torch_dtype=torch.bfloat16 if use_bfloat16
64
  else torch.float32,
65
  token=huggingface_config["TOKEN"]
66
  )
67
+
68
+ if torch.cuda.is_available():
69
+ print(f"After loading performer model: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB allocated")
70
+
 
 
 
71
  self.observer_model.eval()
72
  self.performer_model.eval()
73
 
 
85
  raise ValueError(f"Invalid mode: {mode}")
86
 
87
  def free_memory(self) -> None:
88
+ """Explicitly free GPU memory by moving models to CPU and deleting them"""
89
+ print("Freeing model memory...")
90
+ if hasattr(self, 'observer_model') and self.observer_model is not None:
91
+ self.observer_model = self.observer_model.to('cpu')
92
+
93
+ if hasattr(self, 'performer_model') and self.performer_model is not None:
94
+ self.performer_model = self.performer_model.to('cpu')
95
 
96
  if torch.cuda.is_available():
97
  torch.cuda.empty_cache()
 
115
 
116
  @torch.inference_mode()
117
  def _get_logits(self, encodings: transformers.BatchEncoding) -> torch.Tensor:
118
+ # Ensure we're using the same device for both models
119
  observer_logits = self.observer_model(**encodings.to(DEVICE_1)).logits
120
  performer_logits = self.performer_model(**encodings.to(DEVICE_2)).logits
121
  if DEVICE_1 != "cpu":
 
127
  encodings = self._tokenize(batch)
128
  observer_logits, performer_logits = self._get_logits(encodings)
129
  ppl = perplexity(encodings, performer_logits)
130
+ # No need to move tensors again if they're already on the same device
131
+ x_ppl = entropy(observer_logits, performer_logits,
132
+ encodings, self.tokenizer.pad_token_id)
133
  binoculars_scores = ppl / x_ppl
134
  binoculars_scores = binoculars_scores.tolist()
135
  return binoculars_scores[0] if isinstance(input_text, str) else binoculars_scores
binoculars_utils.py CHANGED
@@ -1,43 +1,105 @@
1
  from binoculars import Binoculars
 
 
2
 
3
- def initialize_binoculars():
4
- chat_model_pair = {
5
- "observer": "deepseek-ai/deepseek-llm-7b-base",
6
- "performer": "deepseek-ai/deepseek-llm-7b-chat"
7
- }
8
 
9
- coder_model_pair = {
10
- "observer": "deepseek-ai/deepseek-llm-7b-base",
11
- "performer": "deepseek-ai/deepseek-coder-7b-instruct-v1.5"
12
- }
 
 
 
13
 
14
- print("Initializing Binoculars models...")
 
 
15
 
16
  bino_chat = Binoculars(
17
  mode="accuracy",
18
- observer_name_or_path=chat_model_pair["observer"],
19
- performer_name_or_path=chat_model_pair["performer"],
20
  max_token_observed=2048
21
  )
 
 
 
 
 
22
 
 
 
 
 
 
 
 
23
  bino_coder = Binoculars(
24
  mode="accuracy",
25
- observer_name_or_path=coder_model_pair["observer"],
26
- performer_name_or_path=coder_model_pair["performer"],
27
  max_token_observed=2048
28
  )
29
 
30
- return bino_chat, bino_coder
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
- def compute_scores(text, bino_chat=None, bino_coder=None):
 
 
 
 
 
 
 
 
 
 
33
  scores = {}
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  if bino_chat:
36
- #print("Computing score_chat...")
37
- scores['score_chat'] = bino_chat.compute_score(text)
38
 
39
  if bino_coder:
40
- #print("Computing score_coder...")
41
- scores['score_coder'] = bino_coder.compute_score(text)
42
-
43
- return scores
 
1
  from binoculars import Binoculars
2
+ import torch
3
+ import gc
4
 
5
+ CHAT_MODEL_PAIR = {
6
+ "observer": "deepseek-ai/deepseek-llm-7b-base",
7
+ "performer": "deepseek-ai/deepseek-llm-7b-chat"
8
+ }
 
9
 
10
+ CODER_MODEL_PAIR = {
11
+ "observer": "deepseek-ai/deepseek-llm-7b-base",
12
+ "performer": "deepseek-ai/deepseek-coder-7b-instruct-v1.5"
13
+ }
14
+
15
+ def initialize_chat_model():
16
+ print("Initializing chat Binoculars model...")
17
 
18
+ if torch.cuda.is_available():
19
+ torch.cuda.empty_cache()
20
+ print(f"GPU Memory before chat model: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB allocated")
21
 
22
  bino_chat = Binoculars(
23
  mode="accuracy",
24
+ observer_name_or_path=CHAT_MODEL_PAIR["observer"],
25
+ performer_name_or_path=CHAT_MODEL_PAIR["performer"],
26
  max_token_observed=2048
27
  )
28
+
29
+ if torch.cuda.is_available():
30
+ print(f"GPU Memory after chat model: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB allocated")
31
+
32
+ return bino_chat
33
 
34
+ def initialize_coder_model():
35
+ print("Initializing coder Binoculars model...")
36
+
37
+ if torch.cuda.is_available():
38
+ torch.cuda.empty_cache()
39
+ print(f"GPU Memory before coder model: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB allocated")
40
+
41
  bino_coder = Binoculars(
42
  mode="accuracy",
43
+ observer_name_or_path=CODER_MODEL_PAIR["observer"],
44
+ performer_name_or_path=CODER_MODEL_PAIR["performer"],
45
  max_token_observed=2048
46
  )
47
 
48
+ if torch.cuda.is_available():
49
+ print(f"GPU Memory after coder model: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB allocated")
50
+
51
+ return bino_coder
52
+
53
+ def compute_chat_score(text):
54
+ print("Computing chat score...")
55
+ bino_chat = initialize_chat_model()
56
+
57
+ try:
58
+ score_chat = bino_chat.compute_score(text)
59
+ return {"score_chat": score_chat}
60
+ finally:
61
+ cleanup_model(bino_chat)
62
 
63
+ def compute_coder_score(text):
64
+ print("Computing coder score...")
65
+ bino_coder = initialize_coder_model()
66
+
67
+ try:
68
+ score_coder = bino_coder.compute_score(text)
69
+ return {"score_coder": score_coder}
70
+ finally:
71
+ cleanup_model(bino_coder)
72
+
73
+ def compute_scores(text, use_chat=True, use_coder=True):
74
  scores = {}
75
 
76
+ if use_chat:
77
+ chat_scores = compute_chat_score(text)
78
+ scores.update(chat_scores)
79
+
80
+ if use_coder:
81
+ coder_scores = compute_coder_score(text)
82
+ scores.update(coder_scores)
83
+
84
+ return scores
85
+
86
+ def cleanup_model(model):
87
+ if model:
88
+ try:
89
+ print(f"Cleaning up model resources...")
90
+ model.free_memory()
91
+
92
+ gc.collect()
93
+ if torch.cuda.is_available():
94
+ torch.cuda.empty_cache()
95
+ torch.cuda.synchronize()
96
+ print(f"After cleanup: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB allocated")
97
+ except Exception as e:
98
+ print(f"Error during model cleanup: {str(e)}")
99
+
100
+ def cleanup_models(bino_chat, bino_coder):
101
  if bino_chat:
102
+ cleanup_model(bino_chat)
 
103
 
104
  if bino_coder:
105
+ cleanup_model(bino_coder)
 
 
 
demo/binary_classifier_demo.py CHANGED
@@ -4,9 +4,10 @@ import gradio as gr
4
  import torch
5
  import os
6
  import spaces
 
7
 
8
  from model_utils import load_model, classify_text
9
- from binoculars_utils import initialize_binoculars, compute_scores
10
 
11
  MINIMUM_TOKENS = 200
12
 
@@ -46,42 +47,48 @@ css = """
46
 
47
  @spaces.GPU
48
  def run_binary_classifier(text, show_analysis=False):
 
 
 
 
 
 
 
 
49
  if len(text.strip()) < MINIMUM_TOKENS:
50
  return gr.Markdown(f"Текст слишком короткий. Требуется минимум {MINIMUM_TOKENS} символов."), None, None
51
 
52
- # Initialize Binoculars models
53
- bino_chat, bino_coder = initialize_binoculars()
54
-
55
- # Load binary classifier model
56
- model, scaler, label_encoder, imputer = load_model()
57
 
58
- # Compute scores using binoculars
59
- scores = compute_scores(text, bino_chat, bino_coder)
60
-
61
- # Run classification
62
- result = classify_text(text, model, scaler, label_encoder, imputer=imputer, scores=scores)
63
-
64
- # Format results
65
- predicted_class = result['predicted_class']
66
- probabilities = result['probabilities']
67
-
68
- # Format probabilities
69
- prob_str = ""
70
- for cls, prob in probabilities.items():
71
- prob_str += f"- {cls}: {prob:.4f}\n"
72
-
73
- # Format scores
74
- scores_str = ""
75
- if scores:
76
- scores_str = "### Binoculars Scores\n"
77
- if 'score_chat' in scores:
78
- scores_str += f"- Score Chat: {scores['score_chat']:.4f}\n"
79
- if 'score_coder' in scores:
80
- scores_str += f"- Score Coder: {scores['score_coder']:.4f}\n"
81
-
82
- # Result markdown
83
- class_style = "human-text" if predicted_class == "Human" else "ai-text"
84
- result_md = f"""
85
  ## Результат классификации
86
 
87
  Предсказанный класс: <span class="{class_style}">{predicted_class}</span>
@@ -91,199 +98,217 @@ def run_binary_classifier(text, show_analysis=False):
91
 
92
  {scores_str}
93
  """
94
-
95
- # Analysis markdown
96
- analysis_md = None
97
- if show_analysis:
98
- features = result['features']
99
- text_analysis = result['text_analysis']
100
-
101
- basic_stats_dict = {
102
- 'total_tokens': 'Количество токенов',
103
- 'total_words': 'Количество слов',
104
- 'unique_words': 'Количество уникальных слов',
105
- 'stop_words': 'Количество стоп-слов',
106
- 'avg_word_length': 'Средняя длина слова (символов)'
107
- }
108
-
109
- morph_dict = {
110
- 'pos_distribution': 'Распределение частей речи',
111
- 'unique_lemmas': 'Количество уникальных лемм',
112
- 'lemma_word_ratio': 'Отношение лемм к словам'
113
- }
114
-
115
- synt_dict = {
116
- 'dependencies': 'Зависимости между словами',
117
- 'noun_chunks': 'Количество именных групп'
118
- }
119
 
120
- entities_dict = {
121
- 'total_entities': 'Общее количество именованных сущностей',
122
- 'entity_types': 'Типы именованных сущностей'
123
- }
124
-
125
- diversity_dict = {
126
- 'ttr': 'TTR (отношение типов к токенам)',
127
- 'mtld': 'MTLD (мера лексического разнообразия)'
128
- }
129
-
130
- structure_dict = {
131
- 'sentence_count': 'Количество предложений',
132
- 'avg_sentence_length': 'Средняя длина предложения (токенов)',
133
- 'question_sentences': 'Количество вопросительных предложений',
134
- 'exclamation_sentences': 'Количество восклицательных предложений'
135
- }
136
-
137
- readability_dict = {
138
- 'words_per_sentence': 'Слов на предложение',
139
- 'syllables_per_word': 'Слогов на слово',
140
- 'flesh_kincaid_score': 'Индекс читабельности Флеша-Кинкейда',
141
- 'long_words_percent': 'Процент длинных слов'
142
- }
143
-
144
- semantic_dict = {
145
- 'avg_coherence_score': 'Средняя связность между предложениями'
146
- }
147
-
148
- analysis_md = "## Анализ текста\n\n"
149
-
150
- # Basic statistics
151
- analysis_md += "### Основная статистика\n"
152
- for key, value in text_analysis.get('basic_stats', {}).items():
153
- label = basic_stats_dict.get(key, key)
154
- if isinstance(value, float):
155
- analysis_md += f"- {label}: {value:.2f}\n"
156
- else:
157
- analysis_md += f"- {label}: {value}\n"
158
- analysis_md += "\n"
159
-
160
- # Morphological analysis
161
- analysis_md += "### Морфологический анализ\n"
162
- morph_analysis = text_analysis.get('morphological_analysis', {})
163
- for key, value in morph_analysis.items():
164
- label = morph_dict.get(key, key)
165
- if key == 'pos_distribution':
166
- analysis_md += f"- {label}:\n"
167
- for pos, count in value.items():
168
- pos_name = pos
169
- if pos == 'NOUN': pos_name = 'Существительные'
170
- elif pos == 'VERB': pos_name = 'Глаголы'
171
- elif pos == 'ADJ': pos_name = 'Прилагательные'
172
- elif pos == 'ADV': pos_name = 'Наречия'
173
- elif pos == 'PROPN': pos_name = 'Имена собственные'
174
- elif pos == 'DET': pos_name = 'Определители'
175
- elif pos == 'ADP': pos_name = 'Предлоги'
176
- elif pos == 'PRON': pos_name = 'Местоимения'
177
- elif pos == 'CCONJ': pos_name = 'Сочинительные союзы'
178
- elif pos == 'SCONJ': pos_name = 'Подчинительные союзы'
179
- elif pos == 'NUM': pos_name = 'Числительные'
180
- elif pos == 'PART': pos_name = 'Частицы'
181
- elif pos == 'PUNCT': pos_name = 'Знаки препинания'
182
- elif pos == 'AUX': pos_name = 'Вспомогательные глаголы'
183
- elif pos == 'SYM': pos_name = 'Символы'
184
- elif pos == 'INTJ': pos_name = 'Междометия'
185
- elif pos == 'X': pos_name = 'Другое (X)'
186
- analysis_md += f" - {pos_name}: {count}\n"
187
- elif isinstance(value, float):
188
- analysis_md += f"- {label}: {value:.3f}\n"
189
- else:
190
- analysis_md += f"- {label}: {value}\n"
191
- analysis_md += "\n"
192
-
193
- # Syntactic analysis
194
- analysis_md += "### Синтаксический анализ\n"
195
- synt_analysis = text_analysis.get('syntactic_analysis', {})
196
- for key, value in synt_analysis.items():
197
- label = synt_dict.get(key, key)
198
- if key == 'dependencies':
199
- analysis_md += f"- {label}:\n"
200
- for dep, count in value.items():
201
- dep_name = dep
202
- if dep == 'nsubj': dep_name = 'Подлежащие'
203
- elif dep == 'obj': dep_name = 'Дополнения'
204
- elif dep == 'amod': dep_name = 'Определения'
205
- elif dep == 'nmod': dep_name = 'Именные модификаторы'
206
- elif dep == 'ROOT': dep_name = 'Корневые узлы'
207
- elif dep == 'punct': dep_name = 'Пунктуация'
208
- elif dep == 'case': dep_name = 'Падежные маркеры'
209
- elif dep == 'dep': dep_name = 'Общие зависимости'
210
- elif dep == 'appos': dep_name = 'Приложения'
211
- elif dep == 'flat:foreign': dep_name = 'Иностранные выражения'
212
- elif dep == 'conj': dep_name = 'Сочинитель��ые конструкции'
213
- elif dep == 'obl': dep_name = 'Косвенные дополнения'
214
- analysis_md += f" - {dep_name}: {count}\n"
215
- elif key == 'noun_chunks':
216
- if isinstance(value, bool):
217
- analysis_md += f"- {label}: {0 if value is False else value}\n"
218
  else:
219
  analysis_md += f"- {label}: {value}\n"
220
- elif isinstance(value, float):
221
- analysis_md += f"- {label}: {value:.3f}\n"
222
- else:
223
- analysis_md += f"- {label}: {value}\n"
224
- analysis_md += "\n"
225
-
226
- # Named entities
227
- analysis_md += "### Именованные сущности\n"
228
- entities = text_analysis.get('named_entities', {})
229
- for key, value in entities.items():
230
- label = entities_dict.get(key, key)
231
- if key == 'entity_types':
232
- analysis_md += f"- {label}:\n"
233
- for ent, count in value.items():
234
- ent_name = ent
235
- if ent == 'PER': ent_name = 'Люди'
236
- elif ent == 'LOC': ent_name = 'Локации'
237
- elif ent == 'ORG': ent_name = 'Организации'
238
- analysis_md += f" - {ent_name}: {count}\n"
239
- elif isinstance(value, float):
240
- analysis_md += f"- {label}: {value:.3f}\n"
241
- else:
242
- analysis_md += f"- {label}: {value}\n"
243
- analysis_md += "\n"
244
-
245
- # Lexical diversity
246
- analysis_md += "### Лексическое разнообразие\n"
247
- for key, value in text_analysis.get('lexical_diversity', {}).items():
248
- label = diversity_dict.get(key, key)
249
- if isinstance(value, float):
250
- analysis_md += f"- {label}: {value:.3f}\n"
251
- else:
252
- analysis_md += f"- {label}: {value}\n"
253
- analysis_md += "\n"
254
-
255
- # Text structure
256
- analysis_md += "### Структура текста\n"
257
- for key, value in text_analysis.get('text_structure', {}).items():
258
- label = structure_dict.get(key, key)
259
- if isinstance(value, float):
260
- analysis_md += f"- {label}: {value:.2f}\n"
261
- else:
262
- analysis_md += f"- {label}: {value}\n"
263
- analysis_md += "\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
 
265
- # Readability
266
- analysis_md += "### Читабельность\n"
267
- for key, value in text_analysis.get('readability', {}).items():
268
- label = readability_dict.get(key, key)
269
- if isinstance(value, float):
270
- analysis_md += f"- {label}: {value:.2f}\n"
271
- else:
272
- analysis_md += f"- {label}: {value}\n"
273
- analysis_md += "\n"
274
 
275
- # Semantic coherence
276
- analysis_md += "### Семантическая связность\n"
277
- for key, value in text_analysis.get('semantic_coherence', {}).items():
278
- label = semantic_dict.get(key, key)
279
- if isinstance(value, float):
280
- analysis_md += f"- {label}: {value:.3f}\n"
281
- else:
282
- analysis_md += f"- {label}: {value}\n"
283
-
284
- return gr.Markdown(result_md), gr.Markdown(analysis_md) if analysis_md else None, text
285
 
286
  def reset_outputs():
 
 
 
 
287
  return None, None, ""
288
 
289
  with gr.Blocks(css=css, theme=gr.themes.Base()) as binary_app:
 
4
  import torch
5
  import os
6
  import spaces
7
+ import gc
8
 
9
  from model_utils import load_model, classify_text
10
+ from binoculars_utils import compute_scores, cleanup_model, cleanup_models
11
 
12
  MINIMUM_TOKENS = 200
13
 
 
47
 
48
  @spaces.GPU
49
  def run_binary_classifier(text, show_analysis=False):
50
+ # Check GPU status at the beginning
51
+ if torch.cuda.is_available():
52
+ print(f"Starting classification with GPU: {torch.cuda.get_device_name(0)}")
53
+ print(f"Initial GPU memory: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB allocated")
54
+ torch.cuda.empty_cache()
55
+ else:
56
+ print("No GPU available, running on CPU")
57
+
58
  if len(text.strip()) < MINIMUM_TOKENS:
59
  return gr.Markdown(f"Текст слишком короткий. Требуется минимум {MINIMUM_TOKENS} символов."), None, None
60
 
61
+ try:
62
+ # Load binary classifier model
63
+ model, scaler, label_encoder, imputer = load_model()
 
 
64
 
65
+ # Compute scores последовательно
66
+ scores = compute_scores(text, use_chat=True, use_coder=True)
67
+
68
+ # Run classification
69
+ result = classify_text(text, model, scaler, label_encoder, imputer=imputer, scores=scores)
70
+
71
+ # Format results
72
+ predicted_class = result['predicted_class']
73
+ probabilities = result['probabilities']
74
+
75
+ # Format probabilities
76
+ prob_str = ""
77
+ for cls, prob in probabilities.items():
78
+ prob_str += f"- {cls}: {prob:.4f}\n"
79
+
80
+ # Format scores
81
+ scores_str = ""
82
+ if scores:
83
+ scores_str = "### Binoculars Scores\n"
84
+ if 'score_chat' in scores:
85
+ scores_str += f"- Score Chat: {scores['score_chat']:.4f}\n"
86
+ if 'score_coder' in scores:
87
+ scores_str += f"- Score Coder: {scores['score_coder']:.4f}\n"
88
+
89
+ # Result markdown
90
+ class_style = "human-text" if predicted_class == "Human" else "ai-text"
91
+ result_md = f"""
92
  ## Результат классификации
93
 
94
  Предсказанный класс: <span class="{class_style}">{predicted_class}</span>
 
98
 
99
  {scores_str}
100
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
+ # Analysis markdown
103
+ analysis_md = None
104
+ if show_analysis:
105
+ features = result['features']
106
+ text_analysis = result['text_analysis']
107
+
108
+ basic_stats_dict = {
109
+ 'total_tokens': 'Количество токенов',
110
+ 'total_words': 'Количество слов',
111
+ 'unique_words': 'Количество уникальных слов',
112
+ 'stop_words': 'Количество стоп-слов',
113
+ 'avg_word_length': 'Средняя длина слова (символов)'
114
+ }
115
+
116
+ morph_dict = {
117
+ 'pos_distribution': 'Распределение частей речи',
118
+ 'unique_lemmas': 'Количество уникальных лемм',
119
+ 'lemma_word_ratio': 'Отношение лемм к словам'
120
+ }
121
+
122
+ synt_dict = {
123
+ 'dependencies': 'Зависимости между словами',
124
+ 'noun_chunks': 'Количество именных групп'
125
+ }
126
+
127
+ entities_dict = {
128
+ 'total_entities': 'Общее количество именованных сущностей',
129
+ 'entity_types': 'Типы именованных сущностей'
130
+ }
131
+
132
+ diversity_dict = {
133
+ 'ttr': 'TTR (отношение типов к токенам)',
134
+ 'mtld': 'MTLD (мера лексического разнообразия)'
135
+ }
136
+
137
+ structure_dict = {
138
+ 'sentence_count': 'Количество предложений',
139
+ 'avg_sentence_length': 'Средняя длина предложения (токенов)',
140
+ 'question_sentences': 'Количество вопросительных предложений',
141
+ 'exclamation_sentences': 'Количество восклицательных предложений'
142
+ }
143
+
144
+ readability_dict = {
145
+ 'words_per_sentence': 'Слов на предложение',
146
+ 'syllables_per_word': 'Слогов на слово',
147
+ 'flesh_kincaid_score': 'Индекс читабельности Флеша-Кинкейда',
148
+ 'long_words_percent': 'Процент длинных слов'
149
+ }
150
+
151
+ semantic_dict = {
152
+ 'avg_coherence_score': 'Средняя связность между предложениями'
153
+ }
154
+
155
+ analysis_md = "## Анализ текста\n\n"
156
+
157
+ # Basic statistics
158
+ analysis_md += "### Основная статистика\n"
159
+ for key, value in text_analysis.get('basic_stats', {}).items():
160
+ label = basic_stats_dict.get(key, key)
161
+ if isinstance(value, float):
162
+ analysis_md += f"- {label}: {value:.2f}\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  else:
164
  analysis_md += f"- {label}: {value}\n"
165
+ analysis_md += "\n"
166
+
167
+ # Morphological analysis
168
+ analysis_md += "### Морфологический анализ\n"
169
+ morph_analysis = text_analysis.get('morphological_analysis', {})
170
+ for key, value in morph_analysis.items():
171
+ label = morph_dict.get(key, key)
172
+ if key == 'pos_distribution':
173
+ analysis_md += f"- {label}:\n"
174
+ for pos, count in value.items():
175
+ pos_name = pos
176
+ if pos == 'NOUN': pos_name = 'Существительные'
177
+ elif pos == 'VERB': pos_name = 'Глаголы'
178
+ elif pos == 'ADJ': pos_name = 'Прилагательные'
179
+ elif pos == 'ADV': pos_name = 'Наречия'
180
+ elif pos == 'PROPN': pos_name = 'Имена собственные'
181
+ elif pos == 'DET': pos_name = 'Определители'
182
+ elif pos == 'ADP': pos_name = 'Предлоги'
183
+ elif pos == 'PRON': pos_name = 'Местоимения'
184
+ elif pos == 'CCONJ': pos_name = 'Сочинительные союзы'
185
+ elif pos == 'SCONJ': pos_name = 'Подчинительные союзы'
186
+ elif pos == 'NUM': pos_name = 'Числительные'
187
+ elif pos == 'PART': pos_name = 'Частицы'
188
+ elif pos == 'PUNCT': pos_name = 'Знаки препинания'
189
+ elif pos == 'AUX': pos_name = 'Вспомогательные глаголы'
190
+ elif pos == 'SYM': pos_name = 'Символы'
191
+ elif pos == 'INTJ': pos_name = 'Междометия'
192
+ elif pos == 'X': pos_name = 'Другое (X)'
193
+ analysis_md += f" - {pos_name}: {count}\n"
194
+ elif isinstance(value, float):
195
+ analysis_md += f"- {label}: {value:.3f}\n"
196
+ else:
197
+ analysis_md += f"- {label}: {value}\n"
198
+ analysis_md += "\n"
199
+
200
+ # Syntactic analysis
201
+ analysis_md += "### Синтаксический анализ\n"
202
+ synt_analysis = text_analysis.get('syntactic_analysis', {})
203
+ for key, value in synt_analysis.items():
204
+ label = synt_dict.get(key, key)
205
+ if key == 'dependencies':
206
+ analysis_md += f"- {label}:\n"
207
+ for dep, count in value.items():
208
+ dep_name = dep
209
+ if dep == 'nsubj': dep_name = 'Подлежащие'
210
+ elif dep == 'obj': dep_name = 'Дополнения'
211
+ elif dep == 'amod': dep_name = 'Определения'
212
+ elif dep == 'nmod': dep_name = 'Именные модификаторы'
213
+ elif dep == 'ROOT': dep_name = 'Корневые узлы'
214
+ elif dep == 'punct': dep_name = 'Пунктуация'
215
+ elif dep == 'case': dep_name = 'Падежные маркеры'
216
+ elif dep == 'dep': dep_name = 'Общие зависимости'
217
+ elif dep == 'appos': dep_name = 'Приложения'
218
+ elif dep == 'flat:foreign': dep_name = 'Иностранные выражения'
219
+ elif dep == 'conj': dep_name = 'Сочинительные конструкции'
220
+ elif dep == 'obl': dep_name = 'Косвенные дополнения'
221
+ analysis_md += f" - {dep_name}: {count}\n"
222
+ elif key == 'noun_chunks':
223
+ if isinstance(value, bool):
224
+ analysis_md += f"- {label}: {0 if value is False else value}\n"
225
+ else:
226
+ analysis_md += f"- {label}: {value}\n"
227
+ elif isinstance(value, float):
228
+ analysis_md += f"- {label}: {value:.3f}\n"
229
+ else:
230
+ analysis_md += f"- {label}: {value}\n"
231
+ analysis_md += "\n"
232
+
233
+ # Named entities
234
+ analysis_md += "### Именованные сущности\n"
235
+ entities = text_analysis.get('named_entities', {})
236
+ for key, value in entities.items():
237
+ label = entities_dict.get(key, key)
238
+ if key == 'entity_types':
239
+ analysis_md += f"- {label}:\n"
240
+ for ent, count in value.items():
241
+ ent_name = ent
242
+ if ent == 'PER': ent_name = 'Люди'
243
+ elif ent == 'LOC': ent_name = 'Локации'
244
+ elif ent == 'ORG': ent_name = 'Организации'
245
+ analysis_md += f" - {ent_name}: {count}\n"
246
+ elif isinstance(value, float):
247
+ analysis_md += f"- {label}: {value:.3f}\n"
248
+ else:
249
+ analysis_md += f"- {label}: {value}\n"
250
+ analysis_md += "\n"
251
+
252
+ # Lexical diversity
253
+ analysis_md += "### Лексическое разнообразие\n"
254
+ for key, value in text_analysis.get('lexical_diversity', {}).items():
255
+ label = diversity_dict.get(key, key)
256
+ if isinstance(value, float):
257
+ analysis_md += f"- {label}: {value:.3f}\n"
258
+ else:
259
+ analysis_md += f"- {label}: {value}\n"
260
+ analysis_md += "\n"
261
+
262
+ # Text structure
263
+ analysis_md += "### Структура текста\n"
264
+ for key, value in text_analysis.get('text_structure', {}).items():
265
+ label = structure_dict.get(key, key)
266
+ if isinstance(value, float):
267
+ analysis_md += f"- {label}: {value:.2f}\n"
268
+ else:
269
+ analysis_md += f"- {label}: {value}\n"
270
+ analysis_md += "\n"
271
+
272
+ # Readability
273
+ analysis_md += "### Читабельность\n"
274
+ for key, value in text_analysis.get('readability', {}).items():
275
+ label = readability_dict.get(key, key)
276
+ if isinstance(value, float):
277
+ analysis_md += f"- {label}: {value:.2f}\n"
278
+ else:
279
+ analysis_md += f"- {label}: {value}\n"
280
+ analysis_md += "\n"
281
+
282
+ # Semantic coherence
283
+ analysis_md += "### Семантическая связность\n"
284
+ for key, value in text_analysis.get('semantic_coherence', {}).items():
285
+ label = semantic_dict.get(key, key)
286
+ if isinstance(value, float):
287
+ analysis_md += f"- {label}: {value:.3f}\n"
288
+ else:
289
+ analysis_md += f"- {label}: {value}\n"
290
+
291
+ # Return results
292
+ result_output = gr.Markdown(result_md)
293
+ analysis_output = gr.Markdown(analysis_md) if analysis_md else None
294
 
295
+ # Report final GPU memory status
296
+ if torch.cuda.is_available():
297
+ print(f"Final GPU memory: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB allocated")
298
+
299
+ return result_output, analysis_output, text
 
 
 
 
300
 
301
+ except Exception as e:
302
+ # Выводим ошибку в случае проблем
303
+ error_msg = f"Ошибка при классификации: {str(e)}"
304
+ print(error_msg)
305
+ return gr.Markdown(error_msg), None, text
 
 
 
 
 
306
 
307
  def reset_outputs():
308
+ # Force memory cleanup when resetting
309
+ if torch.cuda.is_available():
310
+ torch.cuda.empty_cache()
311
+
312
  return None, None, ""
313
 
314
  with gr.Blocks(css=css, theme=gr.themes.Base()) as binary_app: