Spaces:

CoffeBank
/

RU_AI_Detector

Running on Zero

App Files Files Community

CoffeBank commited on Apr 4

Commit

8db7949

1 Parent(s): f9979ab

update

Browse files

Files changed (24) hide show

app.py +6 -7
binoculars/__init__.py +3 -0
binoculars/__pycache__/__init__.cpython-310.pyc +0 -0
binoculars/__pycache__/deepseek_detector.cpython-310.pyc +0 -0
binoculars/__pycache__/detector.cpython-310.pyc +0 -0
binoculars/__pycache__/llama_detector.cpython-310.pyc +0 -0
binoculars/__pycache__/metrics.cpython-310.pyc +0 -0
binoculars/__pycache__/utils.cpython-310.pyc +0 -0
binoculars/detector.py +117 -0
binoculars/metrics.py +57 -0
binoculars/utils.py +16 -0
binoculars_utils.py +43 -0
demo/binary_classifier_demo.py +191 -0
ex_text.txt +3 -0
feature_extraction.py +76 -0
main.py +66 -0
model_utils.py +96 -0
models/medium_binary_classifier/cv_results.json +71 -0
models/medium_binary_classifier/imputer.joblib +3 -0
models/medium_binary_classifier/label_encoder.joblib +3 -0
models/medium_binary_classifier/nn_model.pt +3 -0
models/medium_binary_classifier/scaler.joblib +3 -0
requirements.txt +10 -0
setup.py +15 -0

app.py CHANGED Viewed

@@ -1,7 +1,6 @@
-import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

+from demo.binary_classifier_demo import binary_app
+if __name__ == "__main__":
+    # Launch only the binary classifier demo
+    print("Starting Binary Classifier demo...")
+    binary_app.launch(show_api=False, debug=True, share=True)

binoculars/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .detector import Binoculars
2	+
3	+ __all__ = ["Binoculars"]

binoculars/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (202 Bytes). View file

binoculars/__pycache__/deepseek_detector.cpython-310.pyc ADDED Viewed

Binary file (3.83 kB). View file

binoculars/__pycache__/detector.cpython-310.pyc ADDED Viewed

Binary file (3.82 kB). View file

binoculars/__pycache__/llama_detector.cpython-310.pyc ADDED Viewed

Binary file (3.59 kB). View file

binoculars/__pycache__/metrics.cpython-310.pyc ADDED Viewed

Binary file (1.8 kB). View file

binoculars/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (688 Bytes). View file

binoculars/detector.py ADDED Viewed

	@@ -0,0 +1,117 @@

+from typing import Union
+import os
+import numpy as np
+import torch
+import transformers
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from .utils import assert_tokenizer_consistency
+from .metrics import perplexity, entropy
+torch.set_grad_enabled(False)
+huggingface_config = {
+    # Only required for private models from Huggingface (e.g. LLaMA models)
+    "TOKEN": os.environ.get("HF_TOKEN", None)
+}
+# selected using Falcon-7B and Falcon-7B-Instruct at bfloat16
+BINOCULARS_ACCURACY_THRESHOLD = 0.9015310749276843  # optimized for f1-score
+BINOCULARS_FPR_THRESHOLD = 0.8536432310785527  # optimized for low-fpr [chosen at 0.01%]
+DEVICE_1 = "cuda:0" if torch.cuda.is_available() else "cpu"
+DEVICE_2 = "cuda:1" if torch.cuda.device_count() > 1 else DEVICE_1
+class Binoculars(object):
+    def __init__(self,
+                 observer_name_or_path: str = "tiiuae/falcon-7b",
+                 performer_name_or_path: str = "tiiuae/falcon-7b-instruct",
+                 use_bfloat16: bool = True,
+                 max_token_observed: int = 512,
+                 mode: str = "low-fpr",
+                 ) -> None:
+        assert_tokenizer_consistency(observer_name_or_path, performer_name_or_path)
+        self.change_mode(mode)
+        self.observer_model = AutoModelForCausalLM.from_pretrained(observer_name_or_path,
+                                                                   device_map={"": DEVICE_1},
+                                                                   trust_remote_code=True,
+                                                                   torch_dtype=torch.bfloat16 if use_bfloat16
+                                                                   else torch.float32,
+                                                                   token=huggingface_config["TOKEN"]
+                                                                   )
+        self.performer_model = AutoModelForCausalLM.from_pretrained(performer_name_or_path,
+                                                                    device_map={"": DEVICE_2},
+                                                                    trust_remote_code=True,
+                                                                    torch_dtype=torch.bfloat16 if use_bfloat16
+                                                                    else torch.float32,
+                                                                    token=huggingface_config["TOKEN"]
+                                                                    )
+        self.observer_model.eval()
+        self.performer_model.eval()
+        self.tokenizer = AutoTokenizer.from_pretrained(observer_name_or_path)
+        if not self.tokenizer.pad_token:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        self.max_token_observed = max_token_observed
+    def change_mode(self, mode: str) -> None:
+        if mode == "low-fpr":
+            self.threshold = BINOCULARS_FPR_THRESHOLD
+        elif mode == "accuracy":
+            self.threshold = BINOCULARS_ACCURACY_THRESHOLD
+        else:
+            raise ValueError(f"Invalid mode: {mode}")
+    def free_memory(self) -> None:
+        self.observer_model = self.observer_model.to('cpu')
+        self.performer_model = self.performer_model.to('cpu')
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.synchronize()
+        del self.observer_model
+        del self.performer_model
+        self.observer_model = None
+        self.performer_model = None
+    def _tokenize(self, batch: list[str]) -> transformers.BatchEncoding:
+        batch_size = len(batch)
+        encodings = self.tokenizer(
+            batch,
+            return_tensors="pt",
+            padding="longest" if batch_size > 1 else False,
+            truncation=True,
+            max_length=self.max_token_observed,
+            return_token_type_ids=False).to(self.observer_model.device)
+        return encodings
+    @torch.inference_mode()
+    def _get_logits(self, encodings: transformers.BatchEncoding) -> torch.Tensor:
+        observer_logits = self.observer_model(**encodings.to(DEVICE_1)).logits
+        performer_logits = self.performer_model(**encodings.to(DEVICE_2)).logits
+        if DEVICE_1 != "cpu":
+            torch.cuda.synchronize()
+        return observer_logits, performer_logits
+    def compute_score(self, input_text: Union[list[str], str]) -> Union[float, list[float]]:
+        batch = [input_text] if isinstance(input_text, str) else input_text
+        encodings = self._tokenize(batch)
+        observer_logits, performer_logits = self._get_logits(encodings)
+        ppl = perplexity(encodings, performer_logits)
+        x_ppl = entropy(observer_logits.to(DEVICE_1), performer_logits.to(DEVICE_1),
+                        encodings.to(DEVICE_1), self.tokenizer.pad_token_id)
+        binoculars_scores = ppl / x_ppl
+        binoculars_scores = binoculars_scores.tolist()
+        return binoculars_scores[0] if isinstance(input_text, str) else binoculars_scores
+    def predict(self, input_text: Union[list[str], str]) -> Union[list[str], str]:
+        binoculars_scores = np.array(self.compute_score(input_text))
+        pred = np.where(binoculars_scores < self.threshold,
+                        "Most likely AI-generated",
+                        "Most likely human-generated"
+                        ).tolist()
+        return pred

binoculars/metrics.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import numpy as np
+import torch
+import transformers
+ce_loss_fn = torch.nn.CrossEntropyLoss(reduction="none")
+softmax_fn = torch.nn.Softmax(dim=-1)
+def perplexity(encoding: transformers.BatchEncoding,
+               logits: torch.Tensor,
+               median: bool = False,
+               temperature: float = 1.0):
+    shifted_logits = logits[..., :-1, :].contiguous() / temperature
+    shifted_labels = encoding.input_ids[..., 1:].contiguous()
+    shifted_attention_mask = encoding.attention_mask[..., 1:].contiguous()
+    if median:
+        ce_nan = (ce_loss_fn(shifted_logits.transpose(1, 2), shifted_labels).
+                  masked_fill(~shifted_attention_mask.bool(), float("nan")))
+        ppl = np.nanmedian(ce_nan.cpu().float().numpy(), 1)
+    else:
+        ppl = (ce_loss_fn(shifted_logits.transpose(1, 2), shifted_labels) *
+               shifted_attention_mask).sum(1) / shifted_attention_mask.sum(1)
+        ppl = ppl.to("cpu").float().numpy()
+    return ppl
+def entropy(p_logits: torch.Tensor,
+            q_logits: torch.Tensor,
+            encoding: transformers.BatchEncoding,
+            pad_token_id: int,
+            median: bool = False,
+            sample_p: bool = False,
+            temperature: float = 1.0):
+    vocab_size = p_logits.shape[-1]
+    total_tokens_available = q_logits.shape[-2]
+    p_scores, q_scores = p_logits / temperature, q_logits / temperature
+    p_proba = softmax_fn(p_scores).view(-1, vocab_size)
+    if sample_p:
+        p_proba = torch.multinomial(p_proba.view(-1, vocab_size), replacement=True, num_samples=1).view(-1)
+    q_scores = q_scores.view(-1, vocab_size)
+    ce = ce_loss_fn(input=q_scores, target=p_proba).view(-1, total_tokens_available)
+    padding_mask = (encoding.input_ids != pad_token_id).type(torch.uint8)
+    if median:
+        ce_nan = ce.masked_fill(~padding_mask.bool(), float("nan"))
+        agg_ce = np.nanmedian(ce_nan.cpu().float().numpy(), 1)
+    else:
+        agg_ce = (((ce * padding_mask).sum(1) / padding_mask.sum(1)).to("cpu").float().numpy())
+    return agg_ce

binoculars/utils.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import warnings
+from transformers import AutoTokenizer
+def assert_tokenizer_consistency(model_id_1, model_id_2):
+    identical_tokenizers = (
+            AutoTokenizer.from_pretrained(model_id_1).vocab
+            == AutoTokenizer.from_pretrained(model_id_2).vocab
+    )
+    if not identical_tokenizers:
+        warnings.warn(
+            f"Warning: Tokenizers for models '{model_id_1}' and '{model_id_2}' have different vocabularies. "
+            f"This may lead to inconsistent results when comparing these models. "
+            f"Consider using models with compatible tokenizers.",
+            UserWarning
+        )

binoculars_utils.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from binoculars import Binoculars
+def initialize_binoculars():
+    chat_model_pair = {
+        "observer": "deepseek-ai/deepseek-llm-7b-base",
+        "performer": "deepseek-ai/deepseek-llm-7b-chat"
+    }
+    coder_model_pair = {
+        "observer": "deepseek-ai/deepseek-llm-7b-base",
+        "performer": "deepseek-ai/deepseek-coder-7b-instruct-v1.5"
+    }
+    print("Initializing Binoculars models...")
+    bino_chat = Binoculars(
+        mode="accuracy",
+        observer_name_or_path=chat_model_pair["observer"],
+        performer_name_or_path=chat_model_pair["performer"],
+        max_token_observed=2048
+    )
+    bino_coder = Binoculars(
+        mode="accuracy",
+        observer_name_or_path=coder_model_pair["observer"],
+        performer_name_or_path=coder_model_pair["performer"],
+        max_token_observed=2048
+    )
+    return bino_chat, bino_coder
+def compute_scores(text, bino_chat=None, bino_coder=None):
+    scores = {}
+    if bino_chat:
+        #print("Computing score_chat...")
+        scores['score_chat'] = bino_chat.compute_score(text)
+    if bino_coder:
+        #print("Computing score_coder...")
+        scores['score_coder'] = bino_coder.compute_score(text)
+    return scores

demo/binary_classifier_demo.py ADDED Viewed

	@@ -0,0 +1,191 @@

+__all__ = ["binary_app"]
+import gradio as gr
+import torch
+import os
+from model_utils import load_model, classify_text
+from binoculars_utils import initialize_binoculars, compute_scores
+# Initialize Binoculars models
+bino_chat, bino_coder = initialize_binoculars()
+# Load binary classifier model
+model, scaler, label_encoder, imputer = load_model()
+DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+MINIMUM_TOKENS = 50
+SAMPLE_TEXT = """Привет! Я хотел бы рассказать вам о своём опыте путешествия по Петербургу. Невероятный город с богатой историей и красивой архитектурой. Особенно запомнился Эрмитаж с его огромной коллекцией произведений искусства. Также понравилась прогулка по каналам города, где можно увидеть множество старинных мостов и зданий."""
+css = """
+.human-text {
+    color: black !important;
+    line-height: 1.9em;
+    padding: 0.5em;
+    background: #ccffcc;
+    border-radius: 0.5rem;
+    font-weight: bold;
+}
+.ai-text {
+    color: black !important;
+    line-height: 1.9em;
+    padding: 0.5em;
+    background: #ffad99;
+    border-radius: 0.5rem;
+    font-weight: bold;
+}
+.analysis-block {
+    background: #f5f5f5;
+    padding: 15px;
+    border-radius: 8px;
+    margin-top: 10px;
+}
+.scores {
+    font-size: 1.1em;
+    padding: 10px;
+    background: #e6f7ff;
+    border-radius: 5px;
+    margin: 10px 0;
+}
+"""
+def run_binary_classifier(text, show_analysis=False):
+    if len(text.strip()) < MINIMUM_TOKENS:
+        return gr.Markdown(f"Текст слишком короткий. Требуется минимум {MINIMUM_TOKENS} символов."), None, None
+    # Compute scores using binoculars
+    scores = compute_scores(text, bino_chat, bino_coder)
+    # Run classification
+    result = classify_text(text, model, scaler, label_encoder, imputer=imputer, scores=scores)
+    # Format results
+    predicted_class = result['predicted_class']
+    probabilities = result['probabilities']
+    # Format probabilities
+    prob_str = ""
+    for cls, prob in probabilities.items():
+        prob_str += f"- {cls}: {prob:.4f}\n"
+    # Format scores
+    scores_str = ""
+    if scores:
+        scores_str = "### Binoculars Scores\n"
+        if 'score_chat' in scores:
+            scores_str += f"- Score Chat: {scores['score_chat']:.4f}\n"
+        if 'score_coder' in scores:
+            scores_str += f"- Score Coder: {scores['score_coder']:.4f}\n"
+    # Result markdown
+    class_style = "human-text" if predicted_class == "Human" else "ai-text"
+    result_md = f"""
+## Результат классификации
+Предсказанный класс: <span class="{class_style}">{predicted_class}</span>
+### Вероятности классов:
+{prob_str}
+{scores_str}
+"""
+    # Analysis markdown
+    analysis_md = None
+    if show_analysis:
+        features = result['features']
+        text_analysis = result['text_analysis']
+        analysis_md = "## Анализ текста\n\n"
+        # Basic statistics
+        analysis_md += "### Основная статистика\n"
+        analysis_md += f"- Всего токенов: {text_analysis['basic_stats']['total_tokens']}\n"
+        analysis_md += f"- Всего слов: {text_analysis['basic_stats']['total_words']}\n"
+        analysis_md += f"- Уникальных слов: {text_analysis['basic_stats']['unique_words']}\n"
+        analysis_md += f"- Стоп-слов: {text_analysis['basic_stats']['stop_words']}\n"
+        analysis_md += f"- Средняя длина слова: {text_analysis['basic_stats']['avg_word_length']:.2f} символов\n\n"
+        # Lexical diversity
+        analysis_md += "### Лексическое разнообразие\n"
+        analysis_md += f"- TTR (Type-Token Ratio): {text_analysis['lexical_diversity']['ttr']:.3f}\n"
+        analysis_md += f"- MTLD (упрощенный): {text_analysis['lexical_diversity']['mtld']:.2f}\n\n"
+        # Text structure
+        analysis_md += "### Структура текста\n"
+        analysis_md += f"- Количество предложений: {text_analysis['text_structure']['sentence_count']}\n"
+        analysis_md += f"- Средняя длина предложения: {text_analysis['text_structure']['avg_sentence_length']:.2f} токенов\n\n"
+        # Readability
+        analysis_md += "### Читабельность\n"
+        analysis_md += f"- Flesch-Kincaid score: {text_analysis['readability']['flesh_kincaid_score']:.2f}\n"
+        analysis_md += f"- Процент дл��нных слов: {text_analysis['readability']['long_words_percent']:.2f}%\n\n"
+        # Semantic coherence
+        analysis_md += "### Семантическая связность\n"
+        analysis_md += f"- Средняя связность между предложениями: {text_analysis['semantic_coherence']['avg_coherence_score']:.3f}\n"
+    return gr.Markdown(result_md), gr.Markdown(analysis_md) if analysis_md else None, text
+def reset_outputs():
+    return None, None, ""
+with gr.Blocks(css=css, theme=gr.themes.Base()) as binary_app:
+    with gr.Row():
+        with gr.Column(scale=3):
+            gr.HTML("<h1>Binary Classifier: Human vs AI Text Detection</h1>")
+            gr.HTML("<p>This demo uses a neural network (Medium_Binary_Network) to classify text as either written by a human or generated by AI.</p>")
+    with gr.Row():
+        with gr.Column():
+            input_text = gr.Textbox(value=SAMPLE_TEXT, placeholder="Введите текст для анализа",
+                                   lines=10, label="Текст для анализа")
+            with gr.Row():
+                analysis_checkbox = gr.Checkbox(label="Показать детальный анализ текста", value=False)
+                submit_button = gr.Button("Классифицировать", variant="primary")
+                clear_button = gr.Button("Очистить")
+    with gr.Row():
+        with gr.Column():
+            result_output = gr.Markdown(label="Результат")
+    with gr.Row():
+        with gr.Column():
+            analysis_output = gr.Markdown(label="Анализ")
+    with gr.Accordion("О модели", open=False):
+        gr.Markdown("""
+        ### О бинарном классификаторе
+        Эта демонстрация использует нейронную сеть Medium_Binary_Network для классификации текста как написанного человеком или сгенерированного ИИ.
+        #### Архитектура модели:
+        - Входной слой: Количество признаков (зависит от анализа текста)
+        - Скрытые слои: [256, 192, 128, 64]
+        - Выходной слой: 2 класса (Human, AI)
+        - Dropout: 0.3
+        #### Особенности:
+        - Используется анализ текста и оценки качества текста с помощью Binoculars
+        - Анализируются морфологические, синтаксические и семантические особенности текста
+        - Вычисляются показатели лексического разнообразия и читабельности
+        #### Рекомендации:
+        - Для более точной классификации рекомендуется использовать тексты длиннее 100 слов
+        - Модель обучена на русскоязычных текстах
+        """)
+    # Set up event handlers
+    submit_button.click(
+        fn=run_binary_classifier,
+        inputs=[input_text, analysis_checkbox],
+        outputs=[result_output, analysis_output, input_text]
+    )
+    clear_button.click(
+        fn=reset_outputs,
+        inputs=[],
+        outputs=[result_output, analysis_output, input_text]
+    )

ex_text.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+**Национальный переходный совет Ливии обращается к Западу с просьбой ликвидировать Каддафи**\n\nБенгази, Ливия, 14 марта — Национальный переходный совет Ливии, расположенный в Бенгази, обратился к западным державам с настоятельной просьбой о принятии мер по устранению полковника Муаммара Каддафи, который в настоящее время контролирует западную часть страны. Представитель Совета Мустафа Гериани заявил, что делегация Совета официально передала запрос на проведение тактических ударов против диктатора и установление запретной зоны для полетов в понедельник, 14 марта.\n\n\"Мы готовы сделать все необходимое для разрушения режима Каддафи. Его время пришло, и никто не будет горевать о его смерти,\" — заявил Гериани на пресс-конференции, подчеркнув решимость переходного совета добиться смены власти мирным путем через международное сообщество.\n\nПока не поступило официальных заявлений от Франции и США в ответ на запрос Ливийского совета. Хотя западные страны, включая Париж и Лондон, активно стремятся создать запретную зону над Ливией, консенсус среди членов \"Большой восьмерки\" и Совета Безопасности ООН по данному вопросу еще не достигнут. Специалисты отмечают, что отсутствие единодушия среди международных лидеров может замедлить принятие необходимых мер для стабилизации ситуации в стране.\n\nКонфликт между правительственными силами из Триполи и повстанцами из Бенгази продолжает приносить значительные человеческие потери. По оценкам, число погибших колеблется от нескольких сотен до нескольких тысяч человек, что подрывает возможности для мирного разрешения конфликта. Война уже привела к серьезным разрушениям инфраструктуры и гуманитарному кризису, усугубляя страдания мирного населения.\n\nМеждународные организации и правозащитные группы выражают обеспокоенность нестабильностью в Ливии и призывают к скорейшему урегулированию конфликта. Некоторые аналитики считают, что вмешательство Запада является необходимым шагом для предотвращения дальнейшей эскалации насилия и установления демократического порядка в стране.\n\nНа данный момент остается неизвестным, какие конкретные шаги предпримут Франция, США и другие западные державы в ответ на призыв Ливийского переходного совета. Мир с напряжением ожидает дальнейших заявлений и действий международного сообщества в разрешении кризиса, который уже серьезно повлиял на будущее Ливии и стабильность региона в целом.
+Национальный переходный совет Ливии в Бенгази обратился к странам Запада с просьбой ликвидировать полковника Муаммара Каддафи, контролирующего запад страны, пишет газета The Guardian. Представитель Совета Мустафа Гериани заявил, что просьбу об уничтожении диктатора Франции и США должна была передать делегация из Бенгази в понедельник 14 марта. \"Мы пояснили западным странам, что хотим ��становления над Ливией зоны, запретной для полетов, тактических ударов по боевой технике Каддафи и уничтожения его резиденции\", - сказал Гериани. На уточняющий вопрос, хочет ли он смерти диктатора, представитель Совета ответил: \"Почему бы и нет? Если он умрет, никто и слезы не уронит\". Ни Франция, ни США пока высказывания Гериани не прокомментировали. Известно, что Париж и Лондон добиваются от партнеров по \"Большой восьмерке\" (G8) и Совету Безопасности ООН установления над Ливией зоны, закрытой для полетов. Однако пока в обоих объединениях консенсуса по этому поводу нет. Ранее сам Муаммар Каддафи назвал участников Совета в Бенгази \"предателями и шпионами\" и пообещал награду за голову их руководителя - Мустафы Абдель Джалиля. Противостояние между Триполи и Бенгази продолжается уже месяц. За это время в ходе боев и беспорядков по всей Ливии погибли от нескольких сотен до нескольких тысяч человек.
+Национальный переходный совет Ливии, базирующийся в Бенгази, обратился к западным державам с требованием устранить полковника Муаммара Каддафи, который контролирует западную часть страны, сообщает газета The Guardian. Мустафа Гериани, представитель Совета, уточнил, что запрос на ликвидацию диктатора должен был быть передан франко-американской делегации из Бенгази в понедельник, 14 марта. «Мы объяснили западным государствам наше намерение создать над Ливией воздушную запретную зону, провести тактические удары по военной технике Каддафи и разрушить его резиденцию», — заявил Гериани. На вопрос о том, желает ли он смерти диктатора, представитель Совета ответил: «Почему бы и нет? Если он погибнет, никто не оплачет его». На данный момент Франция и США не прокомментировали заявления Гериани. Известно, что власти Парижа и Лондона прилагают усилия к партнерам по «Большой восьмерке» (G8) и Совету Безопасности ООН для создания воздушной запретной зоны над Ливией. Однако в этих организациях пока не достигнут общий консенсус по этому вопросу. Ранее Муаммар Каддафи назвал членов Совета в Бенгази «предателями и шпионами» и пообещал вознаграждение за голову их лидера — Мустафы Абдел Джалиля. Конфликт между Триполи и Бенгази продолжается уже месяц, в ходе которого из-за боевых действий и беспорядков по всей Ливии погибло от сотен до нескольких тысяч человек.

feature_extraction.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import pandas as pd
+from text_analysis import analyze_text
+def extract_features(text, feature_config=None, scores=None):
+    if feature_config is None:
+        feature_config = {
+            'basic_scores': True,
+            'basic_text_stats': ['total_tokens', 'total_words', 'unique_words', 'stop_words', 'avg_word_length'],
+            'morphological': ['pos_distribution', 'unique_lemmas', 'lemma_word_ratio'],
+            'syntactic': ['dependencies', 'noun_chunks'],
+            'entities': ['total_entities', 'entity_types'],
+            'diversity': ['ttr', 'mtld'],
+            'structure': ['sentence_count', 'avg_sentence_length', 'question_sentences', 'exclamation_sentences'],
+            'readability': ['words_per_sentence', 'syllables_per_word', 'flesh_kincaid_score', 'long_words_percent'],
+            'semantic': True
+        }
+    text_analysis = analyze_text(text)
+    features_df = pd.DataFrame(index=[0])
+    if scores:
+        features_df['score_chat'] = scores.get('score_chat', 0)
+        features_df['score_coder'] = scores.get('score_coder', 0)
+    else:
+        features_df['score_chat'] = 0
+        features_df['score_coder'] = 0
+        print("Warning: No scores provided, using zeros for score_chat and score_coder")
+    if feature_config.get('basic_text_stats'):
+        for feature in feature_config['basic_text_stats']:
+            features_df[f'basic_{feature}'] = text_analysis.get('basic_stats', {}).get(feature, 0)
+    if feature_config.get('morphological'):
+        for feature in feature_config['morphological']:
+            if feature == 'pos_distribution':
+                pos_types = ['NOUN', 'VERB', 'ADJ', 'ADV', 'PROPN', 'DET', 'ADP', 'PRON', 'CCONJ', 'SCONJ']
+                for pos in pos_types:
+                    features_df[f'pos_{pos}'] = text_analysis.get('morphological_analysis', {}).get('pos_distribution', {}).get(pos, 0)
+            else:
+                features_df[f'morph_{feature}'] = text_analysis.get('morphological_analysis', {}).get(feature, 0)
+    if feature_config.get('syntactic'):
+        for feature in feature_config['syntactic']:
+            if feature == 'dependencies':
+                dep_types = ['nsubj', 'obj', 'amod', 'nmod', 'ROOT', 'punct', 'case']
+                for dep in dep_types:
+                    features_df[f'dep_{dep}'] = text_analysis.get('syntactic_analysis', {}).get('dependencies', {}).get(dep, 0)
+            else:
+                features_df[f'synt_{feature}'] = text_analysis.get('syntactic_analysis', {}).get(feature, 0)
+    if feature_config.get('entities'):
+        for feature in feature_config['entities']:
+            if feature == 'entity_types':
+                entity_types = ['PER', 'LOC', 'ORG']
+                for ent in entity_types:
+                    features_df[f'ent_{ent}'] = text_analysis.get('named_entities', {}).get('entity_types', {}).get(ent, 0)
+            else:
+                features_df[f'ent_{feature}'] = text_analysis.get('named_entities', {}).get(feature, 0)
+    if feature_config.get('diversity'):
+        for feature in feature_config['diversity']:
+            features_df[f'div_{feature}'] = text_analysis.get('lexical_diversity', {}).get(feature, 0)
+    if feature_config.get('structure'):
+        for feature in feature_config['structure']:
+            features_df[f'struct_{feature}'] = text_analysis.get('text_structure', {}).get(feature, 0)
+    if feature_config.get('readability'):
+        for feature in feature_config['readability']:
+            features_df[f'read_{feature}'] = text_analysis.get('readability', {}).get(feature, 0)
+    if feature_config.get('semantic'):
+        features_df['semantic_coherence'] = text_analysis.get('semantic_coherence', {}).get('avg_coherence_score', 0)
+    return features_df, text_analysis

main.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import argparse
+import pandas as pd
+from text_analysis import show_text_analysis
+from binoculars_utils import initialize_binoculars, compute_scores
+from model_utils import load_model, classify_text
+def main():
+    parser = argparse.ArgumentParser(description='Text classifier demonstration (Human vs AI)')
+    parser.add_argument('--text', type=str, help='Text for classification')
+    parser.add_argument('--file', type=str, help='Path to file with text')
+    parser.add_argument('--analysis', action='store_true', help='Show detailed text analysis')
+    parser.add_argument('--compute-scores', action='store_true', help='Compute score_chat and score_coder')
+    args = parser.parse_args()
+    bino_chat = None
+    bino_coder = None
+    if args.compute_scores:
+        bino_chat, bino_coder = initialize_binoculars()
+    print("Loading binary classifier model...")
+    model, scaler, label_encoder, imputer = load_model()
+    if args.text:
+        text = args.text
+    elif args.file:
+        with open(args.file, 'r', encoding='utf-8') as f:
+            text = f.read()
+    else:
+        text = input("Enter text for classification: ")
+    scores = None
+    if args.compute_scores:
+        scores = compute_scores(text, bino_chat, bino_coder)
+    print(f"\nAnalyzing text...")
+    result = classify_text(text, model, scaler, label_encoder, imputer=imputer, scores=scores)
+    print("\n" + "="*50)
+    print("CLASSIFICATION RESULTS")
+    print("="*50)
+    print(f"Predicted class: {result['predicted_class']}")
+    print("Class probabilities:")
+    for cls, prob in result['probabilities'].items():
+        print(f"  - {cls}: {prob:.4f}")
+    if scores:
+        print("\nComputed scores:")
+        if 'score_chat' in scores:
+            print(f"  - Score Chat: {scores['score_chat']:.4f}")
+        if 'score_coder' in scores:
+            print(f"  - Score Coder: {scores['score_coder']:.4f}")
+    if args.analysis:
+        show_text_analysis(result['text_analysis'])
+    if args.compute_scores:
+        if bino_chat:
+            bino_chat.free_memory()
+        if bino_coder:
+            bino_coder.free_memory()
+if __name__ == "__main__":
+    main()

model_utils.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import os
+import torch
+import joblib
+import numpy as np
+from sklearn.impute import SimpleImputer
+from NN_classifier.simple_binary_classifier import Medium_Binary_Network
+from feature_extraction import extract_features
+import pandas as pd
+DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+def load_model(model_dir='models/medium_binary_classifier'):
+    model_path = os.path.join(model_dir, 'nn_model.pt')
+    scaler_path = os.path.join(model_dir, 'scaler.joblib')
+    encoder_path = os.path.join(model_dir, 'label_encoder.joblib')
+    imputer_path = os.path.join(model_dir, 'imputer.joblib')
+    if not os.path.exists(model_path):
+        raise FileNotFoundError(f"Model not found at: {model_path}")
+    label_encoder = joblib.load(encoder_path)
+    scaler = joblib.load(scaler_path)
+    imputer = None
+    if os.path.exists(imputer_path):
+        imputer = joblib.load(imputer_path)
+    else:
+        print("Warning: Imputer not found, will create a new one during classification")
+    input_size = scaler.n_features_in_
+    model = Medium_Binary_Network(input_size, hidden_sizes=[256, 192, 128, 64], dropout=0.3).to(DEVICE)
+    model.load_state_dict(torch.load(model_path, map_location=DEVICE))
+    model.eval()
+    if imputer is not None:
+        try:
+            if hasattr(imputer, 'feature_names_in_'):
+                print(f"Imputer has {len(imputer.feature_names_in_)} features")
+                print(f"First few feature names: {imputer.feature_names_in_[:5]}")
+            else:
+                print("Warning: Imputer does not have feature_names_in_ attribute")
+        except Exception as e:
+            print(f"Error checking imputer: {str(e)}")
+    return model, scaler, label_encoder, imputer
+def classify_text(text, model, scaler, label_encoder, imputer=None, scores=None):
+    features_df, text_analysis = extract_features(text, scores=scores)
+    if imputer is not None:
+        expected_feature_names = imputer.feature_names_in_
+    else:
+        expected_feature_names = None
+    if expected_feature_names is not None:
+        aligned_features = pd.DataFrame(columns=expected_feature_names)
+        for col in features_df.columns:
+            if col in expected_feature_names:
+                aligned_features[col] = features_df[col]
+        for col in expected_feature_names:
+            if col not in aligned_features.columns or aligned_features[col].isnull().all():
+                aligned_features[col] = 0
+                print(f"Added missing feature: {col}")
+        features_df = aligned_features
+    if imputer is None:
+        print("Warning: No imputer provided, creating a new one")
+        imputer = SimpleImputer(strategy='mean')
+        features = imputer.fit_transform(features_df)
+    else:
+        features = imputer.transform(features_df)
+    features_scaled = scaler.transform(features)
+    features_tensor = torch.FloatTensor(features_scaled).to(DEVICE)
+    with torch.no_grad():
+        outputs = model(features_tensor)
+        probabilities = torch.softmax(outputs, dim=1)
+        pred_class = torch.argmax(probabilities, dim=1).item()
+    predicted_label = label_encoder.classes_[pred_class]
+    probs_dict = {label_encoder.classes_[i]: probabilities[0][i].item() for i in range(len(label_encoder.classes_))}
+    return {
+        'predicted_class': predicted_label,
+        'probabilities': probs_dict,
+        'features': features_df,
+        'text_analysis': text_analysis,
+        'scores': scores
+    }

models/medium_binary_classifier/cv_results.json ADDED Viewed

	@@ -0,0 +1,71 @@

+{
+    "fold_metrics": [
+        {
+            "fold": 1,
+            "accuracy": 0.8263888888888888,
+            "precision": 0.8280646002798397,
+            "recall": 0.8263888888888888,
+            "f1": 0.8271100936623971,
+            "val_loss": 0.3269847333431244
+        },
+        {
+            "fold": 2,
+            "accuracy": 0.8194444444444444,
+            "precision": 0.8261029411764705,
+            "recall": 0.8194444444444444,
+            "f1": 0.8216165413533835,
+            "val_loss": 0.35324224829673767
+        },
+        {
+            "fold": 3,
+            "accuracy": 0.8331402085747392,
+            "precision": 0.8434186476644773,
+            "recall": 0.8331402085747392,
+            "f1": 0.8358602090830395,
+            "val_loss": 0.306135892868042
+        },
+        {
+            "fold": 4,
+            "accuracy": 0.8366164542294322,
+            "precision": 0.8442133250450394,
+            "recall": 0.8366164542294322,
+            "f1": 0.8388315597059784,
+            "val_loss": 0.3356165289878845
+        },
+        {
+            "fold": 5,
+            "accuracy": 0.8296639629200464,
+            "precision": 0.8434535764325679,
+            "recall": 0.8296639629200464,
+            "f1": 0.8328916049247007,
+            "val_loss": 0.3397574722766876
+        }
+    ],
+    "overall": {
+        "accuracy": 0.8290479499652537,
+        "precision": 0.8366643662464281,
+        "recall": 0.8290479499652537,
+        "f1": 0.8313408894235843
+    },
+    "cross_validation": {
+        "mean_accuracy": 0.8290507918115102,
+        "std_accuracy": 0.005894169882178006,
+        "confidence_interval_95": [
+            0.8238843241167373,
+            0.8342172595062831
+        ]
+    },
+    "best_fold": {
+        "fold": 4,
+        "accuracy": 0.8366164542294322
+    },
+    "model_config": {
+        "hidden_sizes": [
+            256,
+            192,
+            128,
+            64
+        ],
+        "dropout": 0.3
+    }
+}

models/medium_binary_classifier/imputer.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:188d4008a04267264ab8575a77248bc14c9918ead0e586b549fb4844cb306039
+size 1975

models/medium_binary_classifier/label_encoder.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:324b9701f37445fe8c51ef7d6207fc862c7c5656b63581322e095ad9692597fa
+size 540

models/medium_binary_classifier/nn_model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6c071c24bc5ac0630a046d4f75f59dbae875635983edf7981f683b863a8dd955
+size 377798

models/medium_binary_classifier/scaler.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3cf2ad0003a7006486036f07c4eb51cb395e03309929ff679d7642332298c30e
+size 1623

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+sentencepiece
+transformers
+datasets
+numpy
+gradio
+gradio_client
+scikit-learn
+seaborn
+pandas
+#flash_attn

setup.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from setuptools import setup, find_packages
+setup(
+    name='Trinoculars',
+    version='1.0.0',
+    packages=find_packages(),
+    url='https://github.com/CoffeBank/Trinoculars',
+    license=open("LICENSE.md", "r", encoding="utf-8").read(),
+    author='',
+    author_email='',
+    description='An improved version of the Binoculars language model text detector for ru datasets.',
+    long_description=open("README.md", "r", encoding="utf-8").read(),
+    long_description_content_type="text/markdown",
+    install_requires=open("requirements.txt", "r", encoding="utf-8").read().splitlines(),
+)