feat: add files

Browse files

Files changed (8) hide show

.gitignore +2 -0
README.md +46 -1
config.json +11 -0
inference_onnx.py +93 -0
inference_safetensors.py +172 -0
models/off-topic-jinaai-jina-embeddings-v2-small-en-TwinEncoder.onnx +3 -0
govtech-jina-embeddings-v2-small-en-off-topic → models/off-topic-jinaai-jina-embeddings-v2-small-en-TwinEncoder.safetensors +0 -0
requirements.txt +6 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ .venv
2	+ .DS_store

README.md CHANGED Viewed

@@ -2,4 +2,49 @@
 license: other
 license_name: govtech-singapore
 license_link: LICENSE
----

 license: other
 license_name: govtech-singapore
 license_link: LICENSE
+---
+# Off-Topic Classification Model
+This repository contains a fine-tuned **Jina Embeddings model** designed to perform binary classification. The model predicts whether a user prompt is **off-topic** based on the intended purpose defined in the system prompt.
+## Model Highlights
+- **Base Model**: [`jina-embeddings-v2-small-en`](https://huggingface.co/jinaai/jina-embeddings-v2-small-en)
+- **Maximum Context Length**: 1024 tokens
+- **Task**: Binary classification (on-topic/off-topic)
+## Performance
+| Approach                              | Model                          | ROC-AUC | F1   | Precision | Recall |
+|---------------------------------------|--------------------------------|---------|------|-----------|--------|
+| Fine-tuned bi-encoder classifier      | jina-embeddings-v2-small-en    | 0.99    | 0.97 | 0.99      | 0.95   |
+## Usage
+1. Clone this repository and install the required dependencies:
+    ```bash
+    pip install -r requirements.txt
+    ```
+2. You can run the model using two options:
+    **Option 1**: Using `inference_onnx.py` with the ONNX Model.
+        ```
+        python inference_onnx.py '[
+            ["System prompt example 1", "User prompt example 1"],
+            ["System prompt example 2", "System prompt example 2]
+        ]'
+        ```
+    **Option 2**: Using `inference_safetensors.py` with PyTorch and SafeTensors.
+        ```
+        python inference_safetensors.py '[
+            ["System prompt example 1", "User prompt example 1"],
+            ["System prompt example 2", "System prompt example 2]
+        ]'
+        ```
+Read more about this model in our [technical report](https://arxiv.org/abs/2411.12946).

config.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+    "description": "Off-Topic classifier designed to block user prompts that do not align with the intended purpose of the system, as determined by the system prompt.",
+    "classifier": {
+        "embedding": {
+            "model_name": "jinaai/jina-embeddings-v2-small-en",
+            "max_length": 1024,
+            "model_weights_fp": "models/off-topic-jinaai-jina-embeddings-v2-small-en-TwinEncoder.safetensors",
+            "model_fp": "models/off-topic-jinaai-jina-embeddings-v2-small-en-TwinEncoder.onnx"
+        }
+    }
+}

inference_onnx.py ADDED Viewed

	@@ -0,0 +1,93 @@

+"""
+inference_onnx.py
+This script leverages ONNX runtime to perform inference with a pre-trained model.
+"""
+import json
+import torch
+import sys
+import numpy as np
+import onnxruntime as rt
+from huggingface_hub import hf_hub_download
+from transformers import AutoTokenizer
+repo_path = "govtech/jina-embeddings-v2-small-en-off-topic"
+config_path = hf_hub_download(repo_id=repo_path, filename="config.json")
+config_path = "config.json"
+with open(config_path, 'r') as f:
+    config = json.load(f)
+def predict(sentence1, sentence2):
+    """
+    Predicts the label for a pair of sentences using a fine-tuned ONNX model.
+    This function tokenizes the input sentences, prepares them as inputs for an ONNX model,
+    and performs inference to predict the label and probabilities for the given sentence pair.
+    Args:
+    - sentence1 (str): The first input sentence.
+    - sentence2 (str): The second input sentence.
+    Returns:
+    tuple:
+        - predicted_label (int): The predicted label (e.g., 0 or 1).
+        - probabilities (numpy.ndarray): The probabilities for each class.
+    """
+    # Load model configuration
+    model_name = config['classifier']['embedding']['model_name']
+    max_length = config['classifier']['embedding']['max_length']
+    model_fp = config['classifier']['embedding']['model_fp']
+    # Set device and load tokenizer
+    device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    # Get inputs
+    inputs1 = tokenizer(sentence1, return_tensors="pt", truncation=True, padding="max_length", max_length=max_length)
+    inputs2 = tokenizer(sentence2, return_tensors="pt", truncation=True, padding="max_length", max_length=max_length)
+    input_ids1 = inputs1['input_ids'].to(device)
+    attention_mask1 = inputs1['attention_mask'].to(device)
+    input_ids2 = inputs2['input_ids'].to(device)
+    attention_mask2 = inputs2['attention_mask'].to(device)
+    # Download the classifier from HuggingFace hub
+    local_model_fp = model_fp
+    local_model_fp = hf_hub_download(repo_id=repo_path, filename=model_fp)
+    # Run inference
+    session = rt.InferenceSession(local_model_fp)  # Load the ONNX model
+    onnx_inputs = {
+        session.get_inputs()[0].name: input_ids1.cpu().numpy(),
+        session.get_inputs()[1].name: attention_mask1.cpu().numpy(),
+        session.get_inputs()[2].name: input_ids2.cpu().numpy(),
+        session.get_inputs()[3].name: attention_mask2.cpu().numpy(),
+    }
+    outputs = session.run(None, onnx_inputs)
+    probabilities = torch.softmax(torch.tensor(outputs[0]), dim=1)
+    predicted_label = torch.argmax(probabilities, dim=1).item()
+    return predicted_label, probabilities.cpu().numpy()
+if __name__ == "__main__":
+    # Load data
+    input_data = sys.argv[1]
+    sentence_pairs = json.loads(input_data)
+    # Validate input data format
+    if not all(isinstance(pair[0], str) and isinstance(pair[1], str) for pair in sentence_pairs):
+        raise ValueError("Each pair must contain two strings.")
+    for idx, (sentence1, sentence2) in enumerate(sentence_pairs):
+        # Generate prediction and scores
+        predicted_label, probabilities = predict(sentence1, sentence2)
+        # Print the results
+        print(f"Pair {idx + 1}:")
+        print(f"  Sentence 1: {sentence1}")
+        print(f"  Sentence 2: {sentence2}")
+        print(f"  Predicted Label: {predicted_label}")
+        print(f"  Probabilities: {probabilities}")
+        print('-' * 50)

inference_safetensors.py ADDED Viewed

	@@ -0,0 +1,172 @@

+"""
+inference_safetensors.py
+Defines the architecture of the fine-tuned embedding model used for Off-Topic classification.
+"""
+import json
+import torch
+import sys
+import torch.nn as nn
+from huggingface_hub import hf_hub_download
+from safetensors.torch import load_file
+from transformers import AutoTokenizer, AutoModel
+# Adapter for embeddings
+class Adapter(nn.Module):
+    def __init__(self, hidden_size):
+        super(Adapter, self).__init__()
+        self.down_project = nn.Linear(hidden_size, hidden_size // 2)
+        self.activation = nn.ReLU()
+        self.up_project = nn.Linear(hidden_size // 2, hidden_size)
+    def forward(self, x):
+        down = self.down_project(x)
+        activated = self.activation(down)
+        up = self.up_project(activated)
+        return up + x  # Residual connection
+# Pool by attention score
+class AttentionPooling(nn.Module):
+    def __init__(self, hidden_size):
+        super(AttentionPooling, self).__init__()
+        self.attention_weights = nn.Parameter(torch.randn(hidden_size))
+    def forward(self, hidden_states):
+        # hidden_states: [seq_len, batch_size, hidden_size]
+        scores = torch.matmul(hidden_states, self.attention_weights)
+        attention_weights = torch.softmax(scores, dim=0)
+        weighted_sum = torch.sum(attention_weights.unsqueeze(-1) * hidden_states, dim=0)
+        return weighted_sum
+# Custom bi-encoder model with MLP layers for interaction
+class CrossEncoderWithSharedBase(nn.Module):
+    def __init__(self, base_model, num_labels=2, num_heads=8):
+        super(CrossEncoderWithSharedBase, self).__init__()
+        # Shared pre-trained model
+        self.shared_encoder = base_model
+        hidden_size = self.shared_encoder.config.hidden_size
+        # Sentence-specific adapters
+        self.adapter1 = Adapter(hidden_size)
+        self.adapter2 = Adapter(hidden_size)
+        # Cross-attention layers
+        self.cross_attention_1_to_2 = nn.MultiheadAttention(hidden_size, num_heads)
+        self.cross_attention_2_to_1 = nn.MultiheadAttention(hidden_size, num_heads)
+        # Attention pooling layers
+        self.attn_pooling_1_to_2 = AttentionPooling(hidden_size)
+        self.attn_pooling_2_to_1 = AttentionPooling(hidden_size)
+        # Projection layer with non-linearity
+        self.projection_layer = nn.Sequential(
+            nn.Linear(hidden_size * 2, hidden_size),
+            nn.ReLU()
+        )
+        # Classifier with three hidden layers
+        self.classifier = nn.Sequential(
+            nn.Linear(hidden_size, hidden_size // 2),
+            nn.ReLU(),
+            nn.Dropout(0.1),
+            nn.Linear(hidden_size // 2, hidden_size // 4),
+            nn.ReLU(),
+            nn.Dropout(0.1),
+            nn.Linear(hidden_size // 4, num_labels)
+        )
+    def forward(self, input_ids1, attention_mask1, input_ids2, attention_mask2):
+        # Encode sentences
+        outputs1 = self.shared_encoder(input_ids1, attention_mask=attention_mask1)
+        outputs2 = self.shared_encoder(input_ids2, attention_mask=attention_mask2)
+        # Apply sentence-specific adapters
+        embeds1 = self.adapter1(outputs1.last_hidden_state)
+        embeds2 = self.adapter2(outputs2.last_hidden_state)
+        # Transpose for attention layers
+        embeds1 = embeds1.transpose(0, 1)
+        embeds2 = embeds2.transpose(0, 1)
+        # Cross-attention
+        cross_attn_1_to_2, _ = self.cross_attention_1_to_2(embeds1, embeds2, embeds2)
+        cross_attn_2_to_1, _ = self.cross_attention_2_to_1(embeds2, embeds1, embeds1)
+        # Attention pooling
+        pooled_1_to_2 = self.attn_pooling_1_to_2(cross_attn_1_to_2)
+        pooled_2_to_1 = self.attn_pooling_2_to_1(cross_attn_2_to_1)
+        # Concatenate and project
+        combined = torch.cat((pooled_1_to_2, pooled_2_to_1), dim=1)
+        projected = self.projection_layer(combined)
+        # Classification
+        logits = self.classifier(projected)
+        return logits
+# Load configuration file
+repo_path = "govtech/jina-embeddings-v2-small-en-off-topic"
+config_path = hf_hub_download(repo_id=repo_path, filename="config.json")
+config_path = "config.json"
+with open(config_path, 'r') as f:
+    config = json.load(f)
+def predict(sentence1, sentence2):
+    """
+    Predicts the label for a pair of sentences using a fine-tuned model with SafeTensors weights.
+    Args:
+    - sentence1 (str): The first input sentence.
+    - sentence2 (str): The second input sentence.
+    Returns:
+    tuple:
+        - predicted_label (int): The predicted label (e.g., 0 or 1).
+        - probabilities (numpy.ndarray): The probabilities for each class.
+    """
+    # Load model configuration
+    model_name = config['classifier']['embedding']['model_name']
+    max_length = config['classifier']['embedding']['max_length']
+    model_weights_fp = config['classifier']['embedding']['model_weights_fp']
+    # Load tokenizer and base model
+    device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    base_model = AutoModel.from_pretrained(model_name)
+    model = CrossEncoderWithSharedBase(base_model, num_labels=2)
+    # Load weights into the model
+    weights = load_file(model_weights_fp)
+    model.load_state_dict(weights)
+    model.to(device)
+    model.eval()
+    # Get inputs
+    inputs1 = tokenizer(sentence1, return_tensors="pt", truncation=True, padding="max_length", max_length=max_length)
+    inputs2 = tokenizer(sentence2, return_tensors="pt", truncation=True, padding="max_length", max_length=max_length)
+    input_ids1 = inputs1['input_ids'].to(device)
+    attention_mask1 = inputs1['attention_mask'].to(device)
+    input_ids2 = inputs2['input_ids'].to(device)
+    attention_mask2 = inputs2['attention_mask'].to(device)
+    # Get outputs
+    with torch.no_grad():
+        outputs = model(input_ids1=input_ids1, attention_mask1=attention_mask1,
+                        input_ids2=input_ids2, attention_mask2=attention_mask2)
+        probabilities = torch.softmax(outputs, dim=1)
+        predicted_label = torch.argmax(probabilities, dim=1).item()
+    return predicted_label, probabilities.cpu().numpy()
+if __name__ == "__main__":
+    # Load data
+    input_data = sys.argv[1]
+    sentence_pairs = json.loads(input_data)
+    # Validate input data format
+    if not all(isinstance(pair[0], str) and isinstance(pair[1], str) for pair in sentence_pairs):
+        raise ValueError("Each pair must contain two strings.")
+    for idx, (sentence1, sentence2) in enumerate(sentence_pairs):
+        # Generate prediction and scores
+        predicted_label, probabilities = predict(sentence1, sentence2)
+        # Print the results
+        print(f"Pair {idx + 1}:")
+        print(f"  Sentence 1: {sentence1}")
+        print(f"  Sentence 2: {sentence2}")
+        print(f"  Predicted Label: {predicted_label}")
+        print(f"  Probabilities: {probabilities}")
+        print('-' * 50)

models/off-topic-jinaai-jina-embeddings-v2-small-en-TwinEncoder.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:61f616f540ea408e918e9a5c30b770071bc473c75a240d831a71a7309724a890
+size 126521473

govtech-jina-embeddings-v2-small-en-off-topic → models/off-topic-jinaai-jina-embeddings-v2-small-en-TwinEncoder.safetensors RENAMED Viewed

File without changes

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+huggingface_hub==0.26.2
+numpy==2.1.3
+onnxruntime==1.20.0
+safetensors==0.4.5
+torch==2.5.1
+transformers==4.46.3