Clone từ AITeamVN/Vietnamese_Embedding

Browse files

Files changed (13) hide show

.gitattributes +1 -0
1_Pooling/config.json +10 -0
README.md +90 -3
config.json +28 -0
config_sentence_transformers.json +9 -0
evaluation_model.py +191 -0
model.safetensors +3 -0
modules.json +20 -0
sentence_bert_config.json +4 -0
sentencepiece.bpe.model +3 -0
special_tokens_map.json +51 -0
tokenizer.json +3 -0
tokenizer_config.json +56 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

1_Pooling/config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "word_embedding_dimension": 1024,
+  "pooling_mode_cls_token": true,
+  "pooling_mode_mean_tokens": false,
+  "pooling_mode_max_tokens": false,
+  "pooling_mode_mean_sqrt_len_tokens": false,
+  "pooling_mode_weightedmean_tokens": false,
+  "pooling_mode_lasttoken": false,
+  "include_prompt": true
+}

README.md CHANGED Viewed

@@ -1,3 +1,90 @@
----
-license: apache-2.0
----

+---
+license: apache-2.0
+language:
+- vi
+base_model:
+- BAAI/bge-m3
+pipeline_tag: sentence-similarity
+library_name: sentence-transformers
+tags:
+- Embedding
+---
+## Model Card: Vietnamese_Embedding
+Vietnamese_Embedding is an embedding model fine-tuned from the BGE-M3 model (https://huggingface.co/BAAI/bge-m3) to enhance retrieval capabilities for Vietnamese.
+* The model was trained on approximately 300,000 triplets of queries, positive documents, and negative documents for Vietnamese.
+* The model was trained with a maximum sequence length of 2048.
+## Model Details
+### Model Description
+- **Model Type:** Sentence Transformer
+- **Base model:** [BAAI/bge-m3](https://huggingface.co/BAAI/bge-m3)
+- **Maximum Sequence Length:** 2048 tokens
+- **Output Dimensionality:** 1024 dimensions
+- **Similarity Function:** Dot product Similarity
+- **Language:** Vietnamese
+- **Licence:** Apache 2.0
+## Usage
+```python
+from sentence_transformers import SentenceTransformer
+import torch
+model = SentenceTransformer("AITeamVN/Vietnamese_Embedding")
+model.max_seq_length = 2048
+sentences_1 = ["Trí tuệ nhân tạo là gì", "Lợi ích của giấc ngủ"]
+sentences_2 = ["Trí tuệ nhân tạo là công nghệ giúp máy móc suy nghĩ và học hỏi như con người. Nó hoạt động bằng cách thu thập dữ liệu, nhận diện mẫu và đưa ra quyết định.",
+               "Giấc ngủ giúp cơ thể và não bộ nghỉ ngơi, hồi phục năng lượng và cải thiện trí nhớ. Ngủ đủ giấc giúp tinh thần tỉnh táo và làm việc hiệu quả hơn."]
+query_embedding = model.encode(sentences_1)
+doc_embeddings = model.encode(sentences_2)
+similarity = query_embedding @ doc_embeddings.T
+print(similarity)
+'''
+array([[0.66212064, 0.33066642],
+       [0.25866613, 0.5865289 ]], dtype=float32)
+'''
+```
+### Evaluation:
+- Dataset: Entire training dataset of Legal Zalo 2021. Our model was not trained on this dataset.
+| Model                | Accuracy@1 | Accuracy@3 | Accuracy@5 | Accuracy@10  |  MRR@10 |
+|----------------------|------------|------------|------------|-------------|--------------|
+| Vietnamese_Reranker (Phase 2)            | 0.7944     | 0.9324    | 0.9537     | 0.9740     | 0.8672       |
+| Vietnamese_Embedding (Phase 2)          | 0.7262     | 0.8927     | 0.9268     | 0.9578     | 0.8149       |
+| Vietnamese_Embedding  (public)          | 0.7274     | 0.8992     | 0.9305     | 0.9568     | 0.8181       |
+| Vietnamese-bi-encoder (BKAI)         | 0.7109     | 0.8680     | 0.9014     | 0.9299      | 0.7951       |
+| BGE-M3 | 0.5682     | 0.7728     | 0.8382     | 0.8921      | 0.6822       |
+Vietnamese_Reranker (Phase 2) and Vietnamese_Embedding (Phase 2) was trained on 1100000 triplets.
+Although the score on the legal domain drops a bit on Vietnamese_Embedding (Phase 2), since this phase data is much larger, it is very good for other domains.
+You can reproduce the evaluation result by running code python evaluation_model.py (data downloaded from Kaggle).
+## Contact
+Email: [email protected]
+**Developer**
+Member: Nguyễn Nho Trung, Nguyễn Nhật Quang
+## Citation
+```Plaintext
+@misc{Vietnamese_Embedding,
+  title={Vietnamese_Embedding: Embedding model in Vietnamese language.},
+  author={Nguyen Nho Trung, Nguyen Nhat Quang},
+  year={2025},
+  publisher={Huggingface},
+}
+```

config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "_name_or_path": "/AITeamVN/bge_vi_2048",
+  "architectures": [
+    "XLMRobertaModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "classifier_dropout": null,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 8194,
+  "model_type": "xlm-roberta",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "output_past": true,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.49.0",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 250002
+}

config_sentence_transformers.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "__version__": {
+    "sentence_transformers": "2.6.1",
+    "transformers": "4.49.0",
+    "pytorch": "2.6.0+cu124"
+  },
+  "prompts": {},
+  "default_prompt_name": null
+}

evaluation_model.py ADDED Viewed

	@@ -0,0 +1,191 @@

+import numpy as np
+import torch
+import json
+import pandas as pd
+from tqdm import tqdm
+from typing import List, Dict, Tuple, Set, Union, Optional
+from langchain.docstore.document import Document
+from langchain_community.vectorstores import FAISS
+from langchain_community.vectorstores.faiss import DistanceStrategy
+from langchain_core.embeddings.embeddings import Embeddings
+from FlagEmbedding import BGEM3FlagModel
+def setup_gpu_info() -> None:
+    print(f"Số lượng GPU khả dụng: {torch.cuda.device_count()}")
+    print(f"GPU hiện tại: {torch.cuda.current_device()}")
+    print(f"Tên GPU: {torch.cuda.get_device_name(0)}")
+def load_model(model_name: str, use_fp16: bool = False) -> BGEM3FlagModel:
+    return BGEM3FlagModel(model_name, use_fp16=use_fp16)
+def load_json_file(file_path: str) -> dict:
+    with open(file_path, 'r', encoding='utf-8') as f:
+        return json.load(f)
+def load_jsonl_file(file_path: str) -> List[Dict]:
+    corpus = []
+    with open(file_path, "r", encoding="utf-8") as file:
+        for line in file:
+            data = json.loads(line.strip())
+            corpus.append(data)
+    return corpus
+def extract_corpus_from_legal_documents(legal_data: dict) -> List[Dict]:
+    corpus = []
+    for document in legal_data:
+        for article in document['articles']:
+            chunk = {
+                "law_id": document['law_id'],
+                "article_id": article['article_id'],
+                "title": article['title'],
+                "text": article['title'] + '\n' + article['text']
+            }
+            corpus.append(chunk)
+    return corpus
+def convert_corpus_to_documents(corpus: List[Dict[str, str]]) -> List[Document]:
+    documents = []
+    for i in tqdm(range(len(corpus)), desc="Converting corpus to documents"):
+        context = corpus[i]['text']
+        metadata = {
+            'law_id': corpus[i]['law_id'],
+            'article_id': corpus[i]['article_id'],
+            'title': corpus[i]['title']
+        }
+        documents.append(Document(page_content=context, metadata=metadata))
+    return documents
+class CustomEmbedding(Embeddings):
+    """Custom embedding class that uses the BGEM3FlagModel."""
+    def __init__(self, model: BGEM3FlagModel, batch_size: int = 1):
+        self.model = model
+        self.batch_size = batch_size
+    def embed_documents(self, texts: List[str]) -> List[List[float]]:
+        embeddings = []
+        for i in tqdm(range(0, len(texts), self.batch_size), desc="Embedding documents"):
+            batch_texts = texts[i:i+self.batch_size]
+            batch_embeddings = self._get_batch_embeddings(batch_texts)
+            embeddings.extend(batch_embeddings)
+            torch.cuda.empty_cache()
+        return np.vstack(embeddings)
+    def embed_query(self, text: str) -> List[float]:
+        embedding = self.model.encode(text, max_length=256)['dense_vecs']
+        return embedding
+    def _get_batch_embeddings(self, texts: List[str]) -> List[List[float]]:
+        with torch.no_grad():
+            outputs = self.model.encode(texts, batch_size=self.batch_size, max_length=2048)['dense_vecs']
+        batch_embeddings = outputs
+        del outputs
+        return batch_embeddings
+class VectorDB:
+    """Vector database for document retrieval."""
+    def __init__(
+        self,
+        documents: List[Document],
+        embedding: Embeddings,
+        vector_db=FAISS,
+        index_path: Optional[str] = None
+    ) -> None:
+        self.vector_db = vector_db
+        self.embedding = embedding
+        self.index_path = index_path
+        self.db = self._build_db(documents)
+    def _build_db(self, documents: List[Document]):
+        if self.index_path:
+            db = self.vector_db.load_local(
+                self.index_path,
+                self.embedding,
+                allow_dangerous_deserialization=True
+            )
+        else:
+            db = self.vector_db.from_documents(
+                documents=documents,
+                embedding=self.embedding,
+                distance_strategy=DistanceStrategy.DOT_PRODUCT
+            )
+        return db
+    def get_retriever(self, search_type: str = "similarity", search_kwargs: dict = {"k": 10}):
+        retriever = self.db.as_retriever(search_type=search_type, search_kwargs=search_kwargs)
+        return retriever
+    def save_local(self, folder_path: str) -> None:
+        self.db.save_local(folder_path)
+def process_sample(sample: dict, retriever) -> List[int]:
+    question = sample['question']
+    docs = retriever.invoke(question)
+    retrieved_article_full_ids = [
+        docs[i].metadata['law_id'] + "#" + docs[i].metadata['article_id']
+        for i in range(len(docs))
+    ]
+    indexes = []
+    for article in sample['relevant_articles']:
+        article_full_id = article['law_id'] + "#" + article['article_id']
+        if article_full_id in retrieved_article_full_ids:
+            idx = retrieved_article_full_ids.index(article_full_id) + 1
+            indexes.append(idx)
+        else:
+            indexes.append(0)
+    return indexes
+def calculate_metrics(all_indexes: List[List[int]], num_samples: int, selected_keys: Set[str]) -> Dict[str, float]:
+    count = [len(indexes) for indexes in all_indexes]
+    result = {}
+    for thres in [1, 3, 5, 10, 100]:
+        found = [[y for y in x if 0 < y <= thres] for x in all_indexes]
+        found_count = [len(x) for x in found]
+        acc = sum(1 for i in range(num_samples) if found_count[i] > 0) / num_samples
+        rec = sum(found_count[i] / count[i] for i in range(num_samples)) / num_samples
+        pre = sum(found_count[i] / thres for i in range(num_samples)) / num_samples
+        mrr = sum(1 / min(x) if x else 0 for x in found) / num_samples
+        if f"Accuracy@{thres}" in selected_keys:
+            result[f"Accuracy@{thres}"] = acc
+        if f"MRR@{thres}" in selected_keys:
+            result[f"MRR@{thres}"] = mrr
+    return result
+def save_results(result: Dict[str, float], output_path: str) -> None:
+    with open(output_path, "w", encoding="utf-8") as f:
+        json.dump(result, f, indent=4, ensure_ascii=False)
+    print(f"Results saved to {output_path}")
+def main():
+    setup_gpu_info()
+    model = load_model('AITeamVN/Vietnamese_Embedding', use_fp16=False)
+    samples = load_json_file('zalo_kaggle/train_question_answer.json')['items']
+    legal_data = load_json_file('zalo_kaggle/legal_corpus.json')
+    corpus = extract_corpus_from_legal_documents(legal_data)
+    documents = convert_corpus_to_documents(corpus)
+    embedding = CustomEmbedding(model, batch_size=1)  # Increased batch size for efficiency time
+    vectordb = VectorDB(
+        documents=documents,
+        embedding=embedding,
+        vector_db=FAISS,
+        index_path=None
+    )
+    retriever = vectordb.get_retriever(search_type="similarity", search_kwargs={"k": 100})
+    all_indexes = []
+    for sample in tqdm(samples, desc="Processing samples"):
+        all_indexes.append(process_sample(sample, retriever))
+    selected_keys = {"Accuracy@1", "Accuracy@3", "Accuracy@5", "Accuracy@10", "MRR@10", "Accuracy@100"}
+    result = calculate_metrics(all_indexes, len(samples), selected_keys)
+    print(result)
+    save_results(result, "zalo_kaggle/Vietnamese_Embedding.json")
+if __name__ == "__main__":
+    main()

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1f2debafdf03659e8273022a3e902b94deec73cd20c2b7262ab7e21630163f6d
+size 2271064456

modules.json ADDED Viewed

	@@ -0,0 +1,20 @@

+[
+  {
+    "idx": 0,
+    "name": "0",
+    "path": "",
+    "type": "sentence_transformers.models.Transformer"
+  },
+  {
+    "idx": 1,
+    "name": "1",
+    "path": "1_Pooling",
+    "type": "sentence_transformers.models.Pooling"
+  },
+  {
+    "idx": 2,
+    "name": "2",
+    "path": "2_Normalize",
+    "type": "sentence_transformers.models.Normalize"
+  }
+]

sentence_bert_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "max_seq_length": 8192,
+  "do_lower_case": false
+}

sentencepiece.bpe.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
+size 5069051

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b74659c780d49afad7a7b9799868f75cbd3014fb6c34956e85a793028d38094a
+size 17098251

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,56 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "250001": {
+      "content": "<mask>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "mask_token": "<mask>",
+  "model_max_length": 8192,
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "XLMRobertaTokenizer",
+  "unk_token": "<unk>"
+}