Add new SentenceTransformer model

Browse files

Files changed (12) hide show

1_Pooling/config.json +10 -0
README.md +65 -0
added_tokens.json +4 -0
config.json +32 -0
config_sentence_transformers.json +10 -0
merges.txt +0 -0
model.safetensors +3 -0
modules.json +20 -0
sentence_bert_config.json +4 -0
special_tokens_map.json +58 -0
tokenizer_config.json +213 -0
vocab.json +0 -0

1_Pooling/config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "word_embedding_dimension": 1024,
+  "pooling_mode_cls_token": false,
+  "pooling_mode_mean_tokens": true,
+  "pooling_mode_max_tokens": false,
+  "pooling_mode_mean_sqrt_len_tokens": false,
+  "pooling_mode_weightedmean_tokens": false,
+  "pooling_mode_lasttoken": false,
+  "include_prompt": true
+}

README.md ADDED Viewed

	@@ -0,0 +1,65 @@

+---
+license: apache-2.0
+datasets:
+- bigcode/the-stack-dedup
+library_name: transformers
+language:
+- code
+---
+## CodeSage-Small
+### Updates
+* [12/2024] <span style="color:blue">We are excited to announce the release of the CodeSage V2 model family with largely improved performance and flexible embedding dimensions!</span> Please check out our [models](https://huggingface.co/codesage) and [blogpost](https://code-representation-learning.github.io/codesage-v2.html) for more details.
+* [11/2024] You can now access CodeSage models through SentenceTransformer.
+### Model description
+CodeSage is a new family of open code embedding models with an encoder architecture that support a wide range of source code understanding tasks. It is introduced in the paper:
+[Code Representation Learning At Scale by
+Dejiao Zhang*, Wasi Uddin Ahmad*, Ming Tan, Hantian Ding, Ramesh Nallapati, Dan Roth, Xiaofei Ma, Bing Xiang](https://arxiv.org/abs/2402.01935) (* indicates equal contribution).
+### Pretraining data
+This checkpoint is trained on the Stack data (https://huggingface.co/datasets/bigcode/the-stack-dedup). Supported languages (9 in total) are as follows: c, c-sharp, go, java, javascript, typescript, php, python, ruby.
+### Training procedure
+This checkpoint is first trained on code data via masked language modeling (MLM) and then on bimodal text-code pair data. Please refer to the paper for more details.
+### How to Use
+This checkpoint consists of an encoder (130M model), which can be used to extract code embeddings of 1024 dimension.
+1. Accessing CodeSage via HuggingFace: it can be easily loaded using the AutoModel functionality and employs the [Starcoder Tokenizer](https://arxiv.org/pdf/2305.06161.pdf).
+```
+from transformers import AutoModel, AutoTokenizer
+checkpoint = "codesage/codesage-small"
+device = "cuda"  # for GPU usage or "cpu" for CPU usage
+# Note: CodeSage requires adding eos token at the end of each tokenized sequence
+tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True, add_eos_token=True)
+model = AutoModel.from_pretrained(checkpoint, trust_remote_code=True).to(device)
+inputs = tokenizer.encode("def print_hello_world():\tprint('Hello World!')", return_tensors="pt").to(device)
+embedding = model(inputs)[0]
+```
+2. Accessing CodeSage via SentenceTransformer
+```
+from sentence_transformers import SentenceTransformer
+model = SentenceTransformer("codesage/codesage-small", trust_remote_code=True)
+```
+### BibTeX entry and citation info
+```
+@inproceedings{
+    zhang2024code,
+    title={{CODE} {REPRESENTATION} {LEARNING} {AT} {SCALE}},
+    author={Dejiao Zhang and Wasi Uddin Ahmad and Ming Tan and Hantian Ding and Ramesh Nallapati and Dan Roth and Xiaofei Ma and Bing Xiang},
+    booktitle={The Twelfth International Conference on Learning Representations},
+    year={2024},
+    url={https://openreview.net/forum?id=vfzRRjumpX}
+}
+```

added_tokens.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "<mask>": 49152,
+  "<pad>": 49153
+}

config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "_name_or_path": "codesage/codesage-small",
+  "activation_function": "gelu_new",
+  "architectures": [
+    "CodeSageModel"
+  ],
+  "attention_dropout_prob": 0.1,
+  "auto_map": {
+    "AutoConfig": "codesage/codesage-small--config_codesage.CodeSageConfig",
+    "AutoModel": "codesage/codesage-small--modeling_codesage.CodeSageModel",
+    "AutoModelForMaskedLM": "codesage/codesage-small--modeling_codesage.CodeSageForMaskedLM",
+    "AutoModelForSequenceClassification": "codesage/codesage-small--modeling_codesage.CodeSageForSequenceClassification",
+    "AutoTokenizer": "codesage/codesage-small--tokenization_codesage.CodeSageTokenizer"
+  },
+  "bos_token_id": 0,
+  "embedding_dropout_prob": 0.1,
+  "eos_token_id": 0,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 2048,
+  "model_type": "codesage",
+  "num_attention_heads": 8,
+  "num_hidden_layers": 6,
+  "pad_token_id": 49153,
+  "position_embedding_type": "absolute",
+  "residual_dropout_prob": 0.1,
+  "torch_dtype": "float32",
+  "transformers_version": "4.48.0.dev0",
+  "vocab_size": 49154
+}

config_sentence_transformers.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "__version__": {
+    "sentence_transformers": "3.3.0",
+    "transformers": "4.48.0.dev0",
+    "pytorch": "2.4.0"
+  },
+  "prompts": {},
+  "default_prompt_name": null,
+  "similarity_fn_name": "cosine"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:08c4572853e48b68d738b0fd1e9a60e3884b649fe91a505a9f6a4b0c5efac053
+size 512047808

modules.json ADDED Viewed

	@@ -0,0 +1,20 @@

+[
+  {
+    "idx": 0,
+    "name": "0",
+    "path": "",
+    "type": "sentence_transformers.models.Transformer"
+  },
+  {
+    "idx": 1,
+    "name": "1",
+    "path": "1_Pooling",
+    "type": "sentence_transformers.models.Pooling"
+  },
+  {
+    "idx": 2,
+    "name": "2",
+    "path": "2_Normalize",
+    "type": "sentence_transformers.models.Normalize"
+  }
+]

sentence_bert_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "max_seq_length": 1024,
+  "do_lower_case": false
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "<fim_prefix>",
+    "<fim_middle>",
+    "<fim_suffix>",
+    "<fim_pad>",
+    "<filename>",
+    "<gh_stars>",
+    "<issue_start>",
+    "<issue_comment>",
+    "<issue_closed>",
+    "<jupyter_start>",
+    "<jupyter_text>",
+    "<jupyter_code>",
+    "<jupyter_output>",
+    "<empty_output>",
+    "<commit_before>",
+    "<commit_msg>",
+    "<commit_after>",
+    "<reponame>"
+  ],
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,213 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": true,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<fim_prefix>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<fim_middle>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<fim_suffix>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<fim_pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<filename>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "<gh_stars>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "<issue_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": "<issue_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "9": {
+      "content": "<issue_closed>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "10": {
+      "content": "<jupyter_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "11": {
+      "content": "<jupyter_text>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "12": {
+      "content": "<jupyter_code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "13": {
+      "content": "<jupyter_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "14": {
+      "content": "<empty_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "15": {
+      "content": "<commit_before>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "16": {
+      "content": "<commit_msg>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "17": {
+      "content": "<commit_after>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "18": {
+      "content": "<reponame>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49152": {
+      "content": "<mask>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49153": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "<fim_prefix>",
+    "<fim_middle>",
+    "<fim_suffix>",
+    "<fim_pad>",
+    "<filename>",
+    "<gh_stars>",
+    "<issue_start>",
+    "<issue_comment>",
+    "<issue_closed>",
+    "<jupyter_start>",
+    "<jupyter_text>",
+    "<jupyter_code>",
+    "<jupyter_output>",
+    "<empty_output>",
+    "<commit_before>",
+    "<commit_msg>",
+    "<commit_after>",
+    "<reponame>"
+  ],
+  "auto_map": {
+    "AutoTokenizer": [
+      "codesage/codesage-small--tokenization_codesage.CodeSageTokenizer",
+      null
+    ]
+  },
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "mask_token": "<mask>",
+  "model_max_length": 1024,
+  "pad_token": "<pad>",
+  "tokenizer_class": "CodeSageTokenizer",
+  "unk_token": "<|endoftext|>",
+  "vocab_size": 49152
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff