Upload 16 files
Browse files- 1_Pooling/config.json +10 -0
- README.md +494 -3
- config.json +26 -0
- config_sentence_transformers.json +10 -0
- model.safetensors +3 -0
- modules.json +20 -0
- optimizer.pt +3 -0
- rng_state.pth +3 -0
- scheduler.pt +3 -0
- sentence_bert_config.json +4 -0
- special_tokens_map.json +37 -0
- tokenizer.json +0 -0
- tokenizer_config.json +65 -0
- trainer_state.json +78 -0
- training_args.bin +3 -0
- vocab.txt +0 -0
1_Pooling/config.json
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"word_embedding_dimension": 384,
|
3 |
+
"pooling_mode_cls_token": false,
|
4 |
+
"pooling_mode_mean_tokens": true,
|
5 |
+
"pooling_mode_max_tokens": false,
|
6 |
+
"pooling_mode_mean_sqrt_len_tokens": false,
|
7 |
+
"pooling_mode_weightedmean_tokens": false,
|
8 |
+
"pooling_mode_lasttoken": false,
|
9 |
+
"include_prompt": true
|
10 |
+
}
|
README.md
CHANGED
@@ -1,3 +1,494 @@
|
|
1 |
-
---
|
2 |
-
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
tags:
|
3 |
+
- sentence-transformers
|
4 |
+
- sentence-similarity
|
5 |
+
- feature-extraction
|
6 |
+
- generated_from_trainer
|
7 |
+
- dataset_size:6300
|
8 |
+
- loss:CachedMultipleNegativesRankingLoss
|
9 |
+
base_model: sentence-transformers/all-MiniLM-L6-v2
|
10 |
+
widget:
|
11 |
+
- source_sentence: How can AnimateDiff, a motion adapter for pretrained diffusion
|
12 |
+
models, be used to generate videos from images?
|
13 |
+
sentences:
|
14 |
+
- 'Performs a single real input radix-2 transformation on the provided data Kind:
|
15 |
+
instance method ofP2FFT The input data array The output data array The output
|
16 |
+
offset The input offset The step'
|
17 |
+
- AnimateDiffis an adapter model that inserts a motion module into a pretrained
|
18 |
+
diffusion model to animate an image. The adapter is trained on video clips to
|
19 |
+
learn motion which is used to condition the generation process to create a video.
|
20 |
+
It is faster and easier to only train the adapter and it can be loaded into most
|
21 |
+
diffusion models, effectively turning them into “video models”. Start by loading
|
22 |
+
aMotionAdapter. Then load a finetuned Stable Diffusion model with theAnimateDiffPipeline.
|
23 |
+
Create a prompt and generate the video.
|
24 |
+
- 'Utility class to handle streaming of tokens generated by whisper speech-to-text
|
25 |
+
models. Callback functions are invoked when each of the following events occur:
|
26 |
+
Kind: static class ofgeneration/streamers'
|
27 |
+
- source_sentence: How to configure DeepSpeed, including ZeRO-2 and bf16 precision,
|
28 |
+
for optimal performance with Intel Gaudi HPUs?
|
29 |
+
sentences:
|
30 |
+
- 'The DeepSpeed configuration to use is passed through a JSON file and enables
|
31 |
+
you to choose the optimizations to apply. Here is an example for applying ZeRO-2
|
32 |
+
optimizations andbf16precision: The special value"auto"enables to automatically
|
33 |
+
get the correct or most efficient value. You can also specify the values yourself
|
34 |
+
but, if you do so, you should be careful not to have conflicting values with your
|
35 |
+
training arguments. It is strongly advised to readthis sectionin the Transformers
|
36 |
+
documentation to completely understand how this works. Other examples of configurations
|
37 |
+
for HPUs are proposedhereby Intel. TheTransformers documentationexplains how to
|
38 |
+
write a configuration from scratch very well. A more complete description of all
|
39 |
+
configuration possibilities is availablehere.'
|
40 |
+
- Creates a new instance of TokenizerModel. The configuration object for the TokenizerModel.
|
41 |
+
- Most Spaces should run out of the box after a GPU upgrade, but sometimes you’ll
|
42 |
+
need to install CUDA versions of the machine learning frameworks you use. Please,
|
43 |
+
follow this guide to ensure your Space takes advantage of the improved hardware.
|
44 |
+
- source_sentence: Can DeBERTa's question-answering model be fine-tuned for improved
|
45 |
+
information retrieval?
|
46 |
+
sentences:
|
47 |
+
- 'RegNetXis a convolutional network design space with simple, regular models with
|
48 |
+
parameters: depthddd, initial widthw0>0w_{0} > 0w0>0, and slopewa>0w_{a} > 0wa>0,
|
49 |
+
and generates a different block widthuju_{j}ujfor each blockj<dj < dj<d. The
|
50 |
+
key restriction for the RegNet types of model is that there is a linear parameterisation
|
51 |
+
of block widths (the design space only contains models with this linear structure):uj=w0+wa⋅ju_{j}
|
52 |
+
= w_{0} + w_{a}\cdot{j}uj=w0+wa⋅j ForRegNetXwe have additional restrictions:
|
53 |
+
we setb=1b = 1b=1(the bottleneck ratio),12≤d≤2812 \leq d \leq 2812≤d≤28, andwm≥2w_{m}
|
54 |
+
\geq 2wm≥2(the width multiplier).'
|
55 |
+
- 'DeBERTa Model with a span classification head on top for extractive question-answering
|
56 |
+
tasks like SQuAD (a linear layers on top of the hidden-states output to computespan
|
57 |
+
start logitsandspan end logits). Kind: static class ofmodels'
|
58 |
+
- 'The minimum length of the sequence to be generated. Corresponds to the length
|
59 |
+
of the input prompt +min_new_tokens. Its effect is overridden bymin_new_tokens,
|
60 |
+
if also set. Kind: instance property ofGenerationConfigDefault:0'
|
61 |
+
- source_sentence: How can I efficiently upload models from supported libraries like
|
62 |
+
Transformers to the Hugging Face Hub for improved information retrieval?
|
63 |
+
sentences:
|
64 |
+
- '🤗 Diffusers is compatible with Habana Gaudi through 🤗Optimum. Follow theinstallationguide
|
65 |
+
to install the SynapseAI and Gaudi drivers, and then install Optimum Habana: To
|
66 |
+
generate images with Stable Diffusion 1 and 2 on Gaudi, you need to instantiate
|
67 |
+
two instances: When you initialize the pipeline, you have to specifyuse_habana=Trueto
|
68 |
+
deploy it on HPUs and to get the fastest possible generation, you should enableHPU
|
69 |
+
graphswithuse_hpu_graphs=True. Finally, specify aGaudiConfigwhich can be downloaded
|
70 |
+
from theHabanaorganization on the Hub. Now you can call the pipeline to generate
|
71 |
+
images by batches from one or several prompts: For more information, check out
|
72 |
+
🤗 Optimum Habana’sdocumentationand theexampleprovided in the official GitHub repository.'
|
73 |
+
- 'While training and evaluating we record the following reward metrics:'
|
74 |
+
- 'First check if your model is from a library that has built-in support to push
|
75 |
+
to/load from the Hub, like Transformers, Diffusers, Timm, Asteroid, etc.:https://huggingface.co/docs/hub/models-libraries.
|
76 |
+
Below we’ll show how easy this is for a library like Transformers: Some libraries,
|
77 |
+
like Transformers, support loadingcode from the Hub. This is a way to make your
|
78 |
+
model work with Transformers using thetrust_remote_code=Trueflag. You may want
|
79 |
+
to consider this option instead of a full-fledged library integration.'
|
80 |
+
- source_sentence: How can I use Shiny for Python to build and deploy a Hugging Face
|
81 |
+
Space application?
|
82 |
+
sentences:
|
83 |
+
- Shiny for Pythonis a pure Python implementation of Shiny. This gives you access
|
84 |
+
to all of the great features of Shiny like reactivity, complex layouts, and modules
|
85 |
+
without needing to use R. Shiny for Python is ideal for Hugging Face applications
|
86 |
+
because it integrates smoothly with other Hugging Face tools. To get started deploying
|
87 |
+
a Space, click this button to select your hardware and specify if you want a public
|
88 |
+
or private Space. The Space template will populate a few files to get your app
|
89 |
+
started. app.py This file defines your app’s logic. To learn more about how to
|
90 |
+
modify this file, seethe Shiny for Python documentation. As your app gets more
|
91 |
+
complex, it’s a good idea to break your application logic up intomodules. Dockerfile
|
92 |
+
The Dockerfile for a Shiny for Python app is very minimal because the library
|
93 |
+
doesn’t have many system dependencies, but you may need to modify this file if
|
94 |
+
your application has additional system dependencies. The one essential feature
|
95 |
+
of this file is that it exposes and runs the app on the port specified in the
|
96 |
+
space README file (which is 7860 by default). requirements.txt The Space will
|
97 |
+
automatically install dependencies listed in the requirements.txt file. Note that
|
98 |
+
you must include shiny in this file.
|
99 |
+
- '(**kwargs) A context manager that will add each keyword argument passed toos.environand
|
100 |
+
remove them when exiting. Will convert the values inkwargsto strings and upper-case
|
101 |
+
all the keys. () A context manager that will temporarily clear environment variables.
|
102 |
+
When this context exits, the previous environment variables will be back. (mixed_precision=
|
103 |
+
''no''save_location: str = ''/github/home/.cache/huggingface/accelerate/default_config.yaml''use_xpu:
|
104 |
+
bool = False) Parameters Creates and saves a basic cluster config to be used on
|
105 |
+
a local machine with potentially multiple GPUs. Will also set CPU if it is a CPU-only
|
106 |
+
machine. When setting up 🤗 Accelerate for the first time, rather than runningaccelerate
|
107 |
+
config[~utils.write_basic_config] can be used as an alternative for quick configuration.
|
108 |
+
(local_process_index: intverbose: typing.Optional[bool] = None) Parameters Assigns
|
109 |
+
the current process to a specific NUMA node. Ideally most efficient when having
|
110 |
+
at least 2 cpus per node. This result is cached between calls. If you want to
|
111 |
+
override it, please useaccelerate.utils.environment.override_numa_afifnity. (local_process_index:
|
112 |
+
intverbose: typing.Optional[bool] = None) Parameters Overrides whatever NUMA affinity
|
113 |
+
is set for the current process. This is very taxing and requires recalculating
|
114 |
+
the affinity to set, ideally you should useutils.environment.set_numa_affinityinstead.
|
115 |
+
(func_or_cls) Decorator to clean up accelerate environment variables set by the
|
116 |
+
decorated class or function. In some circumstances, calling certain classes or
|
117 |
+
functions can result in accelerate env vars being set and not being cleaned up
|
118 |
+
afterwards. As an example, when calling: TrainingArguments(fp16=True, …) The following
|
119 |
+
env var will be set: ACCELERATE_MIXED_PRECISION=fp16 This can affect subsequent
|
120 |
+
code, since the env var takes precedence over TrainingArguments(fp16=False). This
|
121 |
+
is especially relevant for unit testing, where we want to avoid the individual
|
122 |
+
tests to have side effects on one another. Decorate the unit test function or
|
123 |
+
whole class with this decorator to ensure that after each test, the env vars are
|
124 |
+
cleaned up. This works for both unittest.TestCase and normal classes (pytest);
|
125 |
+
it also works when decorating the parent class.'
|
126 |
+
- 'Performs a real-valued forward FFT on the given input buffer and stores the result
|
127 |
+
in the given output buffer. The input buffer must contain real values only, while
|
128 |
+
the output buffer will contain complex values. The input and output buffers must
|
129 |
+
be different. Kind: instance method ofP2FFTThrows: The output buffer. The input
|
130 |
+
buffer containing real values.'
|
131 |
+
pipeline_tag: sentence-similarity
|
132 |
+
library_name: sentence-transformers
|
133 |
+
---
|
134 |
+
|
135 |
+
# SentenceTransformer based on sentence-transformers/all-MiniLM-L6-v2
|
136 |
+
|
137 |
+
This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [sentence-transformers/all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2). It maps sentences & paragraphs to a 384-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.
|
138 |
+
|
139 |
+
## Model Details
|
140 |
+
|
141 |
+
### Model Description
|
142 |
+
- **Model Type:** Sentence Transformer
|
143 |
+
- **Base model:** [sentence-transformers/all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) <!-- at revision c9745ed1d9f207416be6d2e6f8de32d1f16199bf -->
|
144 |
+
- **Maximum Sequence Length:** 256 tokens
|
145 |
+
- **Output Dimensionality:** 384 dimensions
|
146 |
+
- **Similarity Function:** Cosine Similarity
|
147 |
+
<!-- - **Training Dataset:** Unknown -->
|
148 |
+
<!-- - **Language:** Unknown -->
|
149 |
+
<!-- - **License:** Unknown -->
|
150 |
+
|
151 |
+
### Model Sources
|
152 |
+
|
153 |
+
- **Documentation:** [Sentence Transformers Documentation](https://sbert.net)
|
154 |
+
- **Repository:** [Sentence Transformers on GitHub](https://github.com/UKPLab/sentence-transformers)
|
155 |
+
- **Hugging Face:** [Sentence Transformers on Hugging Face](https://huggingface.co/models?library=sentence-transformers)
|
156 |
+
|
157 |
+
### Full Model Architecture
|
158 |
+
|
159 |
+
```
|
160 |
+
SentenceTransformer(
|
161 |
+
(0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel
|
162 |
+
(1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
|
163 |
+
(2): Normalize()
|
164 |
+
)
|
165 |
+
```
|
166 |
+
|
167 |
+
## Usage
|
168 |
+
|
169 |
+
### Direct Usage (Sentence Transformers)
|
170 |
+
|
171 |
+
First install the Sentence Transformers library:
|
172 |
+
|
173 |
+
```bash
|
174 |
+
pip install -U sentence-transformers
|
175 |
+
```
|
176 |
+
|
177 |
+
Then you can load this model and run inference.
|
178 |
+
```python
|
179 |
+
from sentence_transformers import SentenceTransformer
|
180 |
+
|
181 |
+
# Download from the 🤗 Hub
|
182 |
+
model = SentenceTransformer("sentence_transformers_model_id")
|
183 |
+
# Run inference
|
184 |
+
sentences = [
|
185 |
+
'How can I use Shiny for Python to build and deploy a Hugging Face Space application?',
|
186 |
+
'Shiny for Pythonis a pure Python implementation of Shiny. This gives you access to all of the great features of Shiny like reactivity, complex layouts, and modules without needing to use R. Shiny for Python is ideal for Hugging Face applications because it integrates smoothly with other Hugging Face tools. To get started deploying a Space, click this button to select your hardware and specify if you want a public or private Space. The Space template will populate a few files to get your app started. app.py This file defines your app’s logic. To learn more about how to modify this file, seethe Shiny for Python documentation. As your app gets more complex, it’s a good idea to break your application logic up intomodules. Dockerfile The Dockerfile for a Shiny for Python app is very minimal because the library doesn’t have many system dependencies, but you may need to modify this file if your application has additional system dependencies. The one essential feature of this file is that it exposes and runs the app on the port specified in the space README file (which is 7860 by default). requirements.txt The Space will automatically install dependencies listed in the requirements.txt file. Note that you must include shiny in this file.',
|
187 |
+
'Performs a real-valued forward FFT on the given input buffer and stores the result in the given output buffer. The input buffer must contain real values only, while the output buffer will contain complex values. The input and output buffers must be different. Kind: instance method ofP2FFTThrows: The output buffer. The input buffer containing real values.',
|
188 |
+
]
|
189 |
+
embeddings = model.encode(sentences)
|
190 |
+
print(embeddings.shape)
|
191 |
+
# [3, 384]
|
192 |
+
|
193 |
+
# Get the similarity scores for the embeddings
|
194 |
+
similarities = model.similarity(embeddings, embeddings)
|
195 |
+
print(similarities.shape)
|
196 |
+
# [3, 3]
|
197 |
+
```
|
198 |
+
|
199 |
+
<!--
|
200 |
+
### Direct Usage (Transformers)
|
201 |
+
|
202 |
+
<details><summary>Click to see the direct usage in Transformers</summary>
|
203 |
+
|
204 |
+
</details>
|
205 |
+
-->
|
206 |
+
|
207 |
+
<!--
|
208 |
+
### Downstream Usage (Sentence Transformers)
|
209 |
+
|
210 |
+
You can finetune this model on your own dataset.
|
211 |
+
|
212 |
+
<details><summary>Click to expand</summary>
|
213 |
+
|
214 |
+
</details>
|
215 |
+
-->
|
216 |
+
|
217 |
+
<!--
|
218 |
+
### Out-of-Scope Use
|
219 |
+
|
220 |
+
*List how the model may foreseeably be misused and address what users ought not to do with the model.*
|
221 |
+
-->
|
222 |
+
|
223 |
+
<!--
|
224 |
+
## Bias, Risks and Limitations
|
225 |
+
|
226 |
+
*What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.*
|
227 |
+
-->
|
228 |
+
|
229 |
+
<!--
|
230 |
+
### Recommendations
|
231 |
+
|
232 |
+
*What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
|
233 |
+
-->
|
234 |
+
|
235 |
+
## Training Details
|
236 |
+
|
237 |
+
### Training Dataset
|
238 |
+
|
239 |
+
#### Unnamed Dataset
|
240 |
+
|
241 |
+
* Size: 6,300 training samples
|
242 |
+
* Columns: <code>anchor</code> and <code>positive</code>
|
243 |
+
* Approximate statistics based on the first 1000 samples:
|
244 |
+
| | anchor | positive |
|
245 |
+
|:--------|:-----------------------------------------------------------------------------------|:------------------------------------------------------------------------------------|
|
246 |
+
| type | string | string |
|
247 |
+
| details | <ul><li>min: 8 tokens</li><li>mean: 26.77 tokens</li><li>max: 189 tokens</li></ul> | <ul><li>min: 5 tokens</li><li>mean: 116.82 tokens</li><li>max: 256 tokens</li></ul> |
|
248 |
+
* Samples:
|
249 |
+
| anchor | positive |
|
250 |
+
|:-------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
251 |
+
| <code>How can I configure the `TextEncoderOnnxConfig` class for optimal ONNX export of a text encoder model intended for information retrieval?</code> | <code>(config: PretrainedConfigtask: str = 'feature-extraction'preprocessors: typing.Optional[typing.List[typing.Any]] = Noneint_dtype: str = 'int64'float_dtype: str = 'fp32'legacy: bool = False) Handles encoder-based text architectures.</code> |
|
252 |
+
| <code>How does PyTorch's shared tensor mechanism handle loading and saving, and what are its limitations?</code> | <code>The design is rather simple. We’re going to look for all shared tensors, then looking for all tensors covering the entire buffer (there can be multiple such tensors). That gives us multiple names which can be saved, we simply choose the first one Duringload_model, we are loading a bit likeload_state_dictdoes, except we’re looking into the model itself, to check for shared buffers, and ignoring the “missed keys” which were actually covered by virtue of buffer sharing (they were properly loaded since there was a buffer that loaded under the hood). Every other error is raised as-is Caveat: This means we’re dropping some keys within the file. meaning if you’re checking for the keys saved on disk, you will see some “missing tensors” or if you’re usingload_state_dict. Unless we start supporting shared tensors directly in the format there’s no real way around it.</code> |
|
253 |
+
| <code>How can I manage access tokens to secure my organization's resources?</code> | <code>Tokens Management enables organization administrators to oversee access tokens within their organization, ensuring secure access to organization resources.</code> |
|
254 |
+
* Loss: [<code>CachedMultipleNegativesRankingLoss</code>](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#cachedmultiplenegativesrankingloss) with these parameters:
|
255 |
+
```json
|
256 |
+
{
|
257 |
+
"scale": 20.0,
|
258 |
+
"similarity_fct": "cos_sim",
|
259 |
+
"mini_batch_size": 1024
|
260 |
+
}
|
261 |
+
```
|
262 |
+
|
263 |
+
### Evaluation Dataset
|
264 |
+
|
265 |
+
#### Unnamed Dataset
|
266 |
+
|
267 |
+
* Size: 700 evaluation samples
|
268 |
+
* Columns: <code>anchor</code> and <code>positive</code>
|
269 |
+
* Approximate statistics based on the first 700 samples:
|
270 |
+
| | anchor | positive |
|
271 |
+
|:--------|:----------------------------------------------------------------------------------|:------------------------------------------------------------------------------------|
|
272 |
+
| type | string | string |
|
273 |
+
| details | <ul><li>min: 8 tokens</li><li>mean: 26.76 tokens</li><li>max: 67 tokens</li></ul> | <ul><li>min: 3 tokens</li><li>mean: 115.51 tokens</li><li>max: 256 tokens</li></ul> |
|
274 |
+
* Samples:
|
275 |
+
| anchor | positive |
|
276 |
+
|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
277 |
+
| <code>How can I configure a DecoderSequence object for optimal information retrieval using a list of decoders and a configuration object?</code> | <code>Creates a new instance of DecoderSequence. The configuration object. The list of decoders to apply.</code> |
|
278 |
+
| <code>How can the `generationlogits_process.NoBadWordsLogitsProcessor` static class be effectively integrated into a retrieval model to improve filtering of inappropriate content?</code> | <code>Kind: static class ofgeneration/logits_process</code> |
|
279 |
+
| <code>How can I fine-tune the OpenVINO Sequence Classification model for improved information retrieval performance?</code> | <code>(model= Noneconfig= None**kwargs) Parameters OpenVINO Model with a SequenceClassifierOutput for sequence classification tasks. This model inherits fromoptimum.intel.openvino.modeling.OVBaseModel. Check the superclass documentation for the generic methods the library implements for all its model (such as downloading or saving) (input_ids: typing.Union[torch.Tensor, numpy.ndarray]attention_mask: typing.Union[torch.Tensor, numpy.ndarray]token_type_ids: typing.Union[torch.Tensor, numpy.ndarray, NoneType] = None**kwargs) Parameters TheOVModelForSequenceClassificationforward method, overrides the__call__special method. Although the recipe for forward pass needs to be defined within this function, one should call theModuleinstance afterwards instead of this since the former takes care of running the pre and post processing steps while the latter silently ignores them. Example of sequence classification usingtransformers.pipeline:</code> |
|
280 |
+
* Loss: [<code>CachedMultipleNegativesRankingLoss</code>](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#cachedmultiplenegativesrankingloss) with these parameters:
|
281 |
+
```json
|
282 |
+
{
|
283 |
+
"scale": 20.0,
|
284 |
+
"similarity_fct": "cos_sim",
|
285 |
+
"mini_batch_size": 1024
|
286 |
+
}
|
287 |
+
```
|
288 |
+
|
289 |
+
### Training Hyperparameters
|
290 |
+
#### Non-Default Hyperparameters
|
291 |
+
|
292 |
+
- `eval_strategy`: steps
|
293 |
+
- `per_device_train_batch_size`: 32
|
294 |
+
- `per_device_eval_batch_size`: 32
|
295 |
+
- `learning_rate`: 2e-05
|
296 |
+
- `weight_decay`: 0.01
|
297 |
+
- `num_train_epochs`: 5
|
298 |
+
- `warmup_ratio`: 0.1
|
299 |
+
- `warmup_steps`: 50
|
300 |
+
- `fp16`: True
|
301 |
+
- `load_best_model_at_end`: True
|
302 |
+
- `batch_sampler`: no_duplicates
|
303 |
+
|
304 |
+
#### All Hyperparameters
|
305 |
+
<details><summary>Click to expand</summary>
|
306 |
+
|
307 |
+
- `overwrite_output_dir`: False
|
308 |
+
- `do_predict`: False
|
309 |
+
- `eval_strategy`: steps
|
310 |
+
- `prediction_loss_only`: True
|
311 |
+
- `per_device_train_batch_size`: 32
|
312 |
+
- `per_device_eval_batch_size`: 32
|
313 |
+
- `per_gpu_train_batch_size`: None
|
314 |
+
- `per_gpu_eval_batch_size`: None
|
315 |
+
- `gradient_accumulation_steps`: 1
|
316 |
+
- `eval_accumulation_steps`: None
|
317 |
+
- `torch_empty_cache_steps`: None
|
318 |
+
- `learning_rate`: 2e-05
|
319 |
+
- `weight_decay`: 0.01
|
320 |
+
- `adam_beta1`: 0.9
|
321 |
+
- `adam_beta2`: 0.999
|
322 |
+
- `adam_epsilon`: 1e-08
|
323 |
+
- `max_grad_norm`: 1.0
|
324 |
+
- `num_train_epochs`: 5
|
325 |
+
- `max_steps`: -1
|
326 |
+
- `lr_scheduler_type`: linear
|
327 |
+
- `lr_scheduler_kwargs`: {}
|
328 |
+
- `warmup_ratio`: 0.1
|
329 |
+
- `warmup_steps`: 50
|
330 |
+
- `log_level`: passive
|
331 |
+
- `log_level_replica`: warning
|
332 |
+
- `log_on_each_node`: True
|
333 |
+
- `logging_nan_inf_filter`: True
|
334 |
+
- `save_safetensors`: True
|
335 |
+
- `save_on_each_node`: False
|
336 |
+
- `save_only_model`: False
|
337 |
+
- `restore_callback_states_from_checkpoint`: False
|
338 |
+
- `no_cuda`: False
|
339 |
+
- `use_cpu`: False
|
340 |
+
- `use_mps_device`: False
|
341 |
+
- `seed`: 42
|
342 |
+
- `data_seed`: None
|
343 |
+
- `jit_mode_eval`: False
|
344 |
+
- `use_ipex`: False
|
345 |
+
- `bf16`: False
|
346 |
+
- `fp16`: True
|
347 |
+
- `fp16_opt_level`: O1
|
348 |
+
- `half_precision_backend`: auto
|
349 |
+
- `bf16_full_eval`: False
|
350 |
+
- `fp16_full_eval`: False
|
351 |
+
- `tf32`: None
|
352 |
+
- `local_rank`: 0
|
353 |
+
- `ddp_backend`: None
|
354 |
+
- `tpu_num_cores`: None
|
355 |
+
- `tpu_metrics_debug`: False
|
356 |
+
- `debug`: []
|
357 |
+
- `dataloader_drop_last`: False
|
358 |
+
- `dataloader_num_workers`: 0
|
359 |
+
- `dataloader_prefetch_factor`: None
|
360 |
+
- `past_index`: -1
|
361 |
+
- `disable_tqdm`: False
|
362 |
+
- `remove_unused_columns`: True
|
363 |
+
- `label_names`: None
|
364 |
+
- `load_best_model_at_end`: True
|
365 |
+
- `ignore_data_skip`: False
|
366 |
+
- `fsdp`: []
|
367 |
+
- `fsdp_min_num_params`: 0
|
368 |
+
- `fsdp_config`: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
|
369 |
+
- `fsdp_transformer_layer_cls_to_wrap`: None
|
370 |
+
- `accelerator_config`: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
|
371 |
+
- `deepspeed`: None
|
372 |
+
- `label_smoothing_factor`: 0.0
|
373 |
+
- `optim`: adamw_torch
|
374 |
+
- `optim_args`: None
|
375 |
+
- `adafactor`: False
|
376 |
+
- `group_by_length`: False
|
377 |
+
- `length_column_name`: length
|
378 |
+
- `ddp_find_unused_parameters`: None
|
379 |
+
- `ddp_bucket_cap_mb`: None
|
380 |
+
- `ddp_broadcast_buffers`: False
|
381 |
+
- `dataloader_pin_memory`: True
|
382 |
+
- `dataloader_persistent_workers`: False
|
383 |
+
- `skip_memory_metrics`: True
|
384 |
+
- `use_legacy_prediction_loop`: False
|
385 |
+
- `push_to_hub`: False
|
386 |
+
- `resume_from_checkpoint`: None
|
387 |
+
- `hub_model_id`: None
|
388 |
+
- `hub_strategy`: every_save
|
389 |
+
- `hub_private_repo`: None
|
390 |
+
- `hub_always_push`: False
|
391 |
+
- `gradient_checkpointing`: False
|
392 |
+
- `gradient_checkpointing_kwargs`: None
|
393 |
+
- `include_inputs_for_metrics`: False
|
394 |
+
- `include_for_metrics`: []
|
395 |
+
- `eval_do_concat_batches`: True
|
396 |
+
- `fp16_backend`: auto
|
397 |
+
- `push_to_hub_model_id`: None
|
398 |
+
- `push_to_hub_organization`: None
|
399 |
+
- `mp_parameters`:
|
400 |
+
- `auto_find_batch_size`: False
|
401 |
+
- `full_determinism`: False
|
402 |
+
- `torchdynamo`: None
|
403 |
+
- `ray_scope`: last
|
404 |
+
- `ddp_timeout`: 1800
|
405 |
+
- `torch_compile`: False
|
406 |
+
- `torch_compile_backend`: None
|
407 |
+
- `torch_compile_mode`: None
|
408 |
+
- `dispatch_batches`: None
|
409 |
+
- `split_batches`: None
|
410 |
+
- `include_tokens_per_second`: False
|
411 |
+
- `include_num_input_tokens_seen`: False
|
412 |
+
- `neftune_noise_alpha`: None
|
413 |
+
- `optim_target_modules`: None
|
414 |
+
- `batch_eval_metrics`: False
|
415 |
+
- `eval_on_start`: False
|
416 |
+
- `use_liger_kernel`: False
|
417 |
+
- `eval_use_gather_object`: False
|
418 |
+
- `average_tokens_across_devices`: False
|
419 |
+
- `prompts`: None
|
420 |
+
- `batch_sampler`: no_duplicates
|
421 |
+
- `multi_dataset_batch_sampler`: proportional
|
422 |
+
|
423 |
+
</details>
|
424 |
+
|
425 |
+
### Training Logs
|
426 |
+
| Epoch | Step | Training Loss | Validation Loss |
|
427 |
+
|:------:|:----:|:-------------:|:---------------:|
|
428 |
+
| 0.5076 | 100 | 0.308 | - |
|
429 |
+
| 1.0152 | 200 | 0.179 | - |
|
430 |
+
| 1.5228 | 300 | 0.127 | 0.0739 |
|
431 |
+
| 2.0305 | 400 | 0.0828 | - |
|
432 |
+
| 2.5381 | 500 | 0.0528 | - |
|
433 |
+
| 3.0457 | 600 | 0.0576 | 0.0436 |
|
434 |
+
| 3.5533 | 700 | 0.0396 | - |
|
435 |
+
| 1.0152 | 200 | 0.0262 | 0.0379 |
|
436 |
+
| 2.0305 | 400 | 0.0159 | 0.0360 |
|
437 |
+
| 3.0457 | 600 | 0.0082 | 0.0340 |
|
438 |
+
|
439 |
+
|
440 |
+
### Framework Versions
|
441 |
+
- Python: 3.10.12
|
442 |
+
- Sentence Transformers: 4.0.1
|
443 |
+
- Transformers: 4.47.0
|
444 |
+
- PyTorch: 2.5.1+cu121
|
445 |
+
- Accelerate: 1.2.1
|
446 |
+
- Datasets: 3.3.1
|
447 |
+
- Tokenizers: 0.21.0
|
448 |
+
|
449 |
+
## Citation
|
450 |
+
|
451 |
+
### BibTeX
|
452 |
+
|
453 |
+
#### Sentence Transformers
|
454 |
+
```bibtex
|
455 |
+
@inproceedings{reimers-2019-sentence-bert,
|
456 |
+
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
|
457 |
+
author = "Reimers, Nils and Gurevych, Iryna",
|
458 |
+
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
|
459 |
+
month = "11",
|
460 |
+
year = "2019",
|
461 |
+
publisher = "Association for Computational Linguistics",
|
462 |
+
url = "https://arxiv.org/abs/1908.10084",
|
463 |
+
}
|
464 |
+
```
|
465 |
+
|
466 |
+
#### CachedMultipleNegativesRankingLoss
|
467 |
+
```bibtex
|
468 |
+
@misc{gao2021scaling,
|
469 |
+
title={Scaling Deep Contrastive Learning Batch Size under Memory Limited Setup},
|
470 |
+
author={Luyu Gao and Yunyi Zhang and Jiawei Han and Jamie Callan},
|
471 |
+
year={2021},
|
472 |
+
eprint={2101.06983},
|
473 |
+
archivePrefix={arXiv},
|
474 |
+
primaryClass={cs.LG}
|
475 |
+
}
|
476 |
+
```
|
477 |
+
|
478 |
+
<!--
|
479 |
+
## Glossary
|
480 |
+
|
481 |
+
*Clearly define terms in order to be accessible across audiences.*
|
482 |
+
-->
|
483 |
+
|
484 |
+
<!--
|
485 |
+
## Model Card Authors
|
486 |
+
|
487 |
+
*Lists the people who create the model card, providing recognition and accountability for the detailed work that goes into its construction.*
|
488 |
+
-->
|
489 |
+
|
490 |
+
<!--
|
491 |
+
## Model Card Contact
|
492 |
+
|
493 |
+
*Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors.*
|
494 |
+
-->
|
config.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "sentence-transformers/all-MiniLM-L6-v2",
|
3 |
+
"architectures": [
|
4 |
+
"BertModel"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"classifier_dropout": null,
|
8 |
+
"gradient_checkpointing": false,
|
9 |
+
"hidden_act": "gelu",
|
10 |
+
"hidden_dropout_prob": 0.1,
|
11 |
+
"hidden_size": 384,
|
12 |
+
"initializer_range": 0.02,
|
13 |
+
"intermediate_size": 1536,
|
14 |
+
"layer_norm_eps": 1e-12,
|
15 |
+
"max_position_embeddings": 512,
|
16 |
+
"model_type": "bert",
|
17 |
+
"num_attention_heads": 12,
|
18 |
+
"num_hidden_layers": 6,
|
19 |
+
"pad_token_id": 0,
|
20 |
+
"position_embedding_type": "absolute",
|
21 |
+
"torch_dtype": "float32",
|
22 |
+
"transformers_version": "4.47.0",
|
23 |
+
"type_vocab_size": 2,
|
24 |
+
"use_cache": true,
|
25 |
+
"vocab_size": 30522
|
26 |
+
}
|
config_sentence_transformers.json
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"__version__": {
|
3 |
+
"sentence_transformers": "4.0.1",
|
4 |
+
"transformers": "4.47.0",
|
5 |
+
"pytorch": "2.5.1+cu121"
|
6 |
+
},
|
7 |
+
"prompts": {},
|
8 |
+
"default_prompt_name": null,
|
9 |
+
"similarity_fn_name": "cosine"
|
10 |
+
}
|
model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cdef8549522565179efdba24e1e563b4d5154ccae6b5b7e6ddf37193949608b9
|
3 |
+
size 90864192
|
modules.json
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"idx": 0,
|
4 |
+
"name": "0",
|
5 |
+
"path": "",
|
6 |
+
"type": "sentence_transformers.models.Transformer"
|
7 |
+
},
|
8 |
+
{
|
9 |
+
"idx": 1,
|
10 |
+
"name": "1",
|
11 |
+
"path": "1_Pooling",
|
12 |
+
"type": "sentence_transformers.models.Pooling"
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"idx": 2,
|
16 |
+
"name": "2",
|
17 |
+
"path": "2_Normalize",
|
18 |
+
"type": "sentence_transformers.models.Normalize"
|
19 |
+
}
|
20 |
+
]
|
optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:09e0b047b047dd34f9f4456d9cbb2d934eb30ceed5ea3039b672719ff49fda37
|
3 |
+
size 180582586
|
rng_state.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ceea43eaeac5d2e91fffbdb3ea2a32a5149a290cfbc0a044b63f9d231679f9c9
|
3 |
+
size 14244
|
scheduler.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:aa84a77ec759e53dfb8e9c7ce3af994cbfabec2d7215bd67c4980d6fdcc70410
|
3 |
+
size 1000
|
sentence_bert_config.json
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"max_seq_length": 256,
|
3 |
+
"do_lower_case": false
|
4 |
+
}
|
special_tokens_map.json
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": {
|
3 |
+
"content": "[CLS]",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": false,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"mask_token": {
|
10 |
+
"content": "[MASK]",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"pad_token": {
|
17 |
+
"content": "[PAD]",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
},
|
23 |
+
"sep_token": {
|
24 |
+
"content": "[SEP]",
|
25 |
+
"lstrip": false,
|
26 |
+
"normalized": false,
|
27 |
+
"rstrip": false,
|
28 |
+
"single_word": false
|
29 |
+
},
|
30 |
+
"unk_token": {
|
31 |
+
"content": "[UNK]",
|
32 |
+
"lstrip": false,
|
33 |
+
"normalized": false,
|
34 |
+
"rstrip": false,
|
35 |
+
"single_word": false
|
36 |
+
}
|
37 |
+
}
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"added_tokens_decoder": {
|
3 |
+
"0": {
|
4 |
+
"content": "[PAD]",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false,
|
9 |
+
"special": true
|
10 |
+
},
|
11 |
+
"100": {
|
12 |
+
"content": "[UNK]",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": false,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false,
|
17 |
+
"special": true
|
18 |
+
},
|
19 |
+
"101": {
|
20 |
+
"content": "[CLS]",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": false,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false,
|
25 |
+
"special": true
|
26 |
+
},
|
27 |
+
"102": {
|
28 |
+
"content": "[SEP]",
|
29 |
+
"lstrip": false,
|
30 |
+
"normalized": false,
|
31 |
+
"rstrip": false,
|
32 |
+
"single_word": false,
|
33 |
+
"special": true
|
34 |
+
},
|
35 |
+
"103": {
|
36 |
+
"content": "[MASK]",
|
37 |
+
"lstrip": false,
|
38 |
+
"normalized": false,
|
39 |
+
"rstrip": false,
|
40 |
+
"single_word": false,
|
41 |
+
"special": true
|
42 |
+
}
|
43 |
+
},
|
44 |
+
"clean_up_tokenization_spaces": false,
|
45 |
+
"cls_token": "[CLS]",
|
46 |
+
"do_basic_tokenize": true,
|
47 |
+
"do_lower_case": true,
|
48 |
+
"extra_special_tokens": {},
|
49 |
+
"mask_token": "[MASK]",
|
50 |
+
"max_length": 128,
|
51 |
+
"model_max_length": 256,
|
52 |
+
"never_split": null,
|
53 |
+
"pad_to_multiple_of": null,
|
54 |
+
"pad_token": "[PAD]",
|
55 |
+
"pad_token_type_id": 0,
|
56 |
+
"padding_side": "right",
|
57 |
+
"sep_token": "[SEP]",
|
58 |
+
"stride": 0,
|
59 |
+
"strip_accents": null,
|
60 |
+
"tokenize_chinese_chars": true,
|
61 |
+
"tokenizer_class": "BertTokenizer",
|
62 |
+
"truncation_side": "right",
|
63 |
+
"truncation_strategy": "longest_first",
|
64 |
+
"unk_token": "[UNK]"
|
65 |
+
}
|
trainer_state.json
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": 0.034021906554698944,
|
3 |
+
"best_model_checkpoint": "models/HG/checkpoint-600",
|
4 |
+
"epoch": 3.045685279187817,
|
5 |
+
"eval_steps": 200,
|
6 |
+
"global_step": 600,
|
7 |
+
"is_hyper_param_search": false,
|
8 |
+
"is_local_process_zero": true,
|
9 |
+
"is_world_process_zero": true,
|
10 |
+
"log_history": [
|
11 |
+
{
|
12 |
+
"epoch": 1.015228426395939,
|
13 |
+
"grad_norm": 0.2426794022321701,
|
14 |
+
"learning_rate": 6.349206349206349e-06,
|
15 |
+
"loss": 0.0262,
|
16 |
+
"step": 200
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"epoch": 1.015228426395939,
|
20 |
+
"eval_loss": 0.037922926247119904,
|
21 |
+
"eval_runtime": 1.4442,
|
22 |
+
"eval_samples_per_second": 484.703,
|
23 |
+
"eval_steps_per_second": 15.234,
|
24 |
+
"step": 200
|
25 |
+
},
|
26 |
+
{
|
27 |
+
"epoch": 2.030456852791878,
|
28 |
+
"grad_norm": 0.3046327233314514,
|
29 |
+
"learning_rate": 1.2698412698412699e-05,
|
30 |
+
"loss": 0.0159,
|
31 |
+
"step": 400
|
32 |
+
},
|
33 |
+
{
|
34 |
+
"epoch": 2.030456852791878,
|
35 |
+
"eval_loss": 0.03600747138261795,
|
36 |
+
"eval_runtime": 1.4409,
|
37 |
+
"eval_samples_per_second": 485.792,
|
38 |
+
"eval_steps_per_second": 15.268,
|
39 |
+
"step": 400
|
40 |
+
},
|
41 |
+
{
|
42 |
+
"epoch": 3.045685279187817,
|
43 |
+
"grad_norm": 0.08811034262180328,
|
44 |
+
"learning_rate": 1.904761904761905e-05,
|
45 |
+
"loss": 0.0082,
|
46 |
+
"step": 600
|
47 |
+
},
|
48 |
+
{
|
49 |
+
"epoch": 3.045685279187817,
|
50 |
+
"eval_loss": 0.034021906554698944,
|
51 |
+
"eval_runtime": 1.4347,
|
52 |
+
"eval_samples_per_second": 487.892,
|
53 |
+
"eval_steps_per_second": 15.334,
|
54 |
+
"step": 600
|
55 |
+
}
|
56 |
+
],
|
57 |
+
"logging_steps": 200,
|
58 |
+
"max_steps": 985,
|
59 |
+
"num_input_tokens_seen": 0,
|
60 |
+
"num_train_epochs": 5,
|
61 |
+
"save_steps": 200,
|
62 |
+
"stateful_callbacks": {
|
63 |
+
"TrainerControl": {
|
64 |
+
"args": {
|
65 |
+
"should_epoch_stop": false,
|
66 |
+
"should_evaluate": false,
|
67 |
+
"should_log": false,
|
68 |
+
"should_save": true,
|
69 |
+
"should_training_stop": false
|
70 |
+
},
|
71 |
+
"attributes": {}
|
72 |
+
}
|
73 |
+
},
|
74 |
+
"total_flos": 0.0,
|
75 |
+
"train_batch_size": 32,
|
76 |
+
"trial_name": null,
|
77 |
+
"trial_params": null
|
78 |
+
}
|
training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:23380fa299427d290096f77e3ed9521960f4df5a9d90a6c252fea893be909ff4
|
3 |
+
size 5560
|
vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|