Update modeling_reasonir_8b.py
Browse files- modeling_reasonir_8b.py +17 -17
modeling_reasonir_8b.py
CHANGED
@@ -26,22 +26,22 @@ import torch.nn.functional as F
|
|
26 |
import torch.utils.checkpoint
|
27 |
from torch import nn
|
28 |
|
29 |
-
from
|
30 |
-
from
|
31 |
-
from
|
32 |
-
from
|
33 |
-
from
|
34 |
-
from
|
35 |
BaseModelOutputWithPast,
|
36 |
CausalLMOutputWithPast,
|
37 |
QuestionAnsweringModelOutput,
|
38 |
SequenceClassifierOutputWithPast,
|
39 |
TokenClassifierOutput,
|
40 |
)
|
41 |
-
from
|
42 |
-
from
|
43 |
-
from
|
44 |
-
from
|
45 |
add_code_sample_docstrings,
|
46 |
add_start_docstrings,
|
47 |
add_start_docstrings_to_model_forward,
|
@@ -201,8 +201,8 @@ class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding):
|
|
201 |
|
202 |
def rotate_half(x):
|
203 |
"""Rotates half the hidden dims of the input."""
|
204 |
-
x1 = x[
|
205 |
-
x2 = x[
|
206 |
return torch.cat((-x2, x1), dim=-1)
|
207 |
|
208 |
|
@@ -1313,9 +1313,9 @@ class LlamaForCausalLM(LlamaPreTrainedModel, GenerationMixin):
|
|
1313 |
r"""
|
1314 |
Args:
|
1315 |
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
1316 |
-
Labels for computing the masked language modeling loss. Indices should either be in `[0,
|
1317 |
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
1318 |
-
(masked), the loss is only computed for the tokens with labels in `[0,
|
1319 |
|
1320 |
num_logits_to_keep (`int`, *optional*):
|
1321 |
Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
|
@@ -1433,7 +1433,7 @@ class LlamaForSequenceClassification(LlamaPreTrainedModel):
|
|
1433 |
) -> Union[Tuple, SequenceClassifierOutputWithPast]:
|
1434 |
r"""
|
1435 |
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
1436 |
-
Labels for computing the sequence classification/regression loss. Indices should be in `[0,
|
1437 |
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
|
1438 |
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
|
1439 |
"""
|
@@ -1628,7 +1628,7 @@ class LlamaForTokenClassification(LlamaPreTrainedModel):
|
|
1628 |
) -> Union[Tuple, TokenClassifierOutput]:
|
1629 |
r"""
|
1630 |
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
1631 |
-
Labels for computing the sequence classification/regression loss. Indices should be in `[0,
|
1632 |
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
|
1633 |
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
|
1634 |
"""
|
@@ -1662,4 +1662,4 @@ class LlamaForTokenClassification(LlamaPreTrainedModel):
|
|
1662 |
logits=logits,
|
1663 |
hidden_states=outputs.hidden_states,
|
1664 |
attentions=outputs.attentions,
|
1665 |
-
)
|
|
|
26 |
import torch.utils.checkpoint
|
27 |
from torch import nn
|
28 |
|
29 |
+
from transformers.activations import ACT2FN
|
30 |
+
from transformers.cache_utils import Cache, DynamicCache, StaticCache
|
31 |
+
from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa, _prepare_4d_attention_mask, _prepare_4d_attention_mask_for_sdpa
|
32 |
+
from transformers.generation import GenerationMixin
|
33 |
+
from transformers.modeling_attn_mask_utils import AttentionMaskConverter
|
34 |
+
from transformers.modeling_outputs import (
|
35 |
BaseModelOutputWithPast,
|
36 |
CausalLMOutputWithPast,
|
37 |
QuestionAnsweringModelOutput,
|
38 |
SequenceClassifierOutputWithPast,
|
39 |
TokenClassifierOutput,
|
40 |
)
|
41 |
+
from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS
|
42 |
+
from transformers.modeling_utils import PreTrainedModel
|
43 |
+
from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
|
44 |
+
from transformers.utils import (
|
45 |
add_code_sample_docstrings,
|
46 |
add_start_docstrings,
|
47 |
add_start_docstrings_to_model_forward,
|
|
|
201 |
|
202 |
def rotate_half(x):
|
203 |
"""Rotates half the hidden dims of the input."""
|
204 |
+
x1 = x[transformers., : x.shape[-1] // 2]
|
205 |
+
x2 = x[transformers., x.shape[-1] // 2 :]
|
206 |
return torch.cat((-x2, x1), dim=-1)
|
207 |
|
208 |
|
|
|
1313 |
r"""
|
1314 |
Args:
|
1315 |
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
1316 |
+
Labels for computing the masked language modeling loss. Indices should either be in `[0, transformers.,
|
1317 |
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
1318 |
+
(masked), the loss is only computed for the tokens with labels in `[0, transformers., config.vocab_size]`.
|
1319 |
|
1320 |
num_logits_to_keep (`int`, *optional*):
|
1321 |
Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
|
|
|
1433 |
) -> Union[Tuple, SequenceClassifierOutputWithPast]:
|
1434 |
r"""
|
1435 |
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
1436 |
+
Labels for computing the sequence classification/regression loss. Indices should be in `[0, transformers.,
|
1437 |
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
|
1438 |
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
|
1439 |
"""
|
|
|
1628 |
) -> Union[Tuple, TokenClassifierOutput]:
|
1629 |
r"""
|
1630 |
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
1631 |
+
Labels for computing the sequence classification/regression loss. Indices should be in `[0, transformers.,
|
1632 |
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
|
1633 |
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
|
1634 |
"""
|
|
|
1662 |
logits=logits,
|
1663 |
hidden_states=outputs.hidden_states,
|
1664 |
attentions=outputs.attentions,
|
1665 |
+
)
|