Update modeling_reasonir_8b.py
Browse files- modeling_reasonir_8b.py +8 -8
modeling_reasonir_8b.py
CHANGED
@@ -201,8 +201,8 @@ class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding):
|
|
201 |
|
202 |
def rotate_half(x):
|
203 |
"""Rotates half the hidden dims of the input."""
|
204 |
-
x1 = x[
|
205 |
-
x2 = x[
|
206 |
return torch.cat((-x2, x1), dim=-1)
|
207 |
|
208 |
|
@@ -1313,9 +1313,9 @@ class LlamaForCausalLM(LlamaPreTrainedModel, GenerationMixin):
|
|
1313 |
r"""
|
1314 |
Args:
|
1315 |
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
1316 |
-
Labels for computing the masked language modeling loss. Indices should either be in `[0,
|
1317 |
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
1318 |
-
(masked), the loss is only computed for the tokens with labels in `[0,
|
1319 |
|
1320 |
num_logits_to_keep (`int`, *optional*):
|
1321 |
Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
|
@@ -1433,7 +1433,7 @@ class LlamaForSequenceClassification(LlamaPreTrainedModel):
|
|
1433 |
) -> Union[Tuple, SequenceClassifierOutputWithPast]:
|
1434 |
r"""
|
1435 |
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
1436 |
-
Labels for computing the sequence classification/regression loss. Indices should be in `[0,
|
1437 |
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
|
1438 |
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
|
1439 |
"""
|
@@ -1500,7 +1500,7 @@ SQuAD (a linear layer on top of the hidden-states output to compute `span start
|
|
1500 |
class LlamaForQuestionAnswering(LlamaPreTrainedModel):
|
1501 |
base_model_prefix = "transformer"
|
1502 |
|
1503 |
-
# Copied from
|
1504 |
def __init__(self, config):
|
1505 |
super().__init__(config)
|
1506 |
self.transformer = LlamaModel(config)
|
@@ -1628,7 +1628,7 @@ class LlamaForTokenClassification(LlamaPreTrainedModel):
|
|
1628 |
) -> Union[Tuple, TokenClassifierOutput]:
|
1629 |
r"""
|
1630 |
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
1631 |
-
Labels for computing the sequence classification/regression loss. Indices should be in `[0,
|
1632 |
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
|
1633 |
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
|
1634 |
"""
|
@@ -1662,4 +1662,4 @@ class LlamaForTokenClassification(LlamaPreTrainedModel):
|
|
1662 |
logits=logits,
|
1663 |
hidden_states=outputs.hidden_states,
|
1664 |
attentions=outputs.attentions,
|
1665 |
-
)
|
|
|
201 |
|
202 |
def rotate_half(x):
|
203 |
"""Rotates half the hidden dims of the input."""
|
204 |
+
x1 = x[..., : x.shape[-1] // 2]
|
205 |
+
x2 = x[..., x.shape[-1] // 2 :]
|
206 |
return torch.cat((-x2, x1), dim=-1)
|
207 |
|
208 |
|
|
|
1313 |
r"""
|
1314 |
Args:
|
1315 |
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
1316 |
+
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
1317 |
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
1318 |
+
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
|
1319 |
|
1320 |
num_logits_to_keep (`int`, *optional*):
|
1321 |
Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
|
|
|
1433 |
) -> Union[Tuple, SequenceClassifierOutputWithPast]:
|
1434 |
r"""
|
1435 |
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
1436 |
+
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
|
1437 |
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
|
1438 |
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
|
1439 |
"""
|
|
|
1500 |
class LlamaForQuestionAnswering(LlamaPreTrainedModel):
|
1501 |
base_model_prefix = "transformer"
|
1502 |
|
1503 |
+
# Copied from ...models.bloom.modeling_bloom.BloomForQuestionAnswering.__init__ with Bloom->Llama
|
1504 |
def __init__(self, config):
|
1505 |
super().__init__(config)
|
1506 |
self.transformer = LlamaModel(config)
|
|
|
1628 |
) -> Union[Tuple, TokenClassifierOutput]:
|
1629 |
r"""
|
1630 |
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
1631 |
+
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
|
1632 |
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
|
1633 |
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
|
1634 |
"""
|
|
|
1662 |
logits=logits,
|
1663 |
hidden_states=outputs.hidden_states,
|
1664 |
attentions=outputs.attentions,
|
1665 |
+
)
|