reasonir
/

ReasonIR-8B

@@ -51,6 +51,10 @@ from transformers.utils import (
     replace_return_docstrings,
 )
 from transformers.models.llama.configuration_llama import LlamaConfig
 if is_flash_attn_2_available():
     from flash_attn import flash_attn_func, flash_attn_varlen_func
@@ -428,7 +432,7 @@ class LlamaFlashAttention2(LlamaAttention):
         dropout=0.0,
         softmax_scale=None,
         use_sliding_windows=False,
-        is_causal=False,
     ):
         """
         Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
@@ -529,7 +533,7 @@ class LlamaFlashAttention2(LlamaAttention):
         use_cache: bool = False,
         cache_position: Optional[torch.LongTensor] = None,
         position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
-        is_causal: bool = False,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         if isinstance(past_key_value, StaticCache):
             raise ValueError(
@@ -656,7 +660,7 @@ class LlamaSdpaAttention(LlamaAttention):
         use_cache: bool = False,
         cache_position: Optional[torch.LongTensor] = None,
         position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
-        is_causal: bool = False,
         **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         if output_attentions:
@@ -763,7 +767,7 @@ class LlamaDecoderLayer(nn.Module):
         use_cache: Optional[bool] = False,
         cache_position: Optional[torch.LongTensor] = None,
         position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
-        is_causal: bool = False,
         **kwargs,
     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
         """
@@ -948,6 +952,8 @@ LLAMA_INPUTS_DOCSTRING = r"""
     "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
     LLAMA_START_DOCSTRING,
 )
 class LlamaModel(LlamaPreTrainedModel):
     """
     Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
@@ -991,7 +997,7 @@ class LlamaModel(LlamaPreTrainedModel):
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
-        is_causal: Optional[bool] = False,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -1663,3 +1669,158 @@ class LlamaForTokenClassification(LlamaPreTrainedModel):
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )

     replace_return_docstrings,
 )
 from transformers.models.llama.configuration_llama import LlamaConfig
+from typing import Dict, List, Union, cast
+import numpy as np
+from tqdm import tqdm
+from transformers import AutoTokenizer
 if is_flash_attn_2_available():
     from flash_attn import flash_attn_func, flash_attn_varlen_func
         dropout=0.0,
         softmax_scale=None,
         use_sliding_windows=False,
+        is_causal=True,
     ):
         """
         Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
         use_cache: bool = False,
         cache_position: Optional[torch.LongTensor] = None,
         position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        is_causal: bool = True,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         if isinstance(past_key_value, StaticCache):
             raise ValueError(
         use_cache: bool = False,
         cache_position: Optional[torch.LongTensor] = None,
         position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        is_causal: bool = True,
         **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         if output_attentions:
         use_cache: Optional[bool] = False,
         cache_position: Optional[torch.LongTensor] = None,
         position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        is_causal: bool = True,
         **kwargs,
     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
         """
     "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
     LLAMA_START_DOCSTRING,
 )
 class LlamaModel(LlamaPreTrainedModel):
     """
     Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
+        is_causal: Optional[bool] = True,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
+class ReasonIRModel(LLamaModel):
+    """
+    ReasonIRModel is a wrapper around LlamaModel with bi-directional attention for retrieval tasks
+    """
+    def __init__(self, config: LlamaConfig):
+        """
+        Initializes the ReasonIRModel with the given configuration.
+        """
+        super().__init__(config)
+        self.pooling_method = "mean"
+        self.normalized = True
+        self.embed_eos = ""
+        self.reasonir_config = config
+        self.tokenizer = AutoTokenizer.from_pretrained('reasonir/ReasonIR-8B')
+    def encode_queries(self, queries: Union[List[str], str], **kwargs) -> np.ndarray:
+        """Used for encoding the queries of retrieval or reranking tasks"""
+        return self.encode(queries, **kwargs)
+    def encode_corpus(self, corpus: Union[List[str], str, List[Dict[str, str]]], **kwargs) -> np.ndarray:
+        """Used for encoding the corpus of retrieval tasks"""
+        if isinstance(corpus, dict):
+            corpus = [corpus]
+        if isinstance(corpus, list) and isinstance(corpus[0], dict):
+            corpus = [
+                doc["title"] + " " + doc["text"] if "title" in doc
+                else doc["text"] for doc in corpus
+            ]
+        return self.encode(corpus, **kwargs)
+    @torch.inference_mode()
+    def encode(
+        self,
+        sentences: Union[List[str], str],
+        batch_size: int = 256,
+        max_length: int = 512,
+        instruction: str = "",
+        embed_instruction: bool = False,
+        get_cache: bool = False,
+        convert_to_tensor: bool = False,
+        recast: bool = False,
+        add_special_tokens: bool = True,
+        **kwargs,
+    ) -> np.ndarray:
+        # get number of gpus
+        num_gpus = torch.cuda.device_count()
+        if num_gpus > 0:
+            batch_size *= num_gpus
+        input_was_string = False
+        if isinstance(sentences, str):
+            sentences = [sentences]
+            input_was_string = True
+        all_embeddings, all_kv_caches = [], []
+        for start_index in tqdm(range(0, len(sentences), batch_size), desc="Batches", disable=len(sentences)<256):
+            sentences_batch = [
+                instruction + s + self.embed_eos for s in sentences[start_index:start_index + batch_size]
+            ]
+            # This will prepend the bos token if the tokenizer has `add_bos_token=True`
+            inputs = self.tokenizer(
+                sentences_batch,
+                padding=True,
+                truncation=True,
+                return_tensors='pt',
+                max_length=max_length,
+                add_special_tokens=add_special_tokens,
+            ).to(self.device)
+            inputs["is_causal"] = False
+            if get_cache:
+                inputs['use_cache'] = True
+            outputs = self(**inputs)
+            last_hidden_state = outputs[0]
+            if get_cache:
+                # Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`
+                assert len(all_kv_caches) == 0, "Can only get cache for one batch at a time"
+                all_kv_caches = outputs[1]
+            if (instruction) and (embed_instruction is False) and ("mean" in self.pooling_method):
+                # Remove instruction tokens from the embeddings by masking them
+                instruction_tokens = self.tokenizer(
+                    instruction,
+                    padding=False,
+                    truncation=True,
+                    max_length=max_length,
+                    add_special_tokens=add_special_tokens,
+                )["input_ids"]
+                inputs['attention_mask'][:, :len(instruction_tokens)] = 0
+            embeddings = self.pooling(last_hidden_state, inputs['attention_mask'], recast=recast)
+            # Normalize can change the dtype (https://discuss.pytorch.org/t/tensor-in-float16-is-transformed-into-float32-after-torch-norm/110891)
+            if self.normalized:
+                in_dtype = embeddings.dtype
+                embeddings = torch.nn.functional.normalize(embeddings, dim=-1).to(in_dtype)
+            embeddings = cast(torch.Tensor, embeddings)
+            if convert_to_tensor:
+                all_embeddings.append(embeddings)
+            else:
+                # NumPy does not support bfloat16
+                all_embeddings.append(embeddings.cpu().to(torch.float32).numpy())
+        all_embeddings = (
+            torch.cat(all_embeddings, dim=0) if convert_to_tensor else np.concatenate(all_embeddings, axis=0)
+        )
+        if input_was_string:
+            all_embeddings = all_embeddings[0]
+        if get_cache:
+            return all_embeddings, all_kv_caches
+        return all_embeddings
+    def pooling(
+        self, hidden_state: torch.Tensor, attention_mask: torch.Tensor = None, recast: bool = False
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_state: [b, n, d]
+            attention_mask: [b, n]
+        """
+        # In case the model is distributed across multiple devices; hidden_state may end up on diff device
+        hidden_state = hidden_state.to(attention_mask.device)
+        if self.pooling_method == 'cls':
+            embedding = hidden_state[:, 0]
+        elif self.pooling_method == 'lasttoken':
+            b, n, d = hidden_state.size()
+            # Get the last `1` in the attention mask of each item
+            # Often it is just `gather_indices = torch.argmin(attention_mask, 1, keepdim=False) - 1`
+            # except when 1) There's all 1's 2) There's 0's before the 1's
+            reversed_mask = torch.flip(attention_mask, dims=(1,))
+            argmax_reverse = torch.argmax(reversed_mask, dim=1, keepdim=False)
+            gather_indices = attention_mask.size(1) - argmax_reverse - 1
+            # If there are empty sequences, where the index would become -1 it will crash so set them to 0
+            gather_indices = torch.clamp(gather_indices, min=0)
+            # Turn indices from shape [b] -> [b, 1, d]
+            gather_indices = gather_indices.unsqueeze(-1).repeat(1, d)
+            gather_indices = gather_indices.unsqueeze(1)
+            assert gather_indices.shape == (b, 1, d)
+            # Gather along the seq len: [b, n, d] -> [b, d]
+            # Actually no need for the attention mask as we gather the last token where attn_mask=1 but
+            # as some indices (which shouldn't be attended to) may be 0 due to clamp, use mask to ignore them again
+            input_mask_expanded = attention_mask.unsqueeze(-1).expand((b, n, d)).float()
+            embedding = torch.gather(hidden_state * input_mask_expanded, 1, gather_indices).squeeze(dim=1)
+        elif self.pooling_method in ['mean', 'weightedmean']:
+            if self.pooling_method == 'weightedmean':
+                attention_mask *= attention_mask.cumsum(dim=1) # [0,1,1,1,0,0] -> [0,1,2,3,0,0]
+            s = torch.sum(hidden_state * attention_mask.unsqueeze(-1).float(), dim=1)
+            d = attention_mask.sum(dim=1, keepdim=True).float()
+            embedding = s / d
+        else: raise NotImplementedError(f"Unknown pooling method: {self.pooling_method}")
+        # Recasting performs slightly worse but saves 50% space
+        if recast: return embedding.to(hidden_state.dtype)
+        return embedding