Aleph-Alpha
/

llama-3_1-8b-tfree-hat-base

+import itertools
+from collections.abc import Sequence
+from importlib.metadata import PackageNotFoundError, version
+from typing import Callable
+import torch
+import torch.nn as nn
+from einops import rearrange
+from flash_attn.flash_attn_interface import flash_attn_varlen_func
+from transformers import PreTrainedModel
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from transformers.models.llama.modeling_llama import (
+    LlamaDecoderLayer,
+    LlamaRotaryEmbedding,
+)
+from transformers.utils import ModelOutput
+from .config import (
+    CrossAttentionConfig,
+    DecoderHATModelConfig,
+    EncoderHATModelConfig,
+    HATArchitectureConfig,
+    TransformerHATModelConfig,
+)
+from .splitter import HATSplitter
+try:
+    transformers_version = version("transformers")
+    if transformers_version != "4.46.3":
+        print(f"Warning: Expecected transformers version 4.46.3, but found {transformers_version}. Outputs might be different.")
+except PackageNotFoundError:
+    print("transformers is not installed")
+def sample_argmax(logits: torch.Tensor) -> torch.Tensor:
+    return torch.argmax(logits, dim=-1)[:, -1]
+LLAMA_TEMPLATE = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
+You are a helpful assistant. You give engaging, well-structured answers to user inquiries.<|eot_id|><|start_header_id|>user<|end_header_id|>
+{input}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""
+class HATCache(Cache):
+    encoder_cache: DynamicCache
+    backbone_cache: DynamicCache
+    decoder_cache: DynamicCache
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.encoder_cache = DynamicCache()
+        self.backbone_cache = DynamicCache()
+        self.decoder_cache = DynamicCache()
+    def get_backbone_cache(self) -> DynamicCache:
+        return self.backbone_cache
+    def get_decoder_cache(self) -> DynamicCache:
+        return self.decoder_cache
+    def get_encoder_cache(self) -> DynamicCache:
+        return self.encoder_cache
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, q_cos=None, q_sin=None, k_cos=None, k_sin=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    and allows for different sequence lengths.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        q_cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        q_sin (`torch.Tensor`): The sine part of the rotary embedding.
+        k_cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        k_sin (`torch.Tensor`): The sine part of the rotary embedding.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze
+            cos[position_ids] and sin[position_ids] so that they can be properly
+            broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape
+            [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting
+            unsqueeze_dim=1 makes cos[position_ids] and sin[position_ids]
+            broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key
+        tensors rotated using the Rotary Position Embedding.
+    """
+    q_cos = q_cos.unsqueeze(unsqueeze_dim)
+    q_sin = q_sin.unsqueeze(unsqueeze_dim)
+    k_cos = k_cos.unsqueeze(unsqueeze_dim)
+    k_sin = k_sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * q_cos) + (rotate_half(q) * q_sin)
+    k_embed = (k * k_cos) + (rotate_half(k) * k_sin)
+    return q_embed, k_embed
+class HATBackbone(nn.Module):
+    def __init__(self, config: TransformerHATModelConfig, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.layers = nn.ModuleList([LlamaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)])
+        self.rotary_emb = LlamaRotaryEmbedding(config=config)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_ids: torch.Tensor | None = None,
+        past_key_values: DynamicCache | None = None,
+        use_cache: bool | None = False,
+    ) -> BaseModelOutputWithPast:
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache()
+        if position_ids is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            position_ids = torch.arange(
+                past_seen_tokens,
+                past_seen_tokens + hidden_states.shape[1],
+                device=hidden_states.device,
+            ).unsqueeze(0)
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        for backbone_layer in self.layers:
+            layer_outputs = backbone_layer(
+                hidden_states,
+                position_ids=position_ids,
+                past_key_value=past_key_values,
+                use_cache=use_cache,
+                position_embeddings=position_embeddings,
+            )
+            hidden_states = layer_outputs[0]
+        return CausalLMOutputWithPast(
+            hidden_states=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+        )
+class HATDecoderConnector(nn.Module):
+    def __init__(self, backbone_hiden_dim: int, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.first_word_embedding = torch.nn.Parameter(
+            torch.empty(
+                1,
+                1,
+                backbone_hiden_dim,
+                device="cuda",
+                dtype=torch.bfloat16,
+            )
+        )
+    def forward(
+        self,
+        backbone_activations: torch.Tensor,
+    ):
+        activations = backbone_activations.clone()
+        activations[:, -1:, :] = self.first_word_embedding
+        activations = torch.roll(activations, shifts=1, dims=1)
+        return activations
+class RMSNorm(nn.Module):
+    def __init__(self, dimensions: int, eps: float, device: torch.device, dtype: torch.dtype = torch.bfloat16, norm_in_fp32: bool = False):
+        super().__init__()
+        self.eps = eps
+        self.weight = torch.nn.Parameter(torch.ones(dimensions, dtype=dtype).to(device))
+        self.norm_in_fp32 = norm_in_fp32
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        original_dtype = x.dtype
+        if self.norm_in_fp32:
+            x = x.float()
+        out = x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+        if out.dtype != original_dtype:
+            out = out.to(original_dtype)
+        return out * self.weight
+class HATDecoderBlock(nn.Module):
+    def __init__(
+        self,
+        add_cross_attention: bool,
+        config: DecoderHATModelConfig,
+        layer_idx: int,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.add_cross_attention = add_cross_attention
+        self.config = config
+        self.llama_layer = LlamaDecoderLayer(config, layer_idx)
+        self.llama_layer.self_attn.sliding_window = config.sliding_window
+        if add_cross_attention:
+            self.cross_attention = HATCrossAttention(
+                hidden_size=config.cross_attention_config.hidden_size,
+                hidden_size_kv=config.cross_attention_config.hidden_size_kv,
+                hidden_size_q=config.cross_attention_config.hidden_size_q,
+                config=config,
+                cross_attention_config=config.cross_attention_config,
+            )
+        self.query_norm = RMSNorm(
+            config.cross_attention_config.hidden_size_q,
+            eps=config.rms_norm_eps,
+            device=torch.device("cuda"),
+            dtype=torch.bfloat16,
+            norm_in_fp32=False,
+        )
+        self.kv_norm = RMSNorm(
+            config.cross_attention_config.hidden_size_kv,
+            eps=config.rms_norm_eps,
+            device=torch.device("cuda"),
+            dtype=torch.bfloat16,
+            norm_in_fp32=False,
+        )
+    def apply_norm(self, activations):
+        return self.query_norm(activations), self.kv_norm(activations)
+    def forward(
+        self,
+        encoder_activations,
+        backbone_activations,
+        byte_position_ids,
+        word_position_ids,
+        cumulative_seq_lengths_per_word,
+        position_embeddings,
+        past_key_values,
+        use_cache,
+    ):
+        if self.add_cross_attention:
+            kv_activations = self.kv_norm(backbone_activations)
+            q_activations = self.query_norm(encoder_activations)
+            activations = self.cross_attention.forward(
+                q_activations=q_activations,
+                kv_activations=kv_activations,
+                position_ids_q=byte_position_ids,
+                position_ids_kv=word_position_ids,
+                cumulative_seq_q=cumulative_seq_lengths_per_word,
+                cumulative_seq_kv=torch.arange(0, kv_activations.size(1) + 1, device=encoder_activations.device, dtype=torch.int32),
+                causal=False,
+            )
+            encoder_activations = encoder_activations + activations
+        return self.llama_layer.forward(
+            hidden_states=encoder_activations,
+            position_ids=byte_position_ids,
+            position_embeddings=position_embeddings,
+            past_key_value=past_key_values,
+            use_cache=use_cache,
+        )[0]
+class HATDecoder(nn.Module):
+    def __init__(self, config: DecoderHATModelConfig, *args, **kwargs):
+        super().__init__()
+        self.decoder_layers = nn.Sequential()
+        for layer_idx in range(config.num_hidden_layers):
+            add_cross_attention = config.cross_attn_every_layer or layer_idx == 0
+            self.decoder_layers.add_module(
+                str(layer_idx),
+                HATDecoderBlock(
+                    add_cross_attention,
+                    config,
+                    layer_idx,
+                ),
+            )
+        self.rotary_emb = LlamaRotaryEmbedding(config=config)
+    def forward(
+        self,
+        backbone_activations: torch.Tensor,
+        activations: torch.Tensor,
+        cumulative_seq_lengths_per_word: torch.Tensor | None = None,
+        byte_position_ids: torch.Tensor | None = None,
+        word_position_ids: torch.Tensor | None = None,
+        past_key_values: DynamicCache | None = None,
+        use_cache: bool | None = False,
+    ) -> BaseModelOutputWithPast:
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache()
+        if byte_position_ids is None:
+            past_seen_bytes = past_key_values.get_seq_length() if past_key_values is not None else 0
+            byte_position_ids = torch.arange(
+                past_seen_bytes,
+                past_seen_bytes + activations.size(1),
+                device=activations.device,
+                dtype=torch.int32,
+            ).unsqueeze(0)
+        if cumulative_seq_lengths_per_word is None:
+            cumulative_seq_lengths_per_word = torch.tensor([0, byte_position_ids.size(1)], dtype=byte_position_ids.dtype, device=byte_position_ids.device)
+        if word_position_ids is None:
+            raise ValueError()  # TODO
+        position_embeddings = self.rotary_emb(activations, byte_position_ids)
+        for _, layer in enumerate(self.decoder_layers):
+            activations = layer(
+                encoder_activations=activations,
+                backbone_activations=backbone_activations,
+                position_embeddings=position_embeddings,
+                cumulative_seq_lengths_per_word=cumulative_seq_lengths_per_word,
+                byte_position_ids=byte_position_ids,
+                word_position_ids=word_position_ids,
+                past_key_values=past_key_values,
+                use_cache=use_cache,
+            )
+        return BaseModelOutputWithPast(
+            last_hidden_state=activations,
+            past_key_values=past_key_values if use_cache else None,
+        )
+class HATCrossAttention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        hidden_size_q: int,
+        hidden_size_kv: int,
+        config: EncoderHATModelConfig | DecoderHATModelConfig,
+        cross_attention_config: CrossAttentionConfig,
+        dtype: torch.dtype = torch.bfloat16,
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.hidden_size_q = hidden_size_q
+        self.hidden_size_kv = hidden_size_kv
+        self.num_heads = cross_attention_config.num_attention_heads
+        self.num_key_value_heads = cross_attention_config.attention_num_kv_heads
+        self.num_repeat_kv = cross_attention_config.num_attention_heads // cross_attention_config.attention_num_kv_heads
+        self.head_dim = hidden_size // self.num_heads
+        self.q_proj = nn.Linear(
+            in_features=hidden_size_q,
+            out_features=hidden_size,
+            dtype=dtype,
+            bias=False,
+        )
+        self.k_proj = nn.Linear(
+            in_features=hidden_size_kv,
+            out_features=hidden_size // self.num_repeat_kv,
+            dtype=dtype,
+            bias=False,
+        )
+        self.v_proj = nn.Linear(
+            in_features=hidden_size_kv,
+            out_features=hidden_size // self.num_repeat_kv,
+            dtype=dtype,
+            bias=False,
+        )
+        self.o_proj = nn.Linear(in_features=hidden_size, out_features=hidden_size_q, dtype=dtype, bias=False)
+        rope_theta = config.rope_theta
+        rope_type = config.rope_scaling["rope_type"]
+        self.rotary_emb = LlamaRotaryEmbedding(dim=self.head_dim, base=rope_theta, rope_type=rope_type)
+    def forward(
+        self,
+        q_activations: torch.Tensor,
+        kv_activations: torch.Tensor,
+        position_ids_q: torch.Tensor,
+        position_ids_kv: torch.Tensor,
+        cumulative_seq_kv: torch.Tensor,
+        cumulative_seq_q: torch.Tensor,
+        causal: bool = True,
+        use_cache: bool = False,
+        past_key_value: DynamicCache | None = None,
+    ):
+        q_len = cumulative_seq_q[-1]
+        bsz, _, _ = kv_activations.size()
+        query_states = self.q_proj(q_activations)
+        key_states = self.k_proj(kv_activations)
+        value_states = self.v_proj(kv_activations)
+        # TODO get rid of the double rearrange, this is just for compatibility with scaling
+        query_states = rearrange(query_states, "bsz seq_len (h d) -> bsz h seq_len d", h=self.num_heads)
+        key_states = rearrange(
+            key_states,
+            "bsz seq_len (h d) -> bsz h seq_len d",
+            h=self.num_key_value_heads,
+        )
+        value_states = rearrange(
+            value_states,
+            "bsz seq_len (h d) -> bsz h seq_len d",
+            h=self.num_key_value_heads,
+        )
+        # WIP: Should word_positions_id respect document boundaries?
+        q_cos, q_sin = self.rotary_emb(query_states, position_ids_q)
+        k_cos, k_sin = self.rotary_emb(key_states, position_ids_kv)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, q_cos=q_cos, q_sin=q_sin, k_cos=k_cos, k_sin=k_sin)
+        query_states = rearrange(query_states, "bsz h seq_len d -> (bsz seq_len) h d")
+        key_states = rearrange(key_states, "bsz h seq_len d -> (bsz seq_len) h d")
+        value_states = rearrange(value_states, "bsz h seq_len d -> (bsz seq_len) h d")
+        attn_output = flash_attn_varlen_func(
+            query_states,
+            key_states,
+            value_states,
+            cu_seqlens_q=cumulative_seq_q,
+            cu_seqlens_k=cumulative_seq_kv,
+            max_seqlen_q=self._get_max_seqlen(cumulative_seq_q),
+            max_seqlen_k=self._get_max_seqlen(cumulative_seq_kv),
+            causal=False,
+        )
+        attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output
+    def _get_max_seqlen(self, cumulative_word_lengths: torch.Tensor):
+        diffs = cumulative_word_lengths[1:] - cumulative_word_lengths[:-1]
+        return int(diffs.max().item())
+class HATEncoderConnector(nn.Module):
+    def __init__(
+        self,
+        config: EncoderHATModelConfig,
+        backbone_hidden_size: int,
+        dtype: torch.dtype = torch.bfloat16,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.latent_query = torch.nn.Parameter(
+            torch.empty(
+                1,
+                1,
+                backbone_hidden_size,
+                device="cuda",
+                dtype=dtype,
+            )
+        )
+        self.cross_attention_encoder_connector = HATCrossAttention(
+            hidden_size=config.cross_attention_config.hidden_size,
+            hidden_size_q=backbone_hidden_size,
+            hidden_size_kv=config.hidden_size,
+            config=config,
+            cross_attention_config=config.cross_attention_config,
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cumulative_seq_lengths_per_word: torch.Tensor,
+        word_position_ids: torch.Tensor,
+        byte_position_ids: torch.Tensor,
+    ):
+        q_len = cumulative_seq_lengths_per_word.shape[0] - 1
+        latent_query_repeated = self.latent_query.expand(-1, q_len, -1)
+        cumulative_seq_lengths_q = torch.arange(
+            start=0,
+            end=latent_query_repeated.shape[1] + 1,
+            step=1,
+            device=self.latent_query.device,
+            dtype=torch.int32,
+        )
+        word_embeddings = self.cross_attention_encoder_connector.forward(
+            q_activations=latent_query_repeated,
+            kv_activations=hidden_states,
+            position_ids_q=word_position_ids,
+            position_ids_kv=byte_position_ids,
+            cumulative_seq_q=cumulative_seq_lengths_q,
+            cumulative_seq_kv=cumulative_seq_lengths_per_word,
+        )
+        return word_embeddings
+class HATEncoder(nn.Module):
+    def __init__(
+        self,
+        config: EncoderHATModelConfig,
+        dtype: torch.dtype = torch.bfloat16,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.embedding_layer = nn.Embedding(config.vocab_size, config.hidden_size, dtype=dtype)
+        self.layers = nn.ModuleList([LlamaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)])
+        for layer in self.layers:
+            layer.self_attn.sliding_window = config.sliding_window
+        self.rotary_emb = LlamaRotaryEmbedding(config=config)
+        self.word_window_size = config.cross_attention_config.word_window_size
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        cumulative_seq_lengths_per_word: torch.Tensor | None = None,
+        byte_position_ids: torch.Tensor | None = None,
+        word_position_ids: torch.Tensor | None = None,  # TODO: Remove
+        past_key_values: DynamicCache | None = None,
+        use_cache: bool | None = False,
+    ):
+        input_embeds = self.embedding_layer(input_ids)
+        if cumulative_seq_lengths_per_word is None:
+            cumulative_seq_lengths_per_word = torch.tensor([0, input_embeds.shape[1]], dtype=torch.int32, device=input_ids.device)
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache()
+        if byte_position_ids is None:
+            past_seen_bytes = past_key_values.get_seq_length() if past_key_values is not None else 0
+            byte_position_ids = torch.arange(
+                past_seen_bytes,
+                past_seen_bytes + input_embeds.shape[1],
+                device=input_embeds.device,
+            ).unsqueeze(0)
+        if word_position_ids is None:
+            raise ValueError()  # TODO
+        hidden_states = input_embeds
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, byte_position_ids)
+        for layer in self.layers:
+            layer_outputs = layer(
+                hidden_states,
+                position_ids=byte_position_ids,
+                past_key_value=past_key_values,
+                use_cache=use_cache,
+                position_embeddings=position_embeddings,
+            )
+            hidden_states = layer_outputs[0]
+        return CausalLMOutputWithPast(
+            hidden_states=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+        )
+class HATForCausalLM(PreTrainedModel):
+    config_class = HATArchitectureConfig
+    _supports_flash_attn_2 = True
+    _supports_cache_class = True
+    def __init__(self, config: HATArchitectureConfig, *args, **kwargs):
+        super().__init__(config, *args, **kwargs)
+        self.config = config
+        self.eos_token_id = config.eos_token_id
+        self.encoder = HATEncoder(config.encoder_config)
+        self.encoder_connector = HATEncoderConnector(config.encoder_config, config.backbone_config.hidden_size)
+        self.backbone = HATBackbone(config.backbone_config)
+        self.decoder_connector = HATDecoderConnector(config.backbone_config.hidden_size)
+        self.decoder = HATDecoder(config.decoder_config)
+        self.splitter = HATSplitter(special_token_dict=config.special_token_dict, max_word_size=config.max_word_size)
+        self.layer_norm = RMSNorm(config.decoder_config.hidden_size, eps=config.decoder_config.rms_norm_eps, device=torch.device("cuda"), dtype=torch.bfloat16, norm_in_fp32=False)
+        self.lm_head = nn.Linear(
+            in_features=config.decoder_config.hidden_size,
+            out_features=config.decoder_config.vocab_size,
+            dtype=torch.bfloat16,
+            bias=False,
+        )
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        byte_position_ids: torch.Tensor,
+        cumulative_seq_lengths_per_word: torch.Tensor | None = None,
+        word_position_ids: torch.Tensor | None = None,
+        past_key_values: HATCache | None = None,
+        use_cache: bool = False,
+    ):
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        if past_key_values is None and use_cache:
+            past_key_values = HATCache()
+        encoder_past_key_values = past_key_values.get_encoder_cache() if past_key_values is not None else None
+        backbone_past_key_values = past_key_values.get_backbone_cache() if past_key_values is not None else None
+        decoder_past_key_values = past_key_values.get_decoder_cache() if past_key_values is not None else None
+        encoder_output: BaseModelOutputWithPast = self.encoder(
+            input_ids=input_ids,
+            cumulative_seq_lengths_per_word=cumulative_seq_lengths_per_word,
+            byte_position_ids=byte_position_ids,
+            word_position_ids=word_position_ids,
+            past_key_values=encoder_past_key_values,
+            use_cache=use_cache,
+        )
+        byte_level_activations = encoder_output.hidden_states
+        encoder_connector_output = self.encoder_connector(
+            byte_level_activations,
+            cumulative_seq_lengths_per_word,
+            word_position_ids,
+            byte_position_ids,
+        )
+        backbone_output: CausalLMOutputWithPast = self.backbone(
+            hidden_states=encoder_connector_output,
+            position_ids=word_position_ids,
+            past_key_values=backbone_past_key_values,
+            use_cache=use_cache,
+        )
+        predictive_word_embeddings = self.decoder_connector.forward(backbone_activations=backbone_output.hidden_states)
+        decoder_output = self.decoder.forward(
+            activations=byte_level_activations,
+            backbone_activations=predictive_word_embeddings,
+            cumulative_seq_lengths_per_word=cumulative_seq_lengths_per_word,
+            byte_position_ids=byte_position_ids,
+            word_position_ids=word_position_ids,
+            past_key_values=decoder_past_key_values,
+            use_cache=use_cache,
+        )
+        decoder_output = self.layer_norm(decoder_output.last_hidden_state)
+        logits = self.lm_head(decoder_output)
+        loss = None
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=past_key_values if use_cache else None,
+            hidden_states=backbone_output.hidden_states,
+            attentions=None,
+        )
+    def _append_byte(self, words: list[list[int]], token: int) -> list[list[int]]:
+        extended_last_word = words.pop() + [token]
+        try:
+            text = self.splitter.decode(extended_last_word, skip_special_tokens=False)
+            list_of_bytes = self.splitter.encode(text)
+            words.extend([list(word_in_bytes) for word_in_bytes in list_of_bytes])
+        except UnicodeDecodeError:
+            # if decoding fails, the token cannot be part of a new word since it is not a valid
+            # utf-8 end byte and we append it to the current word
+            words.append(extended_last_word)
+        return words
+    def _complete_word(
+        self,
+        input_ids: torch.Tensor,
+        byte_position_ids: torch.Tensor,
+        backbone_word_prediction: torch.Tensor,
+        word_position_id: torch.Tensor,
+        encoder_cache: DynamicCache,
+        decoder_cache: DynamicCache,
+        sample_fn: Callable[[torch.Tensor], torch.Tensor] = sample_argmax,
+    ):
+        """Generate byte tokens until we hit the first byte of a new word."""
+        words = [input_ids.squeeze(0).tolist()]
+        byte_encoder_activations = []
+        completion_logits = []
+        while True:
+            encoder_output = self.encoder.forward(
+                input_ids,
+                byte_position_ids=None,
+                word_position_ids=word_position_id,
+                past_key_values=encoder_cache,
+                use_cache=True,
+            )
+            byte_encoder_activations.append(encoder_output.hidden_states)
+            decoder_output = self.decoder.forward(
+                backbone_word_prediction,
+                encoder_output.hidden_states,
+                byte_position_ids=None,
+                word_position_ids=word_position_id,
+                past_key_values=decoder_cache,
+                use_cache=True,
+            )
+            decoder_output = self.layer_norm(decoder_output.last_hidden_state)
+            logits = self.lm_head(decoder_output)
+            completion_logits.append(logits[0, -1:, :])
+            next_byte = int(sample_fn(logits).item())
+            words = self._append_byte(words, next_byte)
+            if len(words) > 1 or next_byte == self.eos_token_id:
+                break
+            input_ids = torch.tensor([[next_byte]], dtype=input_ids.dtype, device=input_ids.device)
+        byte_encoder_activations = torch.cat(byte_encoder_activations, dim=1)
+        num_kv = encoder_cache.get_seq_length()
+        byte_position_ids = torch.arange(num_kv + 1 - byte_encoder_activations.shape[1], num_kv + 1, device=input_ids.device, dtype=torch.long).unsqueeze(0)
+        completed_word_embedding = self.encoder_connector.forward(
+            byte_encoder_activations,
+            cumulative_seq_lengths_per_word=torch.tensor([0, byte_encoder_activations.size(1)], dtype=torch.int32, device=input_ids.device),
+            word_position_ids=word_position_id,
+            byte_position_ids=byte_position_ids,
+        )
+        completion = sum(words, [])[-len(completion_logits) :]
+        first_byte_of_next_word = words[1]
+        return completion, completed_word_embedding, first_byte_of_next_word, byte_position_ids[:, -1].item() + 1, completion_logits
+    def generate(
+        self,
+        input_ids: torch.Tensor,
+        max_new_tokens: int,
+        cumulative_seq_lengths_per_word: torch.Tensor,
+        byte_position_ids: torch.Tensor | None = None,
+        word_position_ids: torch.Tensor | None = None,
+        sample_fn: Callable[[torch.Tensor], torch.Tensor] = sample_argmax,
+        use_cache: bool = True,
+        stop_sequences: Sequence[str] | None = None,
+    ):
+        if use_cache:
+            completion_text, completion_logits = self._generate_cached(input_ids, max_new_tokens, cumulative_seq_lengths_per_word, byte_position_ids, word_position_ids, sample_fn, stop_sequences=stop_sequences)
+        else:
+            completion_text, completion_logits = self._generate_uncached(input_ids, max_new_tokens, cumulative_seq_lengths_per_word, byte_position_ids, word_position_ids, sample_fn, stop_sequences=stop_sequences)
+        # remove stop sequence if exists
+        if stop_sequences is not None:
+            stop_sequences = sorted(stop_sequences, key=lambda i: len(i), reverse=True)
+            for stop_sequence in stop_sequences:
+                if stop_sequence in completion_text:
+                    completion_text_left = completion_text.split(stop_sequence)[0]
+                    completion_text_removed = completion_text[len(completion_text_left) :]
+                    completion_logits = completion_logits[: -len(list(bytes(completion_text_removed.encode("UTF-8"))))]
+                    completion_text = completion_text_left
+                    break
+        return ModelOutput(
+            completion_text=completion_text,
+            input_ids=input_ids,
+            completion_logits=completion_logits,
+        )
+    @torch.no_grad()
+    def _generate_cached(
+        self,
+        input_ids: torch.Tensor,
+        max_new_tokens: int,
+        cumulative_seq_lengths_per_word: torch.Tensor,
+        byte_position_ids: torch.Tensor | None = None,
+        word_position_ids: torch.Tensor | None = None,
+        sample_fn: Callable[[torch.Tensor], torch.Tensor] = sample_argmax,
+        stop_sequences: Sequence[str] | None = None,
+    ):
+        max_total_bytes = max_new_tokens + input_ids.shape[1]
+        if byte_position_ids is None:
+            byte_position_ids = torch.arange(0, cumulative_seq_lengths_per_word[-1].item(), device=input_ids.device, dtype=torch.int32).unsqueeze(0)
+        if word_position_ids is None:
+            word_position_ids = torch.arange(0, cumulative_seq_lengths_per_word.shape[0] - 1, device=input_ids.device, dtype=torch.int32).unsqueeze(0)
+        last_word_start, last_word_end = (
+            cumulative_seq_lengths_per_word[-2],
+            cumulative_seq_lengths_per_word[-1],
+        )
+        # Populate cache with everything except last word
+        initial_forward_output = self.forward(
+            input_ids=input_ids[:, :last_word_start],
+            cumulative_seq_lengths_per_word=cumulative_seq_lengths_per_word[:-1],
+            byte_position_ids=byte_position_ids[:, :last_word_start],
+            word_position_ids=word_position_ids[:, :-1],
+            past_key_values=None,
+            use_cache=True,
+        )
+        completion_bytes = []
+        completion_logits = []
+        input_ids = input_ids[:, last_word_start:last_word_end]
+        next_byte_id = last_word_end
+        byte_position_ids = byte_position_ids[:, last_word_start:last_word_end]
+        word_position_id = word_position_ids[:, -1].unsqueeze(-1)
+        backbone_last_hidden_state = initial_forward_output.hidden_states[:, -1:, :]
+        while next_byte_id < max_total_bytes:
+            completion, completed_word_embedding, first_byte_of_next_word, next_byte_id, next_completion_logits = self._complete_word(
+                input_ids=input_ids,
+                byte_position_ids=byte_position_ids,
+                backbone_word_prediction=backbone_last_hidden_state,
+                word_position_id=word_position_id,
+                encoder_cache=initial_forward_output.past_key_values.get_encoder_cache(),
+                decoder_cache=initial_forward_output.past_key_values.get_decoder_cache(),
+                sample_fn=sample_fn,
+            )
+            completion_logits.extend(next_completion_logits)
+            completion_bytes.extend(completion)
+            if self.eos_token_id in completion_bytes:
+                completion_bytes = completion_bytes[: completion_bytes.index(self.eos_token_id)]
+                break
+            if stop_sequences is not None:
+                try:
+                    completion_text_tmp = self.splitter.decode(completion_bytes)
+                    if any(stop_sequence in completion_text_tmp for stop_sequence in stop_sequences):
+                        break
+                except Exception as e:
+                    print("Cannot compare stop sequence", e)
+            backbone_output = self.backbone.forward(
+                hidden_states=completed_word_embedding,
+                position_ids=None,
+                past_key_values=initial_forward_output.past_key_values.get_backbone_cache(),
+                use_cache=True,
+            )
+            backbone_last_hidden_state = backbone_output.hidden_states[:, -1, :].unsqueeze(1)
+            input_ids = torch.tensor([first_byte_of_next_word], dtype=input_ids.dtype, device=input_ids.device)
+            byte_position_ids = torch.tensor([[next_byte_id]], dtype=input_ids.dtype, device=input_ids.device)
+            word_position_id = word_position_id + 1
+        completion_bytes.extend(first_byte_of_next_word)
+        completion_bytes = completion_bytes[:max_new_tokens]
+        completion_logits = torch.cat(completion_logits[:max_new_tokens], dim=0)
+        completion_text = self.splitter.decode(completion_bytes)
+        return completion_text, completion_logits
+    @torch.no_grad()
+    def _generate_uncached(
+        self,
+        input_ids: torch.Tensor,
+        max_new_tokens: int,
+        cumulative_seq_lengths_per_word: torch.Tensor,
+        byte_position_ids: torch.Tensor | None = None,
+        word_position_ids: torch.Tensor | None = None,
+        sample_fn=sample_argmax,
+        stop_sequences: Sequence[str] | None = None,
+    ):
+        if byte_position_ids is None:
+            byte_position_ids = torch.arange(0, cumulative_seq_lengths_per_word[-1].item(), device=input_ids.device, dtype=torch.int32).unsqueeze(0)
+        if word_position_ids is None:
+            word_position_ids = torch.arange(0, cumulative_seq_lengths_per_word.shape[0] - 1, device=input_ids.device, dtype=torch.int32).unsqueeze(0)
+        word_list = []
+        for i in range(1, cumulative_seq_lengths_per_word.shape[0]):
+            start_idx = cumulative_seq_lengths_per_word[i - 1]
+            end_idx = cumulative_seq_lengths_per_word[i]
+            word_list.append(input_ids[:, start_idx:end_idx].squeeze(0).tolist())
+        completion_bytes = []
+        for _ in range(max_new_tokens):
+            output = self.forward(
+                input_ids=input_ids,
+                cumulative_seq_lengths_per_word=cumulative_seq_lengths_per_word,
+                byte_position_ids=byte_position_ids,
+                word_position_ids=word_position_ids,
+                past_key_values=None,
+            )
+            next_byte = int(sample_fn(output.logits).item())
+            completion_bytes.append(next_byte)
+            if next_byte == self.eos_token_id:
+                break
+            word_list = self._append_byte(word_list, next_byte)
+            input_ids = torch.tensor(sum(word_list, []), dtype=torch.long, device=input_ids.device).unsqueeze(0)
+            cumulative_seq_lengths_per_word = torch.tensor([0] + list(itertools.accumulate(len(word) for word in word_list if len(word) > 0)), dtype=torch.int32, device=input_ids.device)
+            byte_position_ids = torch.arange(0, input_ids.shape[1], device=input_ids.device, dtype=torch.int32).unsqueeze(0)
+            word_position_ids = torch.arange(0, cumulative_seq_lengths_per_word.shape[0] - 1, device=input_ids.device, dtype=torch.int32).unsqueeze(0)
+            if stop_sequences is not None:
+                try:
+                    completion_text_tmp = self.splitter.decode(completion_bytes)
+                    if any(completion_text_tmp.endswith(stop_sequence) for stop_sequence in stop_sequences):
+                        break
+                except Exception as e:
+                    print("Cannot compare stop sequence", e)
+        completion_text = self.splitter.decode(completion_bytes)
+        completion_logits = output.logits[0, -len(completion_bytes) :, :]
+        return completion_text, completion_logits
+    def _prepare_input(self, input_str: str, add_llama_template: bool = True, device: torch.device | None = None) -> tuple[torch.Tensor, torch.Tensor]:
+        if add_llama_template:
+            input_str = LLAMA_TEMPLATE.format(input=input_str)
+        if device is None:
+            assert torch.cuda.is_available(), "CUDA is not available"
+            device = torch.device("cuda")
+        input_ids_list = []
+        cumulative_per_word_lengths_list = [0]
+        words = self.splitter.encode(input_str)
+        for word in words:
+            input_ids_list.extend(word)
+            word_length = len(word)
+            cumulative_per_word_lengths_list.append(cumulative_per_word_lengths_list[-1] + word_length)
+        input_ids = torch.tensor(input_ids_list, device=device, dtype=torch.int32).unsqueeze(0)
+        cumulative_per_word_lengths = torch.tensor(cumulative_per_word_lengths_list, device=device, dtype=torch.int32)
+        return input_ids, cumulative_per_word_lengths

splitter.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import re
+from hat_splitter import HATSplitter as RustHATSplitter
+class HATSplitter:
+    def __init__(self, special_token_dict: dict | None = None, max_word_size: int = 128):
+        self.hat_splitter = RustHATSplitter()
+        self.max_word_size = max_word_size
+        self.special_token_dict = special_token_dict
+        self.special_token_replace: dict[int, list[int]] = {
+            token: list(text.encode("utf-8")) for text, token in self.special_token_dict.items()
+        }
+        self.special_token_pattern = (
+            re.compile(rf"({'|'.join(map(re.escape, special_token_dict.keys()))})")
+            if special_token_dict
+            else re.compile(r"(?!)")
+        )
+    def encode(self, text: str) -> list[list[int]]:
+        chunks = []
+        for str_chunk in self.special_token_pattern.split(text):
+            if str_chunk:
+                if str_chunk in self.special_token_dict:
+                    chunks.append([self.special_token_dict[str_chunk]])
+                else:
+                    chunks.extend(list(chunk) for chunk in self.hat_splitter.split_with_limit(str_chunk, self.max_word_size))
+        return chunks
+    def decode(self, token_ids: list[int], errors: str = "replace", skip_special_tokens: bool = False) -> str:
+        assert isinstance(token_ids, list), "token_ids must be a list"
+        assert all(isinstance(token_id, int) for token_id in token_ids), "token_ids must be a list of integers"
+        new_token_ids: list[int]
+        if skip_special_tokens:
+            new_token_ids = [token_id for token_id in token_ids if token_id not in self.special_token_replace]
+        else:
+            new_token_ids = []
+            for token in token_ids:
+                if token in self.special_token_replace:
+                    new_token_ids.extend(self.special_token_replace[token])
+                else:
+                    new_token_ids.append(token)
+        return bytes(new_token_ids).decode("utf-8", errors=errors)