anonamename commited on Mar 23

Commit

691c3fc

verified ·

1 Parent(s): c63a187

Upload turing-motors/Heron-NVILA-Lite-15B

Browse files

Files changed (43) hide show

.gitattributes +1 -0
README.md +162 -0
__init__.py +0 -0
auto_processor.py +330 -0
base_projector.py +228 -0
builder.py +245 -0
config.json +322 -0
configuration_vila.py +93 -0
constants.py +43 -0
conversation.py +191 -0
distributed.py +73 -0
llm/added_tokens.json +29 -0
llm/config.json +30 -0
llm/generation_config.json +14 -0
llm/merges.txt +0 -0
llm/model-00001-of-00006.safetensors +3 -0
llm/model-00002-of-00006.safetensors +3 -0
llm/model-00003-of-00006.safetensors +3 -0
llm/model-00004-of-00006.safetensors +3 -0
llm/model-00005-of-00006.safetensors +3 -0
llm/model-00006-of-00006.safetensors +3 -0
llm/model.safetensors.index.json +586 -0
llm/special_tokens_map.json +41 -0
llm/tokenizer.json +3 -0
llm/tokenizer_config.json +252 -0
llm/vocab.json +0 -0
loss.py +48 -0
main.py +0 -0
media.py +129 -0
media_encoder.py +101 -0
mm_projector/config.json +10 -0
mm_projector/model.safetensors +3 -0
mm_utils.py +572 -0
model_utils_packing.py +35 -0
modeling_vila.py +1228 -0
qwen2_jp.jinja +11 -0
siglip_encoder.py +288 -0
tokenizer_utils.py +182 -0
trainer_state.json +3311 -0
utils.py +212 -0
vision_tower/config.json +23 -0
vision_tower/model.safetensors +3 -0
vision_tower/preprocessor_config.json +24 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+llm/tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,162 @@

+---
+license: apache-2.0
+license_link: https://huggingface.co/Qwen/Qwen2.5-14B-Instruct/blob/main/LICENSE
+language:
+- ja
+- en
+tags:
+- vila
+- nvila
+- conversational
+- multimodal
+base_model:
+- Qwen/Qwen2.5-14B-Instruct
+- Efficient-Large-Model/paligemma-siglip-so400m-patch14-448
+---
+# Heron NVILA-Lite 15B
+Heron NVILA-Lite 15B is a vision language model trained for Japanese, based on the [NVILA](https://arxiv.org/abs/2412.04468)-Lite architecture.
+## Model Overview
+* **Developer**: [Turing Inc.](https://www.turing-motors.com/)
+* **Vision Encoder**: [paligemma-siglip-so400m-patch14-448](https://huggingface.co/Efficient-Large-Model/paligemma-siglip-so400m-patch14-448)
+* **Projector**: mlp_downsample_3x3_fix
+* **LLM**: [Qwen2.5-14B-Instruct](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct)
+* **Supported Languages**: Japanese, English
+## Setup
+```bash
+# I have confirmed that 4.46.0 and 4.49.0 also work. Other versions of Transformer may also work, but I have not tested them.
+pip install transformers==4.45.0 accelerate opencv-python torchvision einops pillow
+pip install git+https://github.com/bfshi/scaling_on_scales.git
+```
+## Usage
+```python
+from transformers import AutoConfig, AutoModel
+model_path = "turing-motors/Heron-NVILA-Lite-15B"
+# you can use config
+config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+model = AutoModel.from_config(config, trust_remote_code=True, device_map="auto")
+# or directly from_pretrained
+model = AutoModel.from_pretrained(model_path, trust_remote_code=True, device_map="auto")
+# show chat_template
+print(model.tokenizer.chat_template)
+# examples generate with raw text
+response = model.generate_content(["こんにちは"])
+print(response)
+print("---" * 40)
+# examples generate with text + image
+from PIL import Image
+import requests
+url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+response = model.generate_content([image, "画像を説明してください。"])
+print(response)
+print("---" * 40)
+# examples generate using generation_config
+from PIL import Image
+import requests
+from transformers import GenerationConfig
+generation_config = {
+    "max_new_tokens": 512,
+    "temperature": 0.5,
+    "do_sample": True,
+}
+generation_config = GenerationConfig(**generation_config)
+url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+response = model.generate_content(
+    [image, "画像を説明してください。"],
+    generation_config=generation_config
+)
+print(response)
+print("---" * 40)
+# examples generate with text + image + text + image + text
+from PIL import Image
+import requests
+url_list = [
+    "https://images.unsplash.com/photo-1694831404826-3400c48c188d?q=80&w=2070&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D",
+    "https://images.unsplash.com/photo-1693240876439-473af88b4ed7?q=80&w=1974&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D"
+]
+images = [
+   Image.open(requests.get(url, stream=True).raw).convert("RGB") for url in url_list
+]
+response = model.generate_content([
+    images[0],
+    "これは日本の横断歩道の画像です",
+    images[1],
+    "これはオーストリアの信号機の画像です",
+    "各画像に写っている歩行者用信号機の色は何色ですか？"])
+print(response)
+print("---" * 40)
+```
+## Training Summary
+| Stage  | Training                      | Data Sources                  | Samples     |
+|--------|-------------------------------|-------------------------------|-------------|
+| Stage1 | Projector                     | [Japanese image text pairs](https://gitlab.llm-jp.nii.ac.jp/datasets/llm-jp-japanese-image-text-pairs), [LLaVA-Pretrain](https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain)                          | 1.1M      |
+| Stage2 | Projector, LLM                | Filtered MOMIJI 3 snapshots (CC-MAIN-2024-46, CC-MAIN-2024-51, CC-MAIN-2025-05)  | 13M     |
+|        |                               | [Japanese image text pairs (subset)](https://gitlab.llm-jp.nii.ac.jp/datasets/llm-jp-japanese-image-text-pairs), [Japanese interleaved data (subset)](https://gitlab.llm-jp.nii.ac.jp/datasets/llm-jp-japanese-interleaved-data), [mmc4-core (subset)](https://github.com/allenai/mmc4), [coyo-700m (subset)](https://huggingface.co/datasets/kakaobrain/coyo-700m), [wikipedia_ja](https://huggingface.co/datasets/turing-motors/Wikipedia-Vision-JA), [llava_pretrain_ja](https://huggingface.co/datasets/turing-motors/LLaVA-Pretrain-JA), [stair_captions](http://captions.stair.center/)  | 20M     |
+| Stage3 | Vision Encoder, Projector, LLM | [llava-instruct-v1_5-en-subset-358k](https://huggingface.co/datasets/llm-jp/llava-instruct-v1_5-en-subset-358k), [llava-instruct-ja](https://huggingface.co/datasets/llm-jp/llava-instruct-ja), [japanese-photos-conv](https://huggingface.co/datasets/llm-jp/japanese-photos-conversation), [ja-vg-vqa](https://huggingface.co/datasets/llm-jp/ja-vg-vqa-conversation), [synthdog-ja (subset)](https://huggingface.co/datasets/naver-clova-ix/synthdog-ja), [ai2d](https://huggingface.co/datasets/lmms-lab/ai2d), [synthdog-en](https://huggingface.co/datasets/naver-clova-ix/synthdog-en), [sherlock](https://github.com/allenai/sherlock)    | 1.4M      |
+## Evaluation
+I used [llm-jp-eval-mm](https://github.com/llm-jp/llm-jp-eval-mm) for this evaluation. All scores other than our models are taken from [llm-jp-eval-mm leaderboard](https://llm-jp.github.io/llm-jp-eval-mm/) and the [Asagi website](https://uehara-mech.github.io/asagi-vlm?v=1).
+| Model                          | LLM Size | Heron-Bench overall LLM (%) | JA-VLM-Bench-In-the-Wild LLM (/5.0) | JA-VG-VQA-500 LLM (/5.0) |
+|--------------------------------|----------|------------------------------|-------------------------------------|--------------------------|
+| **Heron NVILA-Lite 2B**        | 1.5B     | 52.8                         | 3.52                                | 3.50                     |
+| **Heron NVILA-Lite 15B**       | 14B      | 59.6                         | 4.2                                 | 3.82                     |
+| [LLaVA-CALM2-SigLIP](https://huggingface.co/cyberagent/llava-calm2-siglip)             | 7B      | 43.3                        | 3.15                                | 3.21                     |
+| [Llama-3-EvoVLM-JP-v2](https://huggingface.co/SakanaAI/Llama-3-EvoVLM-JP-v2)           | 8B      | 39.3                        | 2.92                                | 2.96                     |
+| [VILA-jp](https://huggingface.co/llm-jp/llm-jp-3-vila-14b)                        | 13B     | 57.2                        | 3.69                                | 3.62                     |
+| [Asagi-14B](https://huggingface.co/MIL-UT/Asagi-14B)                      | 13B     | 55.8                        | 3.44                                | 3.84                     |
+| [Qwen2-VL 7B Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)                         | 7B       | 55.5                        | 3.61                                | 3.6                     |
+| GPT-4o                         | -       | 87.6                        | 3.85                                | 3.58                     |
+## Risks and Limitations
+This model is experimental and has not been thoroughly calibrated for ethical compliance or legal standards. Caution is advised for sensitive applications.
+## License
+- Model weights are licensed under [Apache License 2.0](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct/blob/main/LICENSE).
+- Users must comply with  [OpenAI terms of use](https://openai.com/policies/terms-of-use) due to the inclusion of GPT-4-generated synthetic data.
+## How to cite
+```bibtex
+@misc{HeronNVILALite15B,
+    title  = {Heron NVILA-Lite 15B},
+    author = {Shingo Yokoi},
+    year   = {2025},
+    url    = {https://huggingface.co/turing-motors/Heron-NVILA-Lite-15B},
+}
+```
+## Citations
+```bibtex
+@misc{liu2025nvilaefficientfrontiervisual,
+      title={NVILA: Efficient Frontier Visual Language Models},
+      author={Zhijian Liu and Ligeng Zhu and Baifeng Shi and Zhuoyang Zhang and Yuming Lou and Shang Yang and Haocheng Xi and Shiyi Cao and Yuxian Gu and Dacheng Li and Xiuyu Li and Yunhao Fang and Yukang Chen and Cheng-Yu Hsieh and De-An Huang and An-Chieh Cheng and Vishwesh Nath and Jinyi Hu and Sifei Liu and Ranjay Krishna and Daguang Xu and Xiaolong Wang and Pavlo Molchanov and Jan Kautz and Hongxu Yin and Song Han and Yao Lu},
+      year={2025},
+      eprint={2412.04468},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV},
+      url={https://arxiv.org/abs/2412.04468},
+}
+```

__init__.py ADDED Viewed

File without changes

auto_processor.py ADDED Viewed

	@@ -0,0 +1,330 @@

+import copy
+import os
+import os.path as osp
+import warnings
+from collections import defaultdict
+from typing import List, Union
+import torch
+from transformers import AutoConfig, AutoImageProcessor, AutoModel, AutoProcessor, AutoTokenizer
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import ImageInput, VideoInput
+from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+from transformers.utils import logging
+from .constants import DEFAULT_IMAGE_TOKEN, MEDIA_TOKENS
+from .media import Image, Video, extract_media
+from .mm_utils import process_image, process_images
+from .tokenizer_utils import tokenize_conversation
+def fetch_image_url_or_fpath(url_or_fpath):
+    if url_or_fpath.startswith("http") or url_or_fpath.startswith("https"):
+        import tempfile
+        import requests
+        # Download the image to a temporary file
+        temp_dir = tempfile.mkdtemp()
+        temp_file = os.path.join(temp_dir, os.path.basename(url_or_fpath))
+        response = requests.get(url_or_fpath, stream=True)
+        response.raise_for_status()
+        with open(temp_file, "wb") as f:
+            for chunk in response.iter_content(chunk_size=8192):
+                f.write(chunk)
+        return temp_file
+    elif url_or_fpath.startswith("file://"):
+        fpath = url_or_fpath.replace("file://", "")
+        assert osp.exists(fpath), f"File {fpath} does not exist"
+        return fpath
+    elif osp.exists(url_or_fpath):
+        assert osp.isfile(url_or_fpath), f"File {url_or_fpath} is not a file"
+        return url_or_fpath
+    else:
+        raise ValueError(f"Unsupported image path: {url_or_fpath}")
+def __pad_fn(input_ids_list, padding_value=0, target_len=None, padding_side="left"):
+    # tensor shape is (batch_size, seq_len)
+    max_len = max([ids.shape[1] for ids in input_ids_list])
+    if target_len is not None:
+        assert target_len >= max_len, "target_len must be greater than or equal to max_len"
+        max_len = target_len
+    new_input_ids_list = []
+    for i, input_ids in enumerate(input_ids_list):
+        pad_tensor = torch.ones_like(input_ids) * padding_value
+        curr_len = input_ids.shape[1]
+        pad_tensor = pad_tensor[:, : max_len - curr_len]
+        if padding_side == "right":
+            input_ids = torch.cat((input_ids, pad_tensor), dim=1)
+        else:
+            input_ids = torch.cat((pad_tensor, input_ids), dim=1)
+        new_input_ids_list.append(input_ids)
+    return torch.cat(new_input_ids_list, dim=0)
+class VILAProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+        },
+    }
+class VILAProcessor(ProcessorMixin):
+    # attributes = ["image_processor", "tokenizer"]
+    attributes = []
+    # valid_kwargs = ["chat_template"]
+    valid_kwargs = []
+    # image_processor_class = "VILAImageProcessor"
+    # tokenizer_class = ("VILATokenizer", "VILATokenizerFast")
+    def __init__(self, image_processor=None, tokenizer=None, chat_template=None, config=None, **kwargs):
+        # self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
+        # self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
+        self.image_token = MEDIA_TOKENS["image"]
+        self.video_token = MEDIA_TOKENS["video"]
+        self.config = config
+        self.image_processor = image_processor
+        self.tokenizer = tokenizer
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        if os.path.isdir(pretrained_model_name_or_path):
+            pretrained_model_name_or_path = pretrained_model_name_or_path
+        else:
+            print(f"pretrained_model_name_or_path {pretrained_model_name_or_path} is not a directory, downloading")
+            from huggingface_hub import snapshot_download
+            pretrained_model_name_or_path = snapshot_download(pretrained_model_name_or_path)
+        image_processor = AutoImageProcessor.from_pretrained(
+            osp.join(pretrained_model_name_or_path, "vision_tower"), trust_remote_code=True
+        )
+        tokenizer = AutoTokenizer.from_pretrained(
+            osp.join(pretrained_model_name_or_path, "llm"), trust_remote_code=True
+        )
+        config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True)
+        return cls(image_processor=image_processor, tokenizer=tokenizer, config=config)
+    def __repr__(self):
+        return (
+            f"VILAProcessor(image_processor={self.image_processor}, tokenizer={self.tokenizer}, config={self.config})"
+        )
+    def __call__(
+        self,
+        conversation,
+        images: ImageInput = None,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        videos: VideoInput = None,
+        **kwargs: Unpack[VILAProcessorKwargs],
+    ) -> BatchFeature:
+        if images is not None:
+            warnings.warn("images is not supported in __call__")
+        input_ids = []
+        media = defaultdict(list)
+        media_config = defaultdict(dict)
+        for conv in conversation:
+            feat = self.__single_call__(conv, images, text, videos, **kwargs)
+            input_ids.append(feat.input_ids)
+            for name in feat.media:
+                media[name] += feat.media[name]
+            for name in feat.media_config:
+                media_config[name].update(feat.media_config[name])
+        return BatchFeature(
+            data={
+                # "input_ids": torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=self.pad_token_id),
+                "input_ids": __pad_fn(
+                    input_ids,
+                    padding_value=self.tokenizer.pad_token_id,
+                    padding_side="left",
+                ),
+                "media": media,
+                "media_config": media_config,
+            }
+        )
+    def __single_call__(
+        self,
+        conversation,
+        images: ImageInput = None,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        videos: VideoInput = None,
+        **kwargs: Unpack[VILAProcessorKwargs],
+    ) -> BatchFeature:
+        # TODO: should be merged with llava_arch.py/generate_content()
+        # TODO (extract and preprocess should be done together, as the preprocess of image and video can be different, i.e. when dynamic res is used)
+        conversation = copy.deepcopy(conversation)
+        media = extract_media(conversation, self.config)
+        # Process media
+        media_config = defaultdict(dict)
+        for name in media:
+            if name == "image":
+                if len(media["image"]) == 1 and self.config.image_aspect_ratio in ["dynamic", "dynamic_s2"]:
+                    self.config.image_processor = self.image_processor
+                    if self.config.image_aspect_ratio == "dynamic":
+                        images = process_image(media["image"][0], self.config, None, enable_dynamic_res=True).half()
+                        conversation[0]["value"] = conversation[0]["value"].replace(
+                            DEFAULT_IMAGE_TOKEN, f"{DEFAULT_IMAGE_TOKEN}\n" * images.shape[0]
+                        )
+                    else:
+                        if type(self.config.s2_scales) is str:
+                            self.config.s2_scales = list(map(int, self.config.s2_scales.split(",")))
+                        images, block_sizes = process_image(
+                            media["image"][0], self.config, None, enable_dynamic_s2=True
+                        )
+                        images = images.half()
+                        media_config[name]["block_sizes"] = [block_sizes]
+                else:
+                    images = process_images(media["image"], self.vision_tower.image_processor, self.config).half()
+                media[name] = [image for image in images]
+            elif name == "video":
+                media[name] = [
+                    process_images(images, self.vision_tower.image_processor, self.config).half()
+                    for images in media[name]
+                ]
+            else:
+                raise ValueError(f"Unsupported media type: {name}")
+        input_ids = tokenize_conversation(conversation, self.tokenizer, add_generation_prompt=True).cuda().unsqueeze(0)
+        # Set up the generation config
+        return BatchFeature(data={"input_ids": input_ids, "media": media, "media_config": media_config})
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+    def post_process_image_text_to_text(self, generated_outputs):
+        """
+        Post-process the output of the model to decode the text.
+        Args:
+            generated_outputs (`torch.Tensor` or `np.ndarray`):
+                The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
+                or `(sequence_length,)`.
+        Returns:
+            `List[str]`: The decoded text.
+        """
+        return self.tokenizer.batch_decode(
+            generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+    #     inputs = processor(conversation=llavaconv, padding=True, return_tensors="pt")
+    def apply_chat_template(self, conversation, add_generation_prompt=True, **kwargs):
+        vila_conv = []
+        for chat in conversation:
+            vila_chat = {"from": "", "value": []}
+            if chat["role"] == "user":
+                # user allows to input image and text
+                vila_chat["from"] = "human"
+                for content in chat["content"]:
+                    if content["type"] == "image":
+                        if "path" in content:
+                            # VILA style
+                            vila_chat["value"].append(Image(fetch_image_url_or_fpath(content["path"])))
+                        elif "image" in content:
+                            # Qwen style
+                            vila_chat["value"].append(Image(fetch_image_url_or_fpath(content["image"])))
+                        else:
+                            raise ValueError(f"Unsupported content type `image`: {content}, `image` and `path` are required")
+                    elif content["type"] == "text":
+                        vila_chat["value"].append(content["text"])
+                    # NOTE(ligeng): video supports are needed here
+                    else:
+                        raise ValueError(f"Unsupported content type: {content['type']}")
+            elif chat["role"] == "assistant":
+                vila_chat["from"] = "gpt"
+                for content in chat["content"]:
+                    assert content["type"] == "text", f"Unsupported content type: {content['type']}"
+                    vila_chat["value"].append(content["text"])
+            vila_conv.append(vila_chat)
+        return vila_conv
+if __name__ == "__main__":
+    # gpt style: user, assistant
+    # vila style: human, gpt
+    gpt_conv = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "path": "demo_images/demo_img_1.png"},
+                {"type": "text", "text": "Describe this image."},
+            ],
+        }
+    ]
+    llavaconv = [
+        {
+            "from": "human",
+            "value": [
+                PIL.Image.open("demo_images/demo_img_1.png"),
+                "Describe this image.",
+            ],
+        }
+    ]
+    processor = AutoProcessor.from_pretrained(output_dir, trust_remote_code=True)
+    inputs = processor.apply_chat_template(conversation=gpt_conv, padding=True, return_tensors="pt")
+    # model = llava.load("Efficient-Large-Model/qwen25_2B_3x3-sft").cuda()
+    # print(model)
+    model_path = "NVILA-Lite-2B-hf-preview"
+    model = AutoModel.from_pretrained(model_path, trust_remote_code=True, device_map="auto")
+    # res = model.generate_content(["how are you today?"])
+    # print(model.config)
+    # print(model.tokenizer)
+    # print(res)
+    # exit(0)
+    processor = VILAProcessor(
+        config=model.config,
+        image_processor=model.vision_tower.image_processor,
+        tokenizer=model.tokenizer,
+    )
+    # TODO: add padding, return_tensors,
+    inputs = processor(conversation=llavaconv, padding=True, return_tensors="pt")
+    print(inputs.keys(), inputs.input_ids.shape, [_.shape for _ in inputs.image])
+    print("vila conv pass")
+    inputs = processor.apply_chat_template(conversation=gpt_conv, padding=True, return_tensors="pt")
+    print(inputs.keys(), inputs.input_ids.shape, [_.shape for _ in inputs.image])
+    print("gpt conv pass")
+    output_ids = model.generate(
+        input_ids=inputs.input_ids,
+        media={
+            "image": inputs.image,
+        },
+        media_config={"image": {}},
+        generation_config=model.generation_config,
+        max_new_tokens=100,
+    )
+    print(output_ids)

base_projector.py ADDED Viewed

	@@ -0,0 +1,228 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+import re
+import torch
+import torch.nn as nn
+from transformers import AutoConfig, AutoModel, PretrainedConfig, PreTrainedModel
+class IdentityMap(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x, *args, **kwargs):
+        return x
+    @property
+    def config(self):
+        return {"mm_projector_type": "identity"}
+class SimpleResBlock(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.pre_norm = nn.LayerNorm(channels)
+        self.proj = nn.Sequential(nn.Linear(channels, channels), nn.GELU(), nn.Linear(channels, channels))
+    def forward(self, x):
+        x = self.pre_norm(x)
+        return x + self.proj(x)
+class DownSampleBlock(nn.Module):
+    def forward(self, x):
+        vit_embeds = x
+        h = w = int(vit_embeds.shape[1] ** 0.5)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
+        vit_embeds = self.flat_square(vit_embeds)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1])
+        return vit_embeds
+    def flat_square(self, x):
+        n, w, h, c = x.size()
+        if w % 2 == 1:
+            x = torch.concat([x, torch.zeros((n, 1, h, c), dtype=x.dtype).to(x.device)], dim=1).contiguous()
+            n, w, h, c = x.size()
+        if h % 2 == 1:
+            x = torch.concat([x, torch.zeros((n, w, 1, c), dtype=x.dtype).to(x.device)], dim=2).contiguous()
+            n, w, h, c = x.size()
+        x = x.contiguous()
+        x = x.view(n, w, int(h / 2), int(c * 2))
+        x = x.permute(0, 2, 1, 3).contiguous()
+        x = x.view(n, int(h / 2), int(w / 2), int(c * 4))
+        x = x.permute(0, 2, 1, 3).contiguous()
+        return x
+class DownSample2x2BlockFix(nn.Module):
+    def forward(self, x):
+        vit_embeds = x
+        h = w = int(vit_embeds.shape[1] ** 0.5)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
+        vit_embeds = flat_square_2x2(vit_embeds)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1])
+        return vit_embeds
+def flat_square_2x2(x):
+    n, w, h, c = x.size()
+    if w % 2 == 1:
+        x = torch.concat([x, torch.zeros((n, 1, h, c), dtype=x.dtype).to(x.device)], dim=1).contiguous()
+        n, w, h, c = x.size()
+    x = x.contiguous()
+    if h % 2 == 1:
+        x = torch.concat([x, torch.zeros((n, w, 1, c), dtype=x.dtype).to(x.device)], dim=2).contiguous()
+        n, w, h, c = x.size()
+    x = x.view(n, w, int(h / 2), int(c * 2))
+    x = x.permute(0, 2, 1, 3).contiguous()
+    x = x.view(n, int(h / 2), int(w / 2), int(c * 4))
+    x = x.permute(0, 2, 1, 3).contiguous()
+    return x
+class DownSample3x3BlockFix(nn.Module):
+    def forward(self, x):
+        vit_embeds = x
+        h = w = int(vit_embeds.shape[1] ** 0.5)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
+        vit_embeds = flat_square_3x3(vit_embeds)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1])
+        return vit_embeds
+def flat_square_3x3(x):
+    n, w, h, c = x.size()
+    if w % 3 != 0:
+        x = torch.concat([x, torch.zeros((n, 3 - (w % 3), h, c), dtype=x.dtype).to(x.device)], dim=1).contiguous()
+        n, w, h, c = x.size()
+    x = x.contiguous()
+    if h % 3 != 0:
+        x = torch.concat([x, torch.zeros((n, w, 3 - (h % 3), c), dtype=x.dtype).to(x.device)], dim=2).contiguous()
+        n, w, h, c = x.size()
+    x = x.view(n, w, int(h / 3), int(c * 3))
+    x = x.permute(0, 2, 1, 3).contiguous()
+    x = x.view(n, int(h / 3), int(w / 3), int(c * 9))
+    x = x.permute(0, 2, 1, 3).contiguous()
+    return x
+class MultimodalProjectorConfig(PretrainedConfig):
+    model_type = "v2l_projector"
+    def __init__(self, mm_projector_type: str = None, **kwargs):
+        super().__init__()
+        self.mm_projector_type = mm_projector_type
+class MultimodalProjector(PreTrainedModel):
+    config_class = MultimodalProjectorConfig
+    def __init__(self, mm_projector_cfg: MultimodalProjectorConfig, config: PretrainedConfig):
+        super().__init__(mm_projector_cfg)
+        mm_projector_type = mm_projector_cfg.mm_projector_type
+        self.downsample_rate = 1
+        if mm_projector_type == "identity":
+            self.layers = IdentityMap()
+        elif mm_projector_type == "linear":
+            self.layers = nn.Linear(config.mm_hidden_size, config.hidden_size)
+        elif mm_projector_type == "mlp_downsample":
+            self.layers = nn.Sequential(
+                DownSampleBlock(),
+                nn.LayerNorm(config.mm_hidden_size * 4),
+                nn.Linear(config.mm_hidden_size * 4, config.hidden_size),
+                nn.GELU(),
+                nn.Linear(config.hidden_size, config.hidden_size),
+            )
+            self.downsample_rate = 2
+        elif mm_projector_type == "mlp_downsample_2x2_fix":
+            self.layers = nn.Sequential(
+                DownSample2x2BlockFix(),
+                nn.LayerNorm(config.mm_hidden_size * 4),
+                nn.Linear(config.mm_hidden_size * 4, config.hidden_size),
+                nn.GELU(),
+                nn.Linear(config.hidden_size, config.hidden_size),
+            )
+            self.downsample_rate = 2
+        elif mm_projector_type == "mlp_downsample_3x3_fix":
+            self.layers = nn.Sequential(
+                DownSample3x3BlockFix(),
+                nn.LayerNorm(config.mm_hidden_size * 9),
+                nn.Linear(config.mm_hidden_size * 9, config.mm_hidden_size * 3),
+                nn.GELU(),
+                nn.LayerNorm(config.mm_hidden_size * 3),
+                nn.Linear(config.mm_hidden_size * 3, config.hidden_size),
+                nn.GELU(),
+                nn.Linear(config.hidden_size, config.hidden_size),
+            )
+            self.downsample_rate = 3
+        elif mm_projector_type == "mlp_downsample_3x3_s2":
+            self.layers = nn.Sequential(
+                DownSample3x3BlockFix(),
+                nn.LayerNorm(config.mm_hidden_size * 9),
+                nn.Linear(config.mm_hidden_size * 9, config.mm_hidden_size * 3),
+                nn.GELU(),
+                nn.LayerNorm(config.mm_hidden_size * 3),
+                nn.Linear(config.mm_hidden_size * 3, config.mm_hidden_size),
+                nn.GELU(),
+                nn.LayerNorm(config.mm_hidden_size),
+                nn.Linear(config.mm_hidden_size, config.mm_hidden_size // 3),
+                nn.GELU(),
+                nn.LayerNorm(config.mm_hidden_size // 3),
+                nn.Linear(config.mm_hidden_size // 3, config.hidden_size),
+                nn.GELU(),
+                nn.Linear(config.hidden_size, config.hidden_size),
+            )
+        elif mm_projector_type == "mlp_downsample_3x3_s2_new":
+            self.layers = nn.Sequential(
+                DownSample3x3BlockFix(),
+                nn.LayerNorm(config.mm_hidden_size * 9),
+                nn.Linear(config.mm_hidden_size * 9, config.mm_hidden_size * 4),
+                nn.GELU(),
+                nn.LayerNorm(config.mm_hidden_size * 4),
+                nn.Linear(config.mm_hidden_size * 4, config.mm_hidden_size * 2),
+                nn.GELU(),
+                nn.LayerNorm(config.mm_hidden_size * 2),
+                nn.Linear(config.mm_hidden_size * 2, config.mm_hidden_size),
+                nn.GELU(),
+                nn.LayerNorm(config.mm_hidden_size),
+                nn.Linear(config.mm_hidden_size, config.mm_hidden_size // 3),
+                nn.GELU(),
+                nn.LayerNorm(config.mm_hidden_size // 3),
+                nn.Linear(config.mm_hidden_size // 3, config.hidden_size),
+                nn.GELU(),
+                nn.Linear(config.hidden_size, config.hidden_size),
+            )
+        else:
+            mlp_gelu_match = re.match(r"^mlp(\d+)x_gelu$", mm_projector_type)
+            if mlp_gelu_match:
+                mlp_depth = int(mlp_gelu_match.group(1))
+                modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
+                for _ in range(1, mlp_depth):
+                    modules.append(nn.GELU())
+                    modules.append(nn.Linear(config.hidden_size, config.hidden_size))
+                self.layers = nn.Sequential(*modules)
+            else:
+                raise ValueError(f"Unknown projector type: {mm_projector_type}")
+    def forward(self, x, *args, **kwargs):
+        return self.layers(x)
+# AutoConfig.register("v2l_projector", MultimodalProjectorConfig)
+# AutoModel.register(MultimodalProjectorConfig, MultimodalProjector)

builder.py ADDED Viewed

	@@ -0,0 +1,245 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+import math
+import os
+import os.path as osp
+import warnings
+from dataclasses import asdict
+from typing import Any, Dict, List, Optional, Sequence, Tuple
+import torch
+import transformers
+from huggingface_hub import file_exists, repo_exists
+from huggingface_hub.utils import HFValidationError
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    PretrainedConfig,
+    PreTrainedModel,
+    PreTrainedTokenizer,
+)
+# from .conversation import *
+from .conversation import SeparatorStyle, default_conversation
+SENTINEL_TOKEN = "<vila/sentinel>"
+MEDIA_TOKENS = {
+    "image": "<image>",
+    "video": "<vila/video>",
+}
+# from llava.model.utils import packing
+# from llava.utils.logging import logger
+# from llava.utils.tokenizer import infer_stop_tokens
+DUMMY_CONVERSATION = [
+    {"from": "human", "value": "question"},
+    {"from": "gpt", "value": "answer"},
+] * 10
+def tokenizer_image_token(prompt, tokenizer, return_tensors=None):
+    return tokenizer(prompt, return_tensors=return_tensors).input_ids[0]
+def has_tokenizer(repo_id_or_path: str) -> bool:
+    # Check if the tokenizer is in a local directory
+    if osp.exists(osp.join(repo_id_or_path, "tokenizer_config.json")):
+        return True
+    # Check if the tokenizer is in a Hugging Face Hub repo
+    try:
+        return repo_exists(repo_id_or_path) and file_exists(repo_id_or_path, "tokenizer_config.json")
+    except HFValidationError:
+        return False
+def _maybe_add_sentinel_token(tokenizer: transformers.PreTrainedTokenizer) -> None:
+    if not hasattr(tokenizer, "sentinel_token"):
+        tokenizer.add_tokens([SENTINEL_TOKEN], special_tokens=True)
+        tokenizer.sentinel_token = SENTINEL_TOKEN
+        tokenizer.sentinel_token_id = tokenizer.convert_tokens_to_ids(SENTINEL_TOKEN)
+def tokenize_conversation_legacy(
+    messages: Sequence[Dict[str, str]],
+    tokenizer: transformers.PreTrainedTokenizer,
+    add_generation_prompt: bool = False,
+    overrides: Optional[Dict[str, str]] = None,
+    no_system_prompt: bool = False,
+) -> torch.Tensor:
+    conv = default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+    if no_system_prompt:
+        conv.system = ""
+    # Skip the first message if it is not from human
+    if messages[0]["from"] != "human":
+        messages = messages[1:]
+    # Add a generation prompt if needed
+    if add_generation_prompt:
+        messages.append({"from": "gpt", "value": None})
+    conv.messages = []
+    for turn, message in enumerate(messages):
+        role = roles[message["from"]]
+        assert role == conv.roles[turn % 2]
+        if overrides is not None and message["from"] in overrides:
+            conv.append_message(role, overrides[message["from"]])
+        else:
+            conv.append_message(role, message["value"])
+    return tokenizer_image_token(conv.get_prompt(), tokenizer, return_tensors="pt")
+def tokenize_conversation(
+    messages: Sequence[Dict[str, str]],
+    tokenizer: transformers.PreTrainedTokenizer,
+    add_generation_prompt: bool = False,
+    overrides: Optional[Dict[str, str]] = None,
+    no_system_prompt: bool = False,
+) -> torch.Tensor:
+    # Normalize the conversation before tokenization
+    for message in messages:
+        message["value"] = message["value"].strip()
+    if default_conversation.sep_style != SeparatorStyle.AUTO:
+        return tokenize_conversation_legacy(
+            messages,
+            tokenizer,
+            add_generation_prompt=add_generation_prompt,
+            overrides=overrides,
+            no_system_prompt=no_system_prompt,
+        )
+    conversation = []
+    for m in messages:
+        message = {}
+        if m["from"] == "human":
+            message["role"] = "user"
+        elif m["from"] == "gpt":
+            message["role"] = "assistant"
+        else:
+            raise ValueError(f"Unexpected sender '{m['from']}' in conversation entry.")
+        message["content"] = m["value"]
+        if overrides is not None and m["from"] in overrides:
+            message["content"] = overrides[m["from"]]
+        conversation.append(message)
+    if no_system_prompt:
+        conversation = [{"role": "system", "content": ""}] + conversation
+    text = tokenizer.apply_chat_template(
+        conversation,
+        add_generation_prompt=add_generation_prompt,
+        tokenize=False,
+    )
+    return tokenizer_image_token(text, tokenizer, return_tensors="pt")
+def infer_stop_tokens(tokenizer: transformers.PreTrainedTokenizer) -> List[str]:
+    _maybe_add_sentinel_token(tokenizer)
+    template = tokenize_conversation(DUMMY_CONVERSATION, tokenizer, overrides={"gpt": SENTINEL_TOKEN})
+    stop_tokens = {tokenizer.eos_token}
+    for k in range(template.size(0) - 1):
+        if template[k] == tokenizer.sentinel_token_id:
+            stop_token = tokenizer.decode(template[k + 1])
+            stop_tokens.add(stop_token)
+    return list(stop_tokens)
+def context_length_extension(config):
+    orig_ctx_len = getattr(config, "max_position_embeddings", None)
+    model_max_length = getattr(config, "model_max_length", None)
+    if orig_ctx_len and model_max_length > orig_ctx_len:
+        print(f"Scaling RoPE from {orig_ctx_len} to {model_max_length}")
+        scaling_factor = float(math.ceil(model_max_length / orig_ctx_len))
+        config.rope_scaling = {"type": "linear", "factor": scaling_factor}
+    return config
+def build_llm_and_tokenizer(
+    model_name_or_path: str,
+    config: PretrainedConfig,
+    attn_implementation=None,
+    model_max_length=None,
+    *args,
+    **kwargs,
+) -> Tuple[PreTrainedModel, PreTrainedTokenizer]:
+    # print(model_name_or_path)
+    llm_cfg = AutoConfig.from_pretrained(model_name_or_path)
+    llm_cfg._attn_implementation = attn_implementation
+    llm_cfg.model_max_length = model_max_length
+    if model_max_length is not None:
+        context_length_extension(llm_cfg)
+    # Quantization related
+    quantization_restore_from_checkpoint = False
+    if quantization_restore_from_checkpoint:
+        fp8_model_name_or_path = kwargs.pop("fp8_llm_cfg", None)
+        llm = AutoModelForCausalLM.from_pretrained(
+            fp8_model_name_or_path, config=llm_cfg, torch_dtype=eval(config.model_dtype), *args, **kwargs
+        )
+    else:
+        llm = AutoModelForCausalLM.from_pretrained(
+            model_name_or_path, config=llm_cfg, torch_dtype=eval(config.model_dtype), *args, **kwargs
+        )
+    # NOTE(ligeng): not sure whether it affects the training
+    # packing.patch(llm)
+    # Locate the tokenizer.
+    llm_path = model_name_or_path
+    if not has_tokenizer(llm_path):
+        llm_path = osp.join(llm_path, "llm")
+    if not has_tokenizer(llm_path):
+        raise ValueError(f"Cannot find tokenizer in {llm_path}.")
+    tokenizer = AutoTokenizer.from_pretrained(llm_path, padding_side="right", use_fast=True, legacy=False)
+    if model_max_length is not None:
+        tokenizer.model_max_length = model_max_length
+    # Load chat template if specified.
+    if getattr(config, "chat_template", None) is not None:
+        print(f"Using chat template: {config.chat_template}")
+        fpath = os.path.join(os.path.dirname(__file__), "chat_templates", f"{config.chat_template}.jinja")
+        if not os.path.exists(fpath):
+            fpath = os.path.join(os.path.dirname(model_name_or_path), f"{config.chat_template}.jinja")
+        with open(fpath) as fd:
+            chat_template = fd.read()
+        tokenizer.chat_template = chat_template.replace("    ", "").replace("\n", "")
+    # Set stop tokens for the tokenizer
+    tokenizer.stop_tokens = infer_stop_tokens(tokenizer)
+    tokenizer.stop_token_ids = tokenizer.convert_tokens_to_ids(tokenizer.stop_tokens)
+    # Add media tokens to the tokenizer
+    tokenizer.media_tokens = MEDIA_TOKENS
+    tokenizer.media_token_ids = {}
+    for name, token in MEDIA_TOKENS.items():
+        tokenizer.add_tokens([token], special_tokens=True)
+        tokenizer.media_token_ids[name] = tokenizer.convert_tokens_to_ids(token)
+    # TODO(ligeng): is this necessary for llava?
+    config.hidden_size = llm.config.hidden_size
+    return llm, tokenizer

config.json ADDED Viewed

	@@ -0,0 +1,322 @@

+{
+  "Ubit": 100,
+  "_name_or_path": "runs/train/NVILA-Lite_14b_siglip_aws_env2_obelics_ja/sft_14b_GPT4_v6/model",
+  "architectures": [
+    "VILAForCasualLM"
+  ],
+  "babit": "E5M2",
+  "bobit": "E5M2",
+  "bwbit": "E5M2",
+  "chat_template": null,
+  "col_blocksize": -1,
+  "col_blocksize_optimizer": 128,
+  "draw_distribution_backward": false,
+  "draw_distribution_forward": false,
+  "drop_path_rate": 0.0,
+  "dynamic_s2": false,
+  "epsilon": 1e-10,
+  "epsilon_optimizer": 1e-15,
+  "fabit": "E4M3",
+  "first_order_bit": null,
+  "first_order_quant_type": null,
+  "fobit": "E4M3",
+  "fps": 0.0,
+  "fwbit": "E4M3",
+  "group_size": -1,
+  "hidden_size": 5120,
+  "image_aspect_ratio": "dynamic",
+  "image_encoder": {
+    "_target_": "llava.model.encoders.BasicImageEncoder"
+  },
+  "interpolate_mode": "linear",
+  "llm_cfg": {
+    "_name_or_path": "runs/train/NVILA-Lite_14b_siglip_aws_env2_obelics_ja/sft_14b_GPT4_v6/model/llm",
+    "add_cross_attention": false,
+    "architectures": [
+      "Qwen2ForCausalLM"
+    ],
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": 151643,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 151645,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "silu",
+    "hidden_size": 5120,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_range": 0.02,
+    "intermediate_size": 13824,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_position_embeddings": 32768,
+    "max_window_layers": 70,
+    "min_length": 0,
+    "model_max_length": 4096,
+    "model_type": "qwen2",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 40,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 48,
+    "num_key_value_heads": 8,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": null,
+    "rope_theta": 1000000.0,
+    "sep_token_id": null,
+    "sliding_window": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": false,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": "bfloat16",
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "use_cache": true,
+    "use_sliding_window": false,
+    "vocab_size": 151670
+  },
+  "max_tiles": 12,
+  "min_blockunit_col": 4,
+  "min_blockunit_row": 4,
+  "min_tiles": 1,
+  "mlp_path": null,
+  "mm_hidden_size": 1152,
+  "mm_projector": "mlp_downsample_3x3_fix",
+  "mm_projector_cfg": {
+    "_name_or_path": "runs/train/NVILA-Lite_14b_siglip_aws_env2_obelics_ja/sft_14b_GPT4_v6/model/mm_projector",
+    "add_cross_attention": false,
+    "architectures": [
+      "MultimodalProjector"
+    ],
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "min_length": 0,
+    "mm_projector_type": "mlp_downsample_3x3_fix",
+    "model_type": "v2l_projector",
+    "no_repeat_ngram_size": 0,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": "bfloat16",
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_bfloat16": false
+  },
+  "mm_projector_lr": null,
+  "mm_use_im_patch_token": false,
+  "mm_use_im_start_end": false,
+  "mm_vision_select_feature": "cls_patch",
+  "mm_vision_select_layer": -2,
+  "model_dtype": "torch.bfloat16",
+  "model_name_or_path": "runs/train/NVILA-Lite_14b_siglip_aws_env2_obelics_ja/pretrain_14b/model",
+  "model_type": "vila",
+  "num_time_tokens": 0,
+  "num_video_frames": 8,
+  "pad_block": false,
+  "pad_to_multiple_of": 0,
+  "qchoice": "none",
+  "quantize_model": false,
+  "refine_attn_blocksize": false,
+  "refine_col_blocksize": 4,
+  "refine_ln_blocksize": false,
+  "refine_ln_blocksize_but_only_backward": false,
+  "refine_ln_blocksize_but_only_forward": false,
+  "refine_ln_pertoken": false,
+  "refine_mlp_blocksize": false,
+  "refine_residual_fp": false,
+  "refine_row_blocksize": 4,
+  "resume_path": "runs/train/NVILA-Lite_14b_siglip_aws_env2_obelics_ja/sft_14b_GPT4_v6/model",
+  "row_blocksize": -1,
+  "row_blocksize_optimizer": 1,
+  "s2": false,
+  "s2_max_split_size": 336,
+  "s2_resize_output_to_scale_idx": 0,
+  "s2_scales": "336,672,1008",
+  "second_order_bit": null,
+  "second_order_quant_type": null,
+  "soft_ce_std": 1.0,
+  "symm": true,
+  "time_token_format": "<t{t}>",
+  "time_token_ids": [],
+  "transformers_version": "4.45.0",
+  "tune_language_model": true,
+  "tune_mm_projector": true,
+  "tune_vision_tower": true,
+  "use_quantize_optimizer": false,
+  "version": "2.0",
+  "video_encoder": {
+    "_target_": "llava.model.encoders.BasicVideoEncoder"
+  },
+  "vision_resolution": -1,
+  "vision_tower": "/data/models/Efficient-Large-Model/paligemma-siglip-so400m-patch14-448",
+  "vision_tower_cfg": {
+    "_name_or_path": "runs/train/NVILA-Lite_14b_siglip_aws_env2_obelics_ja/sft_14b_GPT4_v6/model/vision_tower",
+    "add_cross_attention": false,
+    "architectures": [
+      "SiglipVisionModel"
+    ],
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 1152,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "image_size": 448,
+    "intermediate_size": 4304,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-06,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "min_length": 0,
+    "model_type": "siglip_vision_model",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 16,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_channels": 3,
+    "num_hidden_layers": 27,
+    "num_image_tokens": 256,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "patch_size": 14,
+    "prefix": null,
+    "problem_type": null,
+    "projection_dim": 2048,
+    "projector_hidden_act": "gelu_fast",
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": "bfloat16",
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "vision_use_head": false
+  },
+  "vision_tower_lr": null,
+  "weight_memory_efficient": true,
+  "auto_map": {
+    "AutoProcessor": "auto_processor.VILAProcessor",
+    "AutoConfig": "modeling_vila.VILAConfig",
+    "AutoModel": "modeling_vila.VILAForCasualLM",
+    "AutoModelForCausalLM": "modeling_vila.VILAForCasualLM"
+  }
+}

configuration_vila.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import json
+import math
+import os
+import os.path as osp
+from copy import deepcopy
+from threading import Thread
+from typing import List, Optional
+import torch
+import torchvision
+from PIL import Image
+from transformers import (
+    AutoProcessor,
+    PretrainedConfig,
+    PreTrainedModel,
+    Qwen2Config,
+    Qwen2ForCausalLM,
+    Qwen2PreTrainedModel,
+    TextIteratorStreamer,
+)
+class VILAConfig(PretrainedConfig):
+    model_type = "vila"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        llm_cfg=None,
+        vision_tower_cfg=None,
+        mm_projector_cfg=None,
+        architectures=None,
+        resume_path=None,
+        hidden_size=None,
+        mm_hidden_size=None,
+        image_aspect_ratio=None,
+        num_video_frames=None,
+        fps=None,
+        mm_vision_select_layer=None,
+        mm_vision_select_feature=None,
+        mm_use_im_start_end=False,
+        mm_use_im_patch_token=False,
+        mm_projector_lr=None,
+        vision_tower_lr=None,
+        vision_resolution=None,
+        interpolate_mode=None,
+        s2=None,
+        dynamic_s2=None,
+        s2_scales=None,
+        s2_max_split_size=None,
+        s2_resize_output_to_scale_idx=0,
+        min_tiles: Optional[int] = 1,
+        max_tiles: Optional[int] = 12,
+        num_time_tokens=None,
+        time_token_format=None,
+        image_encoder: str = '{"_target_": "llava.model.encoders.BasicImageEncoder"}',
+        video_encoder: str = '{"_target_": "llava.model.encoders.BasicVideoEncoder"}',
+        **kwargs,
+    ):
+        super().__init__()
+        self.architectures = architectures
+        self.llm_cfg = llm_cfg
+        self.vision_tower_cfg = vision_tower_cfg
+        self.mm_projector_cfg = mm_projector_cfg
+        self.resume_path = resume_path
+        self.hidden_size = hidden_size
+        self.mm_hidden_size = mm_hidden_size
+        self.image_aspect_ratio = image_aspect_ratio
+        self.num_video_frames = num_video_frames
+        self.fps = fps
+        self.mm_vision_select_layer = mm_vision_select_layer
+        self.mm_vision_select_feature = mm_vision_select_feature
+        self.mm_use_im_start_end = mm_use_im_start_end
+        self.mm_use_im_patch_token = mm_use_im_patch_token
+        self.mm_projector_lr = mm_projector_lr
+        self.vision_tower_lr = vision_tower_lr
+        self.vision_resolution = vision_resolution
+        self.interpolate_mode = interpolate_mode
+        self.s2 = s2
+        self.dynamic_s2 = dynamic_s2
+        self.s2_scales = s2_scales
+        self.s2_max_split_size = s2_max_split_size
+        self.s2_resize_output_to_scale_idx = s2_resize_output_to_scale_idx
+        self.min_tiles = min_tiles
+        self.max_tiles = max_tiles
+        self.num_time_tokens = num_time_tokens
+        self.time_token_format = time_token_format
+        self.image_encoder = image_encoder
+        self.video_encoder = video_encoder
+        super().__init__(**kwargs)

constants.py ADDED Viewed

	@@ -0,0 +1,43 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+CONTROLLER_HEART_BEAT_EXPIRATION = 30
+WORKER_HEART_BEAT_INTERVAL = 15
+LOGDIR = "."
+# Model Constants
+IGNORE_INDEX = -100
+DEFAULT_IMAGE_TOKEN = "<image>"
+SENTINEL_TOKEN = "<vila/sentinel>"
+MEDIA_TOKENS = {
+    "image": "<image>",
+    "video": "<vila/video>",
+}
+# <image> <vila/video> <vila/sentinel>
+# TODO(ligeng): need to discuss with Zhijian for the following tokens for different models.
+"""
+151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+151646: AddedToken("[BOS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+151647: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+151648: AddedToken("<vila/sentinel>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+151649: AddedToken("<image>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+151650: AddedToken("<vila/video>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+"""
+NUM_EXTRA_TOKENS = 8

conversation.py ADDED Viewed

	@@ -0,0 +1,191 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+# This file is modified from https://github.com/haotian-liu/LLaVA/
+import dataclasses
+from enum import Enum, auto
+from typing import List
+# from llava.utils.logging import logger
+class SeparatorStyle(Enum):
+    """Different separator style."""
+    AUTO = auto()
+    TWO = auto()
+    MPT = auto()
+    PLAIN = auto()
+    LLAMA_3 = auto()
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+    system: str
+    roles: List[str]
+    messages: List[List[str]]
+    sep_style: SeparatorStyle = SeparatorStyle.AUTO
+    sep: str = "###"
+    sep2: str = None
+    version: str = "Unknown"
+    def get_prompt(self):
+        messages = self.messages
+        if len(messages) > 0 and type(messages[0][1]) is tuple:
+            messages = self.messages.copy()
+            init_role, init_msg = messages[0].copy()
+            init_msg = init_msg[0].replace("<image>", "").strip()
+            messages[0] = (init_role, "<image>\n" + init_msg)
+        if self.sep_style == SeparatorStyle.TWO:
+            seps = [self.sep, self.sep2]
+            ret = self.system + seps[0]
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.LLAMA_3:
+            ret = self.system + self.sep
+            for rid, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message = message[0]
+                    sep = self.sep if rid < len(messages) - 1 else self.sep2
+                    ret += role + message + sep
+                else:
+                    ret += role
+        elif self.sep_style == SeparatorStyle.MPT:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+        elif self.sep_style == SeparatorStyle.PLAIN:
+            seps = [self.sep, self.sep2]
+            ret = self.system
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += message + seps[i % 2]
+                else:
+                    ret += ""
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+        return ret
+    def append_message(self, role, message):
+        self.messages.append([role, message])
+    def copy(self):
+        return Conversation(
+            system=self.system,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            version=self.version,
+        )
+conv_auto = Conversation(
+    system="",
+    roles=("", ""),
+    messages=(),
+    sep_style=SeparatorStyle.AUTO,
+    sep="\n",
+)
+conv_vicuna_v1 = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+conv_llava_plain = Conversation(
+    system="",
+    roles=("", ""),
+    messages=(),
+    sep_style=SeparatorStyle.PLAIN,
+    sep="\n",
+)
+hermes_2 = Conversation(
+    system="<|im_start|>system\nAnswer the questions.",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+    messages=(),
+    version="hermes-2",
+)
+# Template added by Yukang. Note (kentang-mit@): sep is <|eot_id|> for official template.
+llama_3_chat = Conversation(
+    system="<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful language and vision assistant. "
+    "You are able to understand the visual content that the user provides, "
+    "and assist the user with a variety of tasks using natural language.",
+    roles=("<|start_header_id|>user<|end_header_id|>\n\n", "<|start_header_id|>assistant<|end_header_id|>\n\n"),
+    version="llama_v3",
+    messages=(),
+    sep_style=SeparatorStyle.LLAMA_3,
+    sep="<|eot_id|>",
+    sep2="<|end_of_text|>",
+)
+default_conversation = conv_auto
+conv_templates = {
+    "auto": conv_auto,
+    "hermes-2": hermes_2,
+    "llama_3": llama_3_chat,
+    "v1": conv_vicuna_v1,
+    "vicuna_v1": conv_vicuna_v1,
+    "plain": conv_llava_plain,
+}
+CONVERSATION_MODE_MAPPING = {
+    "vila1.5-3b": "vicuna_v1",
+    "vila1.5-8b": "llama_3",
+    "vila1.5-13b": "vicuna_v1",
+    "vila1.5-40b": "hermes-2",
+    "llama-3": "llama_3",
+    "llama3": "llama_3",
+}
+def auto_set_conversation_mode(model_name_or_path: str) -> str:
+    global default_conversation
+    for k, v in CONVERSATION_MODE_MAPPING.items():
+        if k in model_name_or_path.lower():
+            print(f"Setting conversation mode to `{v}` based on model name/path `{model_name_or_path}`.")
+            default_conversation = conv_templates[v]
+            return

distributed.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import os
+import warnings
+from typing import Any, List, Optional
+from torch import distributed as dist
+__all__ = [
+    "init",
+    "is_initialized",
+    "size",
+    "rank",
+    "local_size",
+    "local_rank",
+    "is_main",
+    "barrier",
+    "gather",
+    "all_gather",
+]
+def init() -> None:
+    if "RANK" not in os.environ:
+        warnings.warn("Environment variable `RANK` is not set. Skipping distributed initialization.")
+        return
+    dist.init_process_group(backend="nccl", init_method="env://")
+def is_initialized() -> bool:
+    return dist.is_initialized()
+def size() -> int:
+    return int(os.environ.get("WORLD_SIZE", 1))
+def rank() -> int:
+    return int(os.environ.get("RANK", 0))
+def local_size() -> int:
+    return int(os.environ.get("LOCAL_WORLD_SIZE", 1))
+def local_rank() -> int:
+    return int(os.environ.get("LOCAL_RANK", 0))
+def is_main() -> bool:
+    return rank() == 0
+def barrier() -> None:
+    dist.barrier()
+def gather(obj: Any, dst: int = 0) -> Optional[List[Any]]:
+    if not is_initialized():
+        return [obj]
+    if is_main():
+        objs = [None for _ in range(size())]
+        dist.gather_object(obj, objs, dst=dst)
+        return objs
+    else:
+        dist.gather_object(obj, dst=dst)
+        return None
+def all_gather(obj: Any) -> List[Any]:
+    if not is_initialized():
+        return [obj]
+    objs = [None for _ in range(size())]
+    dist.all_gather_object(objs, obj)
+    return objs

llm/added_tokens.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "</tool_call>": 151658,
+  "<image>": 151666,
+  "<tool_call>": 151657,
+  "<vila/sentinel>": 151665,
+  "<vila/video>": 151667,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652,
+  "[BOS]": 151668,
+  "[PAD]": 151669
+}

llm/config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "_name_or_path": "runs/train/NVILA-Lite_14b_siglip_aws_env2_obelics_ja/sft_14b_GPT4_v6/model/llm",
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 5120,
+  "initializer_range": 0.02,
+  "intermediate_size": 13824,
+  "max_position_embeddings": 32768,
+  "max_window_layers": 70,
+  "model_max_length": 4096,
+  "model_type": "qwen2",
+  "num_attention_heads": 40,
+  "num_hidden_layers": 48,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.45.0",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151670
+}

llm/generation_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "bos_token_id": 151643,
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "repetition_penalty": 1.05,
+  "temperature": 0.7,
+  "top_k": 20,
+  "top_p": 0.8,
+  "transformers_version": "4.45.0"
+}

llm/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

llm/model-00001-of-00006.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ee395da79dd9faebfc541bbdfe73fb1132db12e346e314f9868b409a2c217e26
+size 4982176720

llm/model-00002-of-00006.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a80b8643a787f40bb0908614f36194ebfb1d8415718f67372a39844781a45630
+size 4954847344

llm/model-00003-of-00006.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:70217c6ce2d4f73e162b81efe72b3498b1a315ff1517d6e97913194752ffaf68
+size 4954847392

llm/model-00004-of-00006.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2ecab4ac8e07a9e0eb391f72cdc7b6cc895b5fbaf0809917d04669d3d9f8e2e7
+size 4954847392

llm/model-00005-of-00006.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a861359cf100d5879e2c27d4f840444645cb2efe616f449d1ccc43671a5a3365
+size 4954847392

llm/model-00006-of-00006.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4cae8fcc5d91e703f05bb4eb6498e49838badc865fc41bba9a5b1650742c049d
+size 4730498600

llm/model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,586 @@

+{
+  "metadata": {
+    "total_size": 29531998208
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00006-of-00006.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00006.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00006.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00006.safetensors",
+    "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00006.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00006.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00006.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00006.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00006.safetensors",
+    "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00006.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00006.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00006.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00006.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00006.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00006.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00006.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00006.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00006.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00006.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00006.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00006.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00006.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00006.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00006.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00006.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00006.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00006.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00006.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00006.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00006.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.16.self_attn.k_proj.bias": "model-00003-of-00006.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.16.self_attn.q_proj.bias": "model-00003-of-00006.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.16.self_attn.v_proj.bias": "model-00003-of-00006.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.17.self_attn.k_proj.bias": "model-00003-of-00006.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.17.self_attn.q_proj.bias": "model-00003-of-00006.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.17.self_attn.v_proj.bias": "model-00003-of-00006.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.18.self_attn.k_proj.bias": "model-00003-of-00006.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.18.self_attn.q_proj.bias": "model-00003-of-00006.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.18.self_attn.v_proj.bias": "model-00003-of-00006.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.19.self_attn.k_proj.bias": "model-00003-of-00006.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.19.self_attn.q_proj.bias": "model-00003-of-00006.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.19.self_attn.v_proj.bias": "model-00003-of-00006.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00006.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00006.safetensors",
+    "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00006.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00006.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00006.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00006.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00006.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00006.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.21.self_attn.k_proj.bias": "model-00003-of-00006.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00006.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.21.self_attn.v_proj.bias": "model-00003-of-00006.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00006.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00006.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00006.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.23.self_attn.k_proj.bias": "model-00003-of-00006.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00006.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00006.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00006.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00006.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00006.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.25.self_attn.k_proj.bias": "model-00004-of-00006.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.25.self_attn.q_proj.bias": "model-00004-of-00006.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.25.self_attn.v_proj.bias": "model-00004-of-00006.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.26.self_attn.k_proj.bias": "model-00004-of-00006.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.26.self_attn.q_proj.bias": "model-00004-of-00006.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.26.self_attn.v_proj.bias": "model-00004-of-00006.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.27.self_attn.k_proj.bias": "model-00004-of-00006.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.27.self_attn.q_proj.bias": "model-00004-of-00006.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.27.self_attn.v_proj.bias": "model-00004-of-00006.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.28.input_layernorm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.28.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.28.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.28.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.28.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.28.self_attn.k_proj.bias": "model-00004-of-00006.safetensors",
+    "model.layers.28.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.28.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.28.self_attn.q_proj.bias": "model-00004-of-00006.safetensors",
+    "model.layers.28.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.28.self_attn.v_proj.bias": "model-00004-of-00006.safetensors",
+    "model.layers.28.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.29.input_layernorm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.29.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.29.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.29.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.29.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.29.self_attn.k_proj.bias": "model-00004-of-00006.safetensors",
+    "model.layers.29.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.29.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.29.self_attn.q_proj.bias": "model-00004-of-00006.safetensors",
+    "model.layers.29.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.29.self_attn.v_proj.bias": "model-00004-of-00006.safetensors",
+    "model.layers.29.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00006.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00006.safetensors",
+    "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00006.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00006.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00006.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.30.input_layernorm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.30.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.30.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.30.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.30.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.30.self_attn.k_proj.bias": "model-00004-of-00006.safetensors",
+    "model.layers.30.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.30.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.30.self_attn.q_proj.bias": "model-00004-of-00006.safetensors",
+    "model.layers.30.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.30.self_attn.v_proj.bias": "model-00004-of-00006.safetensors",
+    "model.layers.30.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.31.input_layernorm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.31.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.31.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.31.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.31.self_attn.k_proj.bias": "model-00004-of-00006.safetensors",
+    "model.layers.31.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.31.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.31.self_attn.q_proj.bias": "model-00004-of-00006.safetensors",
+    "model.layers.31.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.31.self_attn.v_proj.bias": "model-00004-of-00006.safetensors",
+    "model.layers.31.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.32.input_layernorm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.32.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.32.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.32.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.32.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.32.self_attn.k_proj.bias": "model-00004-of-00006.safetensors",
+    "model.layers.32.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.32.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.32.self_attn.q_proj.bias": "model-00004-of-00006.safetensors",
+    "model.layers.32.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.32.self_attn.v_proj.bias": "model-00004-of-00006.safetensors",
+    "model.layers.32.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.33.input_layernorm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.33.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.33.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.33.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.33.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.33.self_attn.k_proj.bias": "model-00004-of-00006.safetensors",
+    "model.layers.33.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.33.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.33.self_attn.q_proj.bias": "model-00004-of-00006.safetensors",
+    "model.layers.33.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.33.self_attn.v_proj.bias": "model-00004-of-00006.safetensors",
+    "model.layers.33.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.34.input_layernorm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.34.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.34.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.34.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.34.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.34.self_attn.k_proj.bias": "model-00005-of-00006.safetensors",
+    "model.layers.34.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.34.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.34.self_attn.q_proj.bias": "model-00005-of-00006.safetensors",
+    "model.layers.34.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.34.self_attn.v_proj.bias": "model-00005-of-00006.safetensors",
+    "model.layers.34.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.35.input_layernorm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.35.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.35.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.35.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.35.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.35.self_attn.k_proj.bias": "model-00005-of-00006.safetensors",
+    "model.layers.35.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.35.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.35.self_attn.q_proj.bias": "model-00005-of-00006.safetensors",
+    "model.layers.35.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.35.self_attn.v_proj.bias": "model-00005-of-00006.safetensors",
+    "model.layers.35.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.36.input_layernorm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.36.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.36.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.36.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.36.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.36.self_attn.k_proj.bias": "model-00005-of-00006.safetensors",
+    "model.layers.36.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.36.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.36.self_attn.q_proj.bias": "model-00005-of-00006.safetensors",
+    "model.layers.36.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.36.self_attn.v_proj.bias": "model-00005-of-00006.safetensors",
+    "model.layers.36.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.37.input_layernorm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.37.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.37.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.37.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.37.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.37.self_attn.k_proj.bias": "model-00005-of-00006.safetensors",
+    "model.layers.37.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.37.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.37.self_attn.q_proj.bias": "model-00005-of-00006.safetensors",
+    "model.layers.37.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.37.self_attn.v_proj.bias": "model-00005-of-00006.safetensors",
+    "model.layers.37.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.38.input_layernorm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.38.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.38.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.38.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.38.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.38.self_attn.k_proj.bias": "model-00005-of-00006.safetensors",
+    "model.layers.38.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.38.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.38.self_attn.q_proj.bias": "model-00005-of-00006.safetensors",
+    "model.layers.38.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.38.self_attn.v_proj.bias": "model-00005-of-00006.safetensors",
+    "model.layers.38.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.39.input_layernorm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.39.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.39.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.39.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.39.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.39.self_attn.k_proj.bias": "model-00005-of-00006.safetensors",
+    "model.layers.39.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.39.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.39.self_attn.q_proj.bias": "model-00005-of-00006.safetensors",
+    "model.layers.39.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.39.self_attn.v_proj.bias": "model-00005-of-00006.safetensors",
+    "model.layers.39.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00006.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00006.safetensors",
+    "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00006.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00006.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00006.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.40.input_layernorm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.40.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.40.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.40.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.40.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.40.self_attn.k_proj.bias": "model-00005-of-00006.safetensors",
+    "model.layers.40.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.40.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.40.self_attn.q_proj.bias": "model-00005-of-00006.safetensors",
+    "model.layers.40.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.40.self_attn.v_proj.bias": "model-00005-of-00006.safetensors",
+    "model.layers.40.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.41.input_layernorm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.41.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.41.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.41.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.41.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.41.self_attn.k_proj.bias": "model-00005-of-00006.safetensors",
+    "model.layers.41.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.41.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.41.self_attn.q_proj.bias": "model-00005-of-00006.safetensors",
+    "model.layers.41.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.41.self_attn.v_proj.bias": "model-00005-of-00006.safetensors",
+    "model.layers.41.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.42.input_layernorm.weight": "model-00006-of-00006.safetensors",
+    "model.layers.42.mlp.down_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.42.mlp.gate_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.42.mlp.up_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.42.post_attention_layernorm.weight": "model-00006-of-00006.safetensors",
+    "model.layers.42.self_attn.k_proj.bias": "model-00005-of-00006.safetensors",
+    "model.layers.42.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.42.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.42.self_attn.q_proj.bias": "model-00005-of-00006.safetensors",
+    "model.layers.42.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.42.self_attn.v_proj.bias": "model-00005-of-00006.safetensors",
+    "model.layers.42.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.43.input_layernorm.weight": "model-00006-of-00006.safetensors",
+    "model.layers.43.mlp.down_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.43.mlp.gate_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.43.mlp.up_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.43.post_attention_layernorm.weight": "model-00006-of-00006.safetensors",
+    "model.layers.43.self_attn.k_proj.bias": "model-00006-of-00006.safetensors",
+    "model.layers.43.self_attn.k_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.43.self_attn.o_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.43.self_attn.q_proj.bias": "model-00006-of-00006.safetensors",
+    "model.layers.43.self_attn.q_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.43.self_attn.v_proj.bias": "model-00006-of-00006.safetensors",
+    "model.layers.43.self_attn.v_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.44.input_layernorm.weight": "model-00006-of-00006.safetensors",
+    "model.layers.44.mlp.down_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.44.mlp.gate_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.44.mlp.up_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.44.post_attention_layernorm.weight": "model-00006-of-00006.safetensors",
+    "model.layers.44.self_attn.k_proj.bias": "model-00006-of-00006.safetensors",
+    "model.layers.44.self_attn.k_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.44.self_attn.o_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.44.self_attn.q_proj.bias": "model-00006-of-00006.safetensors",
+    "model.layers.44.self_attn.q_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.44.self_attn.v_proj.bias": "model-00006-of-00006.safetensors",
+    "model.layers.44.self_attn.v_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.45.input_layernorm.weight": "model-00006-of-00006.safetensors",
+    "model.layers.45.mlp.down_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.45.mlp.gate_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.45.mlp.up_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.45.post_attention_layernorm.weight": "model-00006-of-00006.safetensors",
+    "model.layers.45.self_attn.k_proj.bias": "model-00006-of-00006.safetensors",
+    "model.layers.45.self_attn.k_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.45.self_attn.o_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.45.self_attn.q_proj.bias": "model-00006-of-00006.safetensors",
+    "model.layers.45.self_attn.q_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.45.self_attn.v_proj.bias": "model-00006-of-00006.safetensors",
+    "model.layers.45.self_attn.v_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.46.input_layernorm.weight": "model-00006-of-00006.safetensors",
+    "model.layers.46.mlp.down_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.46.mlp.gate_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.46.mlp.up_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.46.post_attention_layernorm.weight": "model-00006-of-00006.safetensors",
+    "model.layers.46.self_attn.k_proj.bias": "model-00006-of-00006.safetensors",
+    "model.layers.46.self_attn.k_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.46.self_attn.o_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.46.self_attn.q_proj.bias": "model-00006-of-00006.safetensors",
+    "model.layers.46.self_attn.q_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.46.self_attn.v_proj.bias": "model-00006-of-00006.safetensors",
+    "model.layers.46.self_attn.v_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.47.input_layernorm.weight": "model-00006-of-00006.safetensors",
+    "model.layers.47.mlp.down_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.47.mlp.gate_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.47.mlp.up_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.47.post_attention_layernorm.weight": "model-00006-of-00006.safetensors",
+    "model.layers.47.self_attn.k_proj.bias": "model-00006-of-00006.safetensors",
+    "model.layers.47.self_attn.k_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.47.self_attn.o_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.47.self_attn.q_proj.bias": "model-00006-of-00006.safetensors",
+    "model.layers.47.self_attn.q_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.47.self_attn.v_proj.bias": "model-00006-of-00006.safetensors",
+    "model.layers.47.self_attn.v_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00006.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00006.safetensors",
+    "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00006.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00006.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00006.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00006.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00006.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00006.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.7.self_attn.k_proj.bias": "model-00002-of-00006.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.7.self_attn.q_proj.bias": "model-00002-of-00006.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.7.self_attn.v_proj.bias": "model-00002-of-00006.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.8.self_attn.k_proj.bias": "model-00002-of-00006.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.8.self_attn.q_proj.bias": "model-00002-of-00006.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.8.self_attn.v_proj.bias": "model-00002-of-00006.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00006.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.9.self_attn.q_proj.bias": "model-00002-of-00006.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.9.self_attn.v_proj.bias": "model-00002-of-00006.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
+    "model.norm.weight": "model-00006-of-00006.safetensors"
+  }
+}

llm/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>",
+    "<vila/sentinel>",
+    "<image>",
+    "<vila/video>"
+  ],
+  "bos_token": {
+    "content": "[BOS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

llm/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2adb5255020285bad13f10e6c896570ffe9c35c1b5c0ea587e6ec9662b84f6ea
+size 11422819

llm/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,252 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<vila/sentinel>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151666": {
+      "content": "<image>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151667": {
+      "content": "<vila/video>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151668": {
+      "content": "[BOS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151669": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>",
+    "<vila/sentinel>",
+    "<image>",
+    "<vila/video>"
+  ],
+  "bos_token": "[BOS]",
+  "chat_template": "{% if messages[0]['role'] != 'system' %}{{ '<|im_start|>system\\n以下は、タスクを説明する指示です。要求を適切に満たす応答を書きなさい。<|im_end|>\\n' }}{% endif %}{% for message in messages if message['content'] is not none %}{{ '<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "legacy": false,
+  "model_max_length": 4096,
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

llm/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

loss.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from typing import List, Union
+import torch
+from torch.nn.functional import cross_entropy
+from .constants import IGNORE_INDEX
+__all__ = ["soft_cross_entropy"]
+def soft_cross_entropy(
+    outputs: torch.Tensor,
+    targets: torch.Tensor,
+    soft_tokens: Union[torch.Tensor, List[int]],
+    std: float = 1,
+    ignore_index: int = IGNORE_INDEX,
+) -> torch.Tensor:
+    # Remove last token from outputs and first token from targets
+    outputs = outputs[..., :-1, :].contiguous()
+    targets = targets[..., 1:].contiguous()
+    # Flatten outputs and targets
+    targets = targets.view(-1)
+    outputs = outputs.view(targets.size(0), -1)
+    # Remove outputs and targets with ignore_index
+    indices = targets != ignore_index
+    outputs = outputs[indices]
+    targets = targets[indices]
+    # Convert soft token IDs to tensor
+    if isinstance(soft_tokens, list):
+        soft_tokens = torch.tensor(soft_tokens).to(targets)
+    # Calculate loss for non-soft tokens
+    indices = torch.isin(targets, soft_tokens, invert=True)
+    loss = cross_entropy(outputs[indices], targets[indices], reduction="sum")
+    # Calculate loss for soft tokens
+    indices = torch.isin(targets, soft_tokens)
+    targets_indices = torch.zeros_like(outputs[indices])
+    for k, target in enumerate(targets[indices]):
+        dist = torch.exp(-((target - soft_tokens) ** 2) / (2 * std**2))
+        targets_indices[k][soft_tokens] = dist / dist.sum()
+    loss += cross_entropy(outputs[indices], targets_indices, reduction="sum")
+    # Return average loss
+    return loss / targets.size(0)

main.py ADDED Viewed

File without changes

media.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import glob
+import os
+from collections import defaultdict
+from typing import Any, Dict, List, Optional, Union
+import cv2
+import numpy as np
+import PIL
+import PIL.Image
+import requests
+from transformers import PretrainedConfig
+# from llava.constants import MEDIA_TOKENS
+# from llava.media import Image, Video
+# from llava.utils import make_list
+# from llava.utils.logging import logger
+MEDIA_TOKENS = {
+    "image": "<image>",
+    "video": "<vila/video>",
+}
+class Media:
+    pass
+class File(Media):
+    def __init__(self, path: str) -> None:
+        self.path = path
+class Image(File):
+    pass
+class Video(File):
+    pass
+def make_list(obj: Any) -> List:
+    return obj if isinstance(obj, list) else [obj]
+def _extract_image(image: Union[Image, PIL.Image.Image]) -> PIL.Image.Image:
+    if isinstance(image, Image):
+        if image.path.startswith("http://") or image.path.startswith("https://"):
+            image = PIL.Image.open(requests.get(image.path, stream=True).raw)
+        else:
+            image = PIL.Image.open(image.path)
+    return image
+def _load_video(video_path: str, *, num_frames: int) -> List[PIL.Image.Image]:
+    # Load video frames from a directory
+    if os.path.isdir(video_path):
+        frame_paths = sorted(glob.glob(os.path.join(video_path, "*")))
+        indices = np.round(np.linspace(0, len(frame_paths) - 1, num_frames)).astype(int)
+        return [PIL.Image.open(frame_paths[index]) for index in indices]
+    # Load video frames from a video file
+    vidcap = cv2.VideoCapture(video_path)
+    # Find the last frame as frame count might not be accurate
+    frame_count = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
+    while frame_count > 0:
+        vidcap.set(cv2.CAP_PROP_POS_FRAMES, frame_count - 1)
+        if vidcap.grab():
+            break
+        frame_count -= 1
+    else:
+        raise ValueError(f"Video '{video_path}' has no frames.")
+    # Extract frames uniformly
+    indices = np.round(np.linspace(0, frame_count - 1, num_frames)).astype(int)
+    frames = {}
+    for index in indices:
+        if index in frames:
+            continue
+        vidcap.set(cv2.CAP_PROP_POS_FRAMES, index)
+        success, frame = vidcap.read()
+        if not success:
+            print(f"Failed to read frame {index} from video '{video_path}'. Skipped.")
+            continue
+        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        frames[index] = PIL.Image.fromarray(frame)
+    return [frames[index] for index in indices if index in frames]
+def _extract_video(video: Video, config: PretrainedConfig) -> List[PIL.Image.Image]:
+    num_frames = config.num_video_frames
+    if getattr(config, "fps") != 0:
+        print("Extracting frames from video with specified FPS is not supported yet. Ignored.")
+    frames = _load_video(video.path, num_frames=num_frames)
+    return frames
+def extract_media(
+    messages: List[Dict[str, Any]],
+    config: Optional[PretrainedConfig] = None,
+    draft: bool = False,
+) -> Dict[str, List[Any]]:
+    media = defaultdict(list)
+    for message in messages:
+        text = ""
+        for part in make_list(message["value"]):
+            if isinstance(part, str):
+                for token in MEDIA_TOKENS.values():
+                    if token in part:
+                        print(f"Media token '{token}' found in text: '{part}'. Removed.")
+                        part = part.replace(token, "").strip()
+                text += part
+            elif isinstance(part, (Image, PIL.Image.Image)):
+                if draft:
+                    media["image"].append(part)
+                else:
+                    media["image"].append(_extract_image(part))
+                text += MEDIA_TOKENS["image"]
+            elif isinstance(part, Video):
+                if draft:
+                    media["video"].append(part)
+                else:
+                    media["video"].append(_extract_video(part, config))
+                text += MEDIA_TOKENS["video"]
+            else:
+                raise ValueError(f"Unsupported prompt part type: {type(part)}")
+        message["value"] = text
+    return media

media_encoder.py ADDED Viewed

	@@ -0,0 +1,101 @@

+from functools import partial
+from typing import Any, Dict, List, Optional
+import torch
+from torch import nn
+class BaseEncoder(nn.Module):
+    def __init__(self, parent: nn.Module) -> None:
+        super().__init__()
+        self._parent = [parent]
+    @property
+    def parent(self) -> nn.Module:
+        return self._parent[0]
+class BasicImageEncoder(BaseEncoder):
+    def __init__(
+        self,
+        parent: torch.nn.Module,
+        start_tokens: Optional[str] = None,
+        end_tokens: Optional[str] = "\n",
+    ) -> None:
+        super().__init__(parent)
+        self.start_tokens = start_tokens
+        self.end_tokens = end_tokens
+    def embed_tokens(self, tokens: Optional[str]) -> Optional[torch.Tensor]:
+        if tokens is None:
+            return None
+        token_ids = self.parent.tokenizer(tokens).input_ids
+        token_ids = torch.tensor(token_ids, device=self.parent.device)
+        return self.parent.llm.model.embed_tokens(token_ids)
+    def _process_features(
+        self,
+        features: torch.Tensor,
+        start_token_embeds: Optional[torch.Tensor],
+        end_token_embeds: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        if start_token_embeds is not None:
+            features = torch.cat([start_token_embeds, features], dim=0)
+        if end_token_embeds is not None:
+            features = torch.cat([features, end_token_embeds], dim=0)
+        return features
+    def forward(self, images: List[torch.Tensor], config: Dict[str, Any]) -> List[torch.Tensor]:
+        images = torch.stack(images, dim=0)
+        features = self.parent.encode_images(images, block_sizes=config.get("block_sizes"))
+        process_features = partial(
+            self._process_features,
+            start_token_embeds=self.embed_tokens(self.start_tokens),
+            end_token_embeds=self.embed_tokens(self.end_tokens),
+        )
+        return [process_features(f) for f in features]
+class BasicVideoEncoder(BaseEncoder):
+    def __init__(
+        self,
+        parent: torch.nn.Module,
+        start_tokens: Optional[str] = None,
+        end_tokens: Optional[str] = "\n",
+    ) -> None:
+        super().__init__(parent)
+        self.start_tokens = start_tokens
+        self.end_tokens = end_tokens
+    def embed_tokens(self, tokens: Optional[str]) -> Optional[torch.Tensor]:
+        if tokens is None:
+            return None
+        token_ids = self.parent.tokenizer(tokens).input_ids
+        token_ids = torch.tensor(token_ids, device=self.parent.device)
+        return self.parent.llm.model.embed_tokens(token_ids)
+    def _process_features(
+        self,
+        features: torch.Tensor,
+        start_token_embeds: Optional[torch.Tensor],
+        end_token_embeds: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        if start_token_embeds is not None:
+            start_embeds = torch.stack([start_token_embeds] * features.shape[0], dim=0)
+            features = torch.cat([start_embeds, features], dim=1)
+        if end_token_embeds is not None:
+            end_embeds = torch.stack([end_token_embeds] * features.shape[0], dim=0)
+            features = torch.cat([features, end_embeds], dim=1)
+        return features.flatten(0, 1)
+    def forward(self, videos: List[torch.Tensor], config: Dict[str, Any]) -> List[torch.Tensor]:
+        num_frames = [video.shape[0] for video in videos]
+        images = torch.cat(videos, dim=0)
+        features = self.parent.encode_images(images)
+        features = torch.split(features, num_frames)
+        process_features = partial(
+            self._process_features,
+            start_token_embeds=self.embed_tokens(self.start_tokens),
+            end_token_embeds=self.embed_tokens(self.end_tokens),
+        )
+        return [process_features(f) for f in features]

mm_projector/config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "_name_or_path": "runs/train/NVILA-Lite_14b_siglip_aws_env2_obelics_ja/sft_14b_GPT4_v6/model/mm_projector",
+  "architectures": [
+    "MultimodalProjector"
+  ],
+  "mm_projector_type": "mlp_downsample_3x3_fix",
+  "model_type": "v2l_projector",
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.45.0"
+}

mm_projector/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:117299ba5a53f595969b56b45068c3974b5ce68214bbb12b91518f87448252e1
+size 159565424

mm_utils.py ADDED Viewed

	@@ -0,0 +1,572 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+# dynamic_preprocess and find_closest_aspect_ratio are referenced from https://github.com/OpenGVLab/InternVL
+import base64
+import os
+import tempfile
+from io import BytesIO
+import numpy as np
+import torch
+from PIL import Image
+from transformers import StoppingCriteria
+from .constants import DEFAULT_IMAGE_TOKEN
+def get_frame_from_vcap(vidcap, num_frames=10, max_fps=0.0, fps=None, frame_count=None, video_file_name=None):
+    import cv2
+    if fps == None or frame_count == None:
+        # if one of fps or frame_count is None, still recompute
+        fps = vidcap.get(cv2.CAP_PROP_FPS)
+        frame_count = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
+    if fps == 0 or frame_count == 0:
+        print(f"Video file not found. return empty images. {video_file_name}")
+        return [
+            Image.new("RGB", (720, 720)),
+        ] * num_frames, 0
+    duration = frame_count / fps
+    frame_interval = frame_count // num_frames
+    if frame_interval == 0 and frame_count <= 1:
+        print(f"frame_interval is equal to 0. return empty image. {video_file_name}")
+        return [
+            Image.new("RGB", (720, 720)),
+        ] * num_frames, 0
+    # print("duration:", duration, "frames:", frame_count, "intervals:", frame_interval)
+    images = []
+    count = 0
+    success = True
+    frame_indices = np.linspace(0, frame_count - 1, num_frames, dtype=int)
+    while success:
+        # print("frame_count:", frame_count, "count:", count, "num_frames:", num_frames, "frame_interval:", frame_interval)
+        if frame_count >= num_frames:
+            success, frame = vidcap.read()
+            if count in frame_indices:
+                try:
+                    img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                    im_pil = Image.fromarray(img)
+                    images.append(im_pil)
+                except BaseException:
+                    continue
+                if len(images) >= num_frames:
+                    return images, num_frames
+            count += 1
+        else:
+            # Left padding frames if the video is not long enough
+            success, frame = vidcap.read()
+            if success:
+                try:
+                    img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                    im_pil = Image.fromarray(img)
+                    images.append(im_pil)
+                except BaseException:
+                    continue
+                count += 1
+            else:
+                break
+    if len(images) == 0:
+        raise ValueError("Did not find enough frames in the video. return empty image.")
+    return images, len(images)
+def get_frame_from_vcap_with_fps(vidcap, num_frames=10, max_fps=0.0, fps=None, frame_count=None, video_file_name=None):
+    """
+    num_frames is the max number of frames the model can support.
+    frame_count is the number of frames in the input video.
+    max_fps is the max FPS of the model can support.
+    fps is the fps of the input video.
+    """
+    import random
+    import cv2
+    if fps == None or frame_count == None:
+        # if one of fps or frame_count is None, still recompute
+        fps = vidcap.get(cv2.CAP_PROP_FPS)
+        frame_count = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
+    if fps == 0 or frame_count == 0:
+        print(f"Video file not found. return empty images. {video_file_name}")
+        empty_video_frames = int(random.uniform(2, 8 * max_fps))
+        return [
+            Image.new("RGB", (720, 720)),
+        ] * empty_video_frames, 0
+    duration = frame_count / fps
+    # print("duration:", duration, "frames:", frame_count, "fps:", fps, "num_frames:", num_frames, "max_fps:", max_fps)
+    # If the video is too long (longer than max_fps and num_frames can support),
+    # we will use lower fps to sample frames.
+    if duration >= num_frames / max_fps:
+        frame_interval = frame_count // num_frames
+        # If the video is too short, we will skip the video if there is only one frame.
+        if frame_interval == 0 and frame_count <= 1:
+            print(f"frame_interval is equal to 0. return empty image. {video_file_name}")
+            empty_video_frames = int(random.uniform(2, 8 * max_fps))
+            return [
+                Image.new("RGB", (720, 720)),
+            ] * empty_video_frames, 0
+        images = []
+        count = 0
+        success = True
+        frame_indices = np.linspace(0, frame_count - 1, num_frames, dtype=int)
+        while success:
+            if frame_count >= num_frames:
+                # success, frame = vidcap.read()
+                if count in frame_indices:
+                    success, frame = vidcap.read()
+                    try:
+                        img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                        im_pil = Image.fromarray(img)
+                        images.append(im_pil)
+                    except:
+                        # print("Failed to read frame:", count)
+                        continue
+                    if len(images) >= num_frames:
+                        return images, num_frames
+                else:
+                    success = vidcap.grab()
+                count += 1
+            else:
+                # Left padding frames if the video is not long enough
+                success, frame = vidcap.read()
+                if success:
+                    try:
+                        img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                        im_pil = Image.fromarray(img)
+                        images.append(im_pil)
+                    except:
+                        # print("Failed to read frame:", count)
+                        continue
+                    count += 1
+                else:
+                    break
+    else:
+        frames_required = int(duration * max_fps)
+        frame_indices = np.linspace(0, frame_count - 1, frames_required, dtype=int)
+        if frames_required == 0:
+            print(f"frames_required is fewer than 2. Duration {duration}, return empty image.")
+            empty_video_frames = int(random.uniform(2, 8 * max_fps))
+            return [
+                Image.new("RGB", (720, 720)),
+            ] * empty_video_frames, 0
+        elif frames_required == 1:
+            frame_indices = np.linspace(0, frame_count - 1, 2, dtype=int)
+        images = []
+        count = 0
+        looked = 0
+        success = True
+        while success:
+            success, frame = vidcap.read()
+            if success and (looked in frame_indices):
+                try:
+                    img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                    im_pil = Image.fromarray(img)
+                    images.append(im_pil)
+                except:
+                    continue
+                count += 1
+            looked += 1
+    if len(images) == 0:
+        empty_video_frames = int(random.uniform(2, 8 * max_fps))
+        return [
+            Image.new("RGB", (720, 720)),
+        ] * empty_video_frames, 0
+    else:
+        return images, len(images)
+def opencv_extract_frames(vpath_or_bytesio, frames=6, max_fps=0.0, fps=None, frame_count=None):
+    """
+    Extract frames from a video using OpenCV.
+    Args:
+        vpath_or_bytesio (str or BytesIO): Path to the video file or BytesIO object containing the video.
+        frames (int): Number of frames to extract from the video.
+        fps (float): Frames per second of the video. If 0.0, the function will extract frames at equal intervals.
+    Returns:
+        list: List of PIL Images extracted from the video.
+    Raises:
+        NotImplementedError: If the type of `vpath_or_bytesio` is not supported.
+    """
+    import cv2
+    if isinstance(vpath_or_bytesio, str):
+        vidcap = cv2.VideoCapture(vpath_or_bytesio)
+        if max_fps > 0.0:
+            return get_frame_from_vcap_with_fps(
+                vidcap, frames, max_fps, fps=fps, frame_count=frame_count, video_file_name=vpath_or_bytesio
+            )
+        return get_frame_from_vcap(
+            vidcap, frames, max_fps, fps=fps, frame_count=frame_count, video_file_name=vpath_or_bytesio
+        )
+    elif isinstance(vpath_or_bytesio, (BytesIO,)):
+        # assuming mp4
+        with tempfile.NamedTemporaryFile(delete=True, suffix=".mp4") as temp_video:
+            temp_video.write(vpath_or_bytesio.read())
+            temp_video_name = temp_video.name
+            vidcap = cv2.VideoCapture(temp_video_name)
+            if max_fps > 0.0:
+                return get_frame_from_vcap_with_fps(
+                    vidcap, frames, max_fps, fps=fps, frame_count=frame_count, video_file_name=temp_video_name
+                )
+            return get_frame_from_vcap(
+                vidcap, frames, max_fps, fps=fps, frame_count=frame_count, video_file_name=temp_video_name
+            )
+    else:
+        raise NotImplementedError(type(vpath_or_bytesio))
+def load_image_from_base64(image):
+    return Image.open(BytesIO(base64.b64decode(image)))
+def expand2square(pil_img, background_color):
+    """
+    Expand the given PIL image to a square shape by adding padding.
+    Parameters:
+    - pil_img: The PIL image to be expanded.
+    - background_color: The color of the padding to be added.
+    Returns:
+    - The expanded PIL image.
+    If the image is already square, it is returned as is.
+    If the image is wider than it is tall, padding is added to the top and bottom.
+    If the image is taller than it is wide, padding is added to the left and right.
+    """
+    width, height = pil_img.size
+    if pil_img.mode == "L":
+        background_color = background_color[0]
+    if width == height:
+        return pil_img
+    elif width > height:
+        result = Image.new(pil_img.mode, (width, width), background_color)
+        result.paste(pil_img, (0, (width - height) // 2))
+        return result
+    else:
+        result = Image.new(pil_img.mode, (height, height), background_color)
+        result.paste(pil_img, ((height - width) // 2, 0))
+        return result
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    best_ratio_diff = float("inf")
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+def dynamic_preprocess(image, min_num=1, max_num=12, image_size=384, use_thumbnail=True):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+    # calculate the existing image aspect ratio
+    target_ratios = {
+        (i, j)
+        for n in range(min_num, max_num + 1)
+        for i in range(1, n + 1)
+        for j in range(1, n + 1)
+        if i * j <= max_num and i * j >= min_num
+    }
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(aspect_ratio, target_ratios, orig_width, orig_height, image_size)
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size,
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images
+def dynamic_s2_preprocess(image, s2_scales=[384, 768, 1152], max_num=12, image_size=384):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+    min_num = (s2_scales[-1] // s2_scales[0]) ** 2  # at least use number of tiles as the largest scale
+    processed_images = []
+    ##########################################################################################
+    ############# Add tiles for all but the last scale using fixed squre ratio ###############
+    ##########################################################################################
+    for scale in s2_scales[:-1]:
+        target_width = image_size * (scale // s2_scales[0])
+        target_height = image_size * (scale // s2_scales[0])
+        blocks = (scale // s2_scales[0]) ** 2
+        # resize the image
+        resized_img = image.resize((target_width, target_height))
+        for i in range(blocks):
+            box = (
+                (i % (target_width // image_size)) * image_size,
+                (i // (target_width // image_size)) * image_size,
+                ((i % (target_width // image_size)) + 1) * image_size,
+                ((i // (target_width // image_size)) + 1) * image_size,
+            )
+            # split the image
+            split_img = resized_img.crop(box)
+            processed_images.append(split_img)
+    ##########################################################################################
+    ################ Add tiles for the last scale using dynamic aspect ratio #################
+    ##########################################################################################
+    # calculate the existing image aspect ratio
+    target_ratios = {
+        (i, j)
+        for n in range(min_num, max_num + 1)
+        for i in range(1, n + 1)
+        for j in range(1, n + 1)
+        if i * j <= max_num and i * j >= min_num
+    }
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(aspect_ratio, target_ratios, orig_width, orig_height, image_size)
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size,
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    return processed_images, (target_aspect_ratio[1], target_aspect_ratio[0])
+def dynamic_process_images_and_prompt(images, prompt, data_args, image_folder=None, max_tiles=None):
+    prompt = prompt.split(DEFAULT_IMAGE_TOKEN)
+    idx = 0
+    all_images = []
+    for img in images:
+        processed_images = process_image(img, data_args, image_folder, enable_dynamic_res=True, max_tiles=max_tiles)
+        all_images.append(processed_images)
+        prompt.insert(idx + 1, f"{DEFAULT_IMAGE_TOKEN}\n" * processed_images.shape[0])
+        idx += 2
+    prompt = "".join(prompt)
+    if all_images:
+        all_images = torch.cat(all_images)
+    else:
+        all_images = None
+        prompt = prompt.replace(DEFAULT_IMAGE_TOKEN, "")
+    return all_images, prompt
+def dynamic_s2_process_images_and_prompt(images, prompt, data_args, image_folder=None):
+    idx = 0
+    all_images = []
+    all_block_size = []
+    for img in images:
+        processed_images, block_size = process_image(img, data_args, image_folder, enable_dynamic_s2=True)
+        all_images.append(processed_images)
+        all_block_size.append(block_size)
+        idx += 2
+    if all_images:
+        all_images = torch.cat(all_images)
+    else:
+        all_images = None
+    return all_images, all_block_size
+def process_image(
+    image_file, data_args, image_folder, enable_dynamic_res=False, enable_dynamic_s2=False, max_tiles=None
+):
+    processor = data_args.image_processor
+    if isinstance(image_file, str):
+        if image_folder is not None:
+            image = Image.open(os.path.join(image_folder, image_file)).convert("RGB")
+        else:
+            image = Image.open(image_file).convert("RGB")
+    else:
+        # image is stored in bytearray
+        image = image_file
+    image = image.convert("RGB")
+    if hasattr(data_args.image_processor, "crop_size"):
+        # CLIP vision tower
+        crop_size = data_args.image_processor.crop_size
+    else:
+        # SIGLIP vision tower
+        assert hasattr(data_args.image_processor, "size")
+        crop_size = data_args.image_processor.size
+    if "dynamic_s2" in data_args.image_aspect_ratio and enable_dynamic_s2:
+        assert crop_size["height"] == crop_size["width"]
+        images, block_size = dynamic_s2_preprocess(
+            image, s2_scales=data_args.s2_scales, max_num=data_args.max_tiles, image_size=crop_size["height"]
+        )
+        images = [processor.preprocess(image, return_tensors="pt")["pixel_values"][0] for image in images]
+        return torch.stack(images), block_size
+    if "dynamic" in data_args.image_aspect_ratio and enable_dynamic_res:
+        assert crop_size["height"] == crop_size["width"]
+        if max_tiles is not None:
+            max_num = max_tiles
+        else:
+            max_num = data_args.max_tiles
+        images = dynamic_preprocess(image, min_num=data_args.min_tiles, max_num=max_num, image_size=crop_size["height"])
+        images = [processor.preprocess(image, return_tensors="pt")["pixel_values"][0] for image in images]
+        return torch.stack(images)
+    if data_args.image_aspect_ratio == "resize":
+        image = image.resize((crop_size["width"], crop_size["height"]))
+    if data_args.image_aspect_ratio == "pad":
+        def expand2square(pil_img, background_color):
+            width, height = pil_img.size
+            if width == height:
+                return pil_img
+            elif width > height:
+                result = Image.new(pil_img.mode, (width, width), background_color)
+                result.paste(pil_img, (0, (width - height) // 2))
+                return result
+            else:
+                result = Image.new(pil_img.mode, (height, height), background_color)
+                result.paste(pil_img, ((height - width) // 2, 0))
+                return result
+        image = expand2square(image, tuple(int(x * 255) for x in processor.image_mean))
+        image = processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
+    else:
+        # Using default behavior of the vision encoder
+        # For CLIP, default is central crop
+        # For Radio, default is central crop
+        # For Siglip, default is resize
+        # For InternVIT, default is resize
+        image = processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
+    return image
+def process_images(images, image_processor, model_cfg, enable_dynamic_res=False, max_tiles=None):
+    model_cfg.image_processor = image_processor
+    new_images = [
+        process_image(image, model_cfg, None, enable_dynamic_res=enable_dynamic_res, max_tiles=max_tiles)
+        for image in images
+    ]
+    if all(x.shape == new_images[0].shape for x in new_images):
+        if len(new_images[0].shape) == 4:
+            new_images = torch.cat(new_images, dim=0)
+        elif len(new_images[0].shape) == 3:
+            new_images = torch.stack(new_images, dim=0)
+        else:
+            raise ValueError(f"new_images rank does not equal to 4, rank: {len(new_images[0].shape)}")
+    else:
+        raise ValueError("The shape of images in new_images is different!")
+    return new_images
+def tokenizer_image_token(prompt, tokenizer, return_tensors=None):
+    return tokenizer(prompt, return_tensors=return_tensors).input_ids[0]
+def is_gemma_tokenizer(tokenizer):
+    return "gemma" in tokenizer.__class__.__name__.lower()
+def get_model_name_from_path(model_path):
+    model_path = model_path.strip("/")
+    model_paths = model_path.split("/")
+    if model_paths[-1].startswith("checkpoint-"):
+        return model_paths[-2] + "_" + model_paths[-1]
+    else:
+        return model_paths[-1]
+class KeywordsStoppingCriteria(StoppingCriteria):
+    def __init__(self, keywords, tokenizer, input_ids):
+        self.keywords = keywords
+        self.keyword_ids = []
+        self.max_keyword_len = 0
+        for keyword in keywords:
+            cur_keyword_ids = tokenizer(keyword).input_ids
+            if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
+                cur_keyword_ids = cur_keyword_ids[1:]
+            if len(cur_keyword_ids) > self.max_keyword_len:
+                self.max_keyword_len = len(cur_keyword_ids)
+            self.keyword_ids.append(torch.tensor(cur_keyword_ids))
+        self.tokenizer = tokenizer
+        self.start_len = input_ids.shape[1]
+    def call_for_batch(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len)
+        self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
+        for keyword_id in self.keyword_ids:
+            if (output_ids[0, -keyword_id.shape[0] :] == keyword_id).all():
+                return True
+        outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
+        for keyword in self.keywords:
+            if keyword in outputs:
+                return True
+        return False
+    def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        outputs = []
+        for i in range(output_ids.shape[0]):
+            outputs.append(self.call_for_batch(output_ids[i].unsqueeze(0), scores))
+        return all(outputs)

model_utils_packing.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from importlib import import_module
+from typing import Tuple
+import torch
+import transformers
+from torch import nn
+from torch.nn import functional as F
+__all__ = ["patch"]
+def _get_unpad_data(attention_mask: torch.Tensor, *args, **kwargs) -> Tuple[torch.Tensor, torch.Tensor, int]:
+    if hasattr(_get_unpad_data, "seqlens_in_batch"):
+        seqlens_in_batch = _get_unpad_data.seqlens_in_batch
+    else:
+        seqlens_in_batch = torch.sum(attention_mask, dim=1)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+    return indices, cu_seqlens, max_seqlen_in_batch
+def set_seqlens_in_batch(seqlens_in_batch: torch.Tensor) -> None:
+    _get_unpad_data.seqlens_in_batch = seqlens_in_batch
+def patch(model: nn.Module) -> None:
+    if transformers.__version__ < "4.43.0":
+        m = import_module(model.__module__)
+        if not hasattr(m, "_get_unpad_data"):
+            raise ValueError(f"Module {m} does not have function '_get_unpad_data' for packing")
+        m._get_unpad_data = _get_unpad_data
+    else:
+        transformers.modeling_flash_attention_utils._get_unpad_data = _get_unpad_data

modeling_vila.py ADDED Viewed

	@@ -0,0 +1,1228 @@

+import copy
+import json
+import logging
+import math
+import os
+import os.path
+import os.path as osp
+import shutil
+import warnings
+from abc import ABC
+from collections import OrderedDict, defaultdict, deque
+from copy import deepcopy
+from itertools import chain
+from threading import Thread
+from typing import Any, Dict, List, Optional, Tuple, Union
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision
+from einops import rearrange
+from PIL import Image
+from transformers import (
+    AutoConfig,
+    AutoModel,
+    AutoProcessor,
+    AutoTokenizer,
+    GenerationConfig,
+    LogitsProcessor,
+    PretrainedConfig,
+    PreTrainedModel,
+    Qwen2Config,
+    Qwen2ForCausalLM,
+    Qwen2PreTrainedModel,
+    TextIteratorStreamer,
+)
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.modeling_utils import ContextManagers, no_init_weights
+from .auto_processor import VILAProcessor
+from .base_projector import MultimodalProjector, MultimodalProjectorConfig
+from .builder import build_llm_and_tokenizer
+from .configuration_vila import VILAConfig
+from .constants import *
+from .conversation import SeparatorStyle, default_conversation
+from .distributed import all_gather as vila_all_gather
+from .loss import soft_cross_entropy
+from .media import extract_media
+from .media_encoder import BasicImageEncoder, BasicVideoEncoder
+from .mm_utils import process_image, process_images
+from .model_utils_packing import set_seqlens_in_batch
+from .siglip_encoder import SiglipVisionTower, SiglipVisionTowerDynamicS2, SiglipVisionTowerS2
+from .tokenizer_utils import tokenize_conversation
+from .utils import get_model_config, load_tokenizer_then_handle_media_tokens_and_chat_template
+# from llava.constants import DEFAULT_IMAGE_TOKEN, IGNORE_INDEX, NUM_EXTRA_TOKENS
+# ease debugging
+python_input = input
+# quick hack for remote code
+def get_pg_manager():
+    return None
+def get_model_weights_dtype(model: nn.Module):
+    pass
+def build_mm_projector(model_type_or_path: str, config: PretrainedConfig) -> PreTrainedModel:
+    if model_type_or_path is None:
+        return None
+    ## load from pretrained model
+    if config.resume_path:
+        assert os.path.exists(model_type_or_path), f"Resume mm projector path {model_type_or_path} does not exist!"
+        return MultimodalProjector.from_pretrained(model_type_or_path, config)
+    ## build from scratch
+    else:
+        mm_projector_cfg = MultimodalProjectorConfig(model_type_or_path)
+        mm_projector = MultimodalProjector(mm_projector_cfg, config)
+        return mm_projector
+def check_dot_in_model_path(model_path: str):
+    """Check if the model path contains dot, which will affect the remote code loading."""
+    if osp.isdir(model_path):  # local model
+        if "." in osp.abspath(model_path):
+            return True
+    else:  # remote model
+        if "." in model_path:
+            return True
+    return False
+def get_vila_version(model_path: str) -> str:
+    VERSIONS = ["vila1.5", "vila-u", "longvila", "nvila", "vila-m3"]
+    for version in VERSIONS:
+        if version in model_path.lower():
+            return version
+    return None
+def generate_jinja_template(conv_mode: str) -> str:
+    if conv_mode == "vicuna_v1":
+        return """{% set system_prompt = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. " %}
+{% set roles = ["user", "assistant"] %}
+{% set sep = " " %}
+{{ system_prompt }}
+{% for message in messages %}
+    {% if message['role'] == roles[0] %}
+        {{ "USER: " }}{{ sep }}{{ message['content'] }}{{ sep }}
+    {% else %}
+        {{ "ASSISTANT: " }}{{ sep }}{{ message['content'] }}{{ sep }}
+    {% endif %}
+{% endfor %}
+{% if messages[-1]['role'] == 'user' %}
+    {{ "ASSISTANT:" }}
+{% endif %}
+"""
+    elif conv_mode == "llama_3":
+        return """{% set system_prompt = "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\\n\\nYou are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.<|eot_id|>" %}
+{% set roles = ["<|start_header_id|>user<|end_header_id|>\\n\\n", "<|start_header_id|>assistant<|end_header_id|>\\n\\n"]%}
+{% set sep = "<|eot_id|>" %}
+{{ system_prompt }}
+{% for message in messages %}
+    {% if message['role'] == 'user' %}
+        {{ roles[0] }}{{ message['content'] }}{{ sep }}
+    {% else %}
+        {{ roles[1] }}{{ message['content'] }}{{ sep }}
+    {% endif %}
+{% endfor %}
+{% if messages[-1]['role'] == 'user' %}
+    {{ roles[1] }}
+{% endif %}
+"""
+    elif conv_mode == "hermes_2":
+        return """{% set system_prompt = "<|im_start|>system\nAnswer the questions." %}
+{% set roles = ["<|im_start|>user\n", "<|im_start|>assistant\n"] %}
+{% set sep = "<|im_end|>" %}
+{{ system_prompt }}{{ sep }}
+{% for message in messages %}
+    {% if message['role'] == 'user' %}
+        {{ roles[0] }}{{ message['content'] }}{{ sep }}
+    {% else %}
+        {{ roles[1] }}{{ message['content'] }}{{ sep }}
+    {% endif %}
+{% endfor %}"""
+    else:
+        raise NotImplementedError(f"Jinja template generation is not implemented for {conv_mode}.")
+def build_vision_tower(model_name_or_path: str, config: PretrainedConfig) -> PreTrainedModel:
+    ## skip vision tower instantiation
+    if model_name_or_path is None:
+        return None
+    vision_tower_arch = None
+    if config.resume_path and "radio" not in model_name_or_path:
+        assert os.path.exists(model_name_or_path), f"Resume vision tower path {model_name_or_path} does not exist!"
+        vision_tower_cfg = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=True)
+        vision_tower_arch = vision_tower_cfg.architectures[0].lower()
+    vision_tower_name = vision_tower_arch if vision_tower_arch is not None else model_name_or_path
+    use_s2 = getattr(config, "s2", False)
+    use_dynamic_s2 = getattr(config, "dynamic_s2", False)
+    if "siglip" in vision_tower_name:
+        if use_dynamic_s2:
+            vision_tower = SiglipVisionTowerDynamicS2(model_name_or_path, config)
+        elif use_s2:
+            vision_tower = SiglipVisionTowerS2(model_name_or_path, config)
+        else:
+            vision_tower = SiglipVisionTower(model_name_or_path, config)
+    else:
+        raise NotImplementedError(f"Unknown vision tower: {model_name_or_path}")
+    config.mm_hidden_size = (
+        vision_tower.config.hidden_size if not (use_s2 or use_dynamic_s2) else vision_tower.hidden_size
+    )
+    return vision_tower
+class VILAPretrainedModel(PreTrainedModel):
+    config_class = VILAConfig
+    main_input_name = "input_embeds"
+    supports_gradient_checkpointing = True
+    _supports_flash_attn_2 = True
+    def __init__(self, config: VILAConfig, *args, **kwargs):
+        super().__init__(config)
+        self.config = config
+        cfgs = get_model_config(config)
+        if len(cfgs) == 3:
+            llm_cfg, vision_tower_cfg, mm_projector_cfg = cfgs
+        else:
+            raise ValueError("`llm_cfg` `mm_projector_cfg` `vision_tower_cfg` not found in the config.")
+        # loading on cpu by default
+        device_map = kwargs.get("device_map", "cpu")
+        self.mm_projector = build_mm_projector(mm_projector_cfg, config)
+        self.vision_tower = build_vision_tower(vision_tower_cfg, config)
+        if "auto" in device_map or "cuda" in device_map:
+            self.mm_projector = self.mm_projector.cuda()
+            self.vision_tower = self.vision_tower.cuda()
+        # set device_map auto can autoamtically shard llm to different devices
+        self.llm, self.tokenizer = self.init_llm(llm_cfg, config, device_map=device_map)
+        # NOTE(ligeng): need to add other decoders from config
+        self.encoders = {"image": BasicImageEncoder(self), "video": BasicVideoEncoder(self)}
+        self.post_config()
+        self.is_loaded = True
+        assert (
+            self.llm is not None or self.vision_tower is not None or self.mm_projector is not None
+        ), "At least one of the components must be instantiated."
+    @classmethod
+    def convert_vila_dev_ckpt_to_remote(
+        self,
+        model_path: str,
+        output_dir: str = None,
+        vila_version: str | None = None,
+        conv_mode: str | None = None,
+        copy: bool = False,
+        copy_weights: bool = True,
+        copy_code: bool = True,
+        *model_args,
+        **kwargs,
+    ):
+        # assert type(self) == VILAForCasualLM, "This method is only available for VILAForCasualLM."
+        assert model_path != output_dir, "model_path and output_dir cannot be the same"
+        if os.path.isdir(model_path):
+            model_path = model_path
+        else:
+            from huggingface_hub import HfApi, snapshot_download
+            model_path = snapshot_download(model_path)
+            print("downloading HF model to", model_path)
+        if check_dot_in_model_path(model_path) and output_dir is None:
+            raise ValueError(
+                f"Model path {model_path} contains a dot, which will affect the remote code loading. Please specify the output directory without dot in the path to fix this issue."
+            )
+        if output_dir is not None and "." in output_dir:
+            raise ValueError(
+                f"Output directory {output_dir} contains a dot, which will affect the remote code loading. Please specify a valid output directory without dots."
+            )
+        if copy:
+            print("copy is set to True, copying weights and code to output_dir")
+            copy_weights = copy_code = True
+        # copy weights and code to output_dir
+        self.copy_or_symlink_directory(model_path, output_dir, copy=copy_weights)
+        self.copy_remote_py_files(output_dir, copy=copy_code)
+        if vila_version is None:
+            vila_version = get_vila_version(output_dir)
+        cfg_path = os.path.join(output_dir, "config.json")
+        config = json.load(open(cfg_path))
+        config["version"] = "2.0"  # nvila tag
+        config["architectures"] = ["VILAForCasualLM"]
+        config["auto_map"] = {
+            "AutoProcessor": "auto_processor.VILAProcessor",
+            "AutoConfig": "modeling_vila.VILAConfig",
+            "AutoModel": "modeling_vila.VILAForCasualLM",
+            "AutoModelForCausalLM": "modeling_vila.VILAForCasualLM",
+        }
+        # vila1.5 legacy support
+        config["model_type"] = "vila"
+        if vila_version in ["vila1.5", "vila-m3"]:
+            if conv_mode is None:
+                raise ValueError(f"Please specify the conversation mode for {output_dir}.")
+            config["chat_template"] = conv_mode
+            jinja_template = generate_jinja_template(conv_mode)
+            jinja_path = os.path.join(output_dir, f"{conv_mode}.jinja")
+            with open(jinja_path, "w") as f:
+                f.write(jinja_template)
+        json.dump(config, open(cfg_path, "w"), indent=2)
+        ##########################################################################################
+        config = AutoConfig.from_pretrained(output_dir, trust_remote_code=True)
+        tokenizer = load_tokenizer_then_handle_media_tokens_and_chat_template(output_dir, config)
+        tokenizer.save_pretrained(osp.join(output_dir, "llm"))
+        ##########################################################################################
+    @classmethod
+    def copy_or_symlink_directory(cls, model_path, output_dir, copy=True):
+        # Create output directory if it doesn't exist
+        os.makedirs(output_dir, exist_ok=True)
+        # Create symlinks for all files in model_path to output_dir
+        for item in os.listdir(model_path):
+            src_path = os.path.join(model_path, item)
+            dst_path = os.path.join(output_dir, item)
+            # Remove existing file/directory at destination if it exists
+            if os.path.exists(dst_path):
+                if os.path.islink(dst_path):
+                    os.unlink(dst_path)
+                elif os.path.isdir(dst_path):
+                    shutil.rmtree(dst_path)
+                else:
+                    os.remove(dst_path)
+            # Create symlink
+            if copy:
+                if os.path.isdir(src_path):
+                    shutil.copytree(src_path, dst_path)
+                else:
+                    shutil.copy2(src_path, dst_path)
+                print(f"Copied {src_path} to {dst_path}")
+            else:
+                os.symlink(src_path, dst_path)
+                print(f"Created symlink from {src_path} to {dst_path}")
+    @classmethod
+    def copy_remote_py_files(cls, output_dir, copy=True):
+        ## copy .py and REAMDE for next loading remote code
+        current_file_path = os.path.abspath(__file__)
+        current_folder = os.path.dirname(current_file_path)
+        for file_name in os.listdir(current_folder):
+            if file_name == "INSTRUCTIONS.md":
+                src_fname = os.path.join(current_folder, file_name)
+                dst_fname = os.path.join(output_dir, "README.md")
+                if os.path.exists(dst_fname):
+                    old_reamde = open(dst_fname).read()
+                else:
+                    old_reamde = ""
+                with open(src_fname) as src, open(dst_fname, "w") as dst:
+                    dst.write(src.read())
+                    dst.write(old_reamde)
+                print("[HF remote code] REAMDE ", src_fname, "to", dst_fname)
+            if file_name.endswith(".py") or file_name.endswith(".jinja"):
+                full_file_name = os.path.join(current_folder, file_name)
+                if os.path.isfile(full_file_name):
+                    if copy:
+                        shutil.copy(full_file_name, output_dir)
+                        print("[HF remote code] copying", full_file_name, "to", output_dir)
+                    else:
+                        # symlink to ease development
+                        if os.path.exists(os.path.join(output_dir, file_name)):
+                            os.remove(os.path.join(output_dir, file_name))
+                        os.symlink(full_file_name, os.path.join(output_dir, file_name))
+                        print("[HF remote code] linking", full_file_name, "to", output_dir)
+    def save_pretrained(self, output_dir, state_dict=None, **kwargs):
+        if state_dict is None:
+            # other wise fetch from deepspeed
+            # state_dict = accelerator.get_state_dict(is_deepspeed_enabled)
+            state_dict = self.state_dict()
+        if getattr(self, "tokenizer", None):
+            self.tokenizer.save_pretrained(osp.join(output_dir, "llm"))
+        if self.get_llm():
+            print(f"saving llm to {osp.join(output_dir, 'llm')}")
+            self.llm.config._name_or_path = osp.join(output_dir, "llm")
+            llm_state_dict = OrderedDict({k.split("llm.")[-1]: v for k, v in state_dict.items() if "llm" in k})
+            self.llm.save_pretrained(os.path.join(output_dir, "llm"), state_dict=llm_state_dict)
+            self.config.llm_cfg = self.llm.config
+        if self.get_vision_tower():
+            print(f"saving vision_tower to {osp.join(output_dir, 'vision_tower')}")
+            self.vision_tower.config._name_or_path = osp.join(output_dir, "vision_tower")
+            vision_tower_state_dict = OrderedDict(
+                {k.split("vision_tower.vision_tower.")[-1]: v for k, v in state_dict.items() if "vision_tower" in k}
+            )
+            self.vision_tower.vision_tower.save_pretrained(
+                os.path.join(output_dir, "vision_tower"),
+                state_dict=vision_tower_state_dict,
+            )
+            self.vision_tower.image_processor.save_pretrained(os.path.join(output_dir, "vision_tower"))
+            self.config.vision_tower_cfg = self.vision_tower.config
+            if hasattr(self.config.vision_tower_cfg, "auto_map"):
+                if "radio" not in self.get_vision_tower().__class__.__name__.lower():
+                    delattr(self.config.vision_tower_cfg, "auto_map")
+        if self.get_mm_projector():
+            print(f"saving mm_projector to {osp.join(output_dir, 'mm_projector')}")
+            self.mm_projector.config._name_or_path = osp.join(output_dir, "mm_projector")
+            mm_projector_state_dict = OrderedDict(
+                {k.split("mm_projector.")[-1]: v for k, v in state_dict.items() if "mm_projector" in k}
+            )
+            self.mm_projector.save_pretrained(
+                os.path.join(output_dir, "mm_projector"),
+                state_dict=mm_projector_state_dict,
+            )
+            self.config.mm_projector_cfg = self.mm_projector.config
+        ## update and save top-level config
+        self.config._name_or_path = output_dir
+        self.config.architectures = [self.__class__.__name__]
+        self.config.save_pretrained(output_dir)
+        ## copy .py and REAMDE for next loading remote code
+        self.copy_remote_py_files(output_dir)
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Optional[str] = None,
+        *model_args,
+        config: Optional[Union[PretrainedConfig, str, os.PathLike]] = None,
+        cache_dir: Optional[Union[str, os.PathLike]] = None,
+        ignore_mismatched_sizes: bool = False,
+        force_download: bool = False,
+        local_files_only: bool = False,
+        token: Optional[Union[str, bool]] = None,
+        revision: str = "main",
+        use_safetensors: Optional[bool] = None,
+        weights_only: bool = True,
+        **kwargs,
+    ):
+        config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True)
+        return cls._from_config(config, **kwargs)
+    def init_llm(self, llm_config, config, *args, **kwargs):
+        self.llm, self.tokenizer = build_llm_and_tokenizer(llm_config, config, *args, **kwargs)
+        # hard coded for NVILA
+        # variables for XGrammar
+        # print("DEBUG", len(self.tokenizer.added_tokens_encoder.keys()), self.tokenizer.added_tokens_encoder.keys())
+        NUM_EXTRA_TOKENS = len(self.tokenizer.added_tokens_encoder.keys())
+        # TODO: SENTINEL_TOKEN is not added, need to check with Zhijian
+        self.vocab_size = self.tokenizer.vocab_size + NUM_EXTRA_TOKENS
+        # XGrammar tokenizer and grammar compiler
+        # lazy init only when specified json output during inference
+        self.grammar_compiler = None
+        self.llm.resize_token_embeddings(len(self.tokenizer))
+        return self.llm, self.tokenizer
+    def post_config(self):
+        ######################################################################
+        # TODO: need to check dtype with jason
+        self.llm = self.llm.to(torch.float16)
+        self.mm_projector = self.mm_projector.to(torch.float16)
+        self.vision_tower = self.vision_tower.to(torch.float16)
+        ######################################################################
+        self.training = self.llm.training
+        ## configuration
+        if getattr(self.config, "llm_cfg", None) is None:
+            self.config.llm_cfg = self.llm.config
+        if getattr(self.config, "vision_tower_cfg", None) is None:
+            self.config.vision_tower_cfg = self.vision_tower.config
+        if getattr(self.config, "mm_projector_cfg", None) is None:
+            self.config.mm_projector_cfg = self.mm_projector.config
+    def get_llm(self):
+        llm = getattr(self, "llm", None)
+        if type(llm) is list:
+            llm = llm[0]
+        return llm
+    def get_lm_head(self):
+        lm_head = getattr(self.get_llm(), "lm_head", None)
+        return lm_head
+    def get_vision_tower(self):
+        vision_tower = getattr(self, "vision_tower", None)
+        if type(vision_tower) is list:
+            vision_tower = vision_tower[0]
+        return vision_tower
+    def get_mm_projector(self):
+        mm_projector = getattr(self, "mm_projector", None)
+        if type(mm_projector) is list:
+            mm_projector = mm_projector[0]
+        return mm_projector
+    def freezed_module_patch(self):
+        """
+        Huggingface will call model.train() at each training_step. To ensure the expected behaviors for modules like dropout, batchnorm, etc., we need to call model.eval() for the freezed modules.
+        """
+        if self.training:
+            if self.get_llm() and not getattr(self.config, "tune_language_model", False):
+                pass
+                # logging.warning("Caution: Your LLM is currently in training mode, ensuring accurate gradient computation. Please be vigilant, particularly regarding BatchNorm and Dropout operations.")
+            if self.get_vision_tower() and not getattr(self.config, "tune_vision_tower", False):
+                self.get_vision_tower().eval()
+            if self.get_mm_projector() and not getattr(self.config, "tune_mm_projector", False):
+                self.get_mm_projector().eval()
+class VILAForCasualLM(VILAPretrainedModel):
+    def __init__(self, config: VILAConfig, *args, **kwargs):
+        super().__init__(config, *args, **kwargs)
+    def merge_features_for_dynamic_s2(self, image_features, block_sizes):
+        scales = self.get_vision_tower().scales
+        resize_output_to_scale_idx = self.get_vision_tower().resize_output_to_scale_idx
+        image_features_each_image = []
+        new_block_sizes = []
+        block_cnt = 0
+        for block_size_each_image in block_sizes:
+            if block_size_each_image is None:
+                cur_features = image_features[block_cnt : block_cnt + 1]
+                cur_features = rearrange(cur_features, "1 (h w) c -> 1 c h w", h=int(cur_features.shape[1] ** 0.5))
+                cur_features = cur_features.repeat(1, len(scales), 1, 1)
+                image_features_each_image.append(cur_features)
+                new_block_sizes.append((1, 1))
+                block_cnt += 1
+            else:
+                cur_features_each_scale = []
+                for scale in scales[:-1]:
+                    num_blocks_this_scale = (scale // scales[0]) ** 2
+                    cur_features_each_scale.append(
+                        self.merge_chessboard(
+                            image_features[block_cnt : block_cnt + num_blocks_this_scale],
+                            num_split_h=scale // scales[0],
+                            num_split_w=scale // scales[0],
+                        )
+                    )  # 1 * C * H * W
+                    block_cnt += num_blocks_this_scale
+                num_blocks_last_scale = block_size_each_image[0] * block_size_each_image[1]
+                cur_features_each_scale.append(
+                    self.merge_chessboard(
+                        image_features[block_cnt : block_cnt + num_blocks_last_scale],
+                        num_split_h=block_size_each_image[0],
+                        num_split_w=block_size_each_image[1],
+                    )
+                )  # 1 * C * H * W
+                block_cnt += num_blocks_last_scale
+                # resize and concat features from different scales
+                output_size = cur_features_each_scale[resize_output_to_scale_idx].shape[-2:]
+                cur_features = torch.cat(
+                    [
+                        F.interpolate(cur_features_each_scale[i].to(torch.float32), size=output_size, mode="area").to(
+                            cur_features_each_scale[i].dtype
+                        )
+                        for i in range(len(cur_features_each_scale))
+                    ],
+                    dim=1,
+                )
+                # cur_features = rearrange(cur_features, "1 c h w -> (h w) c")
+                image_features_each_image.append(cur_features)
+                if resize_output_to_scale_idx == len(scales) - 1 or resize_output_to_scale_idx == -1:
+                    new_block_sizes.append(block_size_each_image)
+                else:
+                    new_block_sizes.append(
+                        (
+                            scales[resize_output_to_scale_idx] // scales[0],
+                            scales[resize_output_to_scale_idx] // scales[0],
+                        )
+                    )
+        assert block_cnt == len(image_features)
+        return image_features_each_image, new_block_sizes
+    def encode_images(self, images, block_sizes: Optional[Optional[Tuple[int, ...]]] = None):
+        if block_sizes is None:
+            block_sizes = [None] * len(images)
+        if getattr(self.config, "dynamic_s2", False):
+            image_features = self.get_vision_tower()(images)
+            image_features, new_block_sizes = self.merge_features_for_dynamic_s2(image_features, block_sizes)
+            image_features = [
+                self.split_chessboard(x, block_size[0], block_size[1])
+                for x, block_size in zip(image_features, new_block_sizes)
+            ]  # list of B * C * H * W tensors
+            image_features = torch.cat(
+                [rearrange(x, "b c h w -> b (h w) c") for x in image_features], dim=0
+            )  # B * N * C
+            image_features = self.get_mm_projector()(image_features)
+            image_features = list(
+                image_features.split([block_size[0] * block_size[1] for block_size in new_block_sizes], dim=0)
+            )
+            image_features = [
+                self.merge_chessboard(x, block_size[0], block_size[1])
+                for x, block_size in zip(image_features, new_block_sizes)
+            ]  # list of 1 * C * H * W tensors
+            image_features = [rearrange(x, "1 c h w -> (h w) c") for x in image_features]  # list of N * C tensors
+            if all([feature.shape[0] == image_features[0].shape[0] for feature in image_features]):
+                image_features = torch.stack(image_features, dim=0)
+        else:
+            image_features = self.get_vision_tower()(images)
+            image_features = self.get_mm_projector()(image_features)
+        return image_features
+    def train(self, mode: bool = True):
+        if mode:
+            self.tokenizer.padding_side = "right"
+        else:
+            self.tokenizer.padding_side = "left"
+        super().train(mode)
+        return self
+    def _embed(
+        self,
+        input_ids: torch.Tensor,
+        media: Dict[str, List[torch.Tensor]],
+        media_config: Dict[str, Dict[str, Any]],
+        labels: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        # NOTE(ligeng): deep copy to avoid modifying the original media and media_config
+        media = copy.deepcopy(media)
+        media_config = copy.deepcopy(media_config)
+        labels = labels if labels is not None else torch.full_like(input_ids, IGNORE_INDEX)
+        attention_mask = attention_mask if attention_mask is not None else torch.ones_like(input_ids, dtype=torch.bool)
+        PROCESS_GROUP_MANAGER = get_pg_manager()
+        if PROCESS_GROUP_MANAGER is not None:
+            for name in media:
+                self.encoders[name].end_tokens = None
+        # Extract text and media embeddings
+        text_embeds = self.llm.model.embed_tokens(input_ids)
+        if media is not None:
+            media_embeds = self.__embed_media_tokens(media, media_config)
+        else:
+            # no media was provided, so we just return an empty dict
+            media_embeds = {}
+        # This is a workaround to make sure the dummy embeddings are consumed
+        while media_embeds.get("dummy"):
+            dummy_embed = media_embeds["dummy"].popleft()
+            text_embeds += torch.sum(dummy_embed) * 0
+        # Remove padding
+        batch_size = labels.shape[0]
+        text_embeds = [text_embeds[k][attention_mask[k]] for k in range(batch_size)]
+        labels = [labels[k][attention_mask[k]] for k in range(batch_size)]
+        # Build inverse mapping from token ID to media name
+        media_tokens = {}
+        for name, token_id in self.tokenizer.media_token_ids.items():
+            media_tokens[token_id] = name
+        # Fuse text and media embeddings
+        inputs_m, labels_m = [], []
+        for k in range(batch_size):
+            inputs_mk, labels_mk = [], []
+            pos = 0
+            while pos < len(labels[k]):
+                if input_ids[k][pos].item() in media_tokens:
+                    end = pos + 1
+                    name = media_tokens[input_ids[k][pos].item()]
+                    input = media_embeds[name].popleft()
+                    label = torch.full([input.shape[0]], IGNORE_INDEX, device=labels[k].device, dtype=labels[k].dtype)
+                    # print(f"{self.tokenizer.padding_side} [media] {k=} {pos=}, {self.tokenizer.batch_decode(input_ids[k][pos:pos+1])}"); python_input()
+                elif input_ids[k][pos].item() in (self.tokenizer.pad_token_id, self.tokenizer.eos_token_id):
+                    end = pos + 1
+                    pos = end
+                    # print(f"[skip PAD/EOS] {k=} {pos=}, {self.tokenizer.batch_decode(input_ids[k][pos:end])}"); python_input()
+                    continue
+                else:
+                    end = pos
+                    while end < len(labels[k]) and input_ids[k][end].item() not in media_tokens:
+                        end += 1
+                    input = text_embeds[k][pos:end]
+                    label = labels[k][pos:end]
+                    # print(f"[text] {k=} {pos=}, {self.tokenizer.batch_decode(input_ids[k][pos:end])}"); python_input()
+                inputs_mk.append(input)
+                labels_mk.append(label)
+                pos = end
+            inputs_m.append(torch.cat(inputs_mk, dim=0))
+            labels_m.append(torch.cat(labels_mk, dim=0))
+        inputs, labels = inputs_m, labels_m
+        # Check if all media embeddings are consumed
+        for name in media_embeds:
+            if media_embeds[name]:
+                raise ValueError(f"Not all {name} embeddings are consumed! Still {len(media_embeds[name])} left.")
+        # Truncate sequences to `model_max_length` as media embeddings are inserted
+        inputs, labels = self.__truncate_sequence(inputs, labels)
+        # Pad sequences to the longest one in the batch
+        return self.__batchify_sequence(inputs, labels)
+    def __embed_media_tokens(
+        self,
+        media: Dict[str, List[torch.Tensor]],
+        media_config: Dict[str, Dict[str, Any]],
+    ) -> Dict[str, List[torch.Tensor]]:
+        embeds = defaultdict(deque)
+        for name in media:
+            if self.training:
+                # Gather metainfo of media objects from all ranks
+                info = [{"shape": tensor.shape, "dtype": tensor.dtype} for tensor in media.get(name, [])]
+                infos = list(chain(vila_all_gather(info)))
+                # The entire batch does not contain any media objects of this type.
+                if not infos:
+                    continue
+                # Create a dummy tensor to ensure the encoder is called, otherwise the training will hang.
+                if media.get(name) is None or len(media[name]) == 0:
+                    dummy = torch.zeros(infos[0]["shape"], dtype=infos[0]["dtype"], device=self.device)
+                    embeds["dummy"].extend(self.encoders[name]([dummy], media_config[name]))
+                    continue
+            embeds[name] = deque(self.encoders[name](media[name], media_config[name]))
+        return embeds
+    def __truncate_sequence(
+        self, inputs: List[torch.Tensor], labels: List[torch.Tensor]
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if self.training and any(len(input) > self.tokenizer.model_max_length for input in inputs):
+            warnings.warn(f"Truncating sequences to `model_max_length` ({self.tokenizer.model_max_length}).")
+            inputs = [input[: self.tokenizer.model_max_length] for input in inputs]
+            labels = [label[: self.tokenizer.model_max_length] for label in labels]
+        return inputs, labels
+    def __batchify_sequence(
+        self, inputs: List[torch.Tensor], labels: List[torch.Tensor]
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        batch_size = len(inputs)
+        device = inputs[0].device
+        hidden_size = inputs[0].shape[1]
+        max_length = max(inputs[k].shape[0] for k in range(batch_size))
+        attention_mask = torch.ones((batch_size, max_length), dtype=torch.bool, device=device)
+        inputs_p, labels_p = [], []
+        for k in range(batch_size):
+            size_pk = max_length - inputs[k].shape[0]
+            inputs_pk = torch.zeros((size_pk, hidden_size), dtype=inputs[k].dtype, device=device)
+            labels_pk = torch.full((size_pk,), IGNORE_INDEX, dtype=labels[k].dtype, device=device)
+            if self.tokenizer.padding_side == "right":
+                attention_mask[k, inputs[k].shape[0] :] = False
+                inputs_pk = torch.cat([inputs[k], inputs_pk], dim=0)
+                labels_pk = torch.cat([labels[k], labels_pk], dim=0)
+            else:
+                attention_mask[k, : -inputs[k].shape[0]] = False
+                inputs_pk = torch.cat([inputs_pk, inputs[k]], dim=0)
+                labels_pk = torch.cat([labels_pk, labels[k]], dim=0)
+            inputs_p.append(inputs_pk)
+            labels_p.append(labels_pk)
+        inputs = torch.stack(inputs_p, dim=0)
+        labels = torch.stack(labels_p, dim=0)
+        return inputs, labels, attention_mask
+    def repack_multimodal_data(self, inputs_embeds, attention_mask, position_ids, labels):
+        # Handle sequence parallelism
+        PROCESS_GROUP_MANAGER = get_pg_manager()
+        # We do re-sharding instead of packing here to ensure the sequence length is the same across all ranks.
+        if PROCESS_GROUP_MANAGER is not None:
+            sp_degree = PROCESS_GROUP_MANAGER.sp_degree
+            sp_rank = PROCESS_GROUP_MANAGER.sp_rank
+            sp_group = PROCESS_GROUP_MANAGER.sp_pg
+            ring_degree = PROCESS_GROUP_MANAGER.ring_degree
+            ring_rank = PROCESS_GROUP_MANAGER.ring_rank
+            ring_type = PROCESS_GROUP_MANAGER.ring_type
+            ulysses_degree = PROCESS_GROUP_MANAGER.ulysses_degree
+            ulysses_rank = PROCESS_GROUP_MANAGER.ulysses_rank
+            bs, shard_seqlen = position_ids.shape
+            sp_seq_len = [torch.zeros(1, dtype=torch.int64, device=position_ids.device) for _ in range(sp_degree)]
+            dist.all_gather(sp_seq_len, torch.tensor(shard_seqlen, device=position_ids.device), group=sp_group)
+            sp_seq_len_cat = torch.cat(sp_seq_len, dim=0)
+            if sp_rank == 0:
+                original_start_id = 0
+            else:
+                original_start_id = torch.sum(sp_seq_len_cat[:sp_rank]).item()
+            original_end_id = torch.sum(sp_seq_len_cat[: sp_rank + 1]).item()
+            # Gather attention_mask, position_ids, labels and input_embeds
+            all_inputs_embeds = torch.zeros(
+                bs,
+                torch.sum(sp_seq_len_cat),
+                inputs_embeds.shape[-1],
+                dtype=inputs_embeds.dtype,
+                device=inputs_embeds.device,
+            ).contiguous()
+            all_inputs_embeds[:, original_start_id:original_end_id, :] += inputs_embeds
+            dist.barrier(group=sp_group)
+            dist.all_reduce(all_inputs_embeds, group=sp_group)
+            dist.barrier(group=sp_group)
+            attention_mask_list = [
+                torch.zeros((bs, sp_seq_len[i]), dtype=attention_mask.dtype, device=attention_mask.device)
+                for i in range(sp_degree)
+            ]
+            position_ids_list = [
+                torch.zeros((bs, sp_seq_len[i]), dtype=position_ids.dtype, device=position_ids.device)
+                for i in range(sp_degree)
+            ]
+            labels_list = [
+                torch.zeros((bs, sp_seq_len[i]), dtype=labels.dtype, device=labels.device) for i in range(sp_degree)
+            ]
+            dist.all_gather(attention_mask_list, attention_mask, group=sp_group)
+            dist.all_gather(position_ids_list, position_ids, group=sp_group)
+            dist.all_gather(labels_list, labels, group=sp_group)
+            effective_seqlen_list = [attention_mask_list[i].sum(dim=-1) for i in range(sp_degree)]
+            effective_seqlen = torch.stack(effective_seqlen_list, dim=-1)
+            effective_seqlen_batch_list = torch.unbind(effective_seqlen, dim=0)
+            global_attention_mask_list = []
+            global_position_ids_list = []
+            global_labels_list = []
+            global_inputs_embeds_list = []
+            for i in range(bs):
+                global_attention_mask_batch_list = []
+                global_position_ids_batch_list = []
+                global_labels_batch_list = []
+                global_inputs_embeds_batch_list = []
+                for j in range(sp_degree):
+                    eff_len = effective_seqlen_batch_list[i][j]
+                    prev_len = torch.sum(sp_seq_len_cat[:j]).item() if j > 0 else 0
+                    global_attention_mask_batch_list.append(attention_mask_list[j][i, :eff_len])
+                    global_position_ids_batch_list.append(position_ids_list[j][i, :eff_len])
+                    global_labels_batch_list.append(labels_list[j][i, :eff_len])
+                    global_inputs_embeds_batch_list.append(all_inputs_embeds[i, prev_len : prev_len + eff_len, :])
+                global_attention_mask_list.append(torch.cat(global_attention_mask_batch_list, dim=0))
+                global_position_ids_list.append(torch.cat(global_position_ids_batch_list, dim=0))
+                global_labels_list.append(torch.cat(global_labels_batch_list, dim=0))
+                global_inputs_embeds_list.append(torch.cat(global_inputs_embeds_batch_list, dim=0))
+                global_attention_mask = torch.nn.utils.rnn.pad_sequence(
+                    global_attention_mask_list, batch_first=True, padding_value=False
+                )
+                global_position_ids = torch.nn.utils.rnn.pad_sequence(
+                    global_position_ids_list, batch_first=True, padding_value=-1
+                )
+                global_labels = torch.nn.utils.rnn.pad_sequence(
+                    global_labels_list, batch_first=True, padding_value=IGNORE_INDEX
+                )
+                global_inputs_embeds = torch.nn.utils.rnn.pad_sequence(
+                    global_inputs_embeds_list, batch_first=True, padding_value=0
+                )
+            # Re-shard the inputs
+            if ring_degree > 1:
+                total_effective_seqlen = torch.sum(effective_seqlen, dim=1)
+                new_seqlen_per_rank = total_effective_seqlen // sp_degree
+                assert torch.all(
+                    total_effective_seqlen % sp_degree == 0
+                ), "total_effective_seqlen must be divisible by sp_degree"
+                max_new_seqlen = torch.max(new_seqlen_per_rank).item()
+                new_attention_mask = torch.zeros(
+                    (bs, max_new_seqlen), dtype=global_attention_mask.dtype, device=global_attention_mask.device
+                )
+                new_position_ids = torch.zeros(
+                    (bs, max_new_seqlen), dtype=global_position_ids.dtype, device=global_position_ids.device
+                )
+                new_labels = torch.full(
+                    (bs, max_new_seqlen), IGNORE_INDEX, dtype=global_labels.dtype, device=global_labels.device
+                )
+                new_inputs_embeds = torch.zeros(
+                    (bs, max_new_seqlen, global_inputs_embeds.shape[-1]),
+                    dtype=global_inputs_embeds.dtype,
+                    device=global_inputs_embeds.device,
+                )
+                if ring_type == "ring_varlen":
+                    for i in range(bs):
+                        start_idx = new_seqlen_per_rank[i] * sp_rank
+                        end_idx = start_idx + new_seqlen_per_rank[i]
+                        new_attention_mask[i, : new_seqlen_per_rank[i]] = global_attention_mask[i, start_idx:end_idx]
+                        new_position_ids[i, : new_seqlen_per_rank[i]] = global_position_ids[i, start_idx:end_idx]
+                        new_labels[i, : new_seqlen_per_rank[i]] = global_labels[i, start_idx:end_idx]
+                        new_inputs_embeds[i, : new_seqlen_per_rank[i], :] = global_inputs_embeds[
+                            i, start_idx:end_idx, :
+                        ]
+                elif ring_type == "zigzag_ring_varlen":
+                    chunk_size = total_effective_seqlen // (2 * sp_degree)
+                    for i in range(bs):
+                        # Zigzag pattern indices
+                        if sp_degree == ring_degree:
+                            forward_rank_idx = sp_rank
+                            backward_rank_idx = 2 * sp_degree - sp_rank - 1
+                        else:
+                            ulysses_offset = ulysses_rank * ring_degree * 2
+                            forward_rank_idx = ring_rank + ulysses_offset
+                            backward_rank_idx = sp_degree - ring_rank - 1 + ulysses_offset
+                        # Calculate start and end indices for the forward and backward zigzag
+                        start_idx_fwd = forward_rank_idx * chunk_size[i]
+                        end_idx_fwd = start_idx_fwd + chunk_size[i]
+                        start_idx_bwd = backward_rank_idx * chunk_size[i]
+                        end_idx_bwd = start_idx_bwd + chunk_size[i]
+                        # Fill new tensors with zigzag data
+                        new_attention_mask[i, : chunk_size[i]] = global_attention_mask[i, start_idx_fwd:end_idx_fwd]
+                        new_attention_mask[i, chunk_size[i] : 2 * chunk_size[i]] = global_attention_mask[
+                            i, start_idx_bwd:end_idx_bwd
+                        ]
+                        new_position_ids[i, : chunk_size[i]] = global_position_ids[i, start_idx_fwd:end_idx_fwd]
+                        new_position_ids[i, chunk_size[i] : 2 * chunk_size[i]] = global_position_ids[
+                            i, start_idx_bwd:end_idx_bwd
+                        ]
+                        new_labels[i, : chunk_size[i]] = global_labels[i, start_idx_fwd:end_idx_fwd]
+                        new_labels[i, chunk_size[i] : 2 * chunk_size[i]] = global_labels[i, start_idx_bwd:end_idx_bwd]
+                        new_inputs_embeds[i, : chunk_size[i], :] = global_inputs_embeds[i, start_idx_fwd:end_idx_fwd, :]
+                        new_inputs_embeds[i, chunk_size[i] : 2 * chunk_size[i], :] = global_inputs_embeds[
+                            i, start_idx_bwd:end_idx_bwd, :
+                        ]
+                else:
+                    raise ValueError(f"Invalid ring_type: {ring_type}")
+            else:
+                global_seq_len = global_attention_mask.shape[-1]
+                seq_len_sharded = global_seq_len // sp_degree
+                start_idx_reshard = seq_len_sharded * sp_rank
+                end_idx_reshard = start_idx_reshard + seq_len_sharded if sp_rank < sp_degree - 1 else global_seq_len
+                new_attention_mask = torch.narrow(
+                    global_attention_mask, 1, start_idx_reshard, end_idx_reshard - start_idx_reshard
+                )
+                new_position_ids = torch.narrow(
+                    global_position_ids, 1, start_idx_reshard, end_idx_reshard - start_idx_reshard
+                )
+                new_labels = torch.narrow(global_labels, 1, start_idx_reshard, end_idx_reshard - start_idx_reshard)
+                new_inputs_embeds = torch.narrow(
+                    global_inputs_embeds, 1, start_idx_reshard, end_idx_reshard - start_idx_reshard
+                )
+            return new_inputs_embeds, new_attention_mask, new_position_ids, new_labels
+        device = inputs_embeds.device
+        batch_size = inputs_embeds.shape[0]
+        seqlens = [attention_mask[k].sum().item() for k in range(batch_size)]
+        # Pack all sequences together
+        inputs_embeds_p = [inputs_embeds[k][attention_mask[k]] for k in range(batch_size)]
+        attention_mask_p = [torch.ones(seqlens[k], dtype=torch.int, device=device) for k in range(batch_size)]
+        position_ids_p = [torch.arange(seqlens[k], dtype=torch.int, device=device) for k in range(batch_size)]
+        labels_p = [labels[k][attention_mask[k]] for k in range(batch_size)]
+        # Add one dummy token at the end of the packed sequence to ensure that `_get_unpacked_data` will be called
+        inputs_embeds_p.append(torch.zeros(1, inputs_embeds.shape[-1], dtype=inputs_embeds.dtype, device=device))
+        attention_mask_p.append(torch.tensor([0], dtype=torch.int, device=device))
+        position_ids_p.append(torch.tensor([0], dtype=torch.int, device=device))
+        labels_p.append(torch.tensor([IGNORE_INDEX], dtype=torch.int, device=device))
+        # Mask the first token of each sequence to avoid contamination
+        for label in labels_p:
+            label[0] = IGNORE_INDEX
+        # Batch the data
+        inputs_embeds_p = torch.cat(inputs_embeds_p, dim=0).unsqueeze(0)
+        attention_mask_p = torch.cat(attention_mask_p, dim=0).unsqueeze(0)
+        position_ids_p = torch.cat(position_ids_p, dim=0).unsqueeze(0)
+        labels_p = torch.cat(labels_p, dim=0).unsqueeze(0)
+        if hasattr(
+            self, "pad_to_multiple_of"
+        ):  # related to quantization, please refer to ModelArguments for more information.
+            assert len(labels_p.shape) == 2
+            batch_size, max_length, cur_length = labels_p.shape[0], labels_p.shape[1], labels_p.shape[1]
+            hidden_size = inputs_embeds_p.shape[-1]
+            if max_length % self.pad_to_multiple_of != 0:
+                max_length = ((max_length // self.pad_to_multiple_of) + 1) * self.pad_to_multiple_of
+                difference = max_length - cur_length
+                inputs_embeds_p = torch.cat(
+                    (
+                        inputs_embeds_p,
+                        torch.full((batch_size, difference, hidden_size), self.llm.pad_token_id).to(inputs_embeds_p),
+                    ),
+                    dim=1,
+                )
+                labels_p = torch.cat((labels_p, torch.full((batch_size, difference), IGNORE_INDEX).to(labels_p)), dim=1)
+                attention_mask_p = torch.cat(
+                    (
+                        attention_mask_p,
+                        torch.zeros((batch_size, difference), dtype=torch.bool).to(attention_mask_p),
+                    ),
+                    dim=1,
+                )
+                position_ids_p = torch.cat(
+                    (position_ids_p, torch.full((batch_size, difference), -1).to(position_ids_p)), dim=1
+                )
+        return inputs_embeds_p, attention_mask_p, position_ids_p, labels_p
+    def get_xgr_logits_processor(self, response_format) -> List[LogitsProcessor]:
+        raise NotImplementedError("This method is not implemented for VILA model.")
+        # Convert response format to logits processor
+        import xgrammar as xgr
+        logging.info("[XGrammar] Compiling grammar for contrained output")
+        if self.grammar_compiler is None:
+            # logging.info(f"[XGrammar] {self.tokenizer}, {self.tokenizer.vocab_size}, {self.vocab_size}")
+            self.grammar_compiler = xgr.GrammarCompiler(
+                xgr.TokenizerInfo.from_huggingface(self.tokenizer, vocab_size=self.vocab_size)
+            )
+        if response_format.type == "json_schema":
+            compiled_grammar = self.grammar_compiler.compile_json_schema(
+                response_format.json_schema.schema_,
+                indent=2,
+            )
+        else:
+            compiled_grammar = self.grammar_compiler.compile_builtin_json_grammar()
+        return [xgr.contrib.hf.LogitsProcessor(compiled_grammar)]
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        media: Optional[Dict[str, List[torch.Tensor]]] = None,
+        images: Optional[torch.FloatTensor] = None,
+        media_config: Optional[List] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        packing: bool = True,
+        force_packing: bool = False,
+        seqlens_in_batch: Optional[torch.LongTensor] = None,
+        dpo_forward: bool = False,
+        **kwargs,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        self.freezed_module_patch()
+        if images is not None:
+            if media is not None:
+                raise ValueError("Both 'media' and 'images' are provided. Please provide only one.")
+            print("The 'images' argument is deprecated. Please use 'media' instead.")
+            media = {"image": images}
+        if media_config is None:
+            media_config = defaultdict(dict)
+        if inputs_embeds is None:
+            inputs_embeds, labels, attention_mask = self._embed(input_ids, media, media_config, labels, attention_mask)
+        if force_packing or (packing and self.training and not dpo_forward):
+            if seqlens_in_batch is None:
+                seqlens_in_batch = torch.sum(attention_mask, dim=1)
+            set_seqlens_in_batch(seqlens_in_batch)
+            (inputs_embeds, attention_mask, position_ids, labels) = self.repack_multimodal_data(
+                inputs_embeds, attention_mask, position_ids, labels
+            )
+        outputs = self.llm(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            labels=labels,
+            **kwargs,
+        )
+        if self.training and getattr(self.config, "time_token_ids", []):
+            outputs.loss = soft_cross_entropy(
+                outputs.logits,
+                labels,
+                soft_tokens=self.config.time_token_ids,
+                std=self.config.soft_ce_std,
+            )
+        if dpo_forward:
+            return outputs.logits, labels
+        return outputs
+    @torch.inference_mode()
+    def generate(
+        self,
+        input_ids: Optional[torch.FloatTensor] = None,
+        media: Optional[Dict[str, List[torch.Tensor]]] = None,
+        media_config: Dict[str, Dict[str, Any]] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        **generation_kwargs,
+    ):
+        if self.training:
+            warnings.warn(
+                "Model is in training mode, using default padding strategy to right. This is not recommended for generation."
+            )
+        inputs_embeds, _, attention_mask = self._embed(input_ids, media, media_config, None, attention_mask)
+        return self.llm.generate(inputs_embeds=inputs_embeds, attention_mask=attention_mask, **generation_kwargs)
+    @torch.inference_mode()
+    def generate_content(
+        self,
+        prompt: Union[str, List],
+        generation_config: Optional[GenerationConfig] = None,
+        response_format=None,
+    ) -> str:
+        # TODO(zhijianl): Support directly taking conversation as input
+        conversation = [{"from": "human", "value": prompt}]
+        # Convert response format to logits processor
+        if response_format:
+            xgr_logits_processor = self.get_xgr_logits_processor(response_format)
+        else:
+            xgr_logits_processor = None
+        # Extract media from the conversation
+        # TODO (extract and preprocess should be done together, as the preprocess of image and video can be different, i.e. when dynamic res is used)
+        media = extract_media(conversation, self.config)
+        # Process media
+        media_config = defaultdict(dict)
+        for name in media:
+            if name == "image":
+                if len(media["image"]) == 1 and self.config.image_aspect_ratio in ["dynamic", "dynamic_s2"]:
+                    self.config.image_processor = self.vision_tower.image_processor
+                    if self.config.image_aspect_ratio == "dynamic":
+                        images = process_image(media["image"][0], self.config, None, enable_dynamic_res=True).half()
+                        conversation[0]["value"] = conversation[0]["value"].replace(
+                            DEFAULT_IMAGE_TOKEN, f"{DEFAULT_IMAGE_TOKEN}\n" * images.shape[0]
+                        )
+                    else:
+                        if type(self.config.s2_scales) is str:
+                            self.config.s2_scales = list(map(int, self.config.s2_scales.split(",")))
+                        images, block_sizes = process_image(
+                            media["image"][0], self.config, None, enable_dynamic_s2=True
+                        )
+                        images = images.half()
+                        media_config[name]["block_sizes"] = [block_sizes]
+                else:
+                    images = process_images(media["image"], self.vision_tower.image_processor, self.config).half()
+                media[name] = [image for image in images]
+            elif name == "video":
+                if self.config.image_aspect_ratio == "dynamic" and self.config.video_max_tiles > 1:
+                    media[name] = [
+                        process_images(
+                            images,
+                            self.vision_tower.image_processor,
+                            self.config,
+                            enable_dynamic_res=True,
+                            max_tiles=self.config.video_max_tiles,
+                        ).half()
+                        for images in media[name]
+                    ]
+                elif self.config.image_aspect_ratio == "dynamic_s2" and self.config.video_max_tiles > 1:
+                    self.config.image_processor = self.vision_tower.image_processor
+                    if type(self.config.s2_scales) is str:
+                        self.config.s2_scales = list(map(int, self.config.s2_scales.split(",")))
+                    media[name] = [
+                        torch.cat(
+                            [
+                                process_image(
+                                    image,
+                                    self.config,
+                                    None,
+                                    enable_dynamic_s2=True,
+                                    max_tiles=self.config.video_max_tiles,
+                                )[0].half()
+                                for image in images
+                            ]
+                        )
+                        for images in media[name]
+                    ]
+                else:
+                    media[name] = [
+                        process_images(images, self.vision_tower.image_processor, self.config).half()
+                        for images in media[name]
+                    ]
+            else:
+                raise ValueError(f"Unsupported media type: {name}")
+        # Tokenize the conversation
+        input_ids = tokenize_conversation(conversation, self.tokenizer, add_generation_prompt=True).cuda().unsqueeze(0)
+        # Set up the generation config
+        generation_config = generation_config or self.default_generation_config
+        # print("input_ids", input_ids.shape)
+        # print(input_ids)
+        # print(self.tokenizer.batch_decode(input_ids))
+        # print("media", {k: len(v) for k, v in media.items()})
+        # print("media_config", media_config)
+        # print("generation_config", generation_config)
+        # input("wait for debug")
+        # Generate the response
+        try:
+            output_ids = self.generate(
+                input_ids=input_ids,
+                media=media,
+                media_config=media_config,
+                generation_config=generation_config,
+                logits_processor=xgr_logits_processor,  # structured generation
+            )
+        except ValueError:
+            if not generation_config.do_sample:
+                raise
+            # FIXME(zhijianl): This is a temporary workaround for the sampling issue
+            logging.warning("Generation failed with sampling, retrying with greedy decoding.")
+            generation_config.do_sample = False
+            output_ids = self.generate(
+                input_ids=input_ids,
+                media=media,
+                media_config=media_config,
+                generation_config=generation_config,
+                logits_processor=xgr_logits_processor,
+            )
+        # Decode the response
+        response = self.tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
+        return response
+    @property
+    def default_generation_config(self) -> GenerationConfig:
+        generation_config = copy.deepcopy(self.generation_config or GenerationConfig())
+        if self.tokenizer.eos_token_id is None:
+            raise ValueError("Tokenizer must have an EOS token")
+        if generation_config.max_length == GenerationConfig().max_length:
+            generation_config.max_length = self.tokenizer.model_max_length
+        if generation_config.pad_token_id is None:
+            generation_config.pad_token_id = self.tokenizer.pad_token_id or self.tokenizer.eos_token_id
+        if generation_config.bos_token_id is None:
+            generation_config.bos_token_id = self.tokenizer.bos_token_id or self.tokenizer.eos_token_id
+        if generation_config.eos_token_id is None:
+            generation_config.eos_token_id = self.tokenizer.eos_token_id
+        return generation_config

qwen2_jp.jinja ADDED Viewed

	@@ -0,0 +1,11 @@

+{% if messages[0]['role'] != 'system' %}
+    {{ '<|im_start|>system\n以下は、タスクを説明する指示です。要求を適切に満たす応答を書きなさい。<|im_end|>\n' }}
+{% endif %}
+{% for message in messages if message['content'] is not none %}
+    {{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n' }}
+{% endfor %}
+{% if add_generation_prompt %}
+    {{ '<|im_start|>assistant\n' }}
+{% endif %}

siglip_encoder.py ADDED Viewed

	@@ -0,0 +1,288 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from accelerate.hooks import add_hook_to_module
+from einops import rearrange
+from s2wrapper import forward as multiscale_forward
+from transformers import AutoConfig, PretrainedConfig, PreTrainedModel, SiglipImageProcessor
+from transformers.image_processing_utils import BaseImageProcessor
+from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
+from transformers.models.siglip import SiglipVisionModel
+class VisionTower(nn.Module):
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__()
+        self.is_loaded = False
+        self.vision_tower_name = vision_tower
+        self.select_layer = getattr(args, "mm_vision_select_layer", -2)
+        self.select_feature = getattr(args, "mm_vision_select_feature", "patch")
+        self.cfg_only = None
+    def feature_select(self, image_forward_outs):
+        image_features = image_forward_outs.hidden_states[self.select_layer]
+        if self.select_feature == "patch":
+            image_features = image_features[:, 1:]
+        elif self.select_feature == "cls_patch":
+            image_features = image_features
+        else:
+            raise ValueError(f"Unexpected select feature: {self.select_feature}")
+        return image_features
+    def _maybe_resize_pos_embeds(
+        self,
+        model: PreTrainedModel,
+        image_processor: BaseImageProcessor,
+        resolution: int = -1,
+        interpolate_mode: str = "linear",
+    ):
+        if resolution in [model.config.image_size, -1]:
+            return
+        print(
+            f"Resizing vision model's position embeddings to support higher vision resolution: from {model.config.image_size} to {resolution} ..."
+        )
+        embeddings = model.vision_model.embeddings
+        patch_size = embeddings.patch_size
+        num_new_tokens = int((resolution // patch_size) ** 2)
+        old_embeddings = embeddings.position_embedding
+        match interpolate_mode:
+            case "linear":
+                ## Step 1: Calculate the corresponding patch ID (pid) in the current resolution (M patches) based on the target resolution (N patches). Formula: pid = pid / N * M
+                ## Step 2:  Obtain new embeddings by interpolating between the embeddings of the two nearest calculated patch IDs. Formula: new_embeds = (pid - floor(pid)) * embeds[ceil(pid)] + (ceil(pid) - pid) * embeds[floor(pid)]
+                import torch
+                import torch.nn as nn
+                if is_deepspeed_zero3_enabled():
+                    try:
+                        import deepspeed
+                    except ImportError:
+                        raise ImportError("DeepSpeed is not installed. Please install it with `pip install deepspeed`.")
+                    with deepspeed.zero.GatheredParameters([old_embeddings.weight], modifier_rank=None):
+                        old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
+                else:
+                    old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
+                new_embeddings = nn.Embedding(
+                    num_new_tokens,
+                    old_embedding_dim,
+                    dtype=old_embeddings.weight.dtype,
+                    device=old_embeddings.weight.device,
+                )
+                mapped_indices = (
+                    torch.arange(num_new_tokens).to(old_embeddings.weight.device)
+                    / (num_new_tokens - 1)
+                    * (old_num_tokens - 1)
+                )
+                floor_indices = torch.clamp(mapped_indices.floor().long(), min=0, max=old_num_tokens - 1)
+                ceil_indices = torch.clamp(mapped_indices.ceil().long(), min=0, max=old_num_tokens - 1)
+                if is_deepspeed_zero3_enabled():
+                    params = [old_embeddings.weight, new_embeddings.weight]
+                    with deepspeed.zero.GatheredParameters(params, modifier_rank=0):
+                        interpolated_embeds = (mapped_indices - floor_indices)[:, None] * old_embeddings.weight.data[
+                            ceil_indices, :
+                        ] + (ceil_indices - mapped_indices)[:, None] * old_embeddings.weight.data[floor_indices, :]
+                else:
+                    interpolated_embeds = (mapped_indices - floor_indices)[:, None] * old_embeddings.weight.data[
+                        ceil_indices, :
+                    ] + (ceil_indices - mapped_indices)[:, None] * old_embeddings.weight.data[floor_indices, :]
+                new_embeddings.weight.data = interpolated_embeds
+            case _:
+                raise NotImplementedError
+        if hasattr(old_embeddings, "_hf_hook"):
+            hook = old_embeddings._hf_hook
+            add_hook_to_module(new_embeddings, hook)
+        new_embeddings.requires_grad_(old_embeddings.weight.requires_grad)
+        ## update vision encoder's configurations
+        model.config.image_size = resolution
+        if hasattr(image_processor, "crop_size"):
+            # CLIP vision tower
+            image_processor.crop_size = resolution
+        else:
+            # SIGLIP vision tower
+            assert hasattr(image_processor, "size")
+            image_processor.size = {"height": resolution, "width": resolution}
+        ## TODO define a '_reinitialize' method for VisionTower
+        embeddings.position_embedding = new_embeddings
+        embeddings.image_size = resolution
+        embeddings.num_patches = embeddings.num_positions = num_new_tokens
+        embeddings.position_ids = (
+            torch.arange(embeddings.num_positions).expand((1, -1)).to(old_embeddings.weight.device)
+        )
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_tower(
+                    image.to(device=self.device, dtype=self.dtype).unsqueeze(0),
+                    output_hidden_states=True,
+                )
+                image_feature = self.feature_select(image_forward_out).to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            image_forward_outs = self.vision_tower(
+                images.to(device=self.device, dtype=self.dtype),
+                output_hidden_states=True,
+            )
+            image_features = self.feature_select(image_forward_outs).to(images.dtype)
+        return image_features
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+    @property
+    def dtype(self):
+        return self.vision_tower.dtype
+    @property
+    def device(self):
+        return self.vision_tower.device
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_tower.config
+        else:
+            return self.cfg_only
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+    @property
+    def num_patches(self):
+        return (self.config.image_size // self.config.patch_size) ** 2
+class VisionTowerS2(VisionTower):
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__(vision_tower, args, delay_load)
+        self.scales = list(map(int, args.s2_scales.split(",")))
+        self.scales.sort()
+        self.max_split_size = args.s2_max_split_size
+        self.resize_output_to_scale_idx = getattr(args, "s2_resize_output_to_scale_idx", 0)
+    def forward_feature(self, images):
+        image_forward_outs = self.vision_tower(
+            images.to(device=self.device, dtype=self.dtype), output_hidden_states=True
+        )
+        image_features = self.feature_select(image_forward_outs).to(images.dtype)
+        return image_features
+    def forward(self, images):
+        if type(images) is list:
+            image_feature = []
+            for image in images:
+                image_feature = multiscale_forward(
+                    self.forward_feature,
+                    image.unsqueeze(0),
+                    img_sizes=self.scales,
+                    max_split_size=self.max_split_size,
+                    resize_output_to_idx=self.resize_output_to_scale_idx,
+                )
+                image_features.append(image_feature)
+        else:
+            image_features = multiscale_forward(
+                self.forward_feature,
+                images,
+                img_sizes=self.scales,
+                max_split_size=self.max_split_size,
+                resize_output_to_idx=self.resize_output_to_scale_idx,
+            )
+        return image_features
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size * len(self.scales)
+class VisionTowerDynamicS2(VisionTower):
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__(vision_tower, args, delay_load)
+        self.scales = list(map(int, args.s2_scales.split(",")))
+        self.scales.sort()
+        self.max_split_size = args.s2_max_split_size
+        self.resize_output_to_scale_idx = getattr(args, "s2_resize_output_to_scale_idx", 0)
+    def forward_feature(self, images):
+        image_forward_outs = self.vision_tower(
+            images.to(device=self.device, dtype=self.dtype), output_hidden_states=True
+        )
+        image_features = self.feature_select(image_forward_outs).to(images.dtype)
+        return image_features
+    def forward(self, images):
+        assert type(images) is not list
+        image_features = self.forward_feature(images)
+        return image_features
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size * len(self.scales)
+class SiglipVisionTower(VisionTower):
+    def __init__(self, model_name_or_path: str, config: PretrainedConfig) -> None:
+        super().__init__(model_name_or_path, config)
+        # TODO(ligengl): why pass config here leading to errors?
+        self.vision_tower = SiglipVisionModel.from_pretrained(
+            model_name_or_path,
+            attn_implementation=config._attn_implementation,
+            torch_dtype=eval(config.model_dtype),
+        )
+        self.image_processor = SiglipImageProcessor.from_pretrained(model_name_or_path)
+        self.is_loaded = True
+class SiglipVisionTowerS2(VisionTowerS2):
+    def __init__(self, model_name_or_path: str, config: PretrainedConfig) -> None:
+        super().__init__(model_name_or_path, config)
+        self.vision_tower = SiglipVisionModel.from_pretrained(
+            model_name_or_path,
+            attn_implementation=config._attn_implementation,
+            torch_dtype=eval(config.model_dtype),
+        )
+        self.image_processor = SiglipImageProcessor.from_pretrained(model_name_or_path)
+        # Make sure it crops/resizes the image to the largest scale in self.scales to maintain high-res information
+        self.image_processor.size["height"] = self.image_processor.size["width"] = self.scales[-1]
+        self.is_loaded = True
+class SiglipVisionTowerDynamicS2(VisionTowerDynamicS2):
+    def __init__(self, model_name_or_path: str, config: PretrainedConfig) -> None:
+        super().__init__(model_name_or_path, config)
+        self.vision_tower = SiglipVisionModel.from_pretrained(
+            model_name_or_path,
+            attn_implementation="flash_attention_2",
+            torch_dtype=eval(config.model_dtype),
+        )
+        self.image_processor = SiglipImageProcessor.from_pretrained(model_name_or_path)
+        # Make sure it crops/resizes the image to the largest scale in self.scales to maintain high-res information
+        self.image_processor.size["height"] = self.image_processor.size["width"] = self.scales[0]
+        self.is_loaded = True

tokenizer_utils.py ADDED Viewed

	@@ -0,0 +1,182 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any, Dict, List, Optional, Sequence
+import torch
+import transformers
+from .constants import IGNORE_INDEX, SENTINEL_TOKEN
+from .conversation import SeparatorStyle, default_conversation
+from .mm_utils import tokenizer_image_token
+# __all__ = [
+#     "tokenize_conversation",
+#     "preprocess_conversation",
+#     "infer_stop_tokens",
+# ]
+DUMMY_CONVERSATION = [
+    {"from": "human", "value": "question"},
+    {"from": "gpt", "value": "answer"},
+] * 10
+def tokenize_conversation_legacy(
+    messages: Sequence[Dict[str, str]],
+    tokenizer: transformers.PreTrainedTokenizer,
+    add_generation_prompt: bool = False,
+    overrides: Optional[Dict[str, str]] = None,
+    no_system_prompt: bool = False,
+) -> torch.Tensor:
+    conv = default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+    if no_system_prompt:
+        conv.system = ""
+    # Skip the first message if it is not from human
+    if messages[0]["from"] != "human":
+        messages = messages[1:]
+    # Add a generation prompt if needed
+    if add_generation_prompt:
+        messages.append({"from": "gpt", "value": None})
+    conv.messages = []
+    for turn, message in enumerate(messages):
+        role = roles[message["from"]]
+        assert role == conv.roles[turn % 2]
+        if overrides is not None and message["from"] in overrides:
+            conv.append_message(role, overrides[message["from"]])
+        else:
+            conv.append_message(role, message["value"])
+    return tokenizer_image_token(conv.get_prompt(), tokenizer, return_tensors="pt")
+def tokenize_conversation(
+    messages: Sequence[Dict[str, str]],
+    tokenizer: transformers.PreTrainedTokenizer,
+    add_generation_prompt: bool = False,
+    overrides: Optional[Dict[str, str]] = None,
+    no_system_prompt: bool = False,
+) -> torch.Tensor:
+    # Normalize the conversation before tokenization
+    for message in messages:
+        message["value"] = message["value"].strip()
+    if default_conversation.sep_style != SeparatorStyle.AUTO:
+        return tokenize_conversation_legacy(
+            messages,
+            tokenizer,
+            add_generation_prompt=add_generation_prompt,
+            overrides=overrides,
+            no_system_prompt=no_system_prompt,
+        )
+    conversation = []
+    for m in messages:
+        message = {}
+        if m["from"] == "human":
+            message["role"] = "user"
+        elif m["from"] == "gpt":
+            message["role"] = "assistant"
+        else:
+            raise ValueError(f"Unexpected sender '{m['from']}' in conversation entry.")
+        message["content"] = m["value"]
+        if overrides is not None and m["from"] in overrides:
+            message["content"] = overrides[m["from"]]
+        conversation.append(message)
+    if no_system_prompt:
+        conversation = [{"role": "system", "content": ""}] + conversation
+    text = tokenizer.apply_chat_template(
+        conversation,
+        add_generation_prompt=add_generation_prompt,
+        tokenize=False,
+    )
+    return tokenizer_image_token(text, tokenizer, return_tensors="pt")
+def _maybe_add_sentinel_token(tokenizer: transformers.PreTrainedTokenizer) -> None:
+    if not hasattr(tokenizer, "sentinel_token"):
+        tokenizer.add_tokens([SENTINEL_TOKEN], special_tokens=True)
+        tokenizer.sentinel_token = SENTINEL_TOKEN
+        tokenizer.sentinel_token_id = tokenizer.convert_tokens_to_ids(SENTINEL_TOKEN)
+def preprocess_conversation(
+    conversation: Sequence[Dict[str, str]],
+    tokenizer: transformers.PreTrainedTokenizer,
+    no_system_prompt: bool = False,
+    retried: bool = False,
+) -> Dict[str, Any]:
+    inputs = tokenize_conversation(conversation, tokenizer, no_system_prompt=no_system_prompt)
+    labels = torch.ones_like(inputs) * IGNORE_INDEX
+    # Generate the template by replacing the assistant's response with a sentinel.
+    _maybe_add_sentinel_token(tokenizer)
+    template = tokenize_conversation(
+        conversation, tokenizer, overrides={"gpt": SENTINEL_TOKEN}, no_system_prompt=no_system_prompt
+    )
+    # Remove sentinel tokens from the template.
+    mask = torch.ones_like(template, dtype=torch.bool)
+    for k in range(template.size(0) - 1):
+        if template[k] == tokenizer.sentinel_token_id:
+            mask[k : k + 2] = False
+            # NOTE(zhijianl): This is to handle the corner case where there is an empty token before the sentinel token.
+            if k > 0 and retried:
+                mask[k - 1] = False
+    template = template[mask]
+    # Match the tokenized conversation with the template (with no assistant's response).
+    # Every token that is not matched will be included in the label for training.
+    p = 0
+    for k in range(inputs.size(0)):
+        if p < template.size(0) and inputs[k] == template[p]:
+            p += 1
+        else:
+            labels[k] = inputs[k]
+    # Mask all tokens in the label if the template is not fully matched.
+    if p < template.size(0):
+        if not retried:
+            return preprocess_conversation(
+                conversation,
+                tokenizer,
+                no_system_prompt=no_system_prompt,
+                retried=True,
+            )
+        print(f"Failed to process the conversation: '{conversation}'. All tokens will be masked in the label.")
+        labels[:] = IGNORE_INDEX
+    return {"input_ids": inputs, "labels": labels}
+def infer_stop_tokens(tokenizer: transformers.PreTrainedTokenizer) -> List[str]:
+    _maybe_add_sentinel_token(tokenizer)
+    template = tokenize_conversation(DUMMY_CONVERSATION, tokenizer, overrides={"gpt": SENTINEL_TOKEN})
+    stop_tokens = {tokenizer.eos_token}
+    for k in range(template.size(0) - 1):
+        if template[k] == tokenizer.sentinel_token_id:
+            stop_token = tokenizer.decode(template[k + 1])
+            stop_tokens.add(stop_token)
+    return list(stop_tokens)

trainer_state.json ADDED Viewed

	@@ -0,0 +1,3311 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 467,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0,
+      "grad_norm": 11.039007186889648,
+      "learning_rate": 1.3333333333333334e-06,
+      "loss": 1.7243,
+      "step": 1
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 11.325847625732422,
+      "learning_rate": 2.666666666666667e-06,
+      "loss": 1.7232,
+      "step": 2
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 11.024140357971191,
+      "learning_rate": 4.000000000000001e-06,
+      "loss": 1.7473,
+      "step": 3
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 8.857377052307129,
+      "learning_rate": 5.333333333333334e-06,
+      "loss": 1.5677,
+      "step": 4
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 5.129051685333252,
+      "learning_rate": 6.666666666666667e-06,
+      "loss": 1.3132,
+      "step": 5
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 3.457620143890381,
+      "learning_rate": 8.000000000000001e-06,
+      "loss": 1.2985,
+      "step": 6
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 2.502241373062134,
+      "learning_rate": 9.333333333333334e-06,
+      "loss": 1.1922,
+      "step": 7
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 2.6525237560272217,
+      "learning_rate": 1.0666666666666667e-05,
+      "loss": 1.1783,
+      "step": 8
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 2.580990791320801,
+      "learning_rate": 1.2e-05,
+      "loss": 1.1252,
+      "step": 9
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 2.4445464611053467,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.1204,
+      "step": 10
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 2.5538313388824463,
+      "learning_rate": 1.4666666666666666e-05,
+      "loss": 1.0808,
+      "step": 11
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.922621488571167,
+      "learning_rate": 1.6000000000000003e-05,
+      "loss": 1.0484,
+      "step": 12
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 1.6075185537338257,
+      "learning_rate": 1.7333333333333336e-05,
+      "loss": 1.0798,
+      "step": 13
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.0998339653015137,
+      "learning_rate": 1.866666666666667e-05,
+      "loss": 1.023,
+      "step": 14
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 1.311397910118103,
+      "learning_rate": 2e-05,
+      "loss": 1.0424,
+      "step": 15
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 1.4649641513824463,
+      "learning_rate": 1.9999758458848847e-05,
+      "loss": 0.9873,
+      "step": 16
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 1.5159320831298828,
+      "learning_rate": 1.9999033847063813e-05,
+      "loss": 1.0423,
+      "step": 17
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 1.6150208711624146,
+      "learning_rate": 1.9997826199649607e-05,
+      "loss": 0.9522,
+      "step": 18
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 2.5012216567993164,
+      "learning_rate": 1.9996135574945543e-05,
+      "loss": 0.9858,
+      "step": 19
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 1.7912406921386719,
+      "learning_rate": 1.9993962054622703e-05,
+      "loss": 0.966,
+      "step": 20
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 1.5078647136688232,
+      "learning_rate": 1.9991305743680013e-05,
+      "loss": 0.9418,
+      "step": 21
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 1.0531651973724365,
+      "learning_rate": 1.9988166770439156e-05,
+      "loss": 0.9789,
+      "step": 22
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 1.525269865989685,
+      "learning_rate": 1.9984545286538362e-05,
+      "loss": 0.9383,
+      "step": 23
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 1.369185447692871,
+      "learning_rate": 1.9980441466925118e-05,
+      "loss": 0.9662,
+      "step": 24
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 1.1335804462432861,
+      "learning_rate": 1.9975855509847688e-05,
+      "loss": 0.9393,
+      "step": 25
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 1.4465155601501465,
+      "learning_rate": 1.9970787636845536e-05,
+      "loss": 0.933,
+      "step": 26
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 1.7765053510665894,
+      "learning_rate": 1.9965238092738643e-05,
+      "loss": 0.9219,
+      "step": 27
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.8634375333786011,
+      "learning_rate": 1.9959207145615663e-05,
+      "loss": 0.9462,
+      "step": 28
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 1.3061445951461792,
+      "learning_rate": 1.9952695086820975e-05,
+      "loss": 0.8913,
+      "step": 29
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 1.3201128244400024,
+      "learning_rate": 1.9945702230940616e-05,
+      "loss": 0.9069,
+      "step": 30
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 1.1161390542984009,
+      "learning_rate": 1.993822891578708e-05,
+      "loss": 0.914,
+      "step": 31
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 1.1489887237548828,
+      "learning_rate": 1.9930275502382993e-05,
+      "loss": 0.8876,
+      "step": 32
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 1.072081446647644,
+      "learning_rate": 1.9921842374943682e-05,
+      "loss": 0.9394,
+      "step": 33
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 1.204382061958313,
+      "learning_rate": 1.9912929940858607e-05,
+      "loss": 0.8852,
+      "step": 34
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 1.0732938051223755,
+      "learning_rate": 1.9903538630671687e-05,
+      "loss": 0.9019,
+      "step": 35
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 1.0138473510742188,
+      "learning_rate": 1.9893668898060504e-05,
+      "loss": 0.8915,
+      "step": 36
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 1.2495840787887573,
+      "learning_rate": 1.988332121981436e-05,
+      "loss": 0.8955,
+      "step": 37
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 1.1097376346588135,
+      "learning_rate": 1.9872496095811287e-05,
+      "loss": 0.8872,
+      "step": 38
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 1.0911654233932495,
+      "learning_rate": 1.9861194048993865e-05,
+      "loss": 0.9061,
+      "step": 39
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 1.078086018562317,
+      "learning_rate": 1.9849415625343972e-05,
+      "loss": 0.8869,
+      "step": 40
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 1.57882821559906,
+      "learning_rate": 1.9837161393856413e-05,
+      "loss": 0.8587,
+      "step": 41
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 1.0213719606399536,
+      "learning_rate": 1.982443194651142e-05,
+      "loss": 0.9093,
+      "step": 42
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 1.8046919107437134,
+      "learning_rate": 1.9811227898246072e-05,
+      "loss": 0.8551,
+      "step": 43
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 1.0796761512756348,
+      "learning_rate": 1.979754988692457e-05,
+      "loss": 0.9138,
+      "step": 44
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 1.517764687538147,
+      "learning_rate": 1.978339857330743e-05,
+      "loss": 0.8252,
+      "step": 45
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 1.3809912204742432,
+      "learning_rate": 1.976877464101957e-05,
+      "loss": 0.8894,
+      "step": 46
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 1.5511187314987183,
+      "learning_rate": 1.975367879651728e-05,
+      "loss": 0.8437,
+      "step": 47
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 1.6329996585845947,
+      "learning_rate": 1.9738111769054095e-05,
+      "loss": 0.9215,
+      "step": 48
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 1.3756284713745117,
+      "learning_rate": 1.9722074310645553e-05,
+      "loss": 0.8401,
+      "step": 49
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 1.7832353115081787,
+      "learning_rate": 1.9705567196032894e-05,
+      "loss": 0.8396,
+      "step": 50
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 1.3009949922561646,
+      "learning_rate": 1.9688591222645607e-05,
+      "loss": 0.8627,
+      "step": 51
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 1.448974847793579,
+      "learning_rate": 1.9671147210562925e-05,
+      "loss": 0.858,
+      "step": 52
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 1.298194169998169,
+      "learning_rate": 1.9653236002474202e-05,
+      "loss": 0.8495,
+      "step": 53
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 1.0985174179077148,
+      "learning_rate": 1.96348584636382e-05,
+      "loss": 0.8706,
+      "step": 54
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 1.7281138896942139,
+      "learning_rate": 1.9616015481841293e-05,
+      "loss": 0.8665,
+      "step": 55
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 1.2161897420883179,
+      "learning_rate": 1.9596707967354588e-05,
+      "loss": 0.8657,
+      "step": 56
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 1.1948484182357788,
+      "learning_rate": 1.9576936852889937e-05,
+      "loss": 0.8545,
+      "step": 57
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 1.8918001651763916,
+      "learning_rate": 1.955670309355489e-05,
+      "loss": 0.8358,
+      "step": 58
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 1.1286191940307617,
+      "learning_rate": 1.9536007666806555e-05,
+      "loss": 0.8407,
+      "step": 59
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 1.198012113571167,
+      "learning_rate": 1.951485157240437e-05,
+      "loss": 0.8662,
+      "step": 60
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 2.0095624923706055,
+      "learning_rate": 1.9493235832361812e-05,
+      "loss": 0.8681,
+      "step": 61
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 1.1153709888458252,
+      "learning_rate": 1.9471161490897027e-05,
+      "loss": 0.8658,
+      "step": 62
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 1.3872712850570679,
+      "learning_rate": 1.9448629614382394e-05,
+      "loss": 0.822,
+      "step": 63
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 1.0818780660629272,
+      "learning_rate": 1.942564129129298e-05,
+      "loss": 0.9052,
+      "step": 64
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 1.1288385391235352,
+      "learning_rate": 1.940219763215399e-05,
+      "loss": 0.8246,
+      "step": 65
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.9448270797729492,
+      "learning_rate": 1.9378299769487116e-05,
+      "loss": 0.856,
+      "step": 66
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.8516116142272949,
+      "learning_rate": 1.93539488577558e-05,
+      "loss": 0.8436,
+      "step": 67
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.9422905445098877,
+      "learning_rate": 1.9329146073309502e-05,
+      "loss": 0.8396,
+      "step": 68
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.8786196112632751,
+      "learning_rate": 1.9303892614326835e-05,
+      "loss": 0.8769,
+      "step": 69
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 1.207822322845459,
+      "learning_rate": 1.9278189700757717e-05,
+      "loss": 0.8053,
+      "step": 70
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 1.005181074142456,
+      "learning_rate": 1.9252038574264403e-05,
+      "loss": 0.8608,
+      "step": 71
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 1.247426986694336,
+      "learning_rate": 1.9225440498161544e-05,
+      "loss": 0.8336,
+      "step": 72
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.9933120012283325,
+      "learning_rate": 1.9198396757355118e-05,
+      "loss": 0.8575,
+      "step": 73
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.9208722114562988,
+      "learning_rate": 1.9170908658280388e-05,
+      "loss": 0.8066,
+      "step": 74
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.8881359100341797,
+      "learning_rate": 1.9142977528838763e-05,
+      "loss": 0.8786,
+      "step": 75
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 1.1525728702545166,
+      "learning_rate": 1.911460471833368e-05,
+      "loss": 0.8305,
+      "step": 76
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 1.4480865001678467,
+      "learning_rate": 1.9085791597405404e-05,
+      "loss": 0.8406,
+      "step": 77
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.8593180179595947,
+      "learning_rate": 1.9056539557964814e-05,
+      "loss": 0.8806,
+      "step": 78
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.9452027082443237,
+      "learning_rate": 1.902685001312616e-05,
+      "loss": 0.8047,
+      "step": 79
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 1.3369029760360718,
+      "learning_rate": 1.8996724397138813e-05,
+      "loss": 0.8317,
+      "step": 80
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.8937678337097168,
+      "learning_rate": 1.8966164165317968e-05,
+      "loss": 0.8348,
+      "step": 81
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 1.0756009817123413,
+      "learning_rate": 1.8935170793974335e-05,
+      "loss": 0.8271,
+      "step": 82
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.8728197813034058,
+      "learning_rate": 1.8903745780342838e-05,
+      "loss": 0.8578,
+      "step": 83
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 1.3119213581085205,
+      "learning_rate": 1.887189064251027e-05,
+      "loss": 0.7796,
+      "step": 84
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 1.1723086833953857,
+      "learning_rate": 1.883960691934196e-05,
+      "loss": 0.8497,
+      "step": 85
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 1.2870450019836426,
+      "learning_rate": 1.8806896170407437e-05,
+      "loss": 0.8096,
+      "step": 86
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 1.0081167221069336,
+      "learning_rate": 1.8773759975905098e-05,
+      "loss": 0.878,
+      "step": 87
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 1.154690146446228,
+      "learning_rate": 1.8740199936585856e-05,
+      "loss": 0.7973,
+      "step": 88
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 1.2520458698272705,
+      "learning_rate": 1.8706217673675813e-05,
+      "loss": 0.8218,
+      "step": 89
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 1.1887520551681519,
+      "learning_rate": 1.867181482879795e-05,
+      "loss": 0.7935,
+      "step": 90
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 1.1408494710922241,
+      "learning_rate": 1.8636993063892822e-05,
+      "loss": 0.874,
+      "step": 91
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.9687843322753906,
+      "learning_rate": 1.8601754061138258e-05,
+      "loss": 0.7991,
+      "step": 92
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 1.1839170455932617,
+      "learning_rate": 1.8566099522868118e-05,
+      "loss": 0.8639,
+      "step": 93
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 1.6939510107040405,
+      "learning_rate": 1.8530031171490055e-05,
+      "loss": 0.7854,
+      "step": 94
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 1.2965248823165894,
+      "learning_rate": 1.8493550749402278e-05,
+      "loss": 0.8231,
+      "step": 95
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 1.025974154472351,
+      "learning_rate": 1.8456660018909424e-05,
+      "loss": 0.8452,
+      "step": 96
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 1.2490646839141846,
+      "learning_rate": 1.8419360762137395e-05,
+      "loss": 0.7846,
+      "step": 97
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.8493202924728394,
+      "learning_rate": 1.8381654780947272e-05,
+      "loss": 0.8648,
+      "step": 98
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 1.1620343923568726,
+      "learning_rate": 1.8343543896848275e-05,
+      "loss": 0.8261,
+      "step": 99
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.9533255100250244,
+      "learning_rate": 1.830502995090977e-05,
+      "loss": 0.847,
+      "step": 100
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 1.254692554473877,
+      "learning_rate": 1.826611480367232e-05,
+      "loss": 0.8101,
+      "step": 101
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 1.0770541429519653,
+      "learning_rate": 1.822680033505782e-05,
+      "loss": 0.8249,
+      "step": 102
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.9607298374176025,
+      "learning_rate": 1.8187088444278675e-05,
+      "loss": 0.823,
+      "step": 103
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.8450298309326172,
+      "learning_rate": 1.814698104974604e-05,
+      "loss": 0.789,
+      "step": 104
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 1.1690232753753662,
+      "learning_rate": 1.8106480088977174e-05,
+      "loss": 0.86,
+      "step": 105
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.7981148362159729,
+      "learning_rate": 1.8065587518501806e-05,
+      "loss": 0.8124,
+      "step": 106
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 4.888617992401123,
+      "learning_rate": 1.8024305313767648e-05,
+      "loss": 0.8107,
+      "step": 107
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 1.9401100873947144,
+      "learning_rate": 1.798263546904495e-05,
+      "loss": 0.8515,
+      "step": 108
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 1.1880918741226196,
+      "learning_rate": 1.7940579997330167e-05,
+      "loss": 0.8038,
+      "step": 109
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 1.1192213296890259,
+      "learning_rate": 1.7898140930248703e-05,
+      "loss": 0.8347,
+      "step": 110
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 1.640434741973877,
+      "learning_rate": 1.7855320317956785e-05,
+      "loss": 0.8175,
+      "step": 111
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.8676750063896179,
+      "learning_rate": 1.7812120229042415e-05,
+      "loss": 0.844,
+      "step": 112
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 1.189393401145935,
+      "learning_rate": 1.7768542750425427e-05,
+      "loss": 0.7812,
+      "step": 113
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.8515229821205139,
+      "learning_rate": 1.7724589987256697e-05,
+      "loss": 0.8528,
+      "step": 114
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 1.701436996459961,
+      "learning_rate": 1.768026406281642e-05,
+      "loss": 0.8155,
+      "step": 115
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 1.211014986038208,
+      "learning_rate": 1.7635567118411568e-05,
+      "loss": 0.8411,
+      "step": 116
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 1.2689430713653564,
+      "learning_rate": 1.7590501313272415e-05,
+      "loss": 0.8213,
+      "step": 117
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 1.6434332132339478,
+      "learning_rate": 1.7545068824448255e-05,
+      "loss": 0.8233,
+      "step": 118
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 1.337048888206482,
+      "learning_rate": 1.7499271846702216e-05,
+      "loss": 0.8001,
+      "step": 119
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 1.2411539554595947,
+      "learning_rate": 1.7453112592405245e-05,
+      "loss": 0.8476,
+      "step": 120
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 1.685616374015808,
+      "learning_rate": 1.740659329142922e-05,
+      "loss": 0.7684,
+      "step": 121
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 1.5003278255462646,
+      "learning_rate": 1.7359716191039248e-05,
+      "loss": 0.8474,
+      "step": 122
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 1.7008017301559448,
+      "learning_rate": 1.7312483555785087e-05,
+      "loss": 0.8115,
+      "step": 123
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 1.7307039499282837,
+      "learning_rate": 1.7264897667391757e-05,
+      "loss": 0.8066,
+      "step": 124
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.9101468324661255,
+      "learning_rate": 1.7216960824649304e-05,
+      "loss": 0.8238,
+      "step": 125
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 1.103602647781372,
+      "learning_rate": 1.7168675343301768e-05,
+      "loss": 0.8162,
+      "step": 126
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 1.2054574489593506,
+      "learning_rate": 1.71200435559353e-05,
+      "loss": 0.8254,
+      "step": 127
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 1.234502911567688,
+      "learning_rate": 1.7071067811865477e-05,
+      "loss": 0.8034,
+      "step": 128
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 1.1067628860473633,
+      "learning_rate": 1.7021750477023823e-05,
+      "loss": 0.7755,
+      "step": 129
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.7321228384971619,
+      "learning_rate": 1.69720939338435e-05,
+      "loss": 0.8535,
+      "step": 130
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.8689684867858887,
+      "learning_rate": 1.6922100581144228e-05,
+      "loss": 0.7752,
+      "step": 131
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.9249204397201538,
+      "learning_rate": 1.6871772834016406e-05,
+      "loss": 0.8373,
+      "step": 132
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.8648712635040283,
+      "learning_rate": 1.6821113123704425e-05,
+      "loss": 0.7638,
+      "step": 133
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.8067061901092529,
+      "learning_rate": 1.677012389748923e-05,
+      "loss": 0.8038,
+      "step": 134
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.8623146414756775,
+      "learning_rate": 1.671880761857011e-05,
+      "loss": 0.8298,
+      "step": 135
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.8998252153396606,
+      "learning_rate": 1.666716676594567e-05,
+      "loss": 0.7686,
+      "step": 136
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.9564909934997559,
+      "learning_rate": 1.661520383429412e-05,
+      "loss": 0.8418,
+      "step": 137
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.7597609758377075,
+      "learning_rate": 1.6562921333852714e-05,
+      "loss": 0.7976,
+      "step": 138
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 1.064211130142212,
+      "learning_rate": 1.6510321790296527e-05,
+      "loss": 0.8479,
+      "step": 139
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 1.1456950902938843,
+      "learning_rate": 1.6457407744616417e-05,
+      "loss": 0.7806,
+      "step": 140
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.8875635862350464,
+      "learning_rate": 1.6404181752996287e-05,
+      "loss": 0.8191,
+      "step": 141
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 1.0326021909713745,
+      "learning_rate": 1.6350646386689593e-05,
+      "loss": 0.8086,
+      "step": 142
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.8035858273506165,
+      "learning_rate": 1.629680423189514e-05,
+      "loss": 0.7771,
+      "step": 143
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.8190425634384155,
+      "learning_rate": 1.6242657889632133e-05,
+      "loss": 0.8167,
+      "step": 144
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.8339990377426147,
+      "learning_rate": 1.618820997561454e-05,
+      "loss": 0.8068,
+      "step": 145
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.827274739742279,
+      "learning_rate": 1.613346312012473e-05,
+      "loss": 0.817,
+      "step": 146
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.7203758955001831,
+      "learning_rate": 1.6078419967886402e-05,
+      "loss": 0.8122,
+      "step": 147
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.7495682835578918,
+      "learning_rate": 1.6023083177936824e-05,
+      "loss": 0.7676,
+      "step": 148
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.6958379745483398,
+      "learning_rate": 1.5967455423498387e-05,
+      "loss": 0.8305,
+      "step": 149
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.99383944272995,
+      "learning_rate": 1.591153939184946e-05,
+      "loss": 0.7984,
+      "step": 150
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.829394519329071,
+      "learning_rate": 1.5855337784194576e-05,
+      "loss": 0.8008,
+      "step": 151
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.7945014834403992,
+      "learning_rate": 1.5798853315533932e-05,
+      "loss": 0.7504,
+      "step": 152
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.7520758509635925,
+      "learning_rate": 1.5742088714532247e-05,
+      "loss": 0.8346,
+      "step": 153
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.8301789164543152,
+      "learning_rate": 1.568504672338694e-05,
+      "loss": 0.7719,
+      "step": 154
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 1.3911187648773193,
+      "learning_rate": 1.562773009769564e-05,
+      "loss": 0.8335,
+      "step": 155
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 1.039931297302246,
+      "learning_rate": 1.5570141606323105e-05,
+      "loss": 0.7892,
+      "step": 156
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.801042377948761,
+      "learning_rate": 1.551228403126744e-05,
+      "loss": 0.8124,
+      "step": 157
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.0106760263442993,
+      "learning_rate": 1.5454160167525688e-05,
+      "loss": 0.7651,
+      "step": 158
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.7811651825904846,
+      "learning_rate": 1.5395772822958844e-05,
+      "loss": 0.8168,
+      "step": 159
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.8879010081291199,
+      "learning_rate": 1.5337124818156203e-05,
+      "loss": 0.7364,
+      "step": 160
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.6862936019897461,
+      "learning_rate": 1.5278218986299074e-05,
+      "loss": 0.8275,
+      "step": 161
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.9153168797492981,
+      "learning_rate": 1.5219058173023948e-05,
+      "loss": 0.7984,
+      "step": 162
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.8116987943649292,
+      "learning_rate": 1.515964523628501e-05,
+      "loss": 0.7689,
+      "step": 163
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.7810778617858887,
+      "learning_rate": 1.5099983046216089e-05,
+      "loss": 0.7985,
+      "step": 164
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.6745201945304871,
+      "learning_rate": 1.5040074484992e-05,
+      "loss": 0.8015,
+      "step": 165
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.9147999286651611,
+      "learning_rate": 1.4979922446689308e-05,
+      "loss": 0.8264,
+      "step": 166
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.8092418313026428,
+      "learning_rate": 1.4919529837146529e-05,
+      "loss": 0.743,
+      "step": 167
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.8291578888893127,
+      "learning_rate": 1.4858899573823752e-05,
+      "loss": 0.786,
+      "step": 168
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.6807591915130615,
+      "learning_rate": 1.4798034585661696e-05,
+      "loss": 0.8155,
+      "step": 169
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.8842042088508606,
+      "learning_rate": 1.4736937812940217e-05,
+      "loss": 0.7765,
+      "step": 170
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.8237358927726746,
+      "learning_rate": 1.4675612207136283e-05,
+      "loss": 0.7783,
+      "step": 171
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.661469578742981,
+      "learning_rate": 1.4614060730781377e-05,
+      "loss": 0.7716,
+      "step": 172
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.7561662197113037,
+      "learning_rate": 1.455228635731839e-05,
+      "loss": 0.7934,
+      "step": 173
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.6873330473899841,
+      "learning_rate": 1.4490292070957978e-05,
+      "loss": 0.7654,
+      "step": 174
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.7876659631729126,
+      "learning_rate": 1.4428080866534397e-05,
+      "loss": 0.7754,
+      "step": 175
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.7588985562324524,
+      "learning_rate": 1.4365655749360833e-05,
+      "loss": 0.8073,
+      "step": 176
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.0146478414535522,
+      "learning_rate": 1.4303019735084225e-05,
+      "loss": 0.8115,
+      "step": 177
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.0474367141723633,
+      "learning_rate": 1.4240175849539566e-05,
+      "loss": 0.7662,
+      "step": 178
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.8567104935646057,
+      "learning_rate": 1.4177127128603748e-05,
+      "loss": 0.8192,
+      "step": 179
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.7369076609611511,
+      "learning_rate": 1.4113876618048896e-05,
+      "loss": 0.7796,
+      "step": 180
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.021986961364746,
+      "learning_rate": 1.4050427373395241e-05,
+      "loss": 0.743,
+      "step": 181
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.799660325050354,
+      "learning_rate": 1.3986782459763499e-05,
+      "loss": 0.7985,
+      "step": 182
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.9272130131721497,
+      "learning_rate": 1.3922944951726811e-05,
+      "loss": 0.7779,
+      "step": 183
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.7575782537460327,
+      "learning_rate": 1.3858917933162212e-05,
+      "loss": 0.8191,
+      "step": 184
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.7355371117591858,
+      "learning_rate": 1.3794704497101656e-05,
+      "loss": 0.7801,
+      "step": 185
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.0130892992019653,
+      "learning_rate": 1.3730307745582594e-05,
+      "loss": 0.8038,
+      "step": 186
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.7086817622184753,
+      "learning_rate": 1.366573078949813e-05,
+      "loss": 0.7514,
+      "step": 187
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.8990337252616882,
+      "learning_rate": 1.3600976748446722e-05,
+      "loss": 0.8257,
+      "step": 188
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 2.2387804985046387,
+      "learning_rate": 1.3536048750581494e-05,
+      "loss": 0.7783,
+      "step": 189
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.8208438754081726,
+      "learning_rate": 1.3470949932459116e-05,
+      "loss": 0.7705,
+      "step": 190
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.7612828612327576,
+      "learning_rate": 1.3405683438888281e-05,
+      "loss": 0.7839,
+      "step": 191
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.6562499403953552,
+      "learning_rate": 1.3340252422777788e-05,
+      "loss": 0.8068,
+      "step": 192
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.784289538860321,
+      "learning_rate": 1.3274660044984225e-05,
+      "loss": 0.8028,
+      "step": 193
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.7543610334396362,
+      "learning_rate": 1.3208909474159279e-05,
+      "loss": 0.7688,
+      "step": 194
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.6484317779541016,
+      "learning_rate": 1.314300388659667e-05,
+      "loss": 0.8161,
+      "step": 195
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.9675614833831787,
+      "learning_rate": 1.3076946466078691e-05,
+      "loss": 0.7715,
+      "step": 196
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.7833143472671509,
+      "learning_rate": 1.301074040372242e-05,
+      "loss": 0.7748,
+      "step": 197
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.9204770922660828,
+      "learning_rate": 1.2944388897825559e-05,
+      "loss": 0.7725,
+      "step": 198
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.789046049118042,
+      "learning_rate": 1.2877895153711935e-05,
+      "loss": 0.7526,
+      "step": 199
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.7482736706733704,
+      "learning_rate": 1.2811262383576646e-05,
+      "loss": 0.8268,
+      "step": 200
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.9972829222679138,
+      "learning_rate": 1.274449380633089e-05,
+      "loss": 0.7505,
+      "step": 201
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.727154016494751,
+      "learning_rate": 1.2677592647446472e-05,
+      "loss": 0.7953,
+      "step": 202
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.7113155126571655,
+      "learning_rate": 1.2610562138799977e-05,
+      "loss": 0.7877,
+      "step": 203
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.7132176756858826,
+      "learning_rate": 1.2543405518516651e-05,
+      "loss": 0.8088,
+      "step": 204
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 2.328761339187622,
+      "learning_rate": 1.2476126030813964e-05,
+      "loss": 0.7521,
+      "step": 205
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.6288961172103882,
+      "learning_rate": 1.24087269258449e-05,
+      "loss": 0.7779,
+      "step": 206
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.7735608816146851,
+      "learning_rate": 1.234121145954094e-05,
+      "loss": 0.7624,
+      "step": 207
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.845016598701477,
+      "learning_rate": 1.2273582893454774e-05,
+      "loss": 0.7804,
+      "step": 208
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.8225258588790894,
+      "learning_rate": 1.2205844494602741e-05,
+      "loss": 0.7665,
+      "step": 209
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.9022204875946045,
+      "learning_rate": 1.213799953530701e-05,
+      "loss": 0.7671,
+      "step": 210
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.7139418721199036,
+      "learning_rate": 1.2070051293037493e-05,
+      "loss": 0.8215,
+      "step": 211
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 1.054016351699829,
+      "learning_rate": 1.2002003050253524e-05,
+      "loss": 0.7387,
+      "step": 212
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.7111931443214417,
+      "learning_rate": 1.1933858094245281e-05,
+      "loss": 0.8172,
+      "step": 213
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.7568846940994263,
+      "learning_rate": 1.1865619716974986e-05,
+      "loss": 0.745,
+      "step": 214
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.8243083953857422,
+      "learning_rate": 1.1797291214917882e-05,
+      "loss": 0.8177,
+      "step": 215
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.8872765898704529,
+      "learning_rate": 1.1728875888902975e-05,
+      "loss": 0.7488,
+      "step": 216
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.9252672791481018,
+      "learning_rate": 1.1660377043953588e-05,
+      "loss": 0.7788,
+      "step": 217
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 0.7096117734909058,
+      "learning_rate": 1.1591797989127691e-05,
+      "loss": 0.7839,
+      "step": 218
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.0665735006332397,
+      "learning_rate": 1.152314203735805e-05,
+      "loss": 0.7926,
+      "step": 219
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 0.9210519790649414,
+      "learning_rate": 1.14544125052922e-05,
+      "loss": 0.7637,
+      "step": 220
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 0.7430177927017212,
+      "learning_rate": 1.1385612713132191e-05,
+      "loss": 0.7781,
+      "step": 221
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.6779014468193054,
+      "learning_rate": 1.1316745984474227e-05,
+      "loss": 0.7843,
+      "step": 222
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.9180762767791748,
+      "learning_rate": 1.1247815646148088e-05,
+      "loss": 0.7957,
+      "step": 223
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.9458864331245422,
+      "learning_rate": 1.117882502805643e-05,
+      "loss": 0.8011,
+      "step": 224
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.9582037925720215,
+      "learning_rate": 1.1109777463013915e-05,
+      "loss": 0.743,
+      "step": 225
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.9602392911911011,
+      "learning_rate": 1.1040676286586212e-05,
+      "loss": 0.7724,
+      "step": 226
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.6800273060798645,
+      "learning_rate": 1.097152483692886e-05,
+      "loss": 0.8166,
+      "step": 227
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.7832956314086914,
+      "learning_rate": 1.0902326454626012e-05,
+      "loss": 0.7304,
+      "step": 228
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.8246415853500366,
+      "learning_rate": 1.0833084482529048e-05,
+      "loss": 0.8128,
+      "step": 229
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.748362123966217,
+      "learning_rate": 1.0763802265595103e-05,
+      "loss": 0.7449,
+      "step": 230
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.7535527348518372,
+      "learning_rate": 1.0694483150725458e-05,
+      "loss": 0.8146,
+      "step": 231
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.175562858581543,
+      "learning_rate": 1.0625130486603879e-05,
+      "loss": 0.7621,
+      "step": 232
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.3196650743484497,
+      "learning_rate": 1.055574762353483e-05,
+      "loss": 0.7666,
+      "step": 233
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.9028423428535461,
+      "learning_rate": 1.0486337913281633e-05,
+      "loss": 0.8021,
+      "step": 234
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.6453968286514282,
+      "learning_rate": 1.041690470890455e-05,
+      "loss": 0.7432,
+      "step": 235
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 0.6863554120063782,
+      "learning_rate": 1.0347451364598805e-05,
+      "loss": 0.7589,
+      "step": 236
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 0.7064222693443298,
+      "learning_rate": 1.0277981235532541e-05,
+      "loss": 0.7894,
+      "step": 237
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 0.941105306148529,
+      "learning_rate": 1.0208497677684755e-05,
+      "loss": 0.7692,
+      "step": 238
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 0.8583152294158936,
+      "learning_rate": 1.0139004047683152e-05,
+      "loss": 0.7511,
+      "step": 239
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 1.788122296333313,
+      "learning_rate": 1.0069503702642011e-05,
+      "loss": 0.7827,
+      "step": 240
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.7599253058433533,
+      "learning_rate": 1e-05,
+      "loss": 0.7404,
+      "step": 241
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.70832759141922,
+      "learning_rate": 9.930496297357994e-06,
+      "loss": 0.816,
+      "step": 242
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.618488609790802,
+      "learning_rate": 9.860995952316851e-06,
+      "loss": 0.7423,
+      "step": 243
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.7473293542861938,
+      "learning_rate": 9.791502322315249e-06,
+      "loss": 0.7795,
+      "step": 244
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.7179411053657532,
+      "learning_rate": 9.72201876446746e-06,
+      "loss": 0.7848,
+      "step": 245
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 0.7268110513687134,
+      "learning_rate": 9.6525486354012e-06,
+      "loss": 0.7148,
+      "step": 246
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 0.7188340425491333,
+      "learning_rate": 9.583095291095454e-06,
+      "loss": 0.8226,
+      "step": 247
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.236936092376709,
+      "learning_rate": 9.513662086718372e-06,
+      "loss": 0.7436,
+      "step": 248
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.1389752626419067,
+      "learning_rate": 9.444252376465171e-06,
+      "loss": 0.7829,
+      "step": 249
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.7218825221061707,
+      "learning_rate": 9.374869513396123e-06,
+      "loss": 0.7686,
+      "step": 250
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.9832814931869507,
+      "learning_rate": 9.305516849274542e-06,
+      "loss": 0.7705,
+      "step": 251
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.7646653652191162,
+      "learning_rate": 9.2361977344049e-06,
+      "loss": 0.7855,
+      "step": 252
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 1.1526681184768677,
+      "learning_rate": 9.166915517470953e-06,
+      "loss": 0.7537,
+      "step": 253
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.7619354128837585,
+      "learning_rate": 9.09767354537399e-06,
+      "loss": 0.807,
+      "step": 254
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.7558615207672119,
+      "learning_rate": 9.028475163071142e-06,
+      "loss": 0.7571,
+      "step": 255
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.8925402164459229,
+      "learning_rate": 8.959323713413792e-06,
+      "loss": 0.7655,
+      "step": 256
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 1.0979193449020386,
+      "learning_rate": 8.890222536986085e-06,
+      "loss": 0.7738,
+      "step": 257
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.7939193248748779,
+      "learning_rate": 8.821174971943573e-06,
+      "loss": 0.7993,
+      "step": 258
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.8744078278541565,
+      "learning_rate": 8.752184353851917e-06,
+      "loss": 0.7523,
+      "step": 259
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.7564054727554321,
+      "learning_rate": 8.683254015525776e-06,
+      "loss": 0.7687,
+      "step": 260
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.7039680480957031,
+      "learning_rate": 8.614387286867814e-06,
+      "loss": 0.7861,
+      "step": 261
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.7174641489982605,
+      "learning_rate": 8.545587494707803e-06,
+      "loss": 0.7807,
+      "step": 262
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 1.087963342666626,
+      "learning_rate": 8.476857962641951e-06,
+      "loss": 0.7467,
+      "step": 263
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.7684459090232849,
+      "learning_rate": 8.408202010872312e-06,
+      "loss": 0.7567,
+      "step": 264
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.7861683964729309,
+      "learning_rate": 8.339622956046417e-06,
+      "loss": 0.7847,
+      "step": 265
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.7736767530441284,
+      "learning_rate": 8.271124111097026e-06,
+      "loss": 0.7639,
+      "step": 266
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.8550255298614502,
+      "learning_rate": 8.202708785082122e-06,
+      "loss": 0.7774,
+      "step": 267
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.7113362550735474,
+      "learning_rate": 8.134380283025014e-06,
+      "loss": 0.785,
+      "step": 268
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.7256139516830444,
+      "learning_rate": 8.066141905754724e-06,
+      "loss": 0.7625,
+      "step": 269
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.7075888514518738,
+      "learning_rate": 7.997996949746478e-06,
+      "loss": 0.7464,
+      "step": 270
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.796419620513916,
+      "learning_rate": 7.929948706962508e-06,
+      "loss": 0.7859,
+      "step": 271
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.7683595418930054,
+      "learning_rate": 7.862000464692992e-06,
+      "loss": 0.76,
+      "step": 272
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.8139968514442444,
+      "learning_rate": 7.79415550539726e-06,
+      "loss": 0.7559,
+      "step": 273
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.6886945962905884,
+      "learning_rate": 7.726417106545231e-06,
+      "loss": 0.7708,
+      "step": 274
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.7851764559745789,
+      "learning_rate": 7.658788540459063e-06,
+      "loss": 0.7445,
+      "step": 275
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.8292492032051086,
+      "learning_rate": 7.5912730741551044e-06,
+      "loss": 0.7757,
+      "step": 276
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.9825494289398193,
+      "learning_rate": 7.523873969186039e-06,
+      "loss": 0.7556,
+      "step": 277
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.7132662534713745,
+      "learning_rate": 7.456594481483355e-06,
+      "loss": 0.7614,
+      "step": 278
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.9558830857276917,
+      "learning_rate": 7.389437861200024e-06,
+      "loss": 0.7734,
+      "step": 279
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.8095993399620056,
+      "learning_rate": 7.322407352553529e-06,
+      "loss": 0.7252,
+      "step": 280
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.663575291633606,
+      "learning_rate": 7.2555061936691104e-06,
+      "loss": 0.8247,
+      "step": 281
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.7267043590545654,
+      "learning_rate": 7.188737616423357e-06,
+      "loss": 0.731,
+      "step": 282
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.8647053241729736,
+      "learning_rate": 7.122104846288065e-06,
+      "loss": 0.7719,
+      "step": 283
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.8748030662536621,
+      "learning_rate": 7.055611102174442e-06,
+      "loss": 0.7706,
+      "step": 284
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.9459949135780334,
+      "learning_rate": 6.9892595962775826e-06,
+      "loss": 0.7097,
+      "step": 285
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 1.1018948554992676,
+      "learning_rate": 6.923053533921312e-06,
+      "loss": 0.8045,
+      "step": 286
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.9634941816329956,
+      "learning_rate": 6.85699611340333e-06,
+      "loss": 0.7283,
+      "step": 287
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.7713951468467712,
+      "learning_rate": 6.791090525840722e-06,
+      "loss": 0.7872,
+      "step": 288
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.95869380235672,
+      "learning_rate": 6.725339955015777e-06,
+      "loss": 0.757,
+      "step": 289
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 1.0996911525726318,
+      "learning_rate": 6.659747577222215e-06,
+      "loss": 0.7636,
+      "step": 290
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.7740480899810791,
+      "learning_rate": 6.5943165611117244e-06,
+      "loss": 0.7933,
+      "step": 291
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.6407970786094666,
+      "learning_rate": 6.529050067540887e-06,
+      "loss": 0.7556,
+      "step": 292
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.7106875777244568,
+      "learning_rate": 6.4639512494185104e-06,
+      "loss": 0.7393,
+      "step": 293
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.6763285398483276,
+      "learning_rate": 6.39902325155328e-06,
+      "loss": 0.786,
+      "step": 294
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.8059327006340027,
+      "learning_rate": 6.334269210501876e-06,
+      "loss": 0.7669,
+      "step": 295
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.6124692559242249,
+      "learning_rate": 6.269692254417408e-06,
+      "loss": 0.7802,
+      "step": 296
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.0250524282455444,
+      "learning_rate": 6.205295502898348e-06,
+      "loss": 0.7889,
+      "step": 297
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.83560711145401,
+      "learning_rate": 6.141082066837791e-06,
+      "loss": 0.7176,
+      "step": 298
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.0154820680618286,
+      "learning_rate": 6.077055048273193e-06,
+      "loss": 0.7941,
+      "step": 299
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.7823308706283569,
+      "learning_rate": 6.013217540236503e-06,
+      "loss": 0.7533,
+      "step": 300
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.7707633376121521,
+      "learning_rate": 5.9495726266047605e-06,
+      "loss": 0.7922,
+      "step": 301
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.7845512628555298,
+      "learning_rate": 5.886123381951103e-06,
+      "loss": 0.7215,
+      "step": 302
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.9354090690612793,
+      "learning_rate": 5.822872871396255e-06,
+      "loss": 0.767,
+      "step": 303
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.9846552610397339,
+      "learning_rate": 5.759824150460436e-06,
+      "loss": 0.7866,
+      "step": 304
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.82850182056427,
+      "learning_rate": 5.696980264915777e-06,
+      "loss": 0.7449,
+      "step": 305
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.7820140719413757,
+      "learning_rate": 5.63434425063917e-06,
+      "loss": 0.7662,
+      "step": 306
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.7161849737167358,
+      "learning_rate": 5.571919133465605e-06,
+      "loss": 0.7683,
+      "step": 307
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.7008098363876343,
+      "learning_rate": 5.50970792904203e-06,
+      "loss": 0.7755,
+      "step": 308
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.7139153480529785,
+      "learning_rate": 5.447713642681612e-06,
+      "loss": 0.7443,
+      "step": 309
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 1.005012035369873,
+      "learning_rate": 5.3859392692186256e-06,
+      "loss": 0.7852,
+      "step": 310
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.7501611113548279,
+      "learning_rate": 5.324387792863719e-06,
+      "loss": 0.7567,
+      "step": 311
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.7607911825180054,
+      "learning_rate": 5.263062187059785e-06,
+      "loss": 0.7597,
+      "step": 312
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.992053210735321,
+      "learning_rate": 5.201965414338308e-06,
+      "loss": 0.7655,
+      "step": 313
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.6949525475502014,
+      "learning_rate": 5.14110042617625e-06,
+      "loss": 0.7374,
+      "step": 314
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 1.5913808345794678,
+      "learning_rate": 5.080470162853473e-06,
+      "loss": 0.7907,
+      "step": 315
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.8287177085876465,
+      "learning_rate": 5.020077553310694e-06,
+      "loss": 0.7055,
+      "step": 316
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.8051833510398865,
+      "learning_rate": 4.959925515008003e-06,
+      "loss": 0.7803,
+      "step": 317
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.7546500563621521,
+      "learning_rate": 4.9000169537839126e-06,
+      "loss": 0.7545,
+      "step": 318
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.7356598973274231,
+      "learning_rate": 4.840354763714991e-06,
+      "loss": 0.744,
+      "step": 319
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.6519478559494019,
+      "learning_rate": 4.780941826976054e-06,
+      "loss": 0.7621,
+      "step": 320
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.6492840051651001,
+      "learning_rate": 4.721781013700928e-06,
+      "loss": 0.7444,
+      "step": 321
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.9139990210533142,
+      "learning_rate": 4.662875181843799e-06,
+      "loss": 0.7904,
+      "step": 322
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.7614346742630005,
+      "learning_rate": 4.604227177041156e-06,
+      "loss": 0.7186,
+      "step": 323
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.7474337816238403,
+      "learning_rate": 4.545839832474318e-06,
+      "loss": 0.7475,
+      "step": 324
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.8080284595489502,
+      "learning_rate": 4.487715968732568e-06,
+      "loss": 0.7641,
+      "step": 325
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.8405642509460449,
+      "learning_rate": 4.429858393676898e-06,
+      "loss": 0.7749,
+      "step": 326
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.719484806060791,
+      "learning_rate": 4.3722699023043634e-06,
+      "loss": 0.7181,
+      "step": 327
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.6552863717079163,
+      "learning_rate": 4.314953276613066e-06,
+      "loss": 0.8089,
+      "step": 328
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.7018444538116455,
+      "learning_rate": 4.257911285467754e-06,
+      "loss": 0.734,
+      "step": 329
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.7911511063575745,
+      "learning_rate": 4.201146684466065e-06,
+      "loss": 0.7752,
+      "step": 330
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.8911862373352051,
+      "learning_rate": 4.144662215805426e-06,
+      "loss": 0.7733,
+      "step": 331
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.8746122121810913,
+      "learning_rate": 4.088460608150537e-06,
+      "loss": 0.7336,
+      "step": 332
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.6681094169616699,
+      "learning_rate": 4.0325445765016145e-06,
+      "loss": 0.7892,
+      "step": 333
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.8015274405479431,
+      "learning_rate": 3.9769168220631745e-06,
+      "loss": 0.774,
+      "step": 334
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.8299674391746521,
+      "learning_rate": 3.921580032113602e-06,
+      "loss": 0.7814,
+      "step": 335
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.7029886245727539,
+      "learning_rate": 3.866536879875269e-06,
+      "loss": 0.7556,
+      "step": 336
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.84246826171875,
+      "learning_rate": 3.81179002438546e-06,
+      "loss": 0.7678,
+      "step": 337
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.8028191328048706,
+      "learning_rate": 3.7573421103678707e-06,
+      "loss": 0.7679,
+      "step": 338
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.7954402565956116,
+      "learning_rate": 3.7031957681048604e-06,
+      "loss": 0.7265,
+      "step": 339
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.7864363193511963,
+      "learning_rate": 3.649353613310409e-06,
+      "loss": 0.7926,
+      "step": 340
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.6914604902267456,
+      "learning_rate": 3.5958182470037127e-06,
+      "loss": 0.749,
+      "step": 341
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.7519959807395935,
+      "learning_rate": 3.5425922553835866e-06,
+      "loss": 0.7788,
+      "step": 342
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 1.0774928331375122,
+      "learning_rate": 3.4896782097034755e-06,
+      "loss": 0.7313,
+      "step": 343
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.7848466634750366,
+      "learning_rate": 3.4370786661472922e-06,
+      "loss": 0.7901,
+      "step": 344
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.7957246899604797,
+      "learning_rate": 3.384796165705885e-06,
+      "loss": 0.7606,
+      "step": 345
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.6149446368217468,
+      "learning_rate": 3.3328332340543314e-06,
+      "loss": 0.7831,
+      "step": 346
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 1.1103349924087524,
+      "learning_rate": 3.281192381429894e-06,
+      "loss": 0.7119,
+      "step": 347
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.7909545302391052,
+      "learning_rate": 3.2298761025107707e-06,
+      "loss": 0.7467,
+      "step": 348
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.7904770970344543,
+      "learning_rate": 3.178886876295578e-06,
+      "loss": 0.7978,
+      "step": 349
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.7983621954917908,
+      "learning_rate": 3.128227165983595e-06,
+      "loss": 0.7281,
+      "step": 350
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.6307840943336487,
+      "learning_rate": 3.0778994188557722e-06,
+      "loss": 0.7959,
+      "step": 351
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.8515567779541016,
+      "learning_rate": 3.027906066156503e-06,
+      "loss": 0.74,
+      "step": 352
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.6278306245803833,
+      "learning_rate": 2.978249522976181e-06,
+      "loss": 0.748,
+      "step": 353
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.8014828562736511,
+      "learning_rate": 2.9289321881345257e-06,
+      "loss": 0.74,
+      "step": 354
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 1.187994122505188,
+      "learning_rate": 2.879956444064703e-06,
+      "loss": 0.7533,
+      "step": 355
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.6586583256721497,
+      "learning_rate": 2.8313246566982342e-06,
+      "loss": 0.7291,
+      "step": 356
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.7355678677558899,
+      "learning_rate": 2.783039175350699e-06,
+      "loss": 0.7521,
+      "step": 357
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.636843740940094,
+      "learning_rate": 2.735102332608247e-06,
+      "loss": 0.7392,
+      "step": 358
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.7756280303001404,
+      "learning_rate": 2.6875164442149147e-06,
+      "loss": 0.7927,
+      "step": 359
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.8138651251792908,
+      "learning_rate": 2.640283808960754e-06,
+      "loss": 0.778,
+      "step": 360
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.7197765111923218,
+      "learning_rate": 2.5934067085707835e-06,
+      "loss": 0.7382,
+      "step": 361
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.804681658744812,
+      "learning_rate": 2.54688740759476e-06,
+      "loss": 0.7456,
+      "step": 362
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.633647620677948,
+      "learning_rate": 2.500728153297788e-06,
+      "loss": 0.7492,
+      "step": 363
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.8081827759742737,
+      "learning_rate": 2.454931175551746e-06,
+      "loss": 0.7657,
+      "step": 364
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.7836536169052124,
+      "learning_rate": 2.409498686727587e-06,
+      "loss": 0.7666,
+      "step": 365
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 2.165264844894409,
+      "learning_rate": 2.364432881588431e-06,
+      "loss": 0.7266,
+      "step": 366
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 1.187296986579895,
+      "learning_rate": 2.3197359371835802e-06,
+      "loss": 0.8071,
+      "step": 367
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.6810380220413208,
+      "learning_rate": 2.2754100127433033e-06,
+      "loss": 0.7322,
+      "step": 368
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.6353223323822021,
+      "learning_rate": 2.2314572495745746e-06,
+      "loss": 0.7805,
+      "step": 369
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.74691241979599,
+      "learning_rate": 2.187879770957585e-06,
+      "loss": 0.7186,
+      "step": 370
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.7790375351905823,
+      "learning_rate": 2.144679682043217e-06,
+      "loss": 0.7743,
+      "step": 371
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.9003098011016846,
+      "learning_rate": 2.1018590697513007e-06,
+      "loss": 0.7577,
+      "step": 372
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.7259723544120789,
+      "learning_rate": 2.0594200026698363e-06,
+      "loss": 0.7921,
+      "step": 373
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.7609323859214783,
+      "learning_rate": 2.017364530955055e-06,
+      "loss": 0.7276,
+      "step": 374
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.556248426437378,
+      "learning_rate": 1.9756946862323534e-06,
+      "loss": 0.782,
+      "step": 375
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.7257483005523682,
+      "learning_rate": 1.934412481498198e-06,
+      "loss": 0.7655,
+      "step": 376
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.7583906054496765,
+      "learning_rate": 1.8935199110228274e-06,
+      "loss": 0.7412,
+      "step": 377
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.8019598722457886,
+      "learning_rate": 1.8530189502539608e-06,
+      "loss": 0.7554,
+      "step": 378
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 3.036848783493042,
+      "learning_rate": 1.8129115557213262e-06,
+      "loss": 0.749,
+      "step": 379
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.9667990803718567,
+      "learning_rate": 1.77319966494218e-06,
+      "loss": 0.8014,
+      "step": 380
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.6818022131919861,
+      "learning_rate": 1.7338851963276827e-06,
+      "loss": 0.7119,
+      "step": 381
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.7843496799468994,
+      "learning_rate": 1.6949700490902344e-06,
+      "loss": 0.7811,
+      "step": 382
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.8651720881462097,
+      "learning_rate": 1.6564561031517278e-06,
+      "loss": 0.7667,
+      "step": 383
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 1.0504810810089111,
+      "learning_rate": 1.6183452190527317e-06,
+      "loss": 0.7769,
+      "step": 384
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.7682644724845886,
+      "learning_rate": 1.5806392378626079e-06,
+      "loss": 0.7382,
+      "step": 385
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.8382360935211182,
+      "learning_rate": 1.543339981090578e-06,
+      "loss": 0.7557,
+      "step": 386
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.6545696258544922,
+      "learning_rate": 1.5064492505977234e-06,
+      "loss": 0.7465,
+      "step": 387
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.8746246099472046,
+      "learning_rate": 1.4699688285099489e-06,
+      "loss": 0.7518,
+      "step": 388
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.7189457416534424,
+      "learning_rate": 1.433900477131882e-06,
+      "loss": 0.7783,
+      "step": 389
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.6742172241210938,
+      "learning_rate": 1.3982459388617453e-06,
+      "loss": 0.7703,
+      "step": 390
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.6717891693115234,
+      "learning_rate": 1.363006936107183e-06,
+      "loss": 0.7566,
+      "step": 391
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.6821526885032654,
+      "learning_rate": 1.3281851712020522e-06,
+      "loss": 0.712,
+      "step": 392
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.6083774566650391,
+      "learning_rate": 1.29378232632419e-06,
+      "loss": 0.7865,
+      "step": 393
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.004950761795044,
+      "learning_rate": 1.259800063414146e-06,
+      "loss": 0.7437,
+      "step": 394
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.7969427704811096,
+      "learning_rate": 1.2262400240949023e-06,
+      "loss": 0.6931,
+      "step": 395
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.6904407739639282,
+      "learning_rate": 1.1931038295925646e-06,
+      "loss": 0.7848,
+      "step": 396
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.7331759333610535,
+      "learning_rate": 1.1603930806580443e-06,
+      "loss": 0.7295,
+      "step": 397
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.6611264944076538,
+      "learning_rate": 1.128109357489734e-06,
+      "loss": 0.8059,
+      "step": 398
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.8612853288650513,
+      "learning_rate": 1.0962542196571636e-06,
+      "loss": 0.7421,
+      "step": 399
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.6944461464881897,
+      "learning_rate": 1.064829206025665e-06,
+      "loss": 0.7537,
+      "step": 400
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.7044425010681152,
+      "learning_rate": 1.0338358346820355e-06,
+      "loss": 0.7097,
+      "step": 401
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.6490280032157898,
+      "learning_rate": 1.003275602861188e-06,
+      "loss": 0.7768,
+      "step": 402
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.6698139309883118,
+      "learning_rate": 9.731499868738448e-07,
+      "loss": 0.7691,
+      "step": 403
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.7602378129959106,
+      "learning_rate": 9.434604420351912e-07,
+      "loss": 0.7538,
+      "step": 404
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.7384971976280212,
+      "learning_rate": 9.142084025945986e-07,
+      "loss": 0.7224,
+      "step": 405
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.6048542261123657,
+      "learning_rate": 8.853952816663214e-07,
+      "loss": 0.8024,
+      "step": 406
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.6573376655578613,
+      "learning_rate": 8.570224711612385e-07,
+      "loss": 0.7336,
+      "step": 407
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.6137224435806274,
+      "learning_rate": 8.290913417196178e-07,
+      "loss": 0.79,
+      "step": 408
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.5987165570259094,
+      "learning_rate": 8.016032426448816e-07,
+      "loss": 0.723,
+      "step": 409
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.7513317465782166,
+      "learning_rate": 7.745595018384577e-07,
+      "loss": 0.7547,
+      "step": 410
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.6956148147583008,
+      "learning_rate": 7.479614257355972e-07,
+      "loss": 0.7716,
+      "step": 411
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.7517545819282532,
+      "learning_rate": 7.218102992422882e-07,
+      "loss": 0.7415,
+      "step": 412
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.7091813087463379,
+      "learning_rate": 6.961073856731648e-07,
+      "loss": 0.7552,
+      "step": 413
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 0.6687448620796204,
+      "learning_rate": 6.708539266905e-07,
+      "loss": 0.7959,
+      "step": 414
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 0.7798054218292236,
+      "learning_rate": 6.460511422441984e-07,
+      "loss": 0.6974,
+      "step": 415
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 0.7497337460517883,
+      "learning_rate": 6.21700230512885e-07,
+      "loss": 0.7888,
+      "step": 416
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 0.8241896629333496,
+      "learning_rate": 5.978023678460099e-07,
+      "loss": 0.7506,
+      "step": 417
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.614700198173523,
+      "learning_rate": 5.743587087070235e-07,
+      "loss": 0.7527,
+      "step": 418
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.747754693031311,
+      "learning_rate": 5.513703856176112e-07,
+      "loss": 0.7463,
+      "step": 419
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.7050479650497437,
+      "learning_rate": 5.288385091029724e-07,
+      "loss": 0.7713,
+      "step": 420
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.1944342851638794,
+      "learning_rate": 5.067641676381918e-07,
+      "loss": 0.7593,
+      "step": 421
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.6154138445854187,
+      "learning_rate": 4.851484275956331e-07,
+      "loss": 0.7298,
+      "step": 422
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 2.1620090007781982,
+      "learning_rate": 4.6399233319344703e-07,
+      "loss": 0.7723,
+      "step": 423
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.6451908349990845,
+      "learning_rate": 4.432969064451109e-07,
+      "loss": 0.7605,
+      "step": 424
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 1.1842758655548096,
+      "learning_rate": 4.230631471100655e-07,
+      "loss": 0.7478,
+      "step": 425
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.7377268075942993,
+      "learning_rate": 4.0329203264541594e-07,
+      "loss": 0.7502,
+      "step": 426
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.7105292677879333,
+      "learning_rate": 3.8398451815870984e-07,
+      "loss": 0.7391,
+      "step": 427
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.7181400656700134,
+      "learning_rate": 3.6514153636180384e-07,
+      "loss": 0.7818,
+      "step": 428
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.6796531081199646,
+      "learning_rate": 3.467639975257997e-07,
+      "loss": 0.7778,
+      "step": 429
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.6979401111602783,
+      "learning_rate": 3.2885278943707524e-07,
+      "loss": 0.7531,
+      "step": 430
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.9989197850227356,
+      "learning_rate": 3.114087773543939e-07,
+      "loss": 0.7049,
+      "step": 431
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.7881868481636047,
+      "learning_rate": 2.9443280396710847e-07,
+      "loss": 0.7798,
+      "step": 432
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.7045859694480896,
+      "learning_rate": 2.7792568935444796e-07,
+      "loss": 0.7405,
+      "step": 433
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.7407889366149902,
+      "learning_rate": 2.618882309459081e-07,
+      "loss": 0.6954,
+      "step": 434
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.6761502623558044,
+      "learning_rate": 2.4632120348272e-07,
+      "loss": 0.7933,
+      "step": 435
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.9298683404922485,
+      "learning_rate": 2.312253589804314e-07,
+      "loss": 0.7192,
+      "step": 436
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.6314147710800171,
+      "learning_rate": 2.166014266925731e-07,
+      "loss": 0.8084,
+      "step": 437
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.7935160994529724,
+      "learning_rate": 2.0245011307543416e-07,
+      "loss": 0.748,
+      "step": 438
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.9580221176147461,
+      "learning_rate": 1.88772101753929e-07,
+      "loss": 0.7057,
+      "step": 439
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.766875147819519,
+      "learning_rate": 1.7556805348858063e-07,
+      "loss": 0.7403,
+      "step": 440
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.66485196352005,
+      "learning_rate": 1.6283860614358936e-07,
+      "loss": 0.8058,
+      "step": 441
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.7386608719825745,
+      "learning_rate": 1.5058437465602982e-07,
+      "loss": 0.698,
+      "step": 442
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.8943004012107849,
+      "learning_rate": 1.388059510061379e-07,
+      "loss": 0.7899,
+      "step": 443
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.6558826565742493,
+      "learning_rate": 1.2750390418871605e-07,
+      "loss": 0.7423,
+      "step": 444
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.6673349738121033,
+      "learning_rate": 1.1667878018564171e-07,
+      "loss": 0.8005,
+      "step": 445
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.509501576423645,
+      "learning_rate": 1.063311019395008e-07,
+      "loss": 0.7367,
+      "step": 446
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.948124349117279,
+      "learning_rate": 9.64613693283123e-08,
+      "loss": 0.7318,
+      "step": 447
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.6254904866218567,
+      "learning_rate": 8.707005914139422e-08,
+      "loss": 0.7596,
+      "step": 448
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.6569192409515381,
+      "learning_rate": 7.815762505632096e-08,
+      "loss": 0.7495,
+      "step": 449
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.7474935054779053,
+      "learning_rate": 6.972449761700862e-08,
+      "loss": 0.7723,
+      "step": 450
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.8758479356765747,
+      "learning_rate": 6.177108421292266e-08,
+      "loss": 0.7392,
+      "step": 451
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 1.0904728174209595,
+      "learning_rate": 5.429776905938489e-08,
+      "loss": 0.7561,
+      "step": 452
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.5738272666931152,
+      "learning_rate": 4.7304913179025967e-08,
+      "loss": 0.7998,
+      "step": 453
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.7406892776489258,
+      "learning_rate": 4.0792854384338334e-08,
+      "loss": 0.7018,
+      "step": 454
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.7328673601150513,
+      "learning_rate": 3.4761907261356976e-08,
+      "loss": 0.7957,
+      "step": 455
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.6121895909309387,
+      "learning_rate": 2.9212363154463853e-08,
+      "loss": 0.7514,
+      "step": 456
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.8818193674087524,
+      "learning_rate": 2.4144490152313572e-08,
+      "loss": 0.7642,
+      "step": 457
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 1.2568868398666382,
+      "learning_rate": 1.9558533074882646e-08,
+      "loss": 0.7433,
+      "step": 458
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.8221530914306641,
+      "learning_rate": 1.545471346164007e-08,
+      "loss": 0.7665,
+      "step": 459
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.7928904294967651,
+      "learning_rate": 1.1833229560848092e-08,
+      "loss": 0.7617,
+      "step": 460
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.7835130095481873,
+      "learning_rate": 8.694256319987659e-09,
+      "loss": 0.7412,
+      "step": 461
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.7215912938117981,
+      "learning_rate": 6.037945377297405e-09,
+      "loss": 0.7756,
+      "step": 462
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.8312851786613464,
+      "learning_rate": 3.8644250544594975e-09,
+      "loss": 0.7332,
+      "step": 463
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.6416303515434265,
+      "learning_rate": 2.173800350394606e-09,
+      "loss": 0.797,
+      "step": 464
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.003443956375122,
+      "learning_rate": 9.661529361892907e-10,
+      "loss": 0.7039,
+      "step": 465
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.8252460956573486,
+      "learning_rate": 2.415411511536014e-10,
+      "loss": 0.7942,
+      "step": 466
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.7430176734924316,
+      "learning_rate": 0.0,
+      "loss": 0.7726,
+      "step": 467
+    },
+    {
+      "epoch": 1.0,
+      "step": 467,
+      "total_flos": 0.0,
+      "train_loss": 0.435926725573407,
+      "train_runtime": 10640.2467,
+      "train_samples_per_second": 102.107,
+      "train_steps_per_second": 0.044
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 467,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

utils.py ADDED Viewed

	@@ -0,0 +1,212 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+# This file is modified from https://github.com/haotian-liu/LLaVA/
+import os
+import os.path as osp
+from huggingface_hub import repo_exists, snapshot_download
+from huggingface_hub.utils import HFValidationError, validate_repo_id
+from transformers import AutoConfig, AutoTokenizer, PretrainedConfig
+from .configuration_vila import VILAConfig
+from .constants import MEDIA_TOKENS
+from .tokenizer_utils import infer_stop_tokens
+def load_tokenizer_then_handle_media_tokens_and_chat_template(
+    model_name_or_path, config: VILAConfig, model_max_length=None
+):
+    # TODO(ligeng): a lot of copy-paste code, refactor to make a single function
+    tokenizer = AutoTokenizer.from_pretrained(
+        osp.join(model_name_or_path, "llm"), padding_side="right", use_fast=True, legacy=False
+    )
+    if model_max_length is not None:
+        tokenizer.model_max_length = model_max_length
+    # Load chat template if specified.
+    if getattr(config, "chat_template", None) is not None:
+        print(f"Using chat template: {config.chat_template}")
+        fpath = os.path.join(os.path.dirname(__file__), "chat_templates", f"{config.chat_template}.jinja")
+        if not os.path.exists(fpath):
+            fpath = os.path.join(model_name_or_path, f"{config.chat_template}.jinja")
+        with open(fpath) as fd:
+            chat_template = fd.read()
+        tokenizer.chat_template = chat_template.replace("    ", "").replace("\n", "")
+    # Set stop tokens for the tokenizer
+    tokenizer.stop_tokens = infer_stop_tokens(tokenizer)
+    tokenizer.stop_token_ids = tokenizer.convert_tokens_to_ids(tokenizer.stop_tokens)
+    # Add media tokens to the tokenizer
+    tokenizer.media_tokens = MEDIA_TOKENS
+    tokenizer.media_token_ids = {}
+    for name, token in MEDIA_TOKENS.items():
+        tokenizer.add_tokens([token], special_tokens=True)
+        tokenizer.media_token_ids[name] = tokenizer.convert_tokens_to_ids(token)
+    return tokenizer
+def get_model_config(config):
+    default_keys = ["llm_cfg", "vision_tower_cfg", "mm_projector_cfg"]
+    if hasattr(config, "_name_or_path") and len(config._name_or_path) >= 2:
+        root_path = config._name_or_path
+    else:
+        root_path = config.resume_path
+    # download from huggingface
+    if root_path is not None and not osp.exists(root_path):
+        try:
+            valid_hf_repo = repo_exists(root_path)
+        except HFValidationError as e:
+            valid_hf_repo = False
+        if valid_hf_repo:
+            root_path = snapshot_download(root_path)
+    return_list = []
+    for key in default_keys:
+        cfg = getattr(config, key, None)
+        if isinstance(cfg, dict):
+            try:
+                return_list.append(os.path.join(root_path, key[:-4]))
+            except:
+                raise ValueError(f"Cannot find resume path in config for {key}!")
+        elif isinstance(cfg, PretrainedConfig):
+            return_list.append(os.path.join(root_path, key[:-4]))
+        elif isinstance(cfg, str):
+            return_list.append(cfg)
+    return return_list
+def get_model_config_fp8(config):
+    default_keys = ["llm_cfg", "vision_tower_cfg", "mm_projector_cfg"]
+    if hasattr(config, "_name_or_path") and len(config._name_or_path) >= 2:
+        root_path = config._name_or_path
+    else:
+        root_path = config.resume_path
+    # download from huggingface
+    if root_path is not None and not osp.exists(root_path):
+        try:
+            valid_hf_repo = repo_exists(root_path)
+        except HFValidationError as e:
+            valid_hf_repo = False
+        if valid_hf_repo:
+            root_path = snapshot_download(root_path)
+    return_list = []
+    for key in default_keys:
+        cfg = getattr(config, key, None)
+        if isinstance(cfg, dict):
+            try:
+                return_list.append(os.path.join(root_path, key[:-4]))
+            except:
+                raise ValueError(f"Cannot find resume path in config for {key}!")
+        elif isinstance(cfg, PretrainedConfig):
+            return_list.append(os.path.join(root_path, key[:-4]))
+        elif isinstance(cfg, str):
+            return_list.append(cfg)
+    # fp8_llm
+    key = "fp8_llm_cfg"
+    directory_path = os.path.join(root_path, key[:-4])
+    assert os.path.isdir(directory_path) and os.listdir(
+        directory_path
+    ), "You need to first convert the model weights to FP8 explicitly."
+    return_list.append(directory_path)
+    return return_list
+def get_model_config_fp8(config):
+    default_keys = ["llm_cfg", "vision_tower_cfg", "mm_projector_cfg"]
+    if hasattr(config, "_name_or_path") and len(config._name_or_path) >= 2:
+        root_path = config._name_or_path
+    else:
+        root_path = config.resume_path
+    # download from huggingface
+    if root_path is not None and not osp.exists(root_path):
+        try:
+            valid_hf_repo = repo_exists(root_path)
+        except HFValidationError as e:
+            valid_hf_repo = False
+        if valid_hf_repo:
+            root_path = snapshot_download(root_path)
+    return_list = []
+    for key in default_keys:
+        cfg = getattr(config, key, None)
+        if isinstance(cfg, dict):
+            try:
+                return_list.append(os.path.join(root_path, key[:-4]))
+            except:
+                raise ValueError(f"Cannot find resume path in config for {key}!")
+        elif isinstance(cfg, PretrainedConfig):
+            return_list.append(os.path.join(root_path, key[:-4]))
+        elif isinstance(cfg, str):
+            return_list.append(cfg)
+    # fp8_llm
+    key = "fp8_llm_cfg"
+    directory_path = os.path.join(root_path, key[:-4])
+    assert os.path.isdir(directory_path) and os.listdir(
+        directory_path
+    ), "You need to first convert the model weights to FP8 explicitly."
+    return_list.append(directory_path)
+    return return_list
+def is_mm_model(model_path):
+    """
+    Check if the model at the given path is a visual language model.
+    Args:
+        model_path (str): The path to the model.
+    Returns:
+        bool: True if the model is an MM model, False otherwise.
+    """
+    config = AutoConfig.from_pretrained(model_path)
+    architectures = config.architectures
+    for architecture in architectures:
+        if "llava" in architecture.lower():
+            return True
+    return False
+def auto_upgrade(config):
+    cfg = AutoConfig.from_pretrained(config)
+    if "llava" in config and "llava" not in cfg.model_type:
+        assert cfg.model_type == "llama"
+        print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.")
+        print("You must upgrade the checkpoint to the new code base (this can be done automatically).")
+        confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]")
+        if confirm.lower() in ["y", "yes"]:
+            print("Upgrading checkpoint...")
+            assert len(cfg.architectures) == 1
+            setattr(cfg.__class__, "model_type", "llava")
+            cfg.architectures[0] = "LlavaLlamaForCausalLM"
+            cfg.save_pretrained(config)
+            print("Checkpoint upgraded.")
+        else:
+            print("Checkpoint upgrade aborted.")
+            exit(1)

vision_tower/config.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "_name_or_path": "runs/train/NVILA-Lite_14b_siglip_aws_env2_obelics_ja/sft_14b_GPT4_v6/model/vision_tower",
+  "architectures": [
+    "SiglipVisionModel"
+  ],
+  "attention_dropout": 0.0,
+  "hidden_act": "gelu_pytorch_tanh",
+  "hidden_size": 1152,
+  "image_size": 448,
+  "intermediate_size": 4304,
+  "layer_norm_eps": 1e-06,
+  "model_type": "siglip_vision_model",
+  "num_attention_heads": 16,
+  "num_channels": 3,
+  "num_hidden_layers": 27,
+  "num_image_tokens": 256,
+  "patch_size": 14,
+  "projection_dim": 2048,
+  "projector_hidden_act": "gelu_fast",
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.45.0",
+  "vision_use_head": false
+}

vision_tower/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b252dab753e022135ac0110affc9dfa0cab40680abc935dcaa3f09b449ff1323
+size 826707904

vision_tower/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "do_convert_rgb": null,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_processor_type": "SiglipImageProcessor",
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "processor_class": "SiglipProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "height": 448,
+    "width": 448
+  }
+}