Update README.md
Browse files
README.md
CHANGED
@@ -3,54 +3,95 @@ library_name: transformers
|
|
3 |
license: mit
|
4 |
---
|
5 |
|
6 |
-
# Model Card for Model ID
|
7 |
|
8 |
-
<!-- Provide a quick summary of what the model is/does. -->
|
9 |
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
## License
|
56 |
|
@@ -65,3 +106,7 @@ This project utilizes certain datasets and checkpoints that are subject to their
|
|
65 |
|
66 |
[More Information Needed]
|
67 |
|
|
|
|
|
|
|
|
|
|
3 |
license: mit
|
4 |
---
|
5 |
|
|
|
6 |
|
|
|
7 |
|
8 |
+
<a href='https://arxiv.org/abs/'><img src='https://img.shields.io/badge/arXiv-paper-red'></a><a href='https://ruili33.github.io/tpo_website.github.io/'><img src='https://img.shields.io/badge/project-TPO-blue'></a><a href='https://huggingface.co/collections/ruili0/temporal-preference-optimization-67874b451f65db189fa35e10'><img src='https://img.shields.io/badge/huggingface-datasets-green'></a>
|
9 |
+
<a href='https://huggingface.co/collections/ruili0/temporal-preference-optimization-67874b451f65db189fa35e10'><img src='https://img.shields.io/badge/model-checkpoints-yellow'></a>
|
10 |
+
<a href='https://github.com/ruili33/TPO'><img src='https://img.shields.io/badge/github-repository-purple'></a>
|
11 |
+
<img src="cvpr_figure_TPO.png"></img>
|
12 |
+
# LLaVA-Video-7B-Qwen2-TPO
|
13 |
+
|
14 |
+
LLaVA-Video-7B-Qwen2-TPO, introduced by paper [Temporal Preference Optimization for Long-form Video Understanding](https://arxiv.org/abs), optimized
|
15 |
+
by temporal preference based on LLaVA-Video-7B-Qwen2. LLaVA-Video-7B-Qwen2-TPO achieves state-of-the-art performance on a variety of benchmarks,
|
16 |
+
achieving the strongest 7B model on the Video-MME benchmark.
|
17 |
+
|
18 |
+
|
19 |
+
|
20 |
+
## Evaluation Results
|
21 |
+
| **Model** | **Size** | **LongVideoBench** | **MLVU** | **VideoMME (Average)** |
|
22 |
+
|-------------------------------------|----------|---------------------|----------|----------------------|
|
23 |
+
| **NVILA [1]** | 7B | 57.7 | 70.1 | 64.2/70.0 |
|
24 |
+
| **LLaVA-Video-7B [2]** | 7B | 58.2 | 70.8 | 63.3/69.7 |
|
25 |
+
| **LLaVA-Video-7B-Qwen2-TPO** | 7B | **60.1** | **-** | **76.8/78.7** | **64.6/69.4** | **55.4/66.4** | **65.6/71.5** |
|
26 |
+
|
27 |
+
|
28 |
+
## Get Started
|
29 |
+
|
30 |
+
Use the code below to get started with the model. For more information, please refer to our [github repository](https://github.com/ruili33/TPO).
|
31 |
+
|
32 |
+
```
|
33 |
+
# pip install git+https://github.com/LLaVA-VL/LLaVA-NeXT.git
|
34 |
+
from llava.model.builder import load_pretrained_model
|
35 |
+
from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token
|
36 |
+
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IGNORE_INDEX
|
37 |
+
from llava.conversation import conv_templates, SeparatorStyle
|
38 |
+
from PIL import Image
|
39 |
+
import requests
|
40 |
+
import copy
|
41 |
+
import torch
|
42 |
+
import sys
|
43 |
+
import warnings
|
44 |
+
from decord import VideoReader, cpu
|
45 |
+
import numpy as np
|
46 |
+
warnings.filterwarnings("ignore")
|
47 |
+
def load_video(self, video_path, max_frames_num,fps=1,force_sample=False):
|
48 |
+
if max_frames_num == 0:
|
49 |
+
return np.zeros((1, 336, 336, 3))
|
50 |
+
vr = VideoReader(video_path, ctx=cpu(0),num_threads=1)
|
51 |
+
total_frame_num = len(vr)
|
52 |
+
video_time = total_frame_num / vr.get_avg_fps()
|
53 |
+
fps = round(vr.get_avg_fps()/fps)
|
54 |
+
frame_idx = [i for i in range(0, len(vr), fps)]
|
55 |
+
frame_time = [i/fps for i in frame_idx]
|
56 |
+
if len(frame_idx) > max_frames_num or force_sample:
|
57 |
+
sample_fps = max_frames_num
|
58 |
+
uniform_sampled_frames = np.linspace(0, total_frame_num - 1, sample_fps, dtype=int)
|
59 |
+
frame_idx = uniform_sampled_frames.tolist()
|
60 |
+
frame_time = [i/vr.get_avg_fps() for i in frame_idx]
|
61 |
+
frame_time = ",".join([f"{i:.2f}s" for i in frame_time])
|
62 |
+
spare_frames = vr.get_batch(frame_idx).asnumpy()
|
63 |
+
# import pdb;pdb.set_trace()
|
64 |
+
return spare_frames,frame_time,video_time
|
65 |
+
pretrained = "ruili0/LLaVA-Video-7B-TPO"
|
66 |
+
model_name = "llava_qwen"
|
67 |
+
device = "cuda"
|
68 |
+
device_map = "auto"
|
69 |
+
tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, torch_dtype="bfloat16", device_map=device_map) # Add any other thing you want to pass in llava_model_args
|
70 |
+
model.eval()
|
71 |
+
video_path = "local_demo/assets/dc_demo.mp4"
|
72 |
+
max_frames_num = "64"
|
73 |
+
video,frame_time,video_time = load_video(video_path, max_frames_num, 1, force_sample=True)
|
74 |
+
video = image_processor.preprocess(video, return_tensors="pt")["pixel_values"].cuda().bfloat16()
|
75 |
+
video = [video]
|
76 |
+
conv_template = "qwen_1_5" # Make sure you use correct chat template for different models
|
77 |
+
time_instruciton = f"The video lasts for {video_time:.2f} seconds, and {len(video[0])} frames are uniformly sampled from it. These frames are located at {frame_time}.Please answer the following questions related to this video."
|
78 |
+
question = DEFAULT_IMAGE_TOKEN + f"{time_instruciton}\nPlease describe this video in detail."
|
79 |
+
conv = copy.deepcopy(conv_templates[conv_template])
|
80 |
+
conv.append_message(conv.roles[0], question)
|
81 |
+
conv.append_message(conv.roles[1], None)
|
82 |
+
prompt_question = conv.get_prompt()
|
83 |
+
input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
|
84 |
+
cont = model.generate(
|
85 |
+
input_ids,
|
86 |
+
images=video,
|
87 |
+
modalities= ["video"],
|
88 |
+
do_sample=False,
|
89 |
+
temperature=0,
|
90 |
+
max_new_tokens=4096,
|
91 |
+
)
|
92 |
+
text_outputs = tokenizer.batch_decode(cont, skip_special_tokens=True)[0].strip()
|
93 |
+
print(text_outputs)
|
94 |
+
```
|
95 |
|
96 |
## License
|
97 |
|
|
|
106 |
|
107 |
[More Information Needed]
|
108 |
|
109 |
+
**References:**
|
110 |
+
|
111 |
+
[1]. Liu, Z., Zhu, L., Shi, B., Zhang, Z., Lou, Y., Yang, S., ... & Lu, Y. (2024). NVILA: Efficient Frontier Visual Language Models. arXiv preprint arXiv:2412.04468.
|
112 |
+
[2]. Zhang, Y., Wu, J., Li, W., Li, B., Ma, Z., Liu, Z., & Li, C. (2024). Video instruction tuning with synthetic data. arXiv preprint arXiv:2410.02713.
|