nkkbr/ViCA2 · Hugging Face

Currently under editing.

Installation

git clone https://github.com/nkkbr/ViCA.git
cd ViCA

conda create -n vica2 python=3.10 -y
conda activate vica2

# Install dependencies (with CUDA 12.1 support)
pip install --extra-index-url https://download.pytorch.org/whl/cu121 -e .

# FlashAttention is required and may need to be installed separately
pip install flash-attn==2.5.7

Inference

Here is a runnable example using ViCA2-7B on a VSI-Bench question.

Note: ViCA and ViCA2 use different model architectures. Please make sure to use the corresponding code for inference.

# This inference script is adapted from:
# https://huggingface.co/lmms-lab/LLaVA-Video-7B-Qwen2

from vica2.model.builder import load_pretrained_model
from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IGNORE_INDEX
from llava.conversation import conv_templates, SeparatorStyle
from PIL import Image
import requests
import copy
import torch
import sys
import warnings
from decord import VideoReader, cpu
import numpy as np

warnings.filterwarnings("ignore")
def load_video(video_path, max_frames_num,fps=1,force_sample=False):
    if max_frames_num == 0:
        return np.zeros((1, 336, 336, 3))
    vr = VideoReader(video_path, ctx=cpu(0),num_threads=1)
    total_frame_num = len(vr)
    video_time = total_frame_num / vr.get_avg_fps()
    fps = round(vr.get_avg_fps()/fps)
    frame_idx = [i for i in range(0, len(vr), fps)]
    frame_time = [i/fps for i in frame_idx]
    if len(frame_idx) > max_frames_num or force_sample:
        sample_fps = max_frames_num
        uniform_sampled_frames = np.linspace(0, total_frame_num - 1, sample_fps, dtype=int)
        frame_idx = uniform_sampled_frames.tolist()
        frame_time = [i/vr.get_avg_fps() for i in frame_idx]
    frame_time = ",".join([f"{i:.2f}s" for i in frame_time])
    spare_frames = vr.get_batch(frame_idx).asnumpy()
    return spare_frames,frame_time,video_time

pretrained = "nkkbr/ViCA2-stage2-onevision-ft"
model_name = "vica_qwen"
device = "cuda"
device_map = "auto"
tokenizer, model, image_processor, image_processor_for_sam, max_length = load_pretrained_model(pretrained, None, model_name, torch_dtype="bfloat16", device_map=device_map)  
model.eval()


from datasets import load_dataset
vsi_bench = load_dataset("nyu-visionx/VSI-Bench")
vsi_bench = vsi_bench['test']

data_curr = vsi_bench[90]

video_path = f"[VIDEO PATH]"
max_frames_num = 64
video,frame_time,video_time = load_video(video_path, max_frames_num, 1, force_sample=True)

video1= image_processor.preprocess(video, return_tensors="pt")["pixel_values"].cuda().bfloat16()
video1 = [video1]
video2 = image_processor_for_sam.preprocess(video, return_tensors="pt")["pixel_values"].cuda().bfloat16()
video2 = [video2]
conv_template = "qwen_1_5"  
# time_instruciton = f"The video lasts for {video_time:.2f} seconds, and {len(video[0])} frames are uniformly sampled from it. These frames are located at {frame_time}.Please answer the following questions related to this video."
time_instruciton = ""
question = DEFAULT_IMAGE_TOKEN + f"\n{time_instruciton}\n\n"
question += f"These are frames of a video.\n\n"
question += f"Question: {data_curr['question']}\n"
if data_curr['options'] is not None:
    question += '\n'.join(data_curr['options']) + "\n"
    question += f"Answer with the option’s letter from the given choices directly.\n"
else:
    question += f"Please answer the question using a single word or phrase.\n"
print(f"Prompt:\n{question}")

conv = copy.deepcopy(conv_templates[conv_template])
conv.append_message(conv.roles[0], question)
conv.append_message(conv.roles[1], None)
prompt_question = conv.get_prompt()
input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
cont = model.generate(
    input_ids,
    images=video1,
    images_for_sam=video2,
    modalities= ["video"],
    do_sample=False,
    temperature=0,
    max_new_tokens=1024,
)
text_outputs = tokenizer.batch_decode(cont, skip_special_tokens=True)[0].strip()
print(repr(text_outputs))

nkkbr
/

ViCA2

Installation

Inference

Datasets used to train nkkbr/ViCA2

Collection including nkkbr/ViCA2

ViCA2

Evaluation results