Spaces:
Sleeping
Sleeping
from datetime import datetime | |
import os | |
from transformers import AutoProcessor, AutoModelForVision2Seq | |
from PIL import Image, ImageOps | |
import torch | |
from peft import PeftModel | |
from huggingface_hub import snapshot_download | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
print(f"Using device: {device}") | |
base_model_name = "HuggingFaceTB/SmolVLM-256M-Instruct" | |
processor = AutoProcessor.from_pretrained( | |
base_model_name, | |
torch_dtype=torch.bfloat16, | |
_attn_implementation="flash_attention_2" if device == "cuda" else "eager" | |
) | |
base_model = AutoModelForVision2Seq.from_pretrained(base_model_name, torch_dtype=torch.bfloat16).to(device) | |
repo_local_path = snapshot_download( | |
repo_id="Irina1402/smolvlm-painting-description" | |
) | |
model = PeftModel.from_pretrained(base_model, model_id=repo_local_path) | |
model.eval() | |
def process_chat(text: str = None, image: Image.Image = None): | |
"""Process the input and generate a response using SmolVLM.""" | |
image_data = None | |
inputs = [] | |
if image: | |
image_data = image.convert("RGB") | |
image_data = ImageOps.exif_transpose(image_data) | |
inputs.append({"type": "image"}) | |
if text: | |
inputs.append({"type": "text", "text": text}) | |
message = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}] | |
prompt = processor.apply_chat_template(message, add_generation_prompt=True) | |
print(f"Prepared prompt:\n{prompt}") | |
processed_inputs = processor( | |
text=prompt, | |
images=[image_data] if image_data else None, | |
return_tensors="pt" | |
).to(device) | |
with torch.no_grad(): | |
generated_ids = model.generate(**processed_inputs, max_new_tokens=50, repetition_penalty=1.2) | |
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
assistant_text = generated_text.split("Assistant:", 1)[-1].strip() | |
return assistant_text | |