Model Card

Model Details

Instruction-tuned version of the Marathi-SmolLM2-145M on the following tasks:

Paraphrasing: Using ai4bharat/IndicParaphrase dataset.
Question Generation: Using ai4bharat/IndicQuestionGeneration dataset.
Headline Generation: Using ai4bharat/IndicHeadlineGeneration dataset.
Sentence Summarization: Using ai4bharat/IndicSentenceSummarization dataset.

Note

This is a experimental instruction-tuned model (on 4 tasks).
Initial experiments suggest that this model is not consistent in generating expected outputs everytime and thus needs more tuning.

How to use

from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained("sky-2002/Marathi-SmolLM2-145M-Finetuned-4")
model = AutoModelForCausalLM.from_pretrained("sky-2002/Marathi-SmolLM2-145M-Finetuned-4")

def task_generate(
    input: str,
    max_new_tokens: int = 100,
    min_new_tokens: int = 1,
    temperature: float = 0.7,
    top_p: float = 0.95,
    task="paraphrase",
    use_beam_search: bool = False,
    **kwargs,
) -> str:
    if task == "paraphrase":
        messages = [
            {"role": "system",    "content": "तुम्ही एक उपयुक्त मराठी सहाय्यक आहात."},
            {"role": "user",      "content": f"खालील वाक्य दुसऱ्या, पण समान अर्थ असणाऱ्या शब्दांत पुन्हा लिहा:\n\n{input}"},
        ]
    elif task == "headline":
        messages = [
            {"role": "system",    "content": "तुम्ही एक उपयुक्त मराठी सहाय्यक आहात."},
            {
            "role": "user",
                "content": (
                    "तुम्ही एक बातमी लेख वाचत आहात. त्यावर एक शीर्षक तयार करा.\n\n"
                    "लेख:\n\n"
                    f"{input}\n\n"
                ),
            }
        ]
    elif task=="question":
        messages = [
            {"role": "system",    "content": "तुम्ही एक उपयुक्त मराठी सहाय्यक आहात."},
            {
            "role": "user",
                "content": (
                    "खालील परिच्छेद वाचा आणि दिलेल्या उत्तराशी सुसंगत असा प्रश्न तयार करा:\n\n"
                    "परिच्छेद:\n"
                    f"{input}\n\n"
                    f"उत्तर: {kwargs['answer']}\n\n"
                ),
            }
        ]
    elif task=="summarize":
        messages = [
            {"role": "system",    "content": "तुम्ही एक उपयुक्त मराठी सहाय्यक आहात."},
            {
            "role": "user",
                "content": (
                    "तुम्ही दिलेल्या वाक्याचा सारांश द्या.\n\n"
                    "वाक्य:\n"
                    f"{input}\n\n"
                ),
            }
        ]
    else:
        raise ValueError(f"Unknown task: {task}")

    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
    )
    inputs = tokenizer(
            inputs,
            padding="max_length",
            truncation=True,
            max_length=512,
            return_tensors="pt",
        )
    
    gen_kwargs = {
        "input_ids":      inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "max_new_tokens": max_new_tokens,
        "min_new_tokens": min_new_tokens,
        "pad_token_id":   tokenizer.eos_token_id,
        "eos_token_id":   tokenizer.eos_token_id,
    }
    if use_beam_search:
        # disable sampling:
        gen_kwargs.update({
            "do_sample": False,
            "num_beams": kwargs.get("num_beams", 4),
            "early_stopping": True,
            "num_return_sequences": 1,
            # optional: prevent repetition
            # "no_repeat_ngram_size": 2,
            # optional: length penalty to favor longer/shorter
            # "length_penalty": 1.0,
        })
    else:
        # your existing sampling defaults
        gen_kwargs.update({
            "do_sample": True,
            "temperature": temperature,
            "top_p": top_p,
        })

    output_ids = model.generate(**gen_kwargs)[0]

    decoded = tokenizer.decode(output_ids, skip_special_tokens=True)
    marker = "<|assistant|>"
    if marker in decoded:
        generated = decoded.split(marker)[-1]
    else:
        generated = decoded

    return generated.strip()

# Example usage
sentence = """पुणे विद्यापीठाने म्हटले आहे की, शालेय शिक्षणात सुधारणा करण्यासाठी शाळा व महाविद्यालये यांच्यातील सहकार्य आवश्यक आहे.
शाळा व महाविद्यालये यांच्यातील सहकार्यामुळे विद्यार्थ्यांना शालेय शिक्षणात सुधारणा करण्यास मदत होईल. 
पुणे विद्यापीठाने शालेय शिक्षणात सुधारणा करण्यासाठी शाळा व महाविद्यालये यांच्यातील सहकार्य आवश्यक आहे."""
print(task_generate(sentence, task="headline"))

sentence = """
महाराष्ट्रातील शेतकरी परंपरागत आणि आधुनिक पद्धतींचा अवलंब करून पिक लागवड करतात.
"""
print(task_generate(sentence, task="paraphrase"))

sky-2002
/

Marathi-SmolLM2-145M-Finetuned-4

Model Card

Model Details

How to use

Datasets used to train sky-2002/Marathi-SmolLM2-145M-Finetuned-4

Collection including sky-2002/Marathi-SmolLM2-145M-Finetuned-4

Marathi-SLMs