first commit

Browse files

Files changed (17) hide show

.gitignore +8 -0
README.md +29 -3
bin/train.py +4 -0
model/config.json +48 -0
model/model.safetensors +3 -0
model/preprocessor_config.json +23 -0
predict.py +20 -0
pyproject.toml +29 -0
requirements.txt +5 -0
src/__init__.py +0 -0
src/predict.py +29 -0
src/preprocess.py +10 -0
src/train.py +71 -0
src/utils.py +9 -0
tests/__init__.py +0 -0
tests/data/number_3.jpg +0 -0
tests/test_prediction.py +19 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,8 @@

+# Python metadata
+image_classification_model.egg-info/
+venv/
+__pycache__/
+# Model
+results/
+model/

README.md CHANGED Viewed

@@ -1,3 +1,29 @@
----
-license: mit
----

+---
+license: mit
+---
+# Image Classification Model (ViT)
+This is an image classification model based on **Vision Transformer (ViT)**, fine-tuned on the **MNIST** dataset. The model is designed to classify images into one of 10 possible classes (digits 0-9). The code is compatible with Hugging Face's inference providers and can be easily deployed.
+## Model Details
+- **Model Type**: Vision Transformer (ViT)
+- **Base Model**: `google/vit-base-patch16-224`
+- **Task**: Image Classification
+- **Dataset**: MNIST (handwritten digits)
+- **Labels**: 10 classes (0-9)
+## How to Use
+### Install Requirements
+Make sure you have the following dependencies installed:
+```bash
+pip3 install requirements.txt
+```
+### Run unit tests
+```bash
+python3 -m unittest discover -s tests
+```

bin/train.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from image_classification_model.train import train
+if __name__ == "__main__":
+    train()

model/config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "_name_or_path": "google/vit-base-patch16-224",
+  "architectures": [
+    "ViTForImageClassification"
+  ],
+  "attention_probs_dropout_prob": 0.0,
+  "encoder_stride": 16,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.0,
+  "hidden_size": 768,
+  "id2label": {
+    "0": "0",
+    "1": "1",
+    "2": "2",
+    "3": "3",
+    "4": "4",
+    "5": "5",
+    "6": "6",
+    "7": "7",
+    "8": "8",
+    "9": "9"
+  },
+  "image_size": 224,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "label2id": {
+    "0": 0,
+    "1": 1,
+    "2": 2,
+    "3": 3,
+    "4": 4,
+    "5": 5,
+    "6": 6,
+    "7": 7,
+    "8": 8,
+    "9": 9
+  },
+  "layer_norm_eps": 1e-12,
+  "model_type": "vit",
+  "num_attention_heads": 12,
+  "num_channels": 3,
+  "num_hidden_layers": 12,
+  "patch_size": 16,
+  "problem_type": "single_label_classification",
+  "qkv_bias": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.48.2"
+}

model/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:02e9496d47c1edd9c65ffbb29c62f2238aa998d10c73ef901252524d5f619d7c
+size 343248584

model/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "do_convert_rgb": null,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_processor_type": "ViTImageProcessor",
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "resample": 2,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "height": 224,
+    "width": 224
+  }
+}

predict.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import sys
+from image_classification_model.predict import predict
+def main():
+    if len(sys.argv) < 2:
+        print("Usage: python predict.py <image_path>")
+        sys.exit(1)
+    image_path = sys.argv[1]
+    # Run prediction (handles preprocessing internally)
+    predicted_label = predict(image_path)
+    # Print output in Hugging Face-compatible format
+    print({"label": predicted_label})
+if __name__ == "__main__":
+    main()

pyproject.toml ADDED Viewed

	@@ -0,0 +1,29 @@

+[project]
+name = "image-classification-model"
+version = "0.1.0"
+description = "MNIST-compatible image classification model"
+requires-python = ">=3.8"
+dependencies = [
+    "torch>=2.0.0",
+    "transformers>=4.30.0",
+    "Pillow>=9.0.0",
+    "datasets>=2.0.0",
+    "accelerate>=0.26.0"
+]
+[build-system]
+requires = ["setuptools>=65.5.1", "wheel"]
+build-backend = "setuptools.build_meta"
+[tool.setuptools]
+packages = ["image_classification_model"]
+[tool.setuptools.package-dir]
+image_classification_model = "src"
+[tool.setuptools.entry_points]
+console_scripts = [
+    "train = bin.train:main",
+    "subtract = bin.subtract:main"
+]

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+torch>=2.0.0
+transformers>=4.30.0
+Pillow>=9.0.0
+datasets>=2.0.0
+accelerate>=0.26.0

src/__init__.py ADDED Viewed

File without changes

src/predict.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import torch
+from PIL import Image
+from .preprocess import preprocess_image
+from .utils import load_model
+def predict_with_model(model, inputs):
+    """Runs inference and returns the predicted class."""
+    model.eval()  # Ensure the model is in evaluation mode
+    with torch.no_grad():  # Disable gradient calculation
+        outputs = model(**inputs)
+        logits = outputs.logits
+        predicted_class = logits.argmax(dim=-1).item()  # Get predicted class index
+    return predicted_class
+def predict(image_path):
+    """Loads an image, preprocesses it, runs the model, and returns the prediction."""
+    image = Image.open(image_path).convert("RGB")
+    inputs = preprocess_image(image)
+    # Load model
+    model = load_model()
+    # Ensure inputs are on the same device as the model
+    device = model.device
+    inputs = {key: tensor.to(device) for key, tensor in inputs.items()}
+    return predict_with_model(model, inputs)

src/preprocess.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from transformers import ViTImageProcessor
+from .utils import MODEL_DIR
+processor = ViTImageProcessor.from_pretrained(MODEL_DIR)
+def preprocess_image(image):
+    """Preprocesses a single image for ViT inference."""
+    inputs = processor(images=image, return_tensors="pt")
+    return inputs

src/train.py ADDED Viewed

	@@ -0,0 +1,71 @@

+from transformers import (
+    ViTForImageClassification,
+    ViTImageProcessor,
+    TrainingArguments,
+    Trainer,
+)
+from datasets import load_dataset
+from .utils import MODEL_DIR
+def train():
+    # Load dataset
+    dataset = load_dataset("mnist")
+    dataset = dataset.rename_column("label", "labels")  # Critical rename
+    # Reduce dataset size for faster training
+    small_train_size = 2000  # Use only 2,000 training examples
+    small_test_size = 500  # Use only 500 test examples
+    dataset["train"] = dataset["train"].select(range(small_train_size))
+    dataset["test"] = dataset["test"].select(range(small_test_size))
+    # Initialize processor
+    processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224")
+    def transform(examples):
+        # Convert grayscale to RGB and process
+        images = [img.convert("RGB") for img in examples["image"]]
+        inputs = processor(images=images, return_tensors="pt")
+        inputs["labels"] = examples["labels"]
+        return inputs
+    # Apply preprocessing
+    dataset.set_transform(transform)
+    # Load model with proper initialization
+    model = ViTForImageClassification.from_pretrained(
+        "google/vit-base-patch16-224",
+        num_labels=10,
+        id2label={str(i): str(i) for i in range(10)},
+        label2id={str(i): i for i in range(10)},
+        ignore_mismatched_sizes=True,
+    )
+    # Training arguments with critical parameter
+    training_args = TrainingArguments(
+        output_dir="./results",
+        remove_unused_columns=False,  # Preserve input data
+        per_device_train_batch_size=16,  # Reduce batch size for efficiency
+        eval_strategy="steps",
+        num_train_epochs=3,
+        fp16=False,  # Disable fp16 mixed precision
+        save_steps=500,
+        eval_steps=500,
+        logging_steps=100,
+        learning_rate=2e-4,
+        push_to_hub=False,
+    )
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=dataset["train"],
+        eval_dataset=dataset["test"],
+    )
+    trainer.train()
+    # Save model and processor
+    model.save_pretrained(MODEL_DIR)
+    processor.save_pretrained(MODEL_DIR)

src/utils.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from transformers import ViTForImageClassification
+import os
+ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+MODEL_DIR = os.path.join(ROOT_DIR, "model")
+def load_model():
+    return ViTForImageClassification.from_pretrained(MODEL_DIR)

tests/__init__.py ADDED Viewed

File without changes

tests/data/number_3.jpg ADDED Viewed

tests/test_prediction.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import unittest
+import os
+from image_classification_model.predict import predict
+from image_classification_model.utils import ROOT_DIR
+DATA_DIR = os.path.join(ROOT_DIR, "tests/data")
+class TestPrediction(unittest.TestCase):
+    def test_prediction_label_3(self):
+        test_image_path = os.path.join(DATA_DIR, "number_3.jpg")
+        predicted_label = predict(test_image_path)
+        self.assertEqual(
+            predicted_label, 3, f"Expected label 3, but got {predicted_label}"
+        )
+if __name__ == "__main__":
+    unittest.main()