Spaces:

fevot
/

iti110

Sleeping

App Files Files Community

fevot commited on Feb 26

Commit

1526231

verified ·

1 Parent(s): ad952f8

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -13

app.py CHANGED Viewed

@@ -7,7 +7,7 @@ import json
 from torchvision import models
 import librosa
-# Define the BirdCallRNN model
 class BirdCallRNN(nn.Module):
     def __init__(self, resnet, num_features, num_classes):
         super(BirdCallRNN, self).__init__()
@@ -21,7 +21,7 @@ class BirdCallRNN(nn.Module):
         features = self.resnet(x)
         features = features.view(batch, seq_len, -1)
         rnn_out, _ = self.rnn(features)
-        output = self.fc(rnn_out[:, -1, :])  # Note: We’ll use this for single-segment sequences
         return output
 # Function to convert MP3 to mel spectrogram (unchanged)
@@ -45,12 +45,14 @@ def mp3_to_mel_spectrogram(mp3_file, target_shape=(128, 500), resize_shape=(224,
 with open('class_mapping.json', 'r') as f:
     class_names = json.load(f)
-# Revised inference function to predict per segment with confidence scores
 def infer_birdcall(model, mp3_file, segment_length=500, device="cuda"):
     model.eval()
     y, sr = librosa.load(mp3_file, sr=None)
     S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000)
     log_S = librosa.power_to_db(S, ref=np.max)
     num_segments = log_S.shape[1] // segment_length
     if num_segments == 0:
         segments = [log_S]
@@ -58,43 +60,53 @@ def infer_birdcall(model, mp3_file, segment_length=500, device="cuda"):
         segments = [log_S[:, i * segment_length:(i + 1) * segment_length] for i in range(num_segments)]
     predictions = []
     for seg in segments:
         seg_resized = cv2.resize(seg, (224, 224), interpolation=cv2.INTER_CUBIC)
         seg_rgb = np.repeat(seg_resized[:, :, np.newaxis], 3, axis=-1)
-        seg_tensor = torch.from_numpy(seg_rgb).permute(2, 0, 1).float().unsqueeze(0).unsqueeze(0).to(device)
         output = model(seg_tensor)
         probs = torch.softmax(output, dim=1)
-        confidence, pred = torch.max(probs, dim=1)
-        pred = pred.cpu().numpy()[0]
         confidence = confidence.cpu().numpy()[0]
-        predicted_bird = class_names.get(str(pred), "Unknown")
-        predictions.append(f"{predicted_bird} ({confidence:.2%} confidence)")
     return predictions
 # Initialize the model
 resnet = models.resnet50(weights='IMAGENET1K_V2')
 num_features = resnet.fc.in_features
 resnet.fc = nn.Identity()
-num_classes = len(class_names)
 model = BirdCallRNN(resnet, num_features, num_classes)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device)
 model.load_state_dict(torch.load('model_weights.pth', map_location=device))
 model.eval()
-# Prediction function for Gradio
 def predict_bird(file_path):
     predictions = infer_birdcall(model, file_path, segment_length=500, device=str(device))
-    formatted_predictions = "\n".join([f"{i+1}. {pred}" for i, pred in enumerate(predictions)])
     return formatted_predictions
 # Custom Gradio interface
 def gradio_interface(file_path):
     prediction = predict_bird(file_path)
-    audio_player = gr.Audio(file_path, label="Uploaded MP3 File", visible=True, autoplay=True)
     bird_species_image = gr.Image("1.jpg", label="Bird Species")
     bird_description_image = gr.Image("2.jpg", label="Bird Description")
     bird_origins_image = gr.Image("3.jpg", label="Bird Origins")
     return prediction, audio_player, bird_species_image, bird_description_image, bird_origins_image
 # Launch Gradio interface
@@ -109,4 +121,4 @@ interface = gr.Interface(
         gr.Image(label="Bird Origins")
     ]
 )
-interface.launch(share=True)

 from torchvision import models
 import librosa
+# Define the BirdCallRNN model (unchanged)
 class BirdCallRNN(nn.Module):
     def __init__(self, resnet, num_features, num_classes):
         super(BirdCallRNN, self).__init__()
         features = self.resnet(x)
         features = features.view(batch, seq_len, -1)
         rnn_out, _ = self.rnn(features)
+        output = self.fc(rnn_out[:, -1, :])
         return output
 # Function to convert MP3 to mel spectrogram (unchanged)
 with open('class_mapping.json', 'r') as f:
     class_names = json.load(f)
+# Revised inference function to include confidence scores
 def infer_birdcall(model, mp3_file, segment_length=500, device="cuda"):
     model.eval()
+    # Load audio and compute mel spectrogram
     y, sr = librosa.load(mp3_file, sr=None)
     S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000)
     log_S = librosa.power_to_db(S, ref=np.max)
+    # Segment the spectrogram
     num_segments = log_S.shape[1] // segment_length
     if num_segments == 0:
         segments = [log_S]
         segments = [log_S[:, i * segment_length:(i + 1) * segment_length] for i in range(num_segments)]
     predictions = []
+    # Process each segment individually
     for seg in segments:
         seg_resized = cv2.resize(seg, (224, 224), interpolation=cv2.INTER_CUBIC)
         seg_rgb = np.repeat(seg_resized[:, :, np.newaxis], 3, axis=-1)
+        # Create a tensor with batch size 1 and sequence length 1
+        seg_tensor = torch.from_numpy(seg_rgb).permute(2, 0, 1).float().unsqueeze(0).unsqueeze(0).to(device)  # Shape: (1, 1, 3, 224, 224)
         output = model(seg_tensor)
+        # Apply softmax to get probabilities
         probs = torch.softmax(output, dim=1)
+        confidence, pred_idx = torch.max(probs, dim=1)
+        pred_idx = pred_idx.cpu().numpy()[0]
         confidence = confidence.cpu().numpy()[0]
+        predicted_bird = class_names[str(pred_idx)]
+        predictions.append((predicted_bird, confidence))
     return predictions
 # Initialize the model
 resnet = models.resnet50(weights='IMAGENET1K_V2')
 num_features = resnet.fc.in_features
 resnet.fc = nn.Identity()
+num_classes = len(class_names)  # Should be 114
 model = BirdCallRNN(resnet, num_features, num_classes)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device)
 model.load_state_dict(torch.load('model_weights.pth', map_location=device))
 model.eval()
+# Prediction function with confidence scores
 def predict_bird(file_path):
     predictions = infer_birdcall(model, file_path, segment_length=500, device=str(device))
+    # Format predictions as a numbered list with confidence scores
+    formatted_predictions = "\n".join([f"{i+1}. {pred} (Confidence: {conf*100:.2f}%)" for i, (pred, conf) in enumerate(predictions)])
     return formatted_predictions
 # Custom Gradio interface
 def gradio_interface(file_path):
+    # Predict bird species with confidence
     prediction = predict_bird(file_path)
+    # Display the uploaded MP3 file with a play button
+    audio_player = gr.Audio(file_path, label="Uploaded MP3 File", visible=True, autoplay=False)
+    # Display images with titles
     bird_species_image = gr.Image("1.jpg", label="Bird Species")
     bird_description_image = gr.Image("2.jpg", label="Bird Description")
     bird_origins_image = gr.Image("3.jpg", label="Bird Origins")
     return prediction, audio_player, bird_species_image, bird_description_image, bird_origins_image
 # Launch Gradio interface
         gr.Image(label="Bird Origins")
     ]
 )
+interface.launch(share=True)