yiqing111 commited on
Commit
5868ef7
·
verified ·
1 Parent(s): 9d8ce6a

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +103 -0
  2. dl.py +210 -0
app.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import torch
3
+ from transformers import AutoTokenizer
4
+ import whisper
5
+ import subprocess
6
+ import os
7
+ import pandas as pd
8
+ from dl import PitchEvaluationModel # Import model
9
+
10
+ def download_youtube_video(url, output_file="pitch_video.mp4"):
11
+ """Download YouTube video using yt-dlp."""
12
+ if "youtube.com" not in url and "youtu.be" not in url:
13
+ st.error("❌ Invalid URL! Please enter a valid YouTube link.")
14
+ return None
15
+ try:
16
+ command = ["yt-dlp", "-f", "mp4", "-o", output_file, url]
17
+ subprocess.run(command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
18
+ return output_file
19
+ except subprocess.CalledProcessError:
20
+ st.error("❌ Failed to download the video. Please check the URL and try again.")
21
+ return None
22
+
23
+ def transcribe_video(video_file):
24
+ """Transcribe video using Whisper."""
25
+ try:
26
+ model = whisper.load_model("base")
27
+ result = model.transcribe(video_file)
28
+ return result["text"]
29
+ except Exception as e:
30
+ st.error("❌ An error occurred during transcription.")
31
+ return ""
32
+
33
+ def load_model():
34
+ """Load the trained model."""
35
+ try:
36
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
37
+ model = PitchEvaluationModel("bert-base-uncased").to(device)
38
+ model.load_state_dict(torch.load("best_pitch_model.pt", map_location=device))
39
+ model.eval()
40
+ tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
41
+ return model, tokenizer, device
42
+ except Exception as e:
43
+ st.error("❌ Failed to load the model.")
44
+ return None, None, None
45
+
46
+ def evaluate_pitch(transcript, model, tokenizer, device):
47
+ """Evaluate transcript using the trained model."""
48
+ try:
49
+ inputs = tokenizer(transcript, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
50
+ input_ids, attention_mask = inputs["input_ids"].to(device), inputs["attention_mask"].to(device)
51
+ with torch.no_grad():
52
+ clarity, team, traction = model(input_ids, attention_mask)
53
+ return torch.argmax(clarity).item() + 1, torch.argmax(team).item() + 1, torch.argmax(traction).item() + 1
54
+ except Exception as e:
55
+ st.error("❌ Error in evaluation process.")
56
+ return None, None, None
57
+
58
+ # Streamlit App UI
59
+ st.set_page_config(page_title="Pitch Evaluation App", layout="wide")
60
+ st.title("🚀 Pitch Evaluation")
61
+
62
+ option = st.radio("Choose Input Method", ("YouTube URL", "Upload File"), horizontal=True)
63
+
64
+ if option == "YouTube URL":
65
+ url = st.text_input("🎥 Enter YouTube URL")
66
+ if st.button("Download and Transcribe", use_container_width=True):
67
+ video_file = download_youtube_video(url)
68
+ if video_file:
69
+ transcript = transcribe_video(video_file)
70
+ st.text_area("📜 Transcript", transcript, height=200)
71
+ elif option == "Upload File":
72
+ uploaded_file = st.file_uploader("📂 Upload Video", type=["mp4"], help="Upload a video file for transcription and evaluation.")
73
+ if uploaded_file is not None:
74
+ if uploaded_file.type != "video/mp4":
75
+ st.error("❌ Invalid file format! Please upload an MP4 file.")
76
+ else:
77
+ with open("uploaded_video.mp4", "wb") as f:
78
+ f.write(uploaded_file.getbuffer())
79
+ transcript = transcribe_video("uploaded_video.mp4")
80
+ st.text_area("📜 Transcript", transcript, height=200)
81
+
82
+ if 'transcript' in locals() and transcript:
83
+ model, tokenizer, device = load_model()
84
+ if model is not None:
85
+ clarity, team, traction = evaluate_pitch(transcript, model, tokenizer, device)
86
+ if None not in (clarity, team, traction):
87
+ # Create a DataFrame for the scoring table
88
+ categories = ["Clarity & Conciseness", "Team-Market Fit", "Traction / Validation"]
89
+ scores = [clarity, team, traction]
90
+ descriptions = [
91
+ "Extremely clear, direct, and easy to follow;no fluff, just essential details." if clarity == 5 else "Mostly clear, with only minor unnecessary details." if clarity == 4 else "Somewhat clear but includes extra details or minor distractions." if clarity == 3 else "Lacks clarity; hard to follow; too much fluff or filler." if clarity == 2 else "Unclear, rambling, and difficult to understand.",
92
+ "Founders have highly relevant skills & experience to execute this successfully." if team == 5 else "Founders have good experience but may lack some key skills." if team == 4 else "Some relevant experience but gaps in expertise." if team == 3 else "Limited relevant experience; execution ability is questionable." if team == 2 else "No clear expertise in this space; team seems unqualified.",
93
+ "Strong proof of demand (users, revenue, engagement, partnerships, etc.)." if traction == 5 else "Good early validation with promising signs of demand." if traction == 4 else "Some traction but not yet convincing." if traction == 3 else "Weak or vague traction, with little evidence of demand." if traction == 2 else "No validation or proof that people want this."
94
+ ]
95
+ df = pd.DataFrame({"Category": categories, "Score (1-5)": scores, "Evaluation": descriptions})
96
+
97
+ st.write("## 📊 Evaluation Results")
98
+ st.table(df)
99
+
100
+ if ((clarity + team + traction)/3) >=3.5:
101
+ st.write("## 🎉 Congrats! You have a high possibility to be accepted")
102
+ else:
103
+ st.write("## 🙌 Need More Practice, but don't give up!")
dl.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModel
2
+ import torch
3
+ import torch.nn as nn
4
+ from torch.utils.data import Dataset, DataLoader
5
+ import pandas as pd
6
+ from sklearn.model_selection import train_test_split
7
+ from sklearn.metrics import cohen_kappa_score
8
+ import numpy as np
9
+ import torch.optim.lr_scheduler as lr_scheduler
10
+ from datasets import load_dataset
11
+
12
+ # 1. Dataset Preparation
13
+ class PitchDataset(Dataset):
14
+ def __init__(self, texts, clarity_scores, team_scores, traction_scores, tokenizer, max_length=512):
15
+ self.texts = texts
16
+ self.clarity_scores = clarity_scores
17
+ self.team_scores = team_scores
18
+ self.traction_scores = traction_scores
19
+ self.tokenizer = tokenizer
20
+ self.max_length = max_length
21
+
22
+ def __len__(self):
23
+ return len(self.texts)
24
+
25
+ def __getitem__(self, idx):
26
+ text = str(self.texts[idx])
27
+ encoding = self.tokenizer(
28
+ text,
29
+ max_length=self.max_length,
30
+ padding="max_length",
31
+ truncation=True,
32
+ return_tensors="pt"
33
+ )
34
+ # Convert scores to zero-based index (1-5 → 0-4)
35
+ clarity_score = self.clarity_scores[idx] - 1
36
+ team_score = self.team_scores[idx] - 1
37
+ traction_score = self.traction_scores[idx] - 1
38
+
39
+ return {
40
+ 'input_ids': encoding['input_ids'].flatten(),
41
+ 'attention_mask': encoding['attention_mask'].flatten(),
42
+ 'clarity_score': torch.tensor(clarity_score, dtype=torch.long),
43
+ 'team_score': torch.tensor(team_score, dtype=torch.long),
44
+ 'traction_score': torch.tensor(traction_score, dtype=torch.long)
45
+ }
46
+
47
+ # 2. Model Definition (Using Bert model with gradient checkpointing)
48
+ class PitchEvaluationModel(nn.Module):
49
+ def __init__(self, pretrained_model="bert-base-uncased", use_gradient_checkpointing=True):
50
+ super().__init__()
51
+ self.encoder = AutoModel.from_pretrained(pretrained_model)
52
+ if use_gradient_checkpointing:
53
+ self.encoder.gradient_checkpointing_enable()
54
+ self.dropout = nn.Dropout(0.3)
55
+ self.clarity_classifier = nn.Linear(self.encoder.config.hidden_size, 5)
56
+ self.team_classifier = nn.Linear(self.encoder.config.hidden_size, 5)
57
+ self.traction_classifier = nn.Linear(self.encoder.config.hidden_size, 5)
58
+
59
+ def forward(self, input_ids, attention_mask):
60
+ outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
61
+ pooled_output = outputs.last_hidden_state[:, 0] # Using CLS token
62
+ pooled_output = self.dropout(pooled_output)
63
+
64
+ clarity_logits = self.clarity_classifier(pooled_output)
65
+ team_logits = self.team_classifier(pooled_output)
66
+ traction_logits = self.traction_classifier(pooled_output)
67
+
68
+ return clarity_logits, team_logits, traction_logits
69
+
70
+ # 3. Training Function (Includes learning rate scheduling and multi-task weighted loss)
71
+ def train_model(model, train_loader, val_loader, device, epochs=10,
72
+ clarity_weight=1.0, team_weight=1.0, traction_weight=1.0):
73
+ optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
74
+ # Use StepLR, decrease learning rate every 2 epochs
75
+ scheduler = lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.8)
76
+ criterion = nn.CrossEntropyLoss()
77
+ best_qwk = 0
78
+
79
+ for epoch in range(epochs):
80
+ model.train()
81
+ total_loss = 0
82
+
83
+ for batch in train_loader:
84
+ input_ids = batch['input_ids'].to(device)
85
+ attention_mask = batch['attention_mask'].to(device)
86
+ clarity_scores = batch['clarity_score'].to(device)
87
+ team_scores = batch['team_score'].to(device)
88
+ traction_scores = batch['traction_score'].to(device)
89
+
90
+ optimizer.zero_grad()
91
+
92
+ clarity_logits, team_logits, traction_logits = model(input_ids, attention_mask)
93
+
94
+ clarity_loss = criterion(clarity_logits, clarity_scores)
95
+ team_loss = criterion(team_logits, team_scores)
96
+ traction_loss = criterion(traction_logits, traction_scores)
97
+
98
+ # Multi-task weighted loss
99
+ loss = clarity_weight * clarity_loss + team_weight * team_loss + traction_weight * traction_loss
100
+ loss.backward()
101
+ optimizer.step()
102
+
103
+ total_loss += loss.item()
104
+
105
+ scheduler.step() # Adjust learning rate
106
+
107
+ avg_loss = total_loss / len(train_loader)
108
+ print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")
109
+
110
+ # Evaluate on validation set and save the best model
111
+ qwk = evaluate_model(model, val_loader, device)
112
+ if qwk > best_qwk:
113
+ best_qwk = qwk
114
+ torch.save(model.state_dict(), "best_pitch_model.pt")
115
+ print(f"Model saved with QWK: {best_qwk:.4f}")
116
+
117
+ return model
118
+
119
+ # 4. Evaluation Function (Calculates QWK)
120
+ def evaluate_model(model, data_loader, device):
121
+ model.eval()
122
+
123
+ all_clarity_preds = []
124
+ all_team_preds = []
125
+ all_traction_preds = []
126
+
127
+ all_clarity_true = []
128
+ all_team_true = []
129
+ all_traction_true = []
130
+
131
+ with torch.no_grad():
132
+ for batch in data_loader:
133
+ input_ids = batch['input_ids'].to(device)
134
+ attention_mask = batch['attention_mask'].to(device)
135
+
136
+ clarity_logits, team_logits, traction_logits = model(input_ids, attention_mask)
137
+
138
+ # Convert predictions back to 1-5 scale
139
+ clarity_preds = torch.argmax(clarity_logits, dim=1).cpu().numpy() + 1
140
+ team_preds = torch.argmax(team_logits, dim=1).cpu().numpy() + 1
141
+ traction_preds = torch.argmax(traction_logits, dim=1).cpu().numpy() + 1
142
+
143
+ all_clarity_preds.extend(clarity_preds)
144
+ all_team_preds.extend(team_preds)
145
+ all_traction_preds.extend(traction_preds)
146
+
147
+ all_clarity_true.extend((batch['clarity_score'].cpu().numpy() + 1))
148
+ all_team_true.extend((batch['team_score'].cpu().numpy() + 1))
149
+ all_traction_true.extend((batch['traction_score'].cpu().numpy() + 1))
150
+
151
+ clarity_qwk = cohen_kappa_score(all_clarity_true, all_clarity_preds, weights='quadratic')
152
+ team_qwk = cohen_kappa_score(all_team_true, all_team_preds, weights='quadratic')
153
+ traction_qwk = cohen_kappa_score(all_traction_true, all_traction_preds, weights='quadratic')
154
+
155
+ overall_qwk = (clarity_qwk + team_qwk + traction_qwk) / 3
156
+
157
+ print("Evaluation Results:")
158
+ print(f"Clarity QWK: {clarity_qwk:.4f}")
159
+ print(f"Team Market Fit QWK: {team_qwk:.4f}")
160
+ print(f"Traction QWK: {traction_qwk:.4f}")
161
+ print(f"Overall QWK: {overall_qwk:.4f}")
162
+
163
+ return overall_qwk
164
+
165
+ # 5. Main Function (Includes training, validation, and test set evaluation)
166
+ def main():
167
+ # Load data
168
+ dataset = load_dataset("jasonhwan/yc-startup-pitches-with-scores", split="train")
169
+ df = dataset.to_pandas()
170
+
171
+ df.to_csv("yc_startup_pitches.csv", index=False)
172
+
173
+ df = pd.read_csv("yc_startup_pitches.csv")
174
+
175
+ # Extract text and scores
176
+ texts = df['transcript'].values
177
+ clarity_scores = df['clarity_score'].values
178
+ team_scores = df['team_market_fit_score'].values
179
+ traction_scores = df['traction_validation_score'].values
180
+
181
+ # Split dataset: 70% training, 15% validation, 15% testing
182
+ train_texts, temp_texts, train_clarity, temp_clarity, train_team, temp_team, train_traction, temp_traction = train_test_split(
183
+ texts, clarity_scores, team_scores, traction_scores, test_size=0.3, random_state=42
184
+ )
185
+ val_texts, test_texts, val_clarity, test_clarity, val_team, test_team, val_traction, test_traction = train_test_split(
186
+ temp_texts, temp_clarity, temp_team, temp_traction, test_size=0.5, random_state=42
187
+ )
188
+
189
+ # Initialize tokenizer
190
+ tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased")
191
+
192
+ train_dataset = PitchDataset(train_texts, train_clarity, train_team, train_traction, tokenizer)
193
+ val_dataset = PitchDataset(val_texts, val_clarity, val_team, val_traction, tokenizer)
194
+ test_dataset = PitchDataset(test_texts, test_clarity, test_team, test_traction, tokenizer)
195
+
196
+ # Create DataLoaders
197
+ train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
198
+ val_loader = DataLoader(val_dataset, batch_size=8)
199
+ test_loader = DataLoader(test_dataset, batch_size=8)
200
+
201
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
202
+ model = PitchEvaluationModel(pretrained_model="bert-base-uncased", use_gradient_checkpointing=True)
203
+ model.to(device)
204
+
205
+ model = train_model(model, train_loader, val_loader, device, epochs=10)
206
+
207
+ model.load_state_dict(torch.load("best_pitch_model.pt"))
208
+
209
+ if __name__ == "__main__":
210
+ main()