yiqing111's picture
Upload 2 files
5868ef7 verified
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score
import numpy as np
import torch.optim.lr_scheduler as lr_scheduler
from datasets import load_dataset
# 1. Dataset Preparation
class PitchDataset(Dataset):
def __init__(self, texts, clarity_scores, team_scores, traction_scores, tokenizer, max_length=512):
self.texts = texts
self.clarity_scores = clarity_scores
self.team_scores = team_scores
self.traction_scores = traction_scores
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
text = str(self.texts[idx])
encoding = self.tokenizer(
text,
max_length=self.max_length,
padding="max_length",
truncation=True,
return_tensors="pt"
)
# Convert scores to zero-based index (1-5 → 0-4)
clarity_score = self.clarity_scores[idx] - 1
team_score = self.team_scores[idx] - 1
traction_score = self.traction_scores[idx] - 1
return {
'input_ids': encoding['input_ids'].flatten(),
'attention_mask': encoding['attention_mask'].flatten(),
'clarity_score': torch.tensor(clarity_score, dtype=torch.long),
'team_score': torch.tensor(team_score, dtype=torch.long),
'traction_score': torch.tensor(traction_score, dtype=torch.long)
}
# 2. Model Definition (Using Bert model with gradient checkpointing)
class PitchEvaluationModel(nn.Module):
def __init__(self, pretrained_model="bert-base-uncased", use_gradient_checkpointing=True):
super().__init__()
self.encoder = AutoModel.from_pretrained(pretrained_model)
if use_gradient_checkpointing:
self.encoder.gradient_checkpointing_enable()
self.dropout = nn.Dropout(0.3)
self.clarity_classifier = nn.Linear(self.encoder.config.hidden_size, 5)
self.team_classifier = nn.Linear(self.encoder.config.hidden_size, 5)
self.traction_classifier = nn.Linear(self.encoder.config.hidden_size, 5)
def forward(self, input_ids, attention_mask):
outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
pooled_output = outputs.last_hidden_state[:, 0] # Using CLS token
pooled_output = self.dropout(pooled_output)
clarity_logits = self.clarity_classifier(pooled_output)
team_logits = self.team_classifier(pooled_output)
traction_logits = self.traction_classifier(pooled_output)
return clarity_logits, team_logits, traction_logits
# 3. Training Function (Includes learning rate scheduling and multi-task weighted loss)
def train_model(model, train_loader, val_loader, device, epochs=10,
clarity_weight=1.0, team_weight=1.0, traction_weight=1.0):
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
# Use StepLR, decrease learning rate every 2 epochs
scheduler = lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.8)
criterion = nn.CrossEntropyLoss()
best_qwk = 0
for epoch in range(epochs):
model.train()
total_loss = 0
for batch in train_loader:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
clarity_scores = batch['clarity_score'].to(device)
team_scores = batch['team_score'].to(device)
traction_scores = batch['traction_score'].to(device)
optimizer.zero_grad()
clarity_logits, team_logits, traction_logits = model(input_ids, attention_mask)
clarity_loss = criterion(clarity_logits, clarity_scores)
team_loss = criterion(team_logits, team_scores)
traction_loss = criterion(traction_logits, traction_scores)
# Multi-task weighted loss
loss = clarity_weight * clarity_loss + team_weight * team_loss + traction_weight * traction_loss
loss.backward()
optimizer.step()
total_loss += loss.item()
scheduler.step() # Adjust learning rate
avg_loss = total_loss / len(train_loader)
print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")
# Evaluate on validation set and save the best model
qwk = evaluate_model(model, val_loader, device)
if qwk > best_qwk:
best_qwk = qwk
torch.save(model.state_dict(), "best_pitch_model.pt")
print(f"Model saved with QWK: {best_qwk:.4f}")
return model
# 4. Evaluation Function (Calculates QWK)
def evaluate_model(model, data_loader, device):
model.eval()
all_clarity_preds = []
all_team_preds = []
all_traction_preds = []
all_clarity_true = []
all_team_true = []
all_traction_true = []
with torch.no_grad():
for batch in data_loader:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
clarity_logits, team_logits, traction_logits = model(input_ids, attention_mask)
# Convert predictions back to 1-5 scale
clarity_preds = torch.argmax(clarity_logits, dim=1).cpu().numpy() + 1
team_preds = torch.argmax(team_logits, dim=1).cpu().numpy() + 1
traction_preds = torch.argmax(traction_logits, dim=1).cpu().numpy() + 1
all_clarity_preds.extend(clarity_preds)
all_team_preds.extend(team_preds)
all_traction_preds.extend(traction_preds)
all_clarity_true.extend((batch['clarity_score'].cpu().numpy() + 1))
all_team_true.extend((batch['team_score'].cpu().numpy() + 1))
all_traction_true.extend((batch['traction_score'].cpu().numpy() + 1))
clarity_qwk = cohen_kappa_score(all_clarity_true, all_clarity_preds, weights='quadratic')
team_qwk = cohen_kappa_score(all_team_true, all_team_preds, weights='quadratic')
traction_qwk = cohen_kappa_score(all_traction_true, all_traction_preds, weights='quadratic')
overall_qwk = (clarity_qwk + team_qwk + traction_qwk) / 3
print("Evaluation Results:")
print(f"Clarity QWK: {clarity_qwk:.4f}")
print(f"Team Market Fit QWK: {team_qwk:.4f}")
print(f"Traction QWK: {traction_qwk:.4f}")
print(f"Overall QWK: {overall_qwk:.4f}")
return overall_qwk
# 5. Main Function (Includes training, validation, and test set evaluation)
def main():
# Load data
dataset = load_dataset("jasonhwan/yc-startup-pitches-with-scores", split="train")
df = dataset.to_pandas()
df.to_csv("yc_startup_pitches.csv", index=False)
df = pd.read_csv("yc_startup_pitches.csv")
# Extract text and scores
texts = df['transcript'].values
clarity_scores = df['clarity_score'].values
team_scores = df['team_market_fit_score'].values
traction_scores = df['traction_validation_score'].values
# Split dataset: 70% training, 15% validation, 15% testing
train_texts, temp_texts, train_clarity, temp_clarity, train_team, temp_team, train_traction, temp_traction = train_test_split(
texts, clarity_scores, team_scores, traction_scores, test_size=0.3, random_state=42
)
val_texts, test_texts, val_clarity, test_clarity, val_team, test_team, val_traction, test_traction = train_test_split(
temp_texts, temp_clarity, temp_team, temp_traction, test_size=0.5, random_state=42
)
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased")
train_dataset = PitchDataset(train_texts, train_clarity, train_team, train_traction, tokenizer)
val_dataset = PitchDataset(val_texts, val_clarity, val_team, val_traction, tokenizer)
test_dataset = PitchDataset(test_texts, test_clarity, test_team, test_traction, tokenizer)
# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)
test_loader = DataLoader(test_dataset, batch_size=8)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = PitchEvaluationModel(pretrained_model="bert-base-uncased", use_gradient_checkpointing=True)
model.to(device)
model = train_model(model, train_loader, val_loader, device, epochs=10)
model.load_state_dict(torch.load("best_pitch_model.pt"))
if __name__ == "__main__":
main()