import whisper from tempfile import NamedTemporaryFile class Transcription: def __init__(self, source): self.source = source # self.device = device # self.audios = [] # with NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file: # tmp_file.write(file.getvalue()) # # self.audios.append(tmp_file.name) # self.audios.append(tmp_file) # self.audios.append(source) def transcribe( self, model # whisper_model_option: str, # translation: bool, ): # # Get the whisper model # transcriber = whisper.load_model(whisper_model_option, device=self.device) # self.output = [] # for idx, _ in enumerate(self.audios): # identify language audio = whisper.load_audio(self.source)#audios)#[idx]) audio = whisper.pad_or_trim(audio) # print(model.__dict__) # n_mels = 128 if 'large' in model.name else 80 mel = whisper.log_mel_spectrogram(audio, n_mels=model.dims.n_mels).to(model.device) _, probs = model.detect_language(mel) language = max(probs, key=probs.get) self.raw_output = model.transcribe( self.source,#audios[idx], language=language, verbose=True, word_timestamps=True, # fp16=(model.device == 'cuda') # use fp16 on GPU for speed/memory ) # if(translation): # self.translation = model.transcribe( # self.audios[idx], # language=language, # verbose=True, # word_timestamps=True, # task='translate' # )["text"] # self.raw_output["translation"] = self.translation self.segments = self.raw_output['segments'] for segment in self.raw_output['segments']: del segment['tokens'] self.raw_output.update( name=self.source[0], #[idx],#.name, language=language ) self.output = self.raw_output # self.output.append(self.raw_output) # print(self.raw_output['segments'])