Spaces:

Hyeonsieun
/

Audio-to-LaTeX

Runtime error

App Files Files Community

Audio-to-LaTeX / app.py

Hyeonsieun

Update app.py

87ce87a verified about 1 year ago

raw

history blame contribute delete

5.4 kB

	import torch

	import gradio as gr
	from transformers import pipeline
	from transformers import T5ForConditionalGeneration, T5Tokenizer

	import re
	import os
	import json
	import requests
	import whisper
	from yt_dlp import YoutubeDL

	import matplotlib as plt

	#whisper_model = whisper.load_model('small')

	path = "Hyeonsieun/NTtoGT_7epoch"
	tokenizer = T5Tokenizer.from_pretrained(path)
	model = T5ForConditionalGeneration.from_pretrained(path)


	MODEL_NAME = "openai/whisper-large-v2"
	BATCH_SIZE = 8
	#FILE_LIMIT_MB = 1000

	pipe = pipeline(
	task="automatic-speech-recognition",
	model=MODEL_NAME,
	chunk_length_s=30,
	)


	def transcribe(inputs):
	if inputs is None:
	raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")

	text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe"}, return_timestamps=True)["text"]
	return text

	def remove_spaces_within_dollar(text):
	# 달러 기호로 둘러싸인 부분에서 스페이스 제거
	# 정규 표현식: \$.*?\$ 는 '$'로 시작해서 '$'로 끝나는 최소한의 문자열을 찾음 (non-greedy)
	# re.sub의 repl 파라미터에 함수를 사용하여 매치된 부분에서만 변경을 적용
	result = re.sub(r'\$(.*?)\$', lambda match: match.group(0).replace(' ', ''), text)
	return result


	def audio_correction(file):
	ASR_result = transcribe(file)
	text_list = split_text_complex_rules_with_warning(ASR_result)
	whole_text = ''
	for text in text_list:
	input_text = f"translate the text pronouncing the formula to a LaTeX equation: {text}"
	inputs = tokenizer.encode(
	input_text,
	return_tensors='pt',
	max_length=325,
	padding='max_length',
	truncation=True
	)
	# Get correct sentence ids.
	corrected_ids = model.generate(
	inputs,
	max_length=325,
	num_beams=5, # `num_beams=1` indicated temperature sampling.
	early_stopping=True
	)
	# Decode.
	corrected_sentence = tokenizer.decode(
	corrected_ids[0],
	skip_special_tokens=False
	)
	whole_text += corrected_sentence

	return remove_spaces_within_dollar(whole_text)[5:-4]

	def youtubeASR(link):
	# 유튜브의 음성만 다운로드할 임시 파일명
	out_fn = 'temp1.mp3'

	ydl_opts = {
	'format': 'bestaudio/best', # Audio만 다운로드
	'outtmpl': out_fn, # 지정한 파일명으로 저장
	}

	with YoutubeDL(ydl_opts) as ydl:
	ydl.download([link])

	result = pipe(out_fn, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe"}, return_timestamps=True)["text"] # Youtube에서 받은 음성 파일(out_fn)을 받아쓰기
	script = result['text'] # 받아쓰기 한 내용 저장
	return script

	def split_text_complex_rules_with_warning(text):
	# 콤마를 제외한 구두점으로 문장 분리
	parts = re.split(r'(?<=[.?!])\s+', text)

	result = []
	warnings = [] # 경고 메시지를 저장할 리스트
	for part in parts:
	# 각 부분의 길이가 256자를 초과하는 경우 콤마로 추가 분리
	if len(part) > 256:
	subparts = re.split(r',\s*', part)
	for subpart in subparts:
	# 빈 문자열 제거 및 길이가 256자 이하인 경우만 결과 리스트에 추가
	trimmed_subpart = subpart.strip()
	if trimmed_subpart and len(trimmed_subpart) <= 256:
	result.append(trimmed_subpart)
	else:
	# 길이가 256자를 초과하는 경우 경고 메시지 추가
	warnings.append(f"문장 길이가 256자를 초과합니다: {trimmed_subpart[:50]}... (길이: {len(trimmed_subpart)})")
	else:
	# 길이가 256자 이하인 경우 바로 결과 리스트에 추가
	result.append(part.strip())
	warnings = 0

	return result


	def youtube_correction(link):
	ASR_result = youtubeASR(link)
	text_list = split_text_complex_rules_with_warning(ASR_result)
	whole_text = ''
	for text in text_list:
	input_text = f"translate the text pronouncing the formula to a LaTeX equation: {text}"
	inputs = tokenizer.encode(
	input_text,
	return_tensors='pt',
	max_length=325,
	padding='max_length',
	truncation=True
	)
	# Get correct sentence ids.
	corrected_ids = model.generate(
	inputs,
	max_length=325,
	num_beams=5, # `num_beams=1` indicated temperature sampling.
	early_stopping=True
	)
	# Decode.
	corrected_sentence = tokenizer.decode(
	corrected_ids[0],
	skip_special_tokens=False
	)
	whole_text += corrected_sentence

	return remove_spaces_within_dollar(whole_text)[5:-4]


	demo = gr.Blocks()

	file_transcribe = gr.Interface(
	fn=audio_correction,
	inputs=gr.components.Audio(sources="upload", type="filepath"),
	outputs="text"
	)

	yt_transcribe = gr.Interface(
	fn=youtube_correction,
	inputs="text",
	outputs="text"
	)

	with demo:
	gr.TabbedInterface([file_transcribe, yt_transcribe], ["Audio file", "YouTube"])

	demo.launch()