Spaces:

lshzhm
/

Video-to-Audio-and-Piano

Running

lshzhm commited on 19 days ago

Commit

7d26b43

1 Parent(s): 53593c1

float32

Files changed (2) hide show

app.py CHANGED Viewed

@@ -201,13 +201,7 @@ def load(device):
     return e2tts, stft
-import copy
 e2tts, stft = load(device)
-video2roll_net = copy.deepcopy(e2tts.video2roll_net)
-e2tts = e2tts.half()
-e2tts.video2roll_net = video2roll_net
-del video2roll_net
 gc.collect()
@@ -262,7 +256,7 @@ def run(e2tts, stft, arg1, arg2, arg3, arg4, piano):
             l = mel_lengths[0]
             #cond = mel_spec.repeat(num, 1, 1)
-            cond = torch.randn(num, l, e2tts.num_channels).half()
             duration = torch.tensor([l]*num, dtype=torch.int32)
             lens = torch.tensor([l]*num, dtype=torch.int32)
             print(datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3], "start")
@@ -280,7 +274,7 @@ def run(e2tts, stft, arg1, arg2, arg3, arg4, piano):
             outputs = outputs.reshape(1, -1, outputs.shape[-1])
             audio_final = e2tts.vocos.decode(outputs.transpose(-1,-2))
-            audio_final = audio_final.detach().cpu().float()
             torchaudio.save(audio_path, audio_final, sample_rate = e2tts.sampling_rate)

     return e2tts, stft
 e2tts, stft = load(device)
 gc.collect()
             l = mel_lengths[0]
             #cond = mel_spec.repeat(num, 1, 1)
+            cond = torch.randn(num, l, e2tts.num_channels)
             duration = torch.tensor([l]*num, dtype=torch.int32)
             lens = torch.tensor([l]*num, dtype=torch.int32)
             print(datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3], "start")
             outputs = outputs.reshape(1, -1, outputs.shape[-1])
             audio_final = e2tts.vocos.decode(outputs.transpose(-1,-2))
+            audio_final = audio_final.detach().cpu()
             torchaudio.save(audio_path, audio_final, sample_rate = e2tts.sampling_rate)

src/e2_tts_pytorch/e2_tts_crossatt3.py CHANGED Viewed

@@ -2162,12 +2162,12 @@ class E2TTS(Module):
         batch, cond_seq_len, device = *cond.shape[:2], cond.device
         if frames is None:
-            frames_embed = torch.zeros(batch, cond_seq_len, NOTES, device=device).half()
         else:
             #### sampling settings
             train_video_encoder = True
             if train_video_encoder:
-                frames_embed = self.encode_frames(frames, cond_seq_len).half()
             else:
                 frames_embed = midis
             if frames_embed.shape[1] < cond_seq_len:

         batch, cond_seq_len, device = *cond.shape[:2], cond.device
         if frames is None:
+            frames_embed = torch.zeros(batch, cond_seq_len, NOTES, device=device)
         else:
             #### sampling settings
             train_video_encoder = True
             if train_video_encoder:
+                frames_embed = self.encode_frames(frames, cond_seq_len)
             else:
                 frames_embed = midis
             if frames_embed.shape[1] < cond_seq_len: