Spaces:

lshzhm
/

Video-to-Audio-and-Piano

Running

App Files Files Community

lshzhm commited on Mar 28

Commit

1991049

verified ·

1 Parent(s): 22398e8

Upload 141 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +50 -0
.gitignore +8 -0
LICENSE +21 -0
README.md +78 -0
app.py +86 -0
requirements.txt +26 -0
src/audeo/Midi_synth.py +165 -0
src/audeo/README.md +67 -0
src/audeo/Roll2MidiNet.py +139 -0
src/audeo/Roll2MidiNet_enhance.py +164 -0
src/audeo/Roll2Midi_dataset.py +160 -0
src/audeo/Roll2Midi_dataset_tv2a_eval.py +118 -0
src/audeo/Roll2Midi_evaluate.py +126 -0
src/audeo/Roll2Midi_evaluate_tv2a.py +93 -0
src/audeo/Roll2Midi_inference.py +100 -0
src/audeo/Roll2Midi_train.py +280 -0
src/audeo/Video2RollNet.py +264 -0
src/audeo/Video2Roll_dataset.py +148 -0
src/audeo/Video2Roll_evaluate.py +90 -0
src/audeo/Video2Roll_inference.py +151 -0
src/audeo/Video2Roll_solver.py +204 -0
src/audeo/Video2Roll_train.py +26 -0
src/audeo/Video_Id.md +30 -0
src/audeo/balance_data.py +91 -0
src/audeo/models/Video2Roll_50_0.4/14.pth +3 -0
src/audeo/piano_coords.py +9 -0
src/audeo/thumbnail_image.png +3 -0
src/audeo/videomae_fintune.ipynb +0 -0
src/audioldm/__init__.py +8 -0
src/audioldm/__main__.py +183 -0
src/audioldm/audio/__init__.py +2 -0
src/audioldm/audio/audio_processing.py +100 -0
src/audioldm/audio/stft.py +186 -0
src/audioldm/audio/tools.py +85 -0
src/audioldm/clap/__init__.py +0 -0
src/audioldm/clap/encoders.py +170 -0
src/audioldm/clap/open_clip/__init__.py +25 -0
src/audioldm/clap/open_clip/bert.py +40 -0
src/audioldm/clap/open_clip/bpe_simple_vocab_16e6.txt.gz +3 -0
src/audioldm/clap/open_clip/factory.py +279 -0
src/audioldm/clap/open_clip/feature_fusion.py +192 -0
src/audioldm/clap/open_clip/htsat.py +1308 -0
src/audioldm/clap/open_clip/linear_probe.py +66 -0
src/audioldm/clap/open_clip/loss.py +398 -0
src/audioldm/clap/open_clip/model.py +936 -0
src/audioldm/clap/open_clip/model_configs/HTSAT-base.json +23 -0
src/audioldm/clap/open_clip/model_configs/HTSAT-large.json +23 -0
src/audioldm/clap/open_clip/model_configs/HTSAT-tiny-win-1536.json +23 -0
src/audioldm/clap/open_clip/model_configs/HTSAT-tiny.json +23 -0
src/audioldm/clap/open_clip/model_configs/PANN-10.json +23 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,50 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+Video-to-Audio-and-Piano-HF/src/audeo/thumbnail_image.png filter=lfs diff=lfs merge=lfs -text
+Video-to-Audio-and-Piano-HF/tests/piano_2h_cropped2_cuts/nwwHuxHMIpc.00000000.mp4 filter=lfs diff=lfs merge=lfs -text
+Video-to-Audio-and-Piano-HF/tests/piano_2h_cropped2_cuts/nwwHuxHMIpc.00000001.mp4 filter=lfs diff=lfs merge=lfs -text
+Video-to-Audio-and-Piano-HF/tests/scps/tango-master/data/audiocaps/train_audiocaps.json filter=lfs diff=lfs merge=lfs -text
+Video-to-Audio-and-Piano-HF/tests/scps/tango-master/data/train_audioset_sl.json filter=lfs diff=lfs merge=lfs -text
+Video-to-Audio-and-Piano-HF/tests/scps/tango-master/data/train_bbc_sound_effects.json filter=lfs diff=lfs merge=lfs -text
+Video-to-Audio-and-Piano-HF/tests/scps/tango-master/data/train_val_audioset_sl.json filter=lfs diff=lfs merge=lfs -text
+Video-to-Audio-and-Piano-HF/tests/scps/VGGSound/train.scp filter=lfs diff=lfs merge=lfs -text
+Video-to-Audio-and-Piano-HF/tests/VGGSound/video/1u1orBeV4xI_000428.mp4 filter=lfs diff=lfs merge=lfs -text
+Video-to-Audio-and-Piano-HF/tests/VGGSound/video/1uCzQCdCC1U_000170.mp4 filter=lfs diff=lfs merge=lfs -text
+src/audeo/thumbnail_image.png filter=lfs diff=lfs merge=lfs -text
+tests/piano_2h_cropped2_cuts/nwwHuxHMIpc.00000000.mp4 filter=lfs diff=lfs merge=lfs -text
+tests/piano_2h_cropped2_cuts/nwwHuxHMIpc.00000001.mp4 filter=lfs diff=lfs merge=lfs -text
+tests/VGGSound/video/1u1orBeV4xI_000428.mp4 filter=lfs diff=lfs merge=lfs -text
+tests/VGGSound/video/1uCzQCdCC1U_000170.mp4 filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,8 @@

+**/__pycache__
+src/audeo/data/
+ckpts/
+outputs/
+outputs_piano/
+outputs_vgg/
+src/train*
+src/inference3*

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 Phil Wang
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,78 @@

+---
+title: DeepAudio-V1
+emoji: 🔊
+colorFrom: blue
+colorTo: indigo
+sdk: gradio
+app_file: app.py
+pinned: false
+---
+## Enhance Generation Quality of Flow Matching V2A Model via Multi-Step CoT-Like Guidance and Combined Preference Optimization
+## Towards Video to Piano Music Generation with Chain-of-Perform Support Benchmarks
+## Results
+**1. Results of Video-to-Audio Synthesis**
+https://github.com/user-attachments/assets/d6761371-8fc2-427c-8b2b-6d2ac22a2db2
+https://github.com/user-attachments/assets/50b33e54-8ba1-4fab-89d3-5a5cc4c22c9a
+**2. Results of Video-to-Piano Synthesis**
+https://github.com/user-attachments/assets/b6218b94-1d58-4dc5-873a-c3e8eef6cd67
+https://github.com/user-attachments/assets/ebdd1d95-2d9e-4add-b61a-d181f0ae38d0
+## Installation
+**1. Create a conda environment**
+```bash
+conda create -n v2ap python=3.10
+conda activate v2ap
+```
+**2. Install requirements**
+```bash
+pip install -r requirements.txt
+```
+**Pretrained models**
+The models are available at https://huggingface.co/lshzhm/Video-to-Audio-and-Piano/tree/main.
+## Inference
+**1. Video-to-Audio inference**
+```bash
+python src/inference_v2a.py
+```
+**2. Video-to-Piano inference**
+```bash
+python src/inference_v2p.py
+```
+## Dateset is in progress
+## Metrix
+## Acknowledgement
+- [Audeo](https://github.com/shlizee/Audeo) for video to midi prediction
+- [E2TTS](https://github.com/lucidrains/e2-tts-pytorch) for CFM structure and base E2 implementation
+- [FLAN-T5](https://huggingface.co/google/flan-t5-large) for FLAN-T5 text encode
+- [CLIP](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k) for CLIP image encode
+- [AudioLDM Eval](https://github.com/haoheliu/audioldm_eval) for audio evaluation

app.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import os
+try:
+    import torchaudio
+except ImportError:
+    os.system("cd ./F5-TTS; pip install -e .")
+import spaces
+import logging
+from datetime import datetime
+from pathlib import Path
+import gradio as gr
+import torch
+import torchaudio
+import tempfile
+import requests
+import shutil
+import numpy as np
+from huggingface_hub import hf_hub_download
+model_path = "./ckpts/"
+if not os.path.exists(model_path):
+    os.makedirs(model_path)
+file_path = hf_hub_download(repo_id="lshzhm/Video-to-Audio-and-Piano", local_dir=model_path)
+print(f"Model saved at: {file_path}")
+log = logging.getLogger()
+#@spaces.GPU(duration=120)
+def video_to_audio(video: gr.Video, prompt: str, num_steps: int):
+    return video_save_path, video_gen
+def video_to_piano(video: gr.Video, prompt: str, num_steps: int):
+    return video_save_path, video_gen
+video_to_audio_and_speech_tab = gr.Interface(
+    fn=video_to_audio_and_speech,
+    description="""
+    Project page: <a href="https://acappemin.github.io/DeepAudio-V1.github.io">https://acappemin.github.io/DeepAudio-V1.github.io</a><br>
+    Code: <a href="https://github.com/acappemin/DeepAudio-V1">https://github.com/acappemin/DeepAudio-V1</a><br>
+    """,
+    inputs=[
+        gr.Video(label="Input Video"),
+        gr.Text(label='Video-to-Audio Text Prompt'),
+        gr.Number(label='Video-to-Audio Num Steps', value=64, precision=0, minimum=1),
+        gr.Text(label='Video-to-Speech Transcription'),
+        gr.Audio(label='Video-to-Speech Speech Prompt'),
+        gr.Text(label='Video-to-Speech Speech Prompt Transcription'),
+        gr.Number(label='Video-to-Speech Num Steps', value=64, precision=0, minimum=1),
+    ],
+    outputs=[
+        gr.Video(label="Video-to-Audio Output"),
+        gr.Video(label="Video-to-Speech Output"),
+    ],
+    cache_examples=False,
+    title='Video-to-Audio-and-Speech',
+    examples=[
+        [
+            './tests/VGGSound/video/1u1orBeV4xI_000428.mp4',
+            '',
+            64,
+        ],
+        [
+            './tests/VGGSound/video/1uCzQCdCC1U_000170.mp4',
+            '',
+            64,
+        ],
+    ])
+if __name__ == "__main__":
+    gr.TabbedInterface([video_to_audio_and_speech_tab], ['Video-to-Audio-and-Speech']).launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,26 @@

+accelerate==0.34.2
+beartype==0.18.5
+einops==0.8.0
+einx==0.3.0
+ema-pytorch==0.6.2
+g2p-en==2.1.0
+jaxtyping==0.2.34
+loguru==0.7.2
+tensorboard==2.18.0
+torch==2.4.1
+torchaudio==2.4.1
+torchdiffeq==0.2.4
+torchlibrosa==0.1.0
+torchmetrics==1.6.1
+torchvision==0.19.1
+numpy==1.23.5
+tqdm==4.66.5
+vocos==0.1.0
+x-transformers==1.37.4
+transformers==4.46.0
+moviepy==1.0.3
+jieba==0.42.1
+pypinyin==0.44.0
+progressbar==2.5
+datasets==3.0.1
+matplotlib==3.9.2

src/audeo/Midi_synth.py ADDED Viewed

	@@ -0,0 +1,165 @@

+import os
+import numpy as np
+os.environ["LD_PRELOAD"] = "/usr/lib/x86_64-linux-gnu/libffi.so.7"
+import pretty_midi
+import glob
+import librosa
+import soundfile as sf
+# Synthesizing Audio using Fluid Synth
+class MIDISynth():
+    def __init__(self, out_folder, video_name, instrument, midi=True):
+        self.video_name = video_name
+        # synthesize midi or roll
+        self.midi = False
+        # synthsized output dir, change to your own path
+        self.syn_dir = '/ailab-train/speech/shansizhe/audeo/data/Midi_Synth/training/'
+        self.min_key = 15
+        self.max_key = 65
+        self.frame = 50
+        self.piano_keys = 88
+        if self.midi:
+            self.midi_out_folder = out_folder + video_name
+            self.syn_dir = self.syn_dir + 'w_Roll2Midi/'
+            self.process_midi()
+        else:
+            self.est_roll_folder = out_folder + video_name
+            self.syn_dir = self.syn_dir + 'wo_Roll2Midi/'
+            self.process_roll()
+        self.spf = 0.04 # second per frame
+        self.sample_rate = 16000
+        self.ins = instrument
+    def process_roll(self):
+        self.wo_Roll2Midi_data = []
+        self.est_roll_files = glob.glob(self.est_roll_folder + '/*.npz')
+        self.est_roll_files.sort(key=lambda x: int(x.split('/')[-1].split('.')[0].split('-')[0]))
+        # Use the Roll prediction for Synthesis
+        print("need to process {0} files".format(len(self.est_roll_folder)))
+        for i in range(len(self.est_roll_files)):
+            with np.load(self.est_roll_files[i]) as data:
+                est_roll = data['roll']
+                if est_roll.shape[0] != self.frame:
+                    target = np.zeros((self.frame, self.piano_keys))
+                    target[:est_roll.shape[0], :] = est_roll
+                    est_roll = target
+                est_roll = np.where(est_roll > 0, 1, 0)
+            self.wo_Roll2Midi_data.append(est_roll)
+        self.complete_wo_Roll2Midi_midi = np.concatenate(self.wo_Roll2Midi_data)
+        print("Without Roll2MidiNet, the Roll result has shape:", self.complete_wo_Roll2Midi_midi.shape)
+        # compute onsets and offsets
+        onset = np.zeros(self.complete_wo_Roll2Midi_midi.shape)
+        offset = np.zeros(self.complete_wo_Roll2Midi_midi.shape)
+        for j in range(self.complete_wo_Roll2Midi_midi.shape[0]):
+            if j != 0:
+                onset[j][np.setdiff1d(self.complete_wo_Roll2Midi_midi[j].nonzero(),
+                                      self.complete_wo_Roll2Midi_midi[j - 1].nonzero())] = 1
+                offset[j][np.setdiff1d(self.complete_wo_Roll2Midi_midi[j - 1].nonzero(),
+                                       self.complete_wo_Roll2Midi_midi[j].nonzero())] = -1
+            else:
+                onset[j][self.complete_wo_Roll2Midi_midi[j].nonzero()] = 1
+        onset += offset
+        self.complete_wo_Roll2Midi_onset = onset.T
+        print("Without Roll2MidiNet, the onset has shape:", self.complete_wo_Roll2Midi_onset.shape)
+    def process_midi(self):
+        self.w_Roll2Midi_data = []
+        self.infer_out_files = glob.glob(self.midi_out_folder + '/*.npz')
+        self.infer_out_files.sort(key=lambda x: int(x.split('/')[-1].split('.')[0].split('-')[0]))
+        # Use the Midi prediction for Synthesis
+        for i in range(len(self.infer_out_files)):
+            with np.load(self.infer_out_files[i]) as data:
+                est_midi = data['midi']
+                target = np.zeros((self.frame, self.piano_keys))
+                target[:est_midi.shape[0], self.min_key:self.max_key+1] = est_midi
+                est_midi = target
+                est_midi = np.where(est_midi > 0, 1, 0)
+            self.w_Roll2Midi_data.append(est_midi)
+        self.complete_w_Roll2Midi_midi = np.concatenate(self.w_Roll2Midi_data)
+        print("With Roll2MidiNet Midi, the Midi result has shape:", self.complete_w_Roll2Midi_midi.shape)
+        # compute onsets and offsets
+        onset = np.zeros(self.complete_w_Roll2Midi_midi.shape)
+        offset = np.zeros(self.complete_w_Roll2Midi_midi.shape)
+        for j in range(self.complete_w_Roll2Midi_midi.shape[0]):
+            if j != 0:
+                onset[j][np.setdiff1d(self.complete_w_Roll2Midi_midi[j].nonzero(),
+                                      self.complete_w_Roll2Midi_midi[j - 1].nonzero())] = 1
+                offset[j][np.setdiff1d(self.complete_w_Roll2Midi_midi[j - 1].nonzero(),
+                                       self.complete_w_Roll2Midi_midi[j].nonzero())] = -1
+            else:
+                onset[j][self.complete_w_Roll2Midi_midi[j].nonzero()] = 1
+        onset += offset
+        self.complete_w_Roll2Midi_onset = onset.T
+        print("With Roll2MidiNet, the onset has shape:", self.complete_w_Roll2Midi_onset.shape)
+    def GetNote(self):
+        if self.midi:
+            self.w_Roll2Midi_notes = {}
+            for i in range(self.complete_w_Roll2Midi_onset.shape[0]):
+                tmp = self.complete_w_Roll2Midi_onset[i]
+                start = np.where(tmp == 1)[0]
+                end = np.where(tmp == -1)[0]
+                if len(start) != len(end):
+                    end = np.append(end, tmp.shape)
+                merged_list = [(start[i], end[i]) for i in range(0, len(start))]
+                # 21 is the lowest piano key in the Midi note number (Midi has 128 notes)
+                self.w_Roll2Midi_notes[21 + i] = merged_list
+        else:
+            self.wo_Roll2Midi_notes = {}
+            for i in range(self.complete_wo_Roll2Midi_onset.shape[0]):
+                tmp = self.complete_wo_Roll2Midi_onset[i]
+                start = np.where(tmp==1)[0]
+                end = np.where(tmp==-1)[0]
+                if len(start)!=len(end):
+                    end = np.append(end, tmp.shape)
+                merged_list = [(start[i], end[i]) for i in range(0, len(start))]
+                self.wo_Roll2Midi_notes[21 + i] = merged_list
+    def Synthesize(self):
+        if self.midi:
+            wav = self.generate_midi(self.w_Roll2Midi_notes, self.ins)
+            path = self.create_output_dir()
+            out_file = path + f'/Midi-{self.video_name}-{self.ins}.wav'
+            #librosa.output.write_wav(out_file, wav, sr=self.sample_rate)
+            sf.write(out_file, wav, self.sample_rate)
+        else:
+            wav = self.generate_midi(self.wo_Roll2Midi_notes, self.ins)
+            path = self.create_output_dir()
+            out_file = path + f'/Roll-{self.video_name}-{self.ins}.wav'
+            #librosa.output.write_wav(out_file, wav, sr=self.sample_rate)
+            sf.write(out_file, wav, self.sample_rate)
+    def generate_midi(self, notes, ins):
+        pm = pretty_midi.PrettyMIDI(initial_tempo=80)
+        piano_program = pretty_midi.instrument_name_to_program(ins) #Acoustic Grand Piano
+        piano = pretty_midi.Instrument(program=piano_program)
+        for key in list(notes.keys()):
+            values = notes[key]
+            for i in range(len(values)):
+                start, end = values[i]
+                note = pretty_midi.Note(velocity=100, pitch=key, start=start * self.spf, end=end * self.spf)
+                piano.notes.append(note)
+        pm.instruments.append(piano)
+        wav = pm.fluidsynth(fs=16000)
+        return wav
+    def create_output_dir(self):
+        synth_out_dir = os.path.join(self.syn_dir, self.video_name)
+        os.makedirs(synth_out_dir, exist_ok=True)
+        return synth_out_dir
+if __name__ == "__main__":
+    # could select any instrument available in Midi
+    instrument = 'Acoustic Grand Piano'
+    for i in [1,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,19,21,22,23,24,25,26,27]:
+        video_name = f'{i}'
+        #print(video_name)
+        Midi_out_folder = '/ailab-train/speech/shansizhe/audeo/data/estimate_Roll/training/'# Generated Midi output folder, change to your own path
+        Synth = MIDISynth(Midi_out_folder, video_name, instrument)
+        Synth.GetNote()
+        Synth.Synthesize()

src/audeo/README.md ADDED Viewed

	@@ -0,0 +1,67 @@

+# Audeo
+## Introduction
+This repository contains the code for the paper **"Audeo: Audio Generation for a Silent Performance Video"**, which is avilable [here](https://proceedings.neurips.cc/paper/2020/file/227f6afd3b7f89b96c4bb91f95d50f6d-Paper.pdf), published in NeurIPS 2020. More samples can be found in our [project webpage](http://faculty.washington.edu/shlizee/audeo/) and [Youtube Video](https://www.youtube.com/watch?v=8rS3VgjG7_c).
+[![Alt text](https://img.youtube.com/vi/8rS3VgjG7_c/0.jpg)](https://www.youtube.com/watch?v=8rS3VgjG7_c)
+## Abstract
+We present a novel system that gets as an input, video frames of a musician playing the piano, and generates the music for that video. The generation of music from
+visual cues is a challenging problem and it is not clear whether it is an attainable goal at all. Our main aim in this work is to explore the plausibility of such a
+transformation and to identify cues and components able to carry the association of sounds with visual events. To achieve the transformation we built a full pipeline
+named ‘Audeo’ containing three components. We first translate the video frames of the keyboard and the musician hand movements into raw mechanical musical
+symbolic representation Piano-Roll (Roll) for each video frame which represents the keys pressed at each time step. We then adapt the Roll to be amenable for audio
+synthesis by including temporal correlations. This step turns out to be critical for meaningful audio generation. In the last step, we implement Midi synthesizers
+to generate realistic music. Audeo converts video to audio smoothly and clearly with only a few setup constraints. We evaluate Audeo on piano performance videos
+collected from Youtube and obtain that their generated music is of reasonable audio quality and can be successfully recognized with high precision by popular
+music identification software.
+## Data
+We use Youtube Channel videos recorded by [Paul Barton](https://www.youtube.com/user/PaulBartonPiano) to evaluate the Audeo pipeline. For **Pseudo Midi Evaluation**, we use 24 videos of Bach Well-Tempered Clavier Book One (WTC B1). The testing set contains the first 3 Prelude and Fugue performances of Bach Well-Tempered Clavier Book Two (WTC B2) The Youtube Video Id can be found in [here](https://github.com/shlizee/Audeo/blob/master/Video_Id.md). For **Audio Evaluation**, we use 35 videos from WTC B2 (24 Prelude and Fugue pairs and their 11 variants), 8 videos from WTC B1 Variants, and 9 videos from other composers. Since we cannot host the videos due to copyright issues, you need to download the videos yourself.
+All videos are set at the frame of 25 fps and the audio sampling rate of 16kHz. The **Pseudo GT Midi** are obtained via [Onsets and Frames framework (OF)](https://github.com/magenta/magenta/tree/master/magenta/models/onsets_frames_transcription). We process all videos and keep the full keyboard only and remove all frames that do not contribute to the piano performance (e.g., logos, black screens, etc). The **cropped piano coordinates** can be found in [here](https://github.com/shlizee/Audeo/blob/master/piano_coords.py) (The order is the same as in **Video_Id** file. We trim the initial silent sections up to the first frame in which the first key is being pressed, to align the video, Pseudo GT Midi, and the audio. All silent frames inside each performance are kept.
+For your convenience, we provide the following folders/files in [Google Drive](https://drive.google.com/drive/folders/1w9wsZM-tPPUVqwdpsefEkrDgkN3kfg7G?usp=sharing):
+- **input_images**: examples of how the images data should look like.
+- **labels**: training and testing labels of for training/testing Video2Roll Net. Each folder contains a **pkl** file for one video. The labels are dictionaries where **key** is the **frame number** and **value** is a 88 dim vector. See **Video2Roll_dataset.py** for more details.
+- **OF_midi_files**: the original Pseudo ground truth midi files obtained from **Onsets and Frames Framework**.
+- **midi**: we process the Pseudo GT Midi files to 2D matrix (Piano keys x Time) and down-sampled to 25 fps. Then for each video, we divide them into multiple 2 seconds (50 frames) segments. For example **253-303.npz** includes the 2D matrix from frame 253 to frame 302.
+- **estimate_Roll**: the **Roll** predictions obtained from **Video2Roll Net**. Same format as the **midi**. You can directly use them for training **Roll2Midi Net**.
+- **Roll2Midi_results**: the **Midi** predictions obtained from **Roll2Midi Net**. Same format as the **midi** and **estimate_Roll**. Ready for **Midy Synth**.
+- **Midi_Synth**: synthesized audios from **Roll2Midi_results**.
+- **Video2Roll_models**: contains the pre-trained **Video2RollNet.pth**.
+- **Roll2Midi_models**: contains the pre-trained **Roll2Midi Net**.
+## How to Use
+- Video2Roll Net
+  1. Please check the **Video2Roll_dataset.py** and make sure you satisfy the data formats.
+  2. Run **Video2Roll_train.py** for training.
+  3. Run **Video2Roll_evaluate.py** for evaluation.
+  4. Run **Video2Roll_inference.py** to generate **Roll** predictions.
+- Roll2Midi Net
+  1. Run **Roll2Midi_train.py** for training.
+  2. Run **Roll2Midi_evaluate.py** for evaluation.
+  2. Run **Roll2Midi_inference.py** to generate **Midi** predictions.
+- Midi Synth
+  1. Run **Midi_synth.py** to use **Fluid Synth** to synthesize audio.
+## Requirements
+- Pytorch >= 1.6
+- Python 3
+- numpy 1.19
+- scikit-learn 0.22.1
+- librosa 0.7.1
+- pretty-midi 0.2.8
+## Citation
+Please cite ["Audeo: Audio Generation for a Silent Performance Video"](https://proceedings.neurips.cc/paper/2020/file/227f6afd3b7f89b96c4bb91f95d50f6d-Paper.pdf) when you use this code:
+```
+@article{su2020audeo,
+  title={Audeo: Audio generation for a silent performance video},
+  author={Su, Kun and Liu, Xiulong and Shlizerman, Eli},
+  journal={Advances in Neural Information Processing Systems},
+  volume={33},
+  year={2020}
+}
+```

src/audeo/Roll2MidiNet.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import torch.nn as nn
+import torch.nn.functional as F
+import torch
+##############################
+#           U-NET
+##############################
+class UNetDown(nn.Module):
+    def __init__(self, in_size, out_size, normalize=True, dropout=0.0):
+        super(UNetDown, self).__init__()
+        model = [nn.Conv2d(in_size, out_size, 3, stride=1, padding=1, bias=False)]
+        if normalize:
+            model.append(nn.BatchNorm2d(out_size, 0.8))
+        model.append(nn.LeakyReLU(0.2))
+        if dropout:
+            model.append(nn.Dropout(dropout))
+        self.model = nn.Sequential(*model)
+    def forward(self, x):
+        return self.model(x)
+class UNetUp(nn.Module):
+    def __init__(self, in_size, out_size, dropout=0.0):
+        super(UNetUp, self).__init__()
+        model = [
+            nn.ConvTranspose2d(in_size, out_size, 3, stride=1, padding=1, bias=False),
+            nn.BatchNorm2d(out_size, 0.8),
+            nn.ReLU(inplace=True),
+        ]
+        if dropout:
+            model.append(nn.Dropout(dropout))
+        self.model = nn.Sequential(*model)
+    def forward(self, x, skip_input):
+        x = self.model(x)
+        out = torch.cat((x, skip_input), 1)
+        return out
+class Generator(nn.Module):
+    def __init__(self, input_shape):
+        super(Generator, self).__init__()
+        channels, _ , _ = input_shape
+        self.down1 = UNetDown(channels, 64, normalize=False)
+        self.down2 = UNetDown(64, 128)
+        self.down3 = UNetDown(128, 256, dropout=0.5)
+        self.down4 = UNetDown(256, 512, dropout=0.5)
+        self.down5 = UNetDown(512, 1024, dropout=0.5)
+        self.down6 = UNetDown(1024, 1024, dropout=0.5)
+        self.up1 = UNetUp(1024, 512, dropout=0.5)
+        self.up2 = UNetUp(1024+512, 256, dropout=0.5)
+        self.up3 = UNetUp(512+256, 128, dropout=0.5)
+        self.up4 = UNetUp(256+128, 64)
+        self.up5 = UNetUp(128+64, 16)
+        self.conv1d = nn.Conv2d(80, 1, kernel_size=1)
+    def forward(self, x):
+        # U-Net generator with skip connections from encoder to decoder
+        d1 = self.down1(x)
+        d2 = self.down2(d1)
+        d3 = self.down3(d2)
+        d4 = self.down4(d3)
+        d5 = self.down5(d4)
+        d6 = self.down6(d5)
+        u1 = self.up1(d6, d5)
+        u2 = self.up2(u1, d4)
+        u3 = self.up3(u2, d3)
+        u4 = self.up4(u3, d2)
+        u5 = self.up5(u4, d1)
+        out = self.conv1d(u5)
+        out = F.sigmoid(out)
+        return out
+class Discriminator(nn.Module):
+    def __init__(self, input_shape):
+        super(Discriminator, self).__init__()
+        channels, height, width = input_shape #1 51 50
+        # Calculate output of image discriminator (PatchGAN)
+        patch_h, patch_w = int(height / 2 ** 3)+1, int(width / 2 ** 3)+1
+        self.output_shape = (1, patch_h, patch_w)
+        def discriminator_block(in_filters, out_filters, stride, normalize):
+            """Returns layers of each discriminator block"""
+            layers = [nn.Conv2d(in_filters, out_filters, 3, stride, 1)]
+            if normalize:
+                layers.append(nn.InstanceNorm2d(out_filters))
+            layers.append(nn.LeakyReLU(0.2, inplace=True))
+            return layers
+        layers = []
+        in_filters = channels
+        for out_filters, stride, normalize in [(64, 2, False), (128, 2, True), (256, 2, True), (512, 1, True)]:
+            layers.extend(discriminator_block(in_filters, out_filters, stride, normalize))
+            in_filters = out_filters
+        layers.append(nn.Conv2d(out_filters, 1, 3, 1, 1))
+        self.model = nn.Sequential(*layers)
+    def forward(self, img):
+        return self.model(img)
+def weights_init_normal(m):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        torch.nn.init.normal_(m.weight.data, 0.0, 0.02)
+    elif classname.find("BatchNorm2d") != -1:
+        torch.nn.init.normal_(m.weight.data, 1.0, 0.02)
+        torch.nn.init.constant_(m.bias.data, 0.0)
+if __name__ == "__main__":
+    input_shape = (1,51, 100)
+    gnet = Generator(input_shape)
+    dnet = Discriminator(input_shape)
+    print(dnet.output_shape)
+    imgs = torch.rand((64,1,51,100))
+    gen = gnet(imgs)
+    print(gen.shape)
+    dis = dnet(gen)
+    print(dis.shape)

src/audeo/Roll2MidiNet_enhance.py ADDED Viewed

	@@ -0,0 +1,164 @@

+import torch.nn as nn
+import torch.nn.functional as F
+import torch
+##############################
+#           U-NET
+##############################
+class UNetDown(nn.Module):
+    def __init__(self, in_size, out_size, normalize=True, dropout=0.0):
+        super(UNetDown, self).__init__()
+        model = [nn.Conv2d(in_size, out_size, 3, stride=1, padding=1, bias=False)]
+        if normalize:
+            model.append(nn.BatchNorm2d(out_size, 0.8))
+        model.append(nn.LeakyReLU(0.2))
+        if dropout:
+            model.append(nn.Dropout(dropout))
+        self.model = nn.Sequential(*model)
+    def forward(self, x):
+        return self.model(x)
+class UNetUp(nn.Module):
+    def __init__(self, in_size, out_size, dropout=0.0):
+        super(UNetUp, self).__init__()
+        model = [
+            nn.ConvTranspose2d(in_size, out_size, 3, stride=1, padding=1, bias=False),
+            nn.BatchNorm2d(out_size, 0.8),
+            nn.ReLU(inplace=True),
+        ]
+        if dropout:
+            model.append(nn.Dropout(dropout))
+        self.model = nn.Sequential(*model)
+    def forward(self, x, skip_input):
+        x = self.model(x)
+        out = torch.cat((x, skip_input), 1)
+        return out
+class AttentionGate(nn.Module):
+    def __init__(self, in_channels, g_channels, out_channels):
+        super(AttentionGate, self).__init__()
+        self.theta_x = nn.Conv2d(in_channels, out_channels, kernel_size=1)
+        self.phi_g = nn.Conv2d(g_channels, out_channels, kernel_size=1)
+        self.psi = nn.Conv2d(out_channels, 1, kernel_size=1)
+        self.sigmoid = nn.Sigmoid()
+    def forward(self, x, g):
+        theta_x = self.theta_x(x)
+        phi_g = self.phi_g(g)
+        f = theta_x + phi_g
+        f = self.psi(f)
+        alpha = self.sigmoid(f)
+        return x * alpha
+class Generator(nn.Module):
+    def __init__(self, input_shape):
+        super(Generator, self).__init__()
+        channels, _ , _ = input_shape
+        self.down1 = UNetDown(channels, 64, normalize=False)
+        self.down2 = UNetDown(64, 128)
+        self.down3 = UNetDown(128, 256, dropout=0.5)
+        self.down4 = UNetDown(256, 512, dropout=0.5)
+        self.down5 = UNetDown(512, 1024, dropout=0.5)
+        self.down6 = UNetDown(1024, 1024, dropout=0.5)
+        # Attention Gates
+        self.att1 = AttentionGate(2048, 1024, 512)
+        self.att2 = AttentionGate(1024, 512, 256)
+        self.att3 = AttentionGate(512, 256, 128)
+        self.att4 = AttentionGate(256, 128, 64)
+        self.up1 = UNetUp(1024, 1024, dropout=0.5)
+        self.up2 = UNetUp(2048, 512, dropout=0.5)
+        self.up3 = UNetUp(1024, 256, dropout=0.5)
+        self.up4 = UNetUp(512, 128)
+        self.up5 = UNetUp(256, 64)
+        self.conv1d = nn.Conv2d(128, 1, kernel_size=1)
+    def forward(self, x):
+        # U-Net generator with skip connections from encoder to decoder
+        d1 = self.down1(x)
+        d2 = self.down2(d1)
+        d3 = self.down3(d2)
+        d4 = self.down4(d3)
+        d5 = self.down5(d4)
+        d6 = self.down6(d5)
+        u1 = self.up1(d6, d5)
+        u1 = self.att1(u1, d5)
+        u2 = self.up2(u1, d4)
+        u2 = self.att2(u2, d4)
+        u3 = self.up3(u2, d3)
+        u3 = self.att3(u3, d3)
+        u4 = self.up4(u3, d2)
+        u4 = self.att4(u4, d2)
+        u5 = self.up5(u4, d1)
+        out = self.conv1d(u5)
+        out = F.sigmoid(out)
+        return out
+class Discriminator(nn.Module):
+    def __init__(self, input_shape):
+        super(Discriminator, self).__init__()
+        channels, height, width = input_shape #1 51 50
+        # Calculate output of image discriminator (PatchGAN)
+        patch_h, patch_w = int(height / 2 ** 3)+1, int(width / 2 ** 3)+1
+        self.output_shape = (1, patch_h, patch_w)
+        def discriminator_block(in_filters, out_filters, stride, normalize):
+            """Returns layers of each discriminator block"""
+            layers = [nn.Conv2d(in_filters, out_filters, 3, stride, 1)]
+            if normalize:
+                layers.append(nn.InstanceNorm2d(out_filters))
+            layers.append(nn.LeakyReLU(0.2, inplace=True))
+            return layers
+        layers = []
+        in_filters = channels
+        for out_filters, stride, normalize in [(64, 2, False), (128, 2, True), (256, 2, True), (512, 1, True)]:
+            layers.extend(discriminator_block(in_filters, out_filters, stride, normalize))
+            in_filters = out_filters
+        layers.append(nn.Conv2d(out_filters, 1, 3, 1, 1))
+        self.model = nn.Sequential(*layers)
+    def forward(self, img):
+        return self.model(img)
+def weights_init_normal(m):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        torch.nn.init.normal_(m.weight.data, 0.0, 0.02)
+    elif classname.find("BatchNorm2d") != -1:
+        torch.nn.init.normal_(m.weight.data, 1.0, 0.02)
+        torch.nn.init.constant_(m.bias.data, 0.0)
+if __name__ == "__main__":
+    input_shape = (1,51, 100)
+    gnet = Generator(input_shape)
+    dnet = Discriminator(input_shape)
+    print(dnet.output_shape)
+    imgs = torch.rand((64,1,51,100))
+    gen = gnet(imgs)
+    print(gen.shape)
+    dis = dnet(gen)
+    print(dis.shape)

src/audeo/Roll2Midi_dataset.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+from torch.utils.data import Dataset,DataLoader
+import glob
+print(torch.cuda.current_device())
+DEFAULT_DEVICE = 'cuda'
+torch.cuda.set_device(0)
+frames = 50 #2 seconds
+min_key = 15
+max_key = 65
+class Roll2MidiDataset(Dataset):
+    def __init__(self, path='/ailab-train/speech/shansizhe/audeo/data/midi_npz', est_roll_path='/ailab-train/speech/shansizhe/audeo/data/estimate_Roll_exp3',
+                    train=True,  device=DEFAULT_DEVICE):
+        self.path = path
+        self.est_roll_path = est_roll_path
+        self.device = device
+        self.train = train
+        self.load_data()
+    def __getitem__(self, index):
+        if self.train:
+            gt, roll = self.final_data['train'][index]
+        else:
+            gt, roll = self.final_data['test'][index]
+        gt_ = gt.T.float().to(self.device)
+        roll_ = roll.T.float().to(self.device)
+        return torch.unsqueeze(gt_, dim=0), torch.unsqueeze(torch.sigmoid(roll_), dim=0)
+    def __len__(self):
+        if self.train:
+            return len(self.final_data['train'])
+        else:
+            return len(self.final_data['test'])
+    def load_data(self):
+        self.files = []
+        self.labels = []
+        # ground truth midi dir
+        path = self.path
+        #print(path)
+        train_gt_folders = glob.glob(path + '/training/*')
+        train_gt_folders.sort(key=lambda x: int(x.split('/')[-1]))
+        print(train_gt_folders)
+        test_gt_folders = glob.glob(path + '/testing/*')
+        test_gt_folders.sort(key=lambda x: int(x.split('/')[-1]))
+        print(test_gt_folders)
+        # Roll predictions dir
+        train_roll_folder = glob.glob(self.est_roll_path + '/training/*')
+        train_roll_folder.sort(key=lambda x: int(x.split('/')[-1]))
+        print(train_roll_folder)
+        test_roll_folder = glob.glob(self.est_roll_path + '/testing/*')
+        test_roll_folder.sort(key=lambda x: int(x.split('/')[-1]))
+        print(test_roll_folder)
+        # self.folders: dictionary
+        # key: train/test, values: list of tuples [(ground truth midi folder name, roll prediction folder name)]
+        self.folders = {}
+        self.folders['train'] = [(train_gt_folders[i], train_roll_folder[i]) for i in range(len(train_gt_folders))]
+        print(self.folders['train'])
+        self.folders['test'] = [(test_gt_folders[i], test_roll_folder[i]) for i in range(len(test_gt_folders))]
+        print(self.folders['test'])
+        # self.data: dictionary
+        # key: train/test, value:list of tuples [(2 sec ground truth Midi, 2 sec Roll prediction logits)]
+        self.data = {}
+        self.data['train'] = []
+        self.data['test'] = []
+        # self.final_data: similar to the data, but concat two continuous 2 sec Roll prediction (4 seconds, 100 frames)
+        self.final_data = {}
+        self.final_data['train'] = []
+        self.final_data['test'] = []
+        # load training data
+        for train_gt_folder, est_roll_folder in self.folders['train']:
+            gt_files = glob.glob(train_gt_folder + '/*.npz')
+            gt_files.sort(key=lambda x: int(x.split('/')[-1].split('.')[0].split('-')[0].split('_')[1]))
+            est_roll_files = glob.glob(est_roll_folder + '/*.npz')
+            est_roll_files.sort(key=lambda x: int(x.split('/')[-1].split('.')[0].split('-')[0]))
+            print("have the same files of training gt and est roll:", len(gt_files) == len(est_roll_files))
+            for i in range(len(gt_files)):
+                with np.load(gt_files[i]) as data:
+                    gt = data['midi'][:, min_key:max_key + 1]
+                    if gt.shape[0] != frames:
+                        target = np.zeros((frames, max_key-min_key+1))
+                        target[:gt.shape[0], :] = gt
+                        gt = target
+                    gt = np.where(gt > 0, 1, 0)
+                with np.load(est_roll_files[i]) as data:
+                    est_roll_logit = data['logit'][:, min_key:max_key + 1]
+                    if est_roll_logit.shape[0] != frames:
+                        target = np.zeros((frames, max_key-min_key+1))
+                        target[:est_roll_logit.shape[0], :] = est_roll_logit
+                        est_roll_logit = target
+                self.data['train'].append((torch.from_numpy(gt), torch.from_numpy(est_roll_logit)))
+        # make 4 sec data
+        for i in range(len(self.data['train'])):
+            if i + 1 < len(self.data['train']):
+                one_gt, one_roll = self.data['train'][i]
+                two_gt, two_roll = self.data['train'][i + 1]
+                final_gt = torch.cat([one_gt, two_gt], dim=0)
+                final_roll = torch.cat([one_roll, two_roll], dim=0)
+                self.final_data['train'].append((final_gt, final_roll))
+        print("total number of training data:", len(self.final_data['train']))
+        # load testing data
+        for test_gt_folder, est_roll_folder in self.folders['test']:
+            gt_files = glob.glob(test_gt_folder + '/*.npz')
+            gt_files.sort(key=lambda x: int(x.split('/')[-1].split('.')[0].split('-')[0].split('_')[1]))
+            est_roll_files = glob.glob(est_roll_folder + '/*.npz')
+            est_roll_files.sort(key=lambda x: int(x.split('/')[-1].split('.')[0].split('-')[0]))
+            print("have the same files of testing midi and roll:", len(gt_files) == len(est_roll_files))
+            for i in range(len(gt_files)):
+                with np.load(gt_files[i]) as data:
+                    gt = data['midi'][:, min_key:max_key + 1]
+                    if gt.shape[0] != frames:
+                        target = np.zeros((frames, max_key-min_key+1))
+                        target[:gt.shape[0], :] = gt
+                        gt = target
+                    gt = np.where(gt > 0, 1, 0)
+                with np.load(est_roll_files[i]) as data:
+                    est_roll = data['logit'][:, min_key:max_key + 1]  # data['midi']
+                    if est_roll.shape[0] != frames:
+                        target = np.zeros((frames, max_key-min_key+1))
+                        target[:est_roll.shape[0], :] = est_roll
+                        est_roll = target
+                self.data['test'].append((torch.from_numpy(gt), torch.from_numpy(est_roll)))
+        for i in range(0, len(self.data['test']), 2):
+            if i + 1 < len(self.data['test']):
+                one_gt, one_roll = self.data['test'][i]
+                two_gt, two_roll = self.data['test'][i + 1]
+                final_gt = torch.cat([one_gt, two_gt], dim=0)
+                final_roll = torch.cat([one_roll, two_roll], dim=0)
+                self.final_data['test'].append((final_gt, final_roll))
+        print("total number of testing data:", len(self.final_data['test']))
+if __name__ == "__main__":
+    dataset = Roll2MidiDataset()
+    gt,midi = dataset.__getitem__(0)
+    print(gt.shape)
+    print(midi.shape)
+    fig, (ax1,ax2,ax3) = plt.subplots(1, 3)
+    ax1.imshow(gt.cpu().numpy().squeeze(), plt.cm.gray)
+    ax2.imshow(midi.cpu().numpy().squeeze(), plt.cm.gray)
+    plt.show()
+    data_loader = DataLoader(dataset, batch_size=64)
+    for i,data in enumerate(data_loader):
+        gts,midis = data
+        break

src/audeo/Roll2Midi_dataset_tv2a_eval.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+from torch.utils.data import Dataset,DataLoader
+import glob
+print(torch.cuda.current_device())
+DEFAULT_DEVICE = 'cuda'
+torch.cuda.set_device(0)
+frames = 50 #2 seconds
+min_key = 15
+max_key = 65
+class Roll2MidiDataset(Dataset):
+    def __init__(self, path='/ailab-train/speech/shansizhe/audeo/data/tv2a_piano3_4000_pkl_npz/gt/npz/', est_roll_path='/ailab-train/speech/shansizhe/audeo/data/tv2a_piano3_4000_pkl_npz/v2a/npz/',
+                    train=True,  device=DEFAULT_DEVICE):
+        self.path = path
+        self.est_roll_path = est_roll_path
+        self.device = device
+        self.train = train
+        self.load_data()
+    def __getitem__(self, index):
+        if self.train:
+            gt, roll = self.final_data['train'][index]
+        else:
+            gt, roll = self.final_data['test'][index]
+        gt_ = gt.T.float().to(self.device)
+        roll_ = roll.T.float().to(self.device)
+        return torch.unsqueeze(gt_, dim=0), torch.unsqueeze(roll_, dim=0)
+    def __len__(self):
+        if self.train:
+            return len(self.final_data['train'])
+        else:
+            return len(self.final_data['test'])
+    def load_data(self):
+        self.files = []
+        self.labels = []
+        # ground truth midi dir
+        path = self.path
+        #print(path)
+        train_gt_folders = glob.glob(path + '/*')
+        train_gt_folders.sort(key=lambda x: x.split('/')[-1].split('__')[-1])
+        print(train_gt_folders)
+        # Roll predictions dir
+        train_roll_folder = glob.glob(self.est_roll_path + '/*')
+        train_roll_folder.sort(key=lambda x: x.split('/')[-1].split('__')[-1])
+        print(train_roll_folder)
+        # self.folders: dictionary
+        # key: train/test, values: list of tuples [(ground truth midi folder name, roll prediction folder name)]
+        self.folders = {}
+        self.folders['train'] = [(train_gt_folders[i], train_roll_folder[i]) for i in range(len(train_gt_folders))]
+        print(self.folders['train'])
+        # self.data: dictionary
+        # key: train/test, value:list of tuples [(2 sec ground truth Midi, 2 sec Roll prediction logits)]
+        self.data = {}
+        self.data['train'] = []
+        # self.final_data: similar to the data, but concat two continuous 2 sec Roll prediction (4 seconds, 100 frames)
+        self.final_data = {}
+        self.final_data['train'] = []
+        # load training data
+        for train_gt_folder, est_roll_folder in self.folders['train']:
+            gt_files = glob.glob(train_gt_folder + '/*.npz')
+            gt_files.sort(key=lambda x: int(x.split('/')[-1].split('.')[0].split('-')[0]))
+            est_roll_files = glob.glob(est_roll_folder + '/*.npz')
+            est_roll_files.sort(key=lambda x: int(x.split('/')[-1].split('.')[0].split('-')[0]))
+            print("have the same files of training gt and est roll:", len(gt_files) == len(est_roll_files))
+            for i in range(len(gt_files)):
+                with np.load(gt_files[i]) as data:
+                    gt = data['midi'][:, min_key:max_key + 1]
+                    if gt.shape[0] != frames:
+                        target = np.zeros((frames, max_key-min_key+1))
+                        target[:gt.shape[0], :] = gt
+                        gt = target
+                    gt = np.where(gt > 0, 1, 0)
+                with np.load(est_roll_files[i]) as data:
+                    est_roll_logit = data['midi'][:, min_key:max_key + 1]
+                    if est_roll_logit.shape[0] != frames:
+                        target = np.zeros((frames, max_key-min_key+1))
+                        target[:est_roll_logit.shape[0], :] = est_roll_logit
+                        est_roll_logit = target
+                    est_roll_logit = np.where(est_roll_logit > 0, 1, 0)
+                self.data['train'].append((torch.from_numpy(gt), torch.from_numpy(est_roll_logit)))
+        # make 4 sec data
+        for i in range(len(self.data['train'])):
+            if i + 1 < len(self.data['train']):
+                one_gt, one_roll = self.data['train'][i]
+                two_gt, two_roll = self.data['train'][i + 1]
+                final_gt = torch.cat([one_gt, two_gt], dim=0)
+                final_roll = torch.cat([one_roll, two_roll], dim=0)
+                self.final_data['train'].append((final_gt, final_roll))
+        print("total number of training data:", len(self.final_data['train']))
+if __name__ == "__main__":
+    dataset = Roll2MidiDataset()
+    gt,midi = dataset.__getitem__(0)
+    print(gt.shape)
+    print(midi.shape)
+    fig, (ax1,ax2,ax3) = plt.subplots(1, 3)
+    ax1.imshow(gt.cpu().numpy().squeeze(), plt.cm.gray)
+    ax2.imshow(midi.cpu().numpy().squeeze(), plt.cm.gray)
+    plt.show()
+    data_loader = DataLoader(dataset, batch_size=64)
+    for i,data in enumerate(data_loader):
+        gts,midis = data
+        break

src/audeo/Roll2Midi_evaluate.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import os
+import json
+from Roll2Midi_dataset import Roll2MidiDataset
+from sklearn import metrics
+import torch.utils.data as utils
+import torch
+from Roll2MidiNet_enhance import Generator
+from torch.autograd import Variable
+import numpy as np
+from sklearn.metrics import _classification
+cuda = torch.device("cuda")
+Tensor = torch.cuda.FloatTensor
+def process_data():
+    test_dataset = Roll2MidiDataset(train=False)
+    test_loader = utils.DataLoader(test_dataset, batch_size=16)
+    return test_loader
+def test(generator,  test_loader):
+    all_label = []
+    all_pred_label = []
+    all_pred_label_ = []
+    with torch.no_grad():
+        generator.eval()
+        for idx, data in enumerate(test_loader):
+            gt, roll = data
+            # Adversarial ground truths
+            gt = gt.type(Tensor)
+            roll = roll.type(Tensor)
+            real = Variable(gt)
+            roll_ = Variable(roll)
+            gen_imgs = generator(roll_)
+            pred_label = gen_imgs >= 0.4
+            numpy_label = gt.cpu().detach().numpy().astype(int) # B,1, 51, 50
+            numpy_label = np.transpose(numpy_label.squeeze(), (0, 2, 1))  # B,50,51
+            numpy_label = np.reshape(numpy_label, (-1, 51))
+            numpy_pre_label = pred_label.cpu().detach().numpy().astype(int)
+            numpy_pre_label = np.transpose(numpy_pre_label.squeeze(), (0, 2, 1)) #B,50,51
+            numpy_pre_label = np.reshape(numpy_pre_label, (-1, 51))
+            all_label.append(numpy_label)
+            all_pred_label.append(numpy_pre_label)
+            pred_label_ = gen_imgs >= 0.5
+            numpy_pre_label_ = pred_label_.cpu().detach().numpy().astype(int)
+            numpy_pre_label_ = np.transpose(numpy_pre_label_.squeeze(), (0, 2, 1))  # B,50,51
+            numpy_pre_label_ = np.reshape(numpy_pre_label_, (-1, 51))
+            all_pred_label_.append(numpy_pre_label_)
+        all_label = np.vstack(all_label)
+        all_pred_label = np.vstack(all_pred_label)
+        labels = _classification._check_set_wise_labels(all_label, all_pred_label, labels=None, pos_label=1,
+                                                        average='samples')
+        MCM = metrics.multilabel_confusion_matrix(all_label, all_pred_label, sample_weight=None, labels=labels,
+                                                  samplewise=True)
+        tp_sum = MCM[:, 1, 1]
+        fp_sum = MCM[:, 0, 1]
+        fn_sum = MCM[:, 1, 0]
+        # tn_sum = MCM[:, 0, 0]
+        accuracy = _prf_divide(tp_sum, tp_sum + fp_sum + fn_sum, zero_division=1)
+        accuracy = np.average(accuracy)
+        all_precision = metrics.precision_score(all_label, all_pred_label, average='samples', zero_division=1)
+        all_recall = metrics.recall_score(all_label, all_pred_label, average='samples', zero_division=1)
+        all_f1_score = metrics.f1_score(all_label, all_pred_label, average='samples', zero_division=1)
+        print(
+            "Threshold 0.4, avg precision:{0:.3f} | avg recall:{1:.3f} | avg acc:{2:.3f} | f1 score:{3:.3f}".format(
+                 all_precision, all_recall, accuracy, all_f1_score))
+        all_pred_label_ = np.vstack(all_pred_label_)
+        labels = _classification._check_set_wise_labels(all_label, all_pred_label_, labels=None, pos_label=1,
+                                                        average='samples')
+        MCM = metrics.multilabel_confusion_matrix(all_label, all_pred_label_, sample_weight=None, labels=labels,
+                                                  samplewise=True)
+        tp_sum = MCM[:, 1, 1]
+        fp_sum = MCM[:, 0, 1]
+        fn_sum = MCM[:, 1, 0]
+        # tn_sum = MCM[:, 0, 0]
+        accuracy = _prf_divide(tp_sum, tp_sum + fp_sum + fn_sum, zero_division=1)
+        accuracy = np.average(accuracy)
+        all_precision = metrics.precision_score(all_label, all_pred_label_, average='samples', zero_division=1)
+        all_recall = metrics.recall_score(all_label, all_pred_label_, average='samples', zero_division=1)
+        all_f1_score = metrics.f1_score(all_label, all_pred_label_, average='samples', zero_division=1)
+        print(
+            "Threshold 0.5,  avg precision:{0:.3f} | avg recall:{1:.3f} | avg acc:{2:.3f} | f1 score:{3:.3f}".format(
+                all_precision, all_recall,accuracy, all_f1_score))
+        return
+def _prf_divide(numerator, denominator, zero_division="warn"):
+    """Performs division and handles divide-by-zero.
+    On zero-division, sets the corresponding result elements equal to
+    0 or 1 (according to ``zero_division``). Plus, if
+    ``zero_division != "warn"`` raises a warning.
+    The metric, modifier and average arguments are used only for determining
+    an appropriate warning.
+    """
+    mask = denominator == 0.0
+    denominator = denominator.copy()
+    denominator[mask] = 1  # avoid infs/nans
+    result = numerator / denominator
+    if not np.any(mask):
+        return result
+    # if ``zero_division=1``, set those with denominator == 0 equal to 1
+    result[mask] = 0.0 if zero_division in ["warn", 0] else 1.0
+    # the user will be removing warnings if zero_division is set to something
+    # different than its default value. If we are computing only f-score
+    # the warning will be raised only if precision and recall are ill-defined
+    if zero_division != "warn":
+        return result
+if __name__ == "__main__":
+    est_midi_folder = '/ailab-train/speech/shansizhe/audeo/data/estimate_Roll_exp3/testing'
+    exp_dir = "/ailab-train/speech/shansizhe/audeo/Correct_Roll2Midi_experiments/Roll2MidiNet_4_ep14_enhance"
+    with open(os.path.join(exp_dir,'hyperparams.json'), 'r') as hpfile:
+        hp = json.load(hpfile)
+    print(hp['best_loss'])
+    print(hp['best_epoch'])
+    checkpoints = 'checkpoint-best.tar'
+    checkpoint = torch.load(os.path.join(exp_dir, checkpoints))
+    test_loader = process_data()
+    input_shape = (1, 51, 100)
+    model = Generator(input_shape).cuda()
+    model.load_state_dict(checkpoint['state_dict_G'])
+    test(model, test_loader)

src/audeo/Roll2Midi_evaluate_tv2a.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import os
+import json
+from Roll2Midi_dataset_tv2a_eval import Roll2MidiDataset
+from sklearn import metrics
+import torch.utils.data as utils
+import torch
+from Roll2MidiNet import Generator
+from torch.autograd import Variable
+import numpy as np
+from sklearn.metrics import _classification
+cuda = torch.device("cuda")
+Tensor = torch.cuda.FloatTensor
+def process_data():
+    test_dataset = Roll2MidiDataset(train=True)
+    test_loader = utils.DataLoader(test_dataset, batch_size=16)
+    return test_loader
+def test(test_loader):
+    all_label = []
+    all_pred_label = []
+    all_pred_label_ = []
+    with torch.no_grad():
+        #generator.eval()
+        for idx, data in enumerate(test_loader):
+            gt, roll = data
+            # Adversarial ground truths
+            gt = gt.type(Tensor)
+            roll = roll.type(Tensor)
+            real = Variable(gt)
+            roll_ = Variable(roll)
+            #gen_imgs = generator(roll_)
+            #pred_label = gen_imgs >= 0.4
+            numpy_label = gt.cpu().detach().numpy().astype(int) # B,1, 51, 50
+            numpy_label = np.transpose(numpy_label.squeeze(), (0, 2, 1))  # B,50,51
+            numpy_label = np.reshape(numpy_label, (-1, 51))
+            numpy_pre_label = roll.cpu().detach().numpy().astype(int)
+            numpy_pre_label = np.transpose(numpy_pre_label.squeeze(), (0, 2, 1)) #B,50,51
+            numpy_pre_label = np.reshape(numpy_pre_label, (-1, 51))
+            all_label.append(numpy_label)
+            all_pred_label.append(numpy_pre_label)
+        all_label = np.vstack(all_label)
+        all_pred_label = np.vstack(all_pred_label)
+        labels = _classification._check_set_wise_labels(all_label, all_pred_label, labels=None, pos_label=1,
+                                                        average='samples')
+        MCM = metrics.multilabel_confusion_matrix(all_label, all_pred_label, sample_weight=None, labels=labels,
+                                                  samplewise=True)
+        tp_sum = MCM[:, 1, 1]
+        fp_sum = MCM[:, 0, 1]
+        fn_sum = MCM[:, 1, 0]
+        # tn_sum = MCM[:, 0, 0]
+        accuracy = _prf_divide(tp_sum, tp_sum + fp_sum + fn_sum, zero_division=1)
+        accuracy = np.average(accuracy)
+        all_precision = metrics.precision_score(all_label, all_pred_label, average='weighted', zero_division=1)
+        all_recall = metrics.recall_score(all_label, all_pred_label, average='weighted', zero_division=1)
+        all_f1_score = metrics.f1_score(all_label, all_pred_label, average='weighted', zero_division=1)
+        print(
+            "avg precision:{0:.3f} | avg recall:{1:.3f} | avg acc:{2:.3f} | f1 score:{3:.3f}".format(
+                 all_precision, all_recall, accuracy, all_f1_score))
+        return
+def _prf_divide(numerator, denominator, zero_division="warn"):
+    """Performs division and handles divide-by-zero.
+    On zero-division, sets the corresponding result elements equal to
+    0 or 1 (according to ``zero_division``). Plus, if
+    ``zero_division != "warn"`` raises a warning.
+    The metric, modifier and average arguments are used only for determining
+    an appropriate warning.
+    """
+    mask = denominator == 0.0
+    denominator = denominator.copy()
+    denominator[mask] = 1  # avoid infs/nans
+    result = numerator / denominator
+    if not np.any(mask):
+        return result
+    # if ``zero_division=1``, set those with denominator == 0 equal to 1
+    result[mask] = 0.0 if zero_division in ["warn", 0] else 1.0
+    # the user will be removing warnings if zero_division is set to something
+    # different than its default value. If we are computing only f-score
+    # the warning will be raised only if precision and recall are ill-defined
+    if zero_division != "warn":
+        return result
+if __name__ == "__main__":
+    #est_midi_folder = '/ailab-train/speech/shansizhe/audeo/data/estimate_Roll/testing'
+    test_loader = process_data()
+    test(test_loader)

src/audeo/Roll2Midi_inference.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import os
+import json
+import numpy as np
+import torch
+import glob
+from Roll2MidiNet import Generator
+from torch.autograd import Variable
+torch.cuda.set_device(0)
+cuda = torch.device("cuda")
+print(torch.cuda.current_device())
+Tensor = torch.cuda.FloatTensor
+class Midi_Generation():
+    def __init__(self, checkpoint, exp_dir, est_roll_folder, video_name):
+        # model dir
+        self.exp_dir = exp_dir
+        # load model checkpoint
+        self.checkpoint = torch.load(os.path.join(exp_dir,checkpoint))
+        # the video name
+        self.video_name = video_name
+        # the Roll prediction folder
+        self.est_roll_folder = est_roll_folder + video_name
+        # Midi output dir
+        self.infer_out_dir = '/ailab-train/speech/shansizhe/audeo/data/Roll2Midi_results/training/'
+        self.min_key = 15
+        self.max_key = 65
+        self.frame = 50
+        self.process_est_roll(self.est_roll_folder)
+    def process_est_roll(self, est_roll_folder):
+        self.data = []
+        self.final_data = []
+        self.est_roll_files = glob.glob(est_roll_folder + '/*.npz')
+        self.est_roll_files.sort(key=lambda x: int(x.split('/')[-1].split('.')[0].split('-')[0]))
+        print("need to infer {0} files".format(len(est_roll_folder)))
+        for i in range(len(self.est_roll_files)):
+            with np.load(self.est_roll_files[i]) as data:
+                est_roll = data['logit'][:,self.min_key:self.max_key+1]
+                if est_roll.shape[0] != self.frame:
+                    target = np.zeros((self.frame, self.max_key-self.min_key+1))
+                    target[:est_roll.shape[0], :] = est_roll
+                    est_roll = target
+            self.data.append(torch.from_numpy(est_roll))
+        for i in range(0,len(self.data), 2):
+            if i + 1 < len(self.data):
+                one_roll = self.data[i]
+                two_roll = self.data[i+1]
+                final_roll = torch.cat([one_roll, two_roll], dim=0)
+                self.final_data.append(final_roll)
+    def inference(self):
+        input_shape = (1, self.max_key-self.min_key+1, 2*self.frame)
+        model = Generator(input_shape).cuda()
+        model.load_state_dict(self.checkpoint['state_dict_G'])
+        test_results = []
+        print('Inferencing MIDI......')
+        for i, data in enumerate(self.final_data):
+            roll = torch.unsqueeze(torch.unsqueeze(torch.sigmoid(data.T.float().cuda()), dim=0), dim=0)
+            print("piece ", i)
+            with torch.no_grad():
+                model.eval()
+                roll = roll.type(Tensor)
+                roll_ = Variable(roll)
+                gen_img = model(roll_)
+                gen_img = gen_img >= 0.5
+                numpy_pre_label = gen_img.cpu().detach().numpy().astype(int) # 1,1,88,100
+                numpy_pre_label = np.transpose(numpy_pre_label.squeeze(), (1, 0))  # 100,88
+                test_results.append(numpy_pre_label[:self.frame, :])
+                test_results.append(numpy_pre_label[self.frame:, :])
+        midi_out_dir = self.create_output_dir()
+        for i in range(len(test_results)):
+            print(self.est_roll_files[i])
+            idx = self.est_roll_files[i].split("/")[-1].split(".")[0].split("-")
+            idx1 = int(idx[0])
+            idx2 = int(idx[1])
+            print(idx1, idx2)
+            np.savez(midi_out_dir+f'/{idx1}-{idx2}.npz', midi=test_results[i])
+    def create_output_dir(self):
+        midi_out_dir = os.path.join(self.infer_out_dir, self.video_name)
+        os.makedirs(midi_out_dir, exist_ok=True)
+        return midi_out_dir
+if __name__ == "__main__":
+    # example for generating the Midi output from training Roll predictions
+    est_roll_folder = '/ailab-train/speech/shansizhe/audeo/data/estimate_Roll/training/'
+    exp_dir = "/ailab-train/speech/shansizhe/audeo/Correct_Roll2Midi_experiments/Roll2MidiNet_1"
+    with open(os.path.join(exp_dir,'hyperparams.json'), 'r') as hpfile:
+        hp = json.load(hpfile)
+    print("the best loss:", hp['best_loss'])
+    print("the best epoch:", hp['best_epoch'])
+    checkpoints = 'checkpoint-{}.tar'.format(hp['best_epoch'])
+    for i in [1,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,19,21,22,23,24,25,26,27]:
+        video_name = f'{i}'
+        generator = Midi_Generation(checkpoints, exp_dir, est_roll_folder, video_name)
+        generator.inference()

src/audeo/Roll2Midi_train.py ADDED Viewed

	@@ -0,0 +1,280 @@

+import os
+import torch
+import torch.optim as optim
+import numpy as np
+from torchvision.utils import save_image
+import json
+import torch.utils.data as utils
+from Roll2MidiNet_enhance import Generator, Discriminator,weights_init_normal
+from Roll2Midi_dataset import Roll2MidiDataset
+from torch.autograd import Variable
+from sklearn import metrics
+from tqdm import tqdm
+from torch.utils.tensorboard import SummaryWriter
+torch.cuda.set_device(0)
+cuda = torch.device("cuda")
+print(torch.cuda.current_device())
+Tensor = torch.cuda.FloatTensor
+class hyperparams(object):
+    def __init__(self):
+        self.train_epoch = 200
+        self.test_freq = 1
+        self.exp_name = 'Roll2MidiNet_4_ep14_enhance'
+        self.channels = 1
+        self.h = 51 #input Piano key ranges
+        self.w = 100 # 4 seconds, 100 frames predictions
+        self.iter_train_g_loss = []
+        self.iter_train_d_loss = []
+        self.iter_test_g_loss = []
+        self.iter_test_d_loss = []
+        self.g_loss_history = []
+        self.d_loss_history = []
+        self.test_g_loss_history = []
+        self.test_d_loss_history = []
+        self.best_loss = 1e10
+        self.best_epoch = 0
+def process_data():
+    train_dataset = Roll2MidiDataset(train=True)
+    train_loader = utils.DataLoader(train_dataset, batch_size=16, shuffle=True)
+    test_dataset = Roll2MidiDataset(train=False)
+    test_loader = utils.DataLoader(test_dataset, batch_size=16)
+    return train_loader, test_loader
+def train(generator, discriminator, epoch, train_loader, optimizer_G, optimizer_D,
+          scheduler, adversarial_loss, iter_train_g_loss, iter_train_d_loss):
+    generator.train()
+    discriminator.train()
+    train_g_loss = 0
+    train_d_loss = 0
+    for batch_idx, data in tqdm(enumerate(train_loader)):
+        gt, roll = data
+        # Adversarial ground truths
+        valid = Variable(Tensor(gt.shape[0], *discriminator.output_shape).fill_(1.0), requires_grad=False)
+        fake = Variable(Tensor(gt.shape[0], *discriminator.output_shape).fill_(0.0), requires_grad=False)
+        gt = gt.type(Tensor)
+        roll = roll.type(Tensor)
+        real = Variable(gt)
+        roll_ = Variable(roll)
+        # -----------------
+        #  Train Generator
+        # -----------------
+        optimizer_G.zero_grad()
+        # Generate a batch of images
+        gen_imgs = generator(roll_)
+        # Loss measures generator's ability to fool the discriminator
+        g_loss = 0.001*adversarial_loss(discriminator(gen_imgs), valid) + 0.999*adversarial_loss(gen_imgs, gt)
+        g_loss.backward()
+        iter_train_g_loss.append(g_loss.item())
+        train_g_loss += g_loss
+        optimizer_G.step()
+        # ---------------------
+        #  Train Discriminator
+        # ---------------------
+        optimizer_D.zero_grad()
+        # Measure discriminator's ability to classify real from generated samples
+        real_loss = adversarial_loss(discriminator(real), valid)
+        fake_loss = adversarial_loss(discriminator(gen_imgs.detach()), fake)
+        d_loss = 0.5 * (real_loss + fake_loss)
+        d_loss.backward()
+        iter_train_d_loss.append(d_loss.item())
+        train_d_loss += d_loss
+        optimizer_D.step()
+        if batch_idx % 2 == 0:
+            print('Train Epoch: {0} [{1}/{2} ({3:.0f}%)]\t g Loss: {4:.6f} | d Loss: {5:.6f}'.format(epoch, batch_idx * roll.shape[0],
+                                                                            len(train_loader.dataset),
+                                                                            100. * batch_idx / len(train_loader),
+                                                                            g_loss.item() / roll.shape[0], d_loss.item() / roll.shape[0]))
+    scheduler.step(train_g_loss / len(train_loader.dataset))
+    print('====> Epoch: {} Average g loss: {:.4f} | d loss: {:.4f}'.format(epoch, train_g_loss / len(train_loader.dataset), train_d_loss / len(train_loader.dataset)))
+    return train_g_loss / len(train_loader.dataset),train_d_loss / len(train_loader.dataset)
+def test(generator, discriminator, epoch, test_loader, adversarial_loss,
+         iter_test_g_loss,iter_test_d_loss):
+    all_label = []
+    all_pred_label = []
+    all_pred_label_ = []
+    with torch.no_grad():
+        generator.eval()
+        discriminator.eval()
+        test_g_loss = 0
+        test_d_loss = 0
+        for idx, data in enumerate(test_loader):
+            gt, roll = data
+            # Adversarial ground truths
+            valid = Variable(Tensor(gt.shape[0], *discriminator.output_shape).fill_(1.0), requires_grad=False)
+            fake = Variable(Tensor(gt.shape[0], *discriminator.output_shape).fill_(0.0), requires_grad=False)
+            gt = gt.type(Tensor)
+            roll = roll.type(Tensor)
+            real = Variable(gt)
+            roll_ = Variable(roll)
+            gen_imgs = generator(roll_)
+            # Loss measures generator's ability to fool the discriminator
+            g_loss = adversarial_loss(gen_imgs, gt)
+            iter_test_g_loss.append(g_loss.item())
+            test_g_loss += g_loss
+            # Measure discriminator's ability to classify real from generated samples
+            real_loss = adversarial_loss(discriminator(real), valid)
+            fake_loss = adversarial_loss(discriminator(gen_imgs.detach()), fake)
+            d_loss = 0.5 * (real_loss + fake_loss)
+            iter_test_d_loss.append(d_loss.item())
+            test_d_loss += d_loss
+            pred_label = gen_imgs >= 0.4
+            numpy_label = gt.cpu().detach().numpy().astype(int) # B,1,51, 50
+            numpy_label = np.transpose(numpy_label.squeeze(), (0, 2, 1))  # B,50,51
+            numpy_label = np.reshape(numpy_label, (-1, 51))
+            numpy_pre_label = pred_label.cpu().detach().numpy().astype(int)
+            numpy_pre_label = np.transpose(numpy_pre_label.squeeze(), (0, 2, 1)) #B,50,51
+            numpy_pre_label = np.reshape(numpy_pre_label, (-1, 51))
+            all_label.append(numpy_label)
+            all_pred_label.append(numpy_pre_label)
+            pred_label_ = gen_imgs >= 0.5
+            numpy_pre_label_ = pred_label_.cpu().detach().numpy().astype(int)
+            numpy_pre_label_ = np.transpose(numpy_pre_label_.squeeze(), (0, 2, 1))  # B,50,51
+            numpy_pre_label_ = np.reshape(numpy_pre_label_, (-1, 51))
+            all_pred_label_.append(numpy_pre_label_)
+        test_g_loss /= len(test_loader.dataset)
+        test_d_loss /= len(test_loader.dataset)
+        writer = SummaryWriter(log_dir='/ailab-train/speech/shansizhe/audeo/log/roll2midi/exp4_enhance')
+        # scheduler.step(test_loss)
+        print('====> Test set g loss: {:.4f} | d loss: {:.4f}'.format(test_g_loss, test_d_loss))
+        all_label = np.vstack(all_label)
+        all_pred_label = np.vstack(all_pred_label)
+        all_precision = metrics.precision_score(all_label, all_pred_label, average='samples', zero_division=1)
+        all_recall = metrics.recall_score(all_label, all_pred_label, average='samples', zero_division=1)
+        all_f1_score = metrics.f1_score(all_label, all_pred_label, average='samples', zero_division=1)
+        print(
+            "Threshold 0.4, epoch {0}  avg precision:{1:.3f} | avg recall:{2:.3f} | f1 score:{3:.3f}".format(
+                epoch, all_precision, all_recall, all_f1_score))
+        writer.add_scalar('g_loss', test_g_loss, epoch)
+        writer.add_scalar('d_loss', test_d_loss, epoch)
+        writer.add_scalar('loss', test_d_loss + test_g_loss, epoch)
+        writer.add_scalar('Precision/t=0.4', all_precision, epoch)
+        writer.add_scalar('Recall/t=0.4', all_recall, epoch)
+        writer.add_scalar('F1_score/t=0.4', all_f1_score, epoch)
+        all_pred_label_ = np.vstack(all_pred_label_)
+        all_precision = metrics.precision_score(all_label, all_pred_label_, average='samples', zero_division=1)
+        all_recall = metrics.recall_score(all_label, all_pred_label_, average='samples', zero_division=1)
+        all_f1_score = metrics.f1_score(all_label, all_pred_label_, average='samples', zero_division=1)
+        print(
+            "Threshold 0.5, epoch {0}  avg precision:{1:.3f} | avg recall:{2:.3f} | f1 score:{3:.3f}".format(
+                epoch, all_precision, all_recall, all_f1_score))
+        writer.add_scalar('Precision/t=0.5', all_precision, epoch)
+        writer.add_scalar('Recall/t=0.5', all_recall, epoch)
+        writer.add_scalar('F1_score/t=0.5', all_f1_score, epoch)
+        return test_g_loss, test_d_loss
+def main():
+    hp = hyperparams()
+    try:
+        # the dir to save the Roll2Midi model
+        exp_root = "/ailab-train/speech/shansizhe/audeo/Correct_Roll2Midi_experiments"
+        os.makedirs(exp_root, exist_ok=True)
+    except FileExistsError:
+        pass
+    exp_dir = os.path.join(exp_root, hp.exp_name)
+    os.makedirs(exp_dir, exist_ok=True)
+    input_shape = (hp.channels, hp.h, hp.w)
+    # Loss function
+    adversarial_loss = torch.nn.MSELoss()
+    generator = Generator(input_shape)
+    discriminator = Discriminator(input_shape)
+    # Initialize weights
+    generator.apply(weights_init_normal)
+    discriminator.apply(weights_init_normal)
+    generator.cuda()
+    discriminator.cuda()
+    optimizer_G = torch.optim.Adam(generator.parameters(), lr=0.5*1e-3, betas=(0.9, 0.999))
+    optimizer_D = torch.optim.Adam(discriminator.parameters(), lr=0.5*1e-3, betas=(0.9, 0.999))
+    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer_G, 'min', patience=2)
+    train_loader, test_loader = process_data()
+    print ('start training')
+    for epoch in tqdm(range(hp.train_epoch)):
+        # training loop
+        g_loss, d_loss = train(generator, discriminator, epoch, train_loader, optimizer_G, optimizer_D,
+                              scheduler, adversarial_loss, hp.iter_train_g_loss, hp.iter_train_d_loss)
+        hp.g_loss_history.append(g_loss.item())
+        hp.d_loss_history.append(d_loss.item())
+        # test
+        if epoch % hp.test_freq == 0:
+            test_g_loss,test_d_loss = test(generator, discriminator,  epoch, test_loader, adversarial_loss,
+                                           hp.iter_test_g_loss, hp.iter_test_d_loss)
+            hp.test_g_loss_history.append(test_g_loss.item())
+            hp.test_d_loss_history.append(test_d_loss.item())
+            max_checkpoints = 5
+            # 在每个 epoch 后保存 checkpoint
+            torch.save({'epoch': epoch + 1,
+                        'state_dict_G': generator.state_dict(),
+                        'optimizer_G': optimizer_G.state_dict(),
+                        'state_dict_D': discriminator.state_dict(),
+                        'optimizer_D': optimizer_D.state_dict()},
+                    os.path.join(exp_dir, 'checkpoint-{}.tar'.format(str(epoch + 1))))
+            # 如果达到最大 checkpoint 数量，删除最旧的 checkpoint
+            saved_checkpoints = sorted(os.listdir(exp_dir))
+            saved_checkpoints = [f for f in saved_checkpoints if f != 'checkpoint-best.tar']
+            if len(saved_checkpoints) > max_checkpoints:
+                oldest_checkpoint = saved_checkpoints[0]
+                os.remove(os.path.join(exp_dir, oldest_checkpoint))
+            if test_g_loss + test_d_loss < hp.best_loss:
+                torch.save({'epoch': epoch + 1, 'state_dict_G': generator.state_dict(),
+                            'optimizer_G': optimizer_G.state_dict(),
+                           'state_dict_D': discriminator.state_dict(),
+                            'optimizer_D': optimizer_D.state_dict()},
+                           os.path.join(exp_dir, 'checkpoint-best.tar'))
+                hp.best_loss = test_g_loss.item()+test_d_loss.item()
+                hp.best_epoch = epoch + 1
+                with open(os.path.join(exp_dir, 'hyperparams.json'), 'w') as outfile:
+                    json.dump(hp.__dict__, outfile)
+if __name__ == "__main__":
+    main()

src/audeo/Video2RollNet.py ADDED Viewed

	@@ -0,0 +1,264 @@

+import torch.nn as nn
+import math
+import torch.nn.functional as F
+import torch
+__all__ = ['ResNet', 'resnet18']
+def conv3x3(in_planes, out_planes, stride=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+class FTB(nn.Module):
+    def __init__(self,in_planes, out_planes=512, stride=1):
+        super(FTB,self).__init__()
+        self.conv0 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, padding=1,bias=False)
+        self.conv1 = conv3x3(out_planes, out_planes, stride)
+        self.bn1 = nn.BatchNorm2d(out_planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(out_planes, out_planes)
+        self.avgpool1 = nn.AvgPool2d(kernel_size=(2, 2), stride=2)
+        self.avgpool2 = nn.AvgPool2d(kernel_size=(3, 3), stride=1)
+    def forward(self, x, avg=True):
+        x1 = self.conv0(x)
+        residual = x1
+        out = self.conv1(x1)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out += residual
+        if avg:
+            out = self.avgpool1(out)
+        else:
+            out = self.avgpool2(out)
+        return out
+class FRB(nn.Module):
+    def __init__(self,in_planes1,in_planes2):
+        super(FRB,self).__init__()
+        self.fc1 = nn.Linear(in_planes1+in_planes2, in_planes2)
+        self.relu = nn.ReLU(inplace=True)
+        self.fc2 = nn.Linear(in_planes2, in_planes2)
+    def forward(self, xl, xh):
+        xc = torch.cat([xl,xh],dim=1)
+        zc = F.avg_pool2d(xc, kernel_size=xc.size()[2:]) # C x 1 x 1
+        zc = torch.flatten(zc, 1)
+        out = self.fc1(zc)
+        out = self.relu(out)
+        out = self.fc2(out)
+        zc_ = F.sigmoid(out)
+        zc_ = torch.unsqueeze(zc_,dim=2)
+        zc_ = zc_.repeat(1, 1, xl.shape[2] * xl.shape[3]).view(-1,xl.shape[1],xl.shape[2],xl.shape[3])
+        xl_ = zc_ * xl #n,c,h,w
+        return xl_
+class BasicBlock(nn.Module):
+    expansion = 1
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out += residual
+        out = self.relu(out)
+        return out
+class Bottleneck(nn.Module):
+    expansion = 4
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * 4)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out += residual
+        out = self.relu(out)
+        return out
+class ResNet(nn.Module):
+    def __init__(self, block, layers, top_channel_nums=2048, reduced_channel_nums=256, num_classes=51, scale=1):
+        self.inplanes = 64
+        super(ResNet, self).__init__()
+        self.conv1 = nn.Conv2d(5, 64, kernel_size=(11, 11), stride=(2, 2), padding=(4, 4),bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.FTB2_1 = FTB(128, 128)
+        self.FTB2_2 = FTB(128, 128)
+        self.FRB2 = FRB(128, 128)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.FTB3 = FTB(256, 128)
+        self.FRB3 = FRB(128, 128)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+        self.FTB4 = FTB(512, 128)
+        self.FRB4 = FRB(64, 128)
+        #FPN PARTS
+        # Top layer
+        self.toplayer = nn.Conv2d(top_channel_nums, reduced_channel_nums, kernel_size=1, stride=1, padding=0)  # Reduce channels,
+        self.toplayer_bn = nn.BatchNorm2d(reduced_channel_nums)
+        self.toplayer_relu = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(128, 128, kernel_size=1)
+        self.fc = nn.Linear(128, num_classes)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(planes * block.expansion),
+            )
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+        return nn.Sequential(*layers)
+    def _upsample(self, x, y, scale=1):
+        _, _, H, W = y.size()
+        return F.upsample(x, size=(H // scale, W // scale), mode='bilinear')
+    def _upsample_add(self, x, y):
+        _, _, H, W = y.size()
+        return F.upsample(x, size=(H, W), mode='bilinear') + y
+    def forward(self, x):
+        h = x
+        h = self.conv1(h)
+        h = self.bn1(h)
+        h = self.relu1(h)
+        h = self.maxpool(h)
+        h = self.layer1(h)
+        x1 = h
+        h = self.layer2(h)
+        x2 = h
+        h = self.layer3(h)
+        x3 = h
+        h = self.layer4(h)
+        x4 = h
+        # Top-down
+        x5 = self.toplayer(x4)
+        x5 = self.toplayer_relu(self.toplayer_bn(x5))
+        x2_ = self.FTB2_1(x2)
+        x2_ = self.FTB2_2(x2_)
+        x3_ = self.FTB3(x3)
+        x4_ = self.FTB4(x4, avg=False)
+        p4 = self.FRB4(x4_, x5)
+        p3 = self.FRB3(x3_, p4)
+        p2 = self.FRB2(x2_, p3)
+        out1 = p2*p3
+        out1_ = F.softmax(out1.view(*out1.size()[:2], -1),dim=2).view_as(out1)
+        out2 = out1_*p4
+        out2 = self.conv2(out2)
+        out = out2 + p4
+        out = F.avg_pool2d(out, kernel_size=out.size()[2:])
+        out = torch.flatten(out, 1)
+        out = self.fc(out)
+        return out
+def resnet18(**kwargs):
+    """Constructs a ResNet-18 model.
+    """
+    model = ResNet(BasicBlock, layers=[2, 2, 2, 2], top_channel_nums=512, reduced_channel_nums=64, **kwargs)
+    return model
+if __name__ == "__main__":
+    net = resnet18()
+    print(net)
+    imgs = torch.rand((2, 5, 100,900))
+    logits = net(imgs)
+    print(logits.shape)

src/audeo/Video2Roll_dataset.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import numpy as np
+import glob
+import matplotlib.pyplot as plt
+from PIL import Image
+from torch.utils.data import Dataset, DataLoader
+import torchvision.transforms as transforms
+import torch
+from balance_data import MultilabelBalancedRandomSampler
+# Resize all input images to 1 x 100 x 900
+transform = transforms.Compose([lambda x: x.resize((900,100)),
+                               lambda x: np.reshape(x,(100,900,1)),
+                               lambda x: np.transpose(x,[2,0,1]),
+                               lambda x: x/255.])
+class Video2RollDataset(Dataset):
+    def __init__(self, img_root='./data/frame',label_root='./data/label', transform = transform, subset='train', device='cuda'):
+        self.img_root = img_root #images root dir
+        self.label_root = label_root #labels root dir
+        self.transform = transform
+        self.subset = subset
+        # the minimum and maximum Piano Key values in the data, depending on the data stats
+        self.min_key = 15 #3
+        self.max_key = 65 #79
+        self.device = device
+        self.load_data()
+    def __getitem__(self,index):
+        if self.subset=='train':
+            input_file_list, label = self.data['train'][index]
+        else:
+            input_file_list, label = self.data['test'][index]
+        input_img_list = []
+        # 5 consecutive frames, set binary
+        for input_file in input_file_list:
+            input_img = Image.open(input_file).convert('L')
+            binarr = np.array(input_img)
+            input_img = Image.fromarray(binarr.astype(np.uint8))
+            input_img_list.append(input_img)
+        new_input_img_list = []
+        for input_img in input_img_list:
+            new_input_img_list.append(self.transform(input_img))
+        # stack 5 consecutive frames
+        final_input_img = np.concatenate(new_input_img_list)
+        torch_input_img = torch.from_numpy(final_input_img).float().to(self.device)
+        torch_label = torch.from_numpy(label).float().to(self.device)
+        return torch_input_img, torch_label
+    def __len__(self):
+        if self.subset == 'train':
+            # return 20000
+            return len(self.data['train'])
+        else:
+            return len(self.data['test'])
+    def load_data(self):
+        # self.folders: dictionary
+        # key: train/test, values: list of tuples [(video_i_image_folder, video_i_label_folder)]
+        self.folders = {}
+        train_img_folder = glob.glob(self.img_root+'/training/*')
+        train_img_folder.sort(key=lambda x:int(x.split('/')[-1]))
+        test_img_folder = glob.glob(self.img_root+'/testing/*')
+        test_img_folder.sort(key=lambda x:int(x.split('/')[-1]))
+        train_label_folder = glob.glob(self.label_root+'/training/*')
+        train_label_folder.sort(key=lambda x: int(x.split('/')[-1].split('.')[0]))
+        test_label_folder = glob.glob(self.label_root+'/testing/*')
+        test_label_folder.sort(key=lambda x: int(x.split('/')[-1].split('.')[0]))
+        self.folders['train'] = [(train_img_folder[i],train_label_folder[i]) for i in range(len(train_img_folder))]
+        print(self.folders['train'])
+        self.folders['test'] = [(test_img_folder[i],test_label_folder[i]) for i in range(len(test_img_folder))]
+        print(self.folders['test'])
+        # self.data: dictionary
+        # key: train/test, value: list of tuples [([frame_{i-2, i+2}_image_filename], frame_i_label)]
+        self.data = {}
+        self.data['train'] = []
+        self.data['test'] = []
+        self.train_labels = []
+        count_zero = 0
+        # load train data
+        for img_folder, label_file in self.folders['train']:
+            # each folder contains all image frames of one video, format: frame{number}.jpg
+            img_files = glob.glob(img_folder + '/*.jpg')
+            img_files.sort(key=lambda x: int(x.split('/')[-1].split('.')[0][5:]))
+            # label is a pkl file. The key is frame number, value is the label vector of 88 dim
+            labels = np.load(label_file, allow_pickle=True)
+            for i, file in enumerate(img_files):
+                key = int(file.split('/')[-1].split('.')[0][5:])
+                label = labels[key]
+                # count the number of frames that no key is activate
+                if not np.any(label):
+                    count_zero += 1
+                    # continue
+                new_label = label[self.min_key:self.max_key + 1]
+                if i >= 2 and i<len(img_files)-2:
+                    file_list = [img_files[i-2], img_files[i-1], file, img_files[i+1],img_files[i+2]]
+                else:
+                    continue
+                self.data['train'].append((file_list, new_label))
+                self.train_labels.append(new_label)
+        print("number of all zero label in training:", count_zero)
+        self.train_labels = np.asarray(self.train_labels)
+        count_zero = 0
+        # load test data
+        for img_folder, label_file in self.folders['test']:
+            img_files = glob.glob(img_folder + '/*.jpg')
+            img_files.sort(key=lambda x: int(x.split('/')[-1].split('.')[0][5:]))
+            labels = np.load(label_file, allow_pickle=True)
+            for i, file in enumerate(img_files):
+                key = int(file.split('/')[-1].split('.')[0][5:])
+                label = labels[key]
+                if not np.any(label):
+                    count_zero += 1
+                    # continue
+                new_label = label[self.min_key:self.max_key + 1]
+                if i >= 2 and i<len(img_files)-2:
+                    file_list = [img_files[i-2], img_files[i-1], file, img_files[i+1],img_files[i+2]]
+                else:
+                    continue
+                self.data['test'].append((file_list, new_label))
+        print("number of all zero label in testing:", count_zero)
+        print("length of training data:",len(self.data['train']))
+        print("length of testing data:",len(self.data['test']))
+if __name__ == "__main__":
+    dataset = Video2RollDataset(subset='train')
+    # g,h = dataset.__getitem__(200)
+    # print(g.shape)
+    # print(torch.nonzero(h))
+    train_sampler = MultilabelBalancedRandomSampler(dataset.train_labels)
+    train_loader = DataLoader(dataset, batch_size=64,sampler=train_sampler)
+    for i, data in enumerate(train_loader):
+        print(i)
+        imgs,label = data
+        print(label.shape)
+        # fig, (ax1) = plt.subplots(1)
+        # ax1.imshow(label.cpu().numpy().T, plt.cm.gray)
+        # plt.show()
+        # print(torch.nonzero(label, as_tuple=True))
+        print(torch.unique(torch.nonzero(label)[:,1]))
+        if i==3:
+            break

src/audeo/Video2Roll_evaluate.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import Video2RollNet
+import os
+import glob
+import numpy as np
+from PIL import Image
+import torchvision.transforms as transforms
+from Video2Roll_dataset import Video2RollDataset
+from torch.utils.data import DataLoader
+import torch
+import time
+from sklearn import metrics
+from sklearn.metrics import _classification
+import torch.nn as nn
+def validate(net, criterion, test_loader):
+    epoch_loss = 0
+    count = 0
+    all_pred_label = []
+    all_label = []
+    with torch.no_grad():
+        for i, data in enumerate(test_loader):
+            imgs, label = data
+            logits = net(imgs)
+            loss = criterion(logits, label)
+            pred_label = torch.sigmoid(logits) >= 0.4
+            numpy_label = label.cpu().detach().numpy().astype(int)
+            numpy_pre_label = pred_label.cpu().detach().numpy().astype(int)
+            all_label.append(numpy_label)
+            all_pred_label.append(numpy_pre_label)
+            epoch_loss += loss.item()
+            count += 1
+    all_label = np.vstack(all_label)
+    all_pred_label = np.vstack(all_pred_label)
+    labels = _classification._check_set_wise_labels(all_label, all_pred_label,labels=None, pos_label=1, average='samples')
+    MCM = metrics.multilabel_confusion_matrix(all_label, all_pred_label,sample_weight=None, labels=labels, samplewise=True)
+    tp_sum = MCM[:, 1, 1]
+    fp_sum = MCM[:, 0, 1]
+    fn_sum = MCM[:, 1, 0]
+    # tn_sum = MCM[:, 0, 0]
+    accuracy = _prf_divide(tp_sum, tp_sum+fp_sum+fn_sum, zero_division=1)
+    accuracy = np.average(accuracy)
+    all_precision = metrics.precision_score(all_label, all_pred_label, average='samples', zero_division=1)
+    all_recall = metrics.recall_score(all_label, all_pred_label, average='samples', zero_division=1)
+    all_f1_score = metrics.f1_score(all_label, all_pred_label, average='samples', zero_division=1)
+    return epoch_loss/count, all_precision, all_recall, accuracy, all_f1_score
+def _prf_divide(numerator, denominator, zero_division="warn"):
+    """Performs division and handles divide-by-zero.
+    On zero-division, sets the corresponding result elements equal to
+    0 or 1 (according to ``zero_division``). Plus, if
+    ``zero_division != "warn"`` raises a warning.
+    The metric, modifier and average arguments are used only for determining
+    an appropriate warning.
+    """
+    mask = denominator == 0.0
+    denominator = denominator.copy()
+    denominator[mask] = 1  # avoid infs/nans
+    result = numerator / denominator
+    if not np.any(mask):
+        return result
+    # if ``zero_division=1``, set those with denominator == 0 equal to 1
+    result[mask] = 0.0 if zero_division in ["warn", 0] else 1.0
+    # the user will be removing warnings if zero_division is set to something
+    # different than its default value. If we are computing only f-score
+    # the warning will be raised only if precision and recall are ill-defined
+    if zero_division != "warn":
+        return result
+if __name__ == "__main__":
+    model_path = './models/Video2Roll_50_0.4/14.pth'
+    device = torch.device('cuda')
+    net = Video2RollNet.resnet18()
+    # net = torch.nn.DataParallel(net)
+    net.cuda()
+    net.load_state_dict(torch.load(model_path))
+    print(net)
+    test_dataset = Video2RollDataset(subset='test')
+    test_data_loader = DataLoader(test_dataset, batch_size=64)
+    net.eval()
+    criterion=nn.BCEWithLogitsLoss()
+    val_avg_loss, val_avg_precision, val_avg_recall, val_avg_acc, val_fscore = validate(net, criterion, test_data_loader)
+    epoch = 0
+    print('-' * 85)
+    print(
+        "epoch {0} validation loss:{1:.3f} | avg precision:{2:.3f} | avg recall:{3:.3f} | avg acc:{4:.3f} | f1 score:{5:.3f}".format(
+            epoch + 1, val_avg_loss, val_avg_precision, val_avg_recall, val_avg_acc, val_fscore))
+    print('-' * 85)

src/audeo/Video2Roll_inference.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import Video2RollNet
+import os
+import glob
+import numpy as np
+from PIL import Image
+import torchvision.transforms as transforms
+import torch
+transform = transforms.Compose([lambda x: x.resize((900,100)),
+                               lambda x: np.reshape(x,(100,900,1)),
+                               lambda x: np.transpose(x,[2,0,1]),
+                               lambda x: x/255.])
+# video images root dir, change to your path
+img_root='./data/frame'
+# labels root dir, change to your path
+label_root='./data/label'
+# midi ground truth root dir, change to your path
+midi_root = './data/midi_npz'
+# Roll prediction output, change to your path
+#est_roll_root = '/ailab-train/speech/shansizhe/audeo/data/estimate_Roll_exp3/'
+# the range of Piano keys (maximum is 88), depending on your data
+min_key = 15
+max_key = 65
+def load_data(img_folder, label_file, midi_folder):
+    img_files = glob.glob(img_folder + '/*.jpg')
+    img_files.sort(key=lambda x: int(x.split('/')[-1].split('.')[0][5:]))
+    labels = np.load(label_file, allow_pickle=True)
+    # Midi info for every video is divided into multiple npz files
+    # each npz contains 2 seconds (50 frames) Midi information
+    # format: frame_{i}-frame_{i+50}.npz
+    midi_files = glob.glob(midi_folder + '/*.npz')
+    midi_files.sort(key=lambda x: int(x.split('/')[-1].split('.')[0].split('-')[0].split('_')[1]))
+    intervals = []
+    for file in midi_files:
+        interval = file.split('/')[-1].split('.')[0].split('-')
+        start = int(interval[0].split('_')[1])
+        end = int(interval[1].split('_')[1])
+        intervals.append([start, end])
+    data = []
+    for i, file in enumerate(img_files):
+        key = int(file.split('/')[-1].split('.')[0][5:])
+        label = np.where(labels[key] > 0, 1, 0)
+        new_label = label[min_key:max_key + 1]
+        if i >= 2 and i < len(img_files) - 2:
+            file_list = [img_files[i - 2], img_files[i - 1], file, img_files[i + 1], img_files[i + 2]]
+        elif i < 2:
+            file_list = [file, file, file,  img_files[i + 1], img_files[i + 2]]
+        else:
+            file_list = [img_files[i - 2], img_files[i - 1], file, file, file]
+        data.append((file_list, new_label))
+        print("data", i, file, file_list, new_label)
+    return intervals, data
+# infer 2 seconds every time
+def inference(net, intervals, data, est_roll_folder):
+    net.eval()
+    i = 0
+    for interval in intervals:
+        start, end = interval
+        print("infer interval {0} - {1}".format(start, end))
+        save_est_roll = []
+        save_est_logit = []
+        infer_data = data[i:i+50]
+        for frame in infer_data:
+            file_list, label = frame
+            torch_input_img, torch_label = torch_preprocess(file_list, label)
+            logits = net(torch.unsqueeze(torch_input_img,dim=0))
+            print("####", torch_input_img.shape, torch_label.shape, logits.shape)
+            pred_label = torch.sigmoid(logits) >= 0.4
+            numpy_pre_label = pred_label.cpu().detach().numpy().astype(int)
+            numpy_logit = logits.cpu().detach().numpy()
+            save_est_roll.append(numpy_pre_label)
+            save_est_logit.append(numpy_logit)
+        # Roll prediction
+        target = np.zeros((50, 88))
+        target[:, min_key:max_key+1] = np.asarray(save_est_roll).squeeze()
+        save_est_roll = target
+        # Logit
+        target_ = np.zeros((50, 88))
+        target_[:, min_key:max_key + 1] = np.asarray(save_est_logit).squeeze()
+        save_est_logit = target_
+        # save both Roll predictions and logits as npz files
+        np.savez(f'{est_roll_folder}/' + str(start) + '-' + str(end) + '.npz', logit=save_est_logit, roll=save_est_roll)
+        i = i+50
+def torch_preprocess(input_file_list, label):
+    input_img_list = []
+    for input_file in input_file_list:
+        input_img = Image.open(input_file).convert('L')
+        binarr = np.array(input_img)
+        input_img = Image.fromarray(binarr.astype(np.uint8))
+        input_img_list.append(input_img)
+    new_input_img_list = []
+    for input_img in input_img_list:
+        new_input_img_list.append(transform(input_img))
+    final_input_img = np.concatenate(new_input_img_list)
+    torch_input_img = torch.from_numpy(final_input_img).float().cuda()
+    torch_label = torch.from_numpy(label).float().cuda()
+    return torch_input_img, torch_label
+if __name__ == "__main__":
+    model_path = './models/Video2Roll_50_0.4/14.pth' # change to your path
+    device = torch.device('cuda')
+    net = Video2RollNet.resnet18()
+    net.cuda()
+    net.load_state_dict(torch.load(model_path))
+    #training_data = [True,False]
+    training_data = [False]
+    # infer Roll predictions
+    folders = {}
+    train_img_folder = glob.glob(img_root +'/training/*')
+    train_img_folder.sort(key=lambda x:int(x.split('/')[-1]))
+    test_img_folder = glob.glob(img_root +'/testing/*')
+    test_img_folder.sort(key=lambda x:int(x.split('/')[-1]))
+    train_label_folder = glob.glob(label_root +'/training/*')
+    train_label_folder.sort(key=lambda x: int(x.split('/')[-1].split('.')[0]))
+    test_label_folder = glob.glob(label_root +'/testing/*')
+    test_label_folder.sort(key=lambda x: int(x.split('/')[-1].split('.')[0]))
+    train_midi_folder = glob.glob(midi_root +'/training/*')
+    train_midi_folder.sort(key=lambda x:int(x.split('/')[-1]))
+    test_midi_folder = glob.glob(midi_root +'/testing/*')
+    test_midi_folder.sort(key=lambda x:int(x.split('/')[-1]))
+    folders['train'] = [(train_img_folder[i],train_label_folder[i],train_midi_folder[i]) for i in range(len(train_img_folder))]
+    print(folders['train'])
+    folders['test'] = [(test_img_folder[i],test_label_folder[i],test_midi_folder[i]) for i in range(len(test_img_folder))]
+    print(folders['test'])
+    for item in training_data:
+        if item:
+            for img_folder, label_file, midi_folder in folders['train']:
+                est_roll_folder = midi_folder.replace('midi_npz','estimate_Roll_exp4')
+                #/ailab-train/speech/shansizhe/audeo/data/midi_npz/testing/2
+                print("save file in:", est_roll_folder)
+                os.makedirs(est_roll_folder, exist_ok=True)
+                intervals, data = load_data(img_folder, label_file, midi_folder)
+                print("starting inference--------------------")
+                inference(net,intervals, data, est_roll_folder)
+        else:
+            for img_folder, label_file, midi_folder in folders['test']:
+                est_roll_folder = midi_folder.replace('midi_npz','estimate_Roll_exp4')
+                print("save file in:", est_roll_folder)
+                os.makedirs(est_roll_folder, exist_ok=True)
+                intervals, data = load_data(img_folder, label_file, midi_folder)
+                print("starting inference--------------------")
+                inference(net, intervals, data, est_roll_folder)

src/audeo/Video2Roll_solver.py ADDED Viewed

	@@ -0,0 +1,204 @@

+import time
+import numpy as np
+import torch
+from sklearn import metrics
+from sklearn.metrics import _classification
+from torch.utils.tensorboard import SummaryWriter
+from tqdm import tqdm
+import os
+class Solver(object):
+    def __init__(self, data_loader, test_data_loader, model, criterion, optimizer, lr_scheduler, epochs):
+        self.save_model_path = '/ailab-train/speech/shansizhe/audeo/models/Video2Roll_50_0.4/' # change to your path
+        self.test_loader = test_data_loader
+        self.data_loader = data_loader
+        self.net = model
+        self.criterion = criterion
+        self.optimizer = optimizer
+        self.lr_scheduler = lr_scheduler
+        # Training config
+        self.epochs = epochs
+        # logging
+        self.step = 0
+        self.global_step = 0
+        self.writer = SummaryWriter(log_dir='/ailab-train/speech/shansizhe/audeo/log/50_0.4/')
+        # visualizing loss using visdom
+        self.tr_loss = torch.Tensor(self.epochs)
+        self.val_loss = torch.zeros(self.epochs)
+        self.visdom = False
+        self.visdom_epoch = 1
+        self.visdom_id = 'key classification'
+        if self.visdom:
+            from visdom import Visdom
+            self.vis = Visdom(env=self.visdom_id)
+            self.vis_opts = dict(title=self.visdom_id,
+                                 ylabel='Loss', xlabel='Epoch',
+                                 legend=['train loss', 'val loss'])
+            self.vis_window = None
+            self.vis_epochs = torch.arange(1, self.epochs + 1)
+    def train(self):
+        # Train model multi-epoches
+        pre_val_loss = 1e4
+        for epoch in tqdm(range(self.epochs)):
+            print("Training...")
+            self.net.train()  # Turn on BatchNorm & Dropout
+            start = time.time()
+            # training loop
+            tr_avg_loss, tr_avg_precision, tr_avg_recall = self.train_loop()
+            # evaluate
+            self.net.eval()
+            val_avg_loss, val_avg_precision, val_avg_recall, val_avg_acc, val_fscore = self.validate()
+            print('-' * 85)
+            print('Train Summary | Epoch {0} | Time {1:.2f}s | '
+                  'Train Loss {2:.3f}'.format(
+                epoch+1, time.time() - start, tr_avg_loss, tr_avg_precision, tr_avg_recall))
+            print("epoch {0} validation loss:{1:.3f} | avg precision:{2:.3f} | avg recall:{3:.3f} | avg acc:{4:.3f} | f1 score:{5:.3f}".format(
+                epoch+1, val_avg_loss, val_avg_precision, val_avg_recall, val_avg_acc, val_fscore))
+            print('-' * 85)
+            # Log metrics to TensorBoard
+            self.writer.add_scalar('Loss/train', tr_avg_loss, epoch)
+            self.writer.add_scalar('Precision/train', tr_avg_precision, epoch)
+            self.writer.add_scalar('Recall/train', tr_avg_recall, epoch)
+            self.writer.add_scalar('Loss/val', val_avg_loss, epoch)
+            self.writer.add_scalar('Precision/val', val_avg_precision, epoch)
+            self.writer.add_scalar('Recall/val', val_avg_recall, epoch)
+            self.writer.add_scalar('Accuracy/val', val_avg_acc, epoch)
+            self.writer.add_scalar('F1_score/val', val_fscore, epoch)
+            os.makedirs(self.save_model_path, exist_ok=True)
+            model_save_path = f"{self.save_model_path}{epoch}.pth"
+            torch.save(self.net.state_dict(), model_save_path)
+            if val_avg_loss < pre_val_loss:
+                pre_val_loss = val_avg_loss
+                torch.save(self.net.state_dict(), f"{self.save_model_path}best.pth")
+            # Save model each epoch
+            self.val_loss[epoch] = val_avg_loss
+            self.tr_loss[epoch] = tr_avg_loss
+            # visualizing loss using visdom
+            if self.visdom:
+                x_axis = self.vis_epochs[0:epoch + 1]
+                # train_y_axis = self.tr_loss[0:epoch+1]
+                # val_x_axis = self.vis_epochs[0:epoch+1:10]
+                # val_y_axis = self.val_loss[0:epoch//10+1]
+                y_axis = torch.stack(
+                    (self.tr_loss[0:epoch + 1], self.val_loss[0:epoch + 1]), dim=1)
+                if self.vis_window is None:
+                    self.vis_window = self.vis.line(
+                        X=x_axis,
+                        Y=y_axis,
+                        opts=self.vis_opts,
+                    )
+                else:
+                    self.vis.line(
+                        X=x_axis.unsqueeze(0).expand(y_axis.size(
+                            1), x_axis.size(0)).transpose(0, 1),  # Visdom fix
+                        Y=y_axis,
+                        win=self.vis_window,
+                        update='replace',
+                    )
+    def train_loop(self):
+        data_loader = self.data_loader
+        epoch_loss = 0
+        epoch_precision = 0
+        epoch_recall = 0
+        count = 0
+        start = time.time()
+        for i, data in tqdm(enumerate(data_loader)):
+            imgs, label = data
+            logits = self.net(imgs)
+            loss = self.criterion(logits,label)
+            # set the threshold of the logits
+            pred_label = torch.sigmoid(logits) >= 0.4
+            numpy_label = label.cpu().detach().numpy().astype(int)
+            numpy_pre_label = pred_label.cpu().detach().numpy().astype(int)
+            precision = metrics.precision_score(numpy_label,numpy_pre_label, average='samples', zero_division=1)
+            recall = metrics.recall_score(numpy_label,numpy_pre_label, average='samples', zero_division=1)
+            self.writer.add_scalar('loss/step', loss, self.global_step)
+            self.writer.add_scalar('precision/step', precision, self.global_step)
+            self.writer.add_scalar('recall/step', recall, self.global_step)
+            if self.global_step % 100 == 0:
+                end = time.time()
+                print(
+                    "step {0} loss:{1:.4f} | precision:{2:.3f} | recall:{3:.3f} | time:{4:.2f}".format(self.global_step, loss.item(), precision,
+                                                                                        recall,end - start))
+                start = end
+            epoch_precision += precision
+            epoch_recall += recall
+            epoch_loss += loss.item()
+            self.optimizer.zero_grad()
+            loss.backward()
+            self.optimizer.step()
+            count += 1
+            self.global_step += 1
+        self.lr_scheduler.step(epoch_loss / count)
+        return epoch_loss/count, epoch_precision/count, epoch_recall/count
+    def validate(self):
+        epoch_loss = 0
+        count = 0
+        all_pred_label = []
+        all_label = []
+        with torch.no_grad():
+            for i, data in enumerate(self.test_loader):
+                imgs, label = data
+                logits = self.net(imgs)
+                loss = self.criterion(logits, label)
+                pred_label = torch.sigmoid(logits) >= 0.4
+                numpy_label = label.cpu().detach().numpy().astype(int)
+                numpy_pre_label = pred_label.cpu().detach().numpy().astype(int)
+                all_label.append(numpy_label)
+                all_pred_label.append(numpy_pre_label)
+                epoch_loss += loss.item()
+                count += 1
+        all_label = np.vstack(all_label)
+        all_pred_label = np.vstack(all_pred_label)
+        labels = _classification._check_set_wise_labels(all_label, all_pred_label,labels=None, pos_label=1, average='samples')
+        MCM = metrics.multilabel_confusion_matrix(all_label, all_pred_label,sample_weight=None, labels=labels, samplewise=True)
+        tp_sum = MCM[:, 1, 1]
+        fp_sum = MCM[:, 0, 1]
+        fn_sum = MCM[:, 1, 0]
+        # tn_sum = MCM[:, 0, 0]
+        accuracy = _prf_divide(tp_sum, tp_sum+fp_sum+fn_sum, zero_division=1)
+        accuracy = np.average(accuracy)
+        all_precision = metrics.precision_score(all_label, all_pred_label, average='samples', zero_division=1)
+        all_recall = metrics.recall_score(all_label, all_pred_label, average='samples', zero_division=1)
+        all_f1_score = metrics.f1_score(all_label, all_pred_label, average='samples', zero_division=1)
+        return epoch_loss/count, all_precision, all_recall, accuracy, all_f1_score
+def _prf_divide(numerator, denominator, zero_division="warn"):
+    """Performs division and handles divide-by-zero.
+    On zero-division, sets the corresponding result elements equal to
+    0 or 1 (according to ``zero_division``). Plus, if
+    ``zero_division != "warn"`` raises a warning.
+    The metric, modifier and average arguments are used only for determining
+    an appropriate warning.
+    """
+    mask = denominator == 0.0
+    denominator = denominator.copy()
+    denominator[mask] = 1  # avoid infs/nans
+    result = numerator / denominator
+    if not np.any(mask):
+        return result
+    # if ``zero_division=1``, set those with denominator == 0 equal to 1
+    result[mask] = 0.0 if zero_division in ["warn", 0] else 1.0
+    # the user will be removing warnings if zero_division is set to something
+    # different than its default value. If we are computing only f-score
+    # the warning will be raised only if precision and recall are ill-defined
+    if zero_division != "warn":
+        return result

src/audeo/Video2Roll_train.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from Video2Roll_dataset import Video2RollDataset
+from torch.utils.data import DataLoader
+import torch
+from torch import optim
+import Video2RollNet
+from Video2Roll_solver import Solver
+import torch.nn as nn
+from balance_data import MultilabelBalancedRandomSampler
+if __name__ == "__main__":
+    train_dataset = Video2RollDataset(subset='train')
+    train_sampler = MultilabelBalancedRandomSampler(train_dataset.train_labels)
+    train_data_loader = DataLoader(train_dataset, batch_size=64, sampler=train_sampler)
+    test_dataset = Video2RollDataset(subset='test')
+    test_data_loader = DataLoader(test_dataset, batch_size=64)
+    device = torch.device('cuda:6')
+    net = Video2RollNet.resnet18()
+    net.cuda()
+    optimizer = optim.Adam(net.parameters(), lr=0.001, betas=(0.9, 0.999))
+    criterion = nn.BCEWithLogitsLoss()
+    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2)
+    solver = Solver(train_data_loader, test_data_loader, net, criterion, optimizer, scheduler, epochs=50)
+    solver.train()

src/audeo/Video_Id.md ADDED Viewed

	@@ -0,0 +1,30 @@

+# Training
+- https://youtu.be/_3qnL9ddHuw
+- https://youtu.be/HB8-w5CvMls
+- https://youtu.be/vGdV4mJhaKU
+- https://youtu.be/W5lOLZsjOp8
+- https://youtu.be/vHi3_k4XOrA
+- https://youtu.be/PIS76X17Mf8
+- https://youtu.be/DMdJLEGrUrg
+- https://youtu.be/xXwCryMItHs
+- https://youtu.be/49dCBsIGsgY
+- https://youtu.be/OZVMVVQPPPI
+- https://youtu.be/cAnmwgC-JRw
+- https://youtu.be/w77mBaWOOh0
+- https://youtu.be/MGMxImcYhiI
+- https://youtu.be/WqFyqbD9VEQ
+- https://youtu.be/V0P_2QG84MM
+- https://youtu.be/1eEcy3MgqxA
+- https://youtu.be/GH-kkZQQ8G8
+- https://youtu.be/Kk58v56rD0s
+- https://youtu.be/WWqRR7RZGXw
+- https://youtu.be/ouhp7O3Sz8M
+- https://youtu.be/U0v4CckNE68
+- https://youtu.be/VaqWF70DjYs
+- https://youtu.be/m2yadhLP8H8
+- https://youtu.be/wRJlm0lCyoI
+# Testing
+- https://youtu.be/u5nBBJndN3I
+- https://youtu.be/nwwHuxHMIpc
+- https://youtu.be/ra1jf2nzJPg

src/audeo/balance_data.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import random
+import numpy as np
+import torch
+from torch.utils.data.sampler import Sampler
+# torch.cuda.set_device(1)
+class MultilabelBalancedRandomSampler(Sampler):
+    """
+    MultilabelBalancedRandomSampler: Given a multilabel dataset of length n_samples and
+    number of classes n_classes, samples from the data with equal probability per class
+    effectively oversampling minority classes and undersampling majority classes at the
+    same time. Note that using this sampler does not guarantee that the distribution of
+    classes in the output samples will be uniform, since the dataset is multilabel and
+    sampling is based on a single class. This does however guarantee that all classes
+    will have at least batch_size / n_classes samples as batch_size approaches infinity
+    """
+    def __init__(self, labels, indices=None, class_choice="random"):
+        """
+        Parameters:
+        -----------
+            labels: a multi-hot encoding numpy array of shape (n_samples, n_classes)
+            indices: an arbitrary-length 1-dimensional numpy array representing a list
+            of indices to sample only from.
+            class_choice: a string indicating how class will be selected for every
+            sample.
+                "random": class is chosen uniformly at random.
+                "cycle": the sampler cycles through the classes sequentially.
+        """
+        self.labels = labels
+        self.indices = indices
+        if self.indices is None:
+            self.indices = range(len(labels))
+        self.map = []
+        for class_ in range(self.labels.shape[1]):
+            lst = np.where(self.labels[:, class_] == 1)[0]
+            lst = lst[np.isin(lst, self.indices)]
+            self.map.append(lst)
+        all_zero = []
+        for row in range(self.labels.shape[0]):
+            if not np.any(labels[row]):
+                all_zero.append(row)
+        print("all zero sample number is: ",len(all_zero))
+        self.map.append(all_zero)
+        print("counting-----")
+        for i in range(len(self.map)):
+            print("class {0} has {1} samples:".format(i,len(self.map[i])))
+        assert class_choice in ["random", "cycle"]
+        self.class_choice = class_choice
+        self.current_class = 0
+    def __iter__(self):
+        self.count = 0
+        return self
+    def __next__(self):
+        # if self.count >= len(self.indices):
+        if self.count >= 20000:
+            raise StopIteration
+        self.count += 1
+        return self.sample()
+    def sample(self):
+        if self.class_choice == "random":
+            class_ = random.randint(0, self.labels.shape[1])# - 1)
+            # print(class_)
+        elif self.class_choice == "cycle":
+            class_ = self.current_class
+            self.current_class = (self.current_class + 1) % self.labels.shape[1]
+        class_indices = self.map[class_]
+        return np.random.choice(class_indices)
+    def __len__(self):
+        return 20000
+        # return len(self.indices)
+# if __name__ == "__main__":
+#     train_dataset = Video2RollDataset(subset='train')
+#     train_sampler = MultilabelBalancedRandomSampler(train_dataset.train_labels)
+#     train_data_loader = DataLoader(train_dataset, batch_size=64, sampler=train_sampler)
+#     for i, data in enumerate(train_data_loader):
+#         print(i)
+#         imgs,label,ref_imgs,rng = data
+#         print(torch.unique(torch.nonzero(label)[:,1]))
+#         for j in range(len(label)):
+#             if label[j].sum()==0:
+#                 print("yes")
+#         if i == 1:
+#             break

src/audeo/models/Video2Roll_50_0.4/14.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c0e46b8dcf33cb6bf953fe09326edb0bbdcf06b697f64a6f448e3baa42bd822c
+size 50945493

src/audeo/piano_coords.py ADDED Viewed

	@@ -0,0 +1,9 @@

+# upper_left_x, upper_left_y, lower_right_x, lower_right_y
+train_piano_coords = [(68,674,1869,863), (38,680,1882,875), (42,678,1870,874), (42,678,1870,874),
+               (44,670,1876,865), (35,678,1875,869), (30,451,1249,583), (28,454,1254,584),
+               (39,678,1886,881), (33,671,1886,860), (29,446,1252,576), (26,447,1252,577),
+               (42,673,1879,871), (43,669,1870,869), (45,675,1864,870), (53,674,1868,860),
+               (51,679,1866,866), (51,674,1861,861), (48,674,1878,861), (45,671,1879,870),
+               (50,671,1879,866), (54,670,1864,863), (50,670,1870,867), (43,673,1882,869)]
+test_piano_coords = [(41,679,1880,881), (43,675,1883,875), (40,671,1879,871)]

src/audeo/thumbnail_image.png ADDED Viewed

Git LFS Details

SHA256: edbba8fb9a0d6b1ca69c09482a88556882a0a99c6e34c5c4b5d39a1472fdb64b
Pointer size: 131 Bytes
Size of remote file: 125 kB

src/audeo/videomae_fintune.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

src/audioldm/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from .ldm import LatentDiffusion
+from .utils import seed_everything, save_wave, get_time, get_duration
+from .pipeline import *

src/audioldm/__main__.py ADDED Viewed

	@@ -0,0 +1,183 @@

+#!/usr/bin/python3
+import os
+from audioldm import text_to_audio, style_transfer, build_model, save_wave, get_time, round_up_duration, get_duration
+import argparse
+CACHE_DIR = os.getenv(
+    "AUDIOLDM_CACHE_DIR",
+    os.path.join(os.path.expanduser("~"), ".cache/audioldm"))
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--mode",
+    type=str,
+    required=False,
+    default="generation",
+    help="generation: text-to-audio generation; transfer: style transfer",
+    choices=["generation", "transfer"]
+)
+parser.add_argument(
+    "-t",
+    "--text",
+    type=str,
+    required=False,
+    default="",
+    help="Text prompt to the model for audio generation",
+)
+parser.add_argument(
+    "-f",
+    "--file_path",
+    type=str,
+    required=False,
+    default=None,
+    help="(--mode transfer): Original audio file for style transfer; Or (--mode generation): the guidance audio file for generating simialr audio",
+)
+parser.add_argument(
+    "--transfer_strength",
+    type=float,
+    required=False,
+    default=0.5,
+    help="A value between 0 and 1. 0 means original audio without transfer, 1 means completely transfer to the audio indicated by text",
+)
+parser.add_argument(
+    "-s",
+    "--save_path",
+    type=str,
+    required=False,
+    help="The path to save model output",
+    default="./output",
+)
+parser.add_argument(
+    "--model_name",
+    type=str,
+    required=False,
+    help="The checkpoint you gonna use",
+    default="audioldm-s-full",
+    choices=["audioldm-s-full", "audioldm-l-full", "audioldm-s-full-v2"]
+)
+parser.add_argument(
+    "-ckpt",
+    "--ckpt_path",
+    type=str,
+    required=False,
+    help="The path to the pretrained .ckpt model",
+    default=None,
+)
+parser.add_argument(
+    "-b",
+    "--batchsize",
+    type=int,
+    required=False,
+    default=1,
+    help="Generate how many samples at the same time",
+)
+parser.add_argument(
+    "--ddim_steps",
+    type=int,
+    required=False,
+    default=200,
+    help="The sampling step for DDIM",
+)
+parser.add_argument(
+    "-gs",
+    "--guidance_scale",
+    type=float,
+    required=False,
+    default=2.5,
+    help="Guidance scale (Large => better quality and relavancy to text; Small => better diversity)",
+)
+parser.add_argument(
+    "-dur",
+    "--duration",
+    type=float,
+    required=False,
+    default=10.0,
+    help="The duration of the samples",
+)
+parser.add_argument(
+    "-n",
+    "--n_candidate_gen_per_text",
+    type=int,
+    required=False,
+    default=3,
+    help="Automatic quality control. This number control the number of candidates (e.g., generate three audios and choose the best to show you). A Larger value usually lead to better quality with heavier computation",
+)
+parser.add_argument(
+    "--seed",
+    type=int,
+    required=False,
+    default=42,
+    help="Change this value (any integer number) will lead to a different generation result.",
+)
+args = parser.parse_args()
+if(args.ckpt_path is not None):
+    print("Warning: ckpt_path has no effect after version 0.0.20.")
+assert args.duration % 2.5 == 0, "Duration must be a multiple of 2.5"
+mode = args.mode
+if(mode == "generation" and args.file_path is not None):
+    mode = "generation_audio_to_audio"
+    if(len(args.text) > 0):
+        print("Warning: You have specified the --file_path. --text will be ignored")
+        args.text = ""
+save_path = os.path.join(args.save_path, mode)
+if(args.file_path is not None):
+    save_path = os.path.join(save_path, os.path.basename(args.file_path.split(".")[0]))
+text = args.text
+random_seed = args.seed
+duration = args.duration
+guidance_scale = args.guidance_scale
+n_candidate_gen_per_text = args.n_candidate_gen_per_text
+os.makedirs(save_path, exist_ok=True)
+audioldm = build_model(model_name=args.model_name)
+if(args.mode == "generation"):
+    waveform = text_to_audio(
+        audioldm,
+        text,
+        args.file_path,
+        random_seed,
+        duration=duration,
+        guidance_scale=guidance_scale,
+        ddim_steps=args.ddim_steps,
+        n_candidate_gen_per_text=n_candidate_gen_per_text,
+        batchsize=args.batchsize,
+    )
+elif(args.mode == "transfer"):
+    assert args.file_path is not None
+    assert os.path.exists(args.file_path), "The original audio file \'%s\' for style transfer does not exist." % args.file_path
+    waveform = style_transfer(
+        audioldm,
+        text,
+        args.file_path,
+        args.transfer_strength,
+        random_seed,
+        duration=duration,
+        guidance_scale=guidance_scale,
+        ddim_steps=args.ddim_steps,
+        batchsize=args.batchsize,
+    )
+    waveform = waveform[:,None,:]
+save_wave(waveform, save_path, name="%s_%s" % (get_time(), text))

src/audioldm/audio/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .tools import wav_to_fbank, read_wav_file
2	+ from .stft import TacotronSTFT

src/audioldm/audio/audio_processing.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import torch
+import numpy as np
+import librosa.util as librosa_util
+from scipy.signal import get_window
+def window_sumsquare(
+    window,
+    n_frames,
+    hop_length,
+    win_length,
+    n_fft,
+    dtype=np.float32,
+    norm=None,
+):
+    """
+    # from librosa 0.6
+    Compute the sum-square envelope of a window function at a given hop length.
+    This is used to estimate modulation effects induced by windowing
+    observations in short-time fourier transforms.
+    Parameters
+    ----------
+    window : string, tuple, number, callable, or list-like
+        Window specification, as in `get_window`
+    n_frames : int > 0
+        The number of analysis frames
+    hop_length : int > 0
+        The number of samples to advance between frames
+    win_length : [optional]
+        The length of the window function.  By default, this matches `n_fft`.
+    n_fft : int > 0
+        The length of each analysis frame.
+    dtype : np.dtype
+        The data type of the output
+    Returns
+    -------
+    wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
+        The sum-squared envelope of the window function
+    """
+    if win_length is None:
+        win_length = n_fft
+    n = n_fft + hop_length * (n_frames - 1)
+    x = np.zeros(n, dtype=dtype)
+    # Compute the squared window at the desired length
+    win_sq = get_window(window, win_length, fftbins=True)
+    win_sq = librosa_util.normalize(win_sq, norm=norm) ** 2
+    win_sq = librosa_util.pad_center(win_sq, n_fft)
+    # Fill the envelope
+    for i in range(n_frames):
+        sample = i * hop_length
+        x[sample : min(n, sample + n_fft)] += win_sq[: max(0, min(n_fft, n - sample))]
+    return x
+def griffin_lim(magnitudes, stft_fn, n_iters=30):
+    """
+    PARAMS
+    ------
+    magnitudes: spectrogram magnitudes
+    stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods
+    """
+    angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size())))
+    angles = angles.astype(np.float32)
+    angles = torch.autograd.Variable(torch.from_numpy(angles))
+    signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
+    for i in range(n_iters):
+        _, angles = stft_fn.transform(signal)
+        signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
+    return signal
+def dynamic_range_compression(x, normalize_fun=torch.log, C=1, clip_val=1e-5):
+    """
+    PARAMS
+    ------
+    C: compression factor
+    """
+    return normalize_fun(torch.clamp(x, min=clip_val) * C)
+def dynamic_range_decompression(x, C=1):
+    """
+    PARAMS
+    ------
+    C: compression factor used to compress
+    """
+    return torch.exp(x) / C

src/audioldm/audio/stft.py ADDED Viewed

	@@ -0,0 +1,186 @@

+import torch
+import torch.nn.functional as F
+import numpy as np
+from scipy.signal import get_window
+from librosa.util import pad_center, tiny
+from librosa.filters import mel as librosa_mel_fn
+from audioldm.audio.audio_processing import (
+    dynamic_range_compression,
+    dynamic_range_decompression,
+    window_sumsquare,
+)
+class STFT(torch.nn.Module):
+    """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""
+    def __init__(self, filter_length, hop_length, win_length, window="hann"):
+        super(STFT, self).__init__()
+        self.filter_length = filter_length
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.window = window
+        self.forward_transform = None
+        scale = self.filter_length / self.hop_length
+        fourier_basis = np.fft.fft(np.eye(self.filter_length))
+        cutoff = int((self.filter_length / 2 + 1))
+        fourier_basis = np.vstack(
+            [np.real(fourier_basis[:cutoff, :]), np.imag(fourier_basis[:cutoff, :])]
+        )
+        forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
+        inverse_basis = torch.FloatTensor(
+            np.linalg.pinv(scale * fourier_basis).T[:, None, :]
+        )
+        if window is not None:
+            assert filter_length >= win_length
+            # get window and zero center pad it to filter_length
+            fft_window = get_window(window, win_length, fftbins=True)
+            fft_window = pad_center(fft_window, filter_length)
+            fft_window = torch.from_numpy(fft_window).float()
+            # window the bases
+            forward_basis *= fft_window
+            inverse_basis *= fft_window
+        self.register_buffer("forward_basis", forward_basis.float())
+        self.register_buffer("inverse_basis", inverse_basis.float())
+    def transform(self, input_data):
+        device = self.forward_basis.device
+        input_data = input_data.to(device)
+        num_batches = input_data.size(0)
+        num_samples = input_data.size(1)
+        self.num_samples = num_samples
+        # similar to librosa, reflect-pad the input
+        input_data = input_data.view(num_batches, 1, num_samples)
+        input_data = F.pad(
+            input_data.unsqueeze(1),
+            (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0),
+            mode="reflect",
+        )
+        input_data = input_data.squeeze(1)
+        forward_transform = F.conv1d(
+            input_data,
+            torch.autograd.Variable(self.forward_basis, requires_grad=False),
+            stride=self.hop_length,
+            padding=0,
+        )#.cpu()
+        cutoff = int((self.filter_length / 2) + 1)
+        real_part = forward_transform[:, :cutoff, :]
+        imag_part = forward_transform[:, cutoff:, :]
+        magnitude = torch.sqrt(real_part**2 + imag_part**2)
+        phase = torch.autograd.Variable(torch.atan2(imag_part.data, real_part.data))
+        return magnitude, phase
+    def inverse(self, magnitude, phase):
+        device = self.forward_basis.device
+        magnitude, phase = magnitude.to(device), phase.to(device)
+        recombine_magnitude_phase = torch.cat(
+            [magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1
+        )
+        inverse_transform = F.conv_transpose1d(
+            recombine_magnitude_phase,
+            torch.autograd.Variable(self.inverse_basis, requires_grad=False),
+            stride=self.hop_length,
+            padding=0,
+        )
+        if self.window is not None:
+            window_sum = window_sumsquare(
+                self.window,
+                magnitude.size(-1),
+                hop_length=self.hop_length,
+                win_length=self.win_length,
+                n_fft=self.filter_length,
+                dtype=np.float32,
+            )
+            # remove modulation effects
+            approx_nonzero_indices = torch.from_numpy(
+                np.where(window_sum > tiny(window_sum))[0]
+            )
+            window_sum = torch.autograd.Variable(
+                torch.from_numpy(window_sum), requires_grad=False
+            )
+            window_sum = window_sum
+            inverse_transform[:, :, approx_nonzero_indices] /= window_sum[
+                approx_nonzero_indices
+            ]
+            # scale by hop ratio
+            inverse_transform *= float(self.filter_length) / self.hop_length
+        inverse_transform = inverse_transform[:, :, int(self.filter_length / 2) :]
+        inverse_transform = inverse_transform[:, :, : -int(self.filter_length / 2) :]
+        return inverse_transform
+    def forward(self, input_data):
+        self.magnitude, self.phase = self.transform(input_data)
+        reconstruction = self.inverse(self.magnitude, self.phase)
+        return reconstruction
+class TacotronSTFT(torch.nn.Module):
+    def __init__(
+        self,
+        filter_length,
+        hop_length,
+        win_length,
+        n_mel_channels,
+        sampling_rate,
+        mel_fmin,
+        mel_fmax,
+    ):
+        super(TacotronSTFT, self).__init__()
+        self.n_mel_channels = n_mel_channels
+        self.sampling_rate = sampling_rate
+        self.stft_fn = STFT(filter_length, hop_length, win_length)
+        mel_basis = librosa_mel_fn(
+            sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax
+        )
+        mel_basis = torch.from_numpy(mel_basis).float()
+        self.register_buffer("mel_basis", mel_basis)
+    def spectral_normalize(self, magnitudes, normalize_fun):
+        output = dynamic_range_compression(magnitudes, normalize_fun)
+        return output
+    def spectral_de_normalize(self, magnitudes):
+        output = dynamic_range_decompression(magnitudes)
+        return output
+    def mel_spectrogram(self, y, normalize_fun=torch.log):
+        """Computes mel-spectrograms from a batch of waves
+        PARAMS
+        ------
+        y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]
+        RETURNS
+        -------
+        mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
+        """
+        assert torch.min(y.data) >= -1, torch.min(y.data)
+        assert torch.max(y.data) <= 1, torch.max(y.data)
+        magnitudes, phases = self.stft_fn.transform(y)
+        magnitudes = magnitudes.data
+        mel_output = torch.matmul(self.mel_basis, magnitudes)
+        mel_output = self.spectral_normalize(mel_output, normalize_fun)
+        energy = torch.norm(magnitudes, dim=1)
+        log_magnitudes = self.spectral_normalize(magnitudes, normalize_fun)
+        return mel_output, log_magnitudes, energy

src/audioldm/audio/tools.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import torch
+import numpy as np
+import torchaudio
+def get_mel_from_wav(audio, _stft):
+    audio = torch.clip(torch.FloatTensor(audio).unsqueeze(0), -1, 1)
+    audio = torch.autograd.Variable(audio, requires_grad=False)
+    melspec, log_magnitudes_stft, energy = _stft.mel_spectrogram(audio)
+    melspec = torch.squeeze(melspec, 0).numpy().astype(np.float32)
+    log_magnitudes_stft = (
+        torch.squeeze(log_magnitudes_stft, 0).numpy().astype(np.float32)
+    )
+    energy = torch.squeeze(energy, 0).numpy().astype(np.float32)
+    return melspec, log_magnitudes_stft, energy
+def _pad_spec(fbank, target_length=1024):
+    n_frames = fbank.shape[0]
+    p = target_length - n_frames
+    # cut and pad
+    if p > 0:
+        m = torch.nn.ZeroPad2d((0, 0, 0, p))
+        fbank = m(fbank)
+    elif p < 0:
+        fbank = fbank[0:target_length, :]
+    if fbank.size(-1) % 2 != 0:
+        fbank = fbank[..., :-1]
+    return fbank
+def pad_wav(waveform, segment_length):
+    waveform_length = waveform.shape[-1]
+    assert waveform_length > 100, "Waveform is too short, %s" % waveform_length
+    if segment_length is None or waveform_length == segment_length:
+        return waveform
+    elif waveform_length > segment_length:
+        return waveform[:segment_length]
+    elif waveform_length < segment_length:
+        temp_wav = np.zeros((1, segment_length))
+        temp_wav[:, :waveform_length] = waveform
+    return temp_wav
+def normalize_wav(waveform):
+    waveform = waveform - np.mean(waveform)
+    waveform = waveform / (np.max(np.abs(waveform)) + 1e-8)
+    return waveform * 0.5
+def read_wav_file(filename, segment_length):
+    # waveform, sr = librosa.load(filename, sr=None, mono=True) # 4 times slower
+    waveform, sr = torchaudio.load(filename)  # Faster!!!
+    waveform = torchaudio.functional.resample(waveform, orig_freq=sr, new_freq=16000)
+    waveform = waveform.numpy()[0, ...]
+    waveform = normalize_wav(waveform)
+    waveform = waveform[None, ...]
+    waveform = pad_wav(waveform, segment_length)
+    waveform = waveform / np.max(np.abs(waveform))
+    waveform = 0.5 * waveform
+    return waveform
+def wav_to_fbank(filename, target_length=1024, fn_STFT=None):
+    assert fn_STFT is not None
+    # mixup
+    waveform = read_wav_file(filename, target_length * 160)  # hop size is 160
+    waveform = waveform[0, ...]
+    waveform = torch.FloatTensor(waveform)
+    fbank, log_magnitudes_stft, energy = get_mel_from_wav(waveform, fn_STFT)
+    fbank = torch.FloatTensor(fbank.T)
+    log_magnitudes_stft = torch.FloatTensor(log_magnitudes_stft.T)
+    fbank, log_magnitudes_stft = _pad_spec(fbank, target_length), _pad_spec(
+        log_magnitudes_stft, target_length
+    )
+    return fbank, log_magnitudes_stft, waveform

src/audioldm/clap/__init__.py ADDED Viewed

File without changes

src/audioldm/clap/encoders.py ADDED Viewed

	@@ -0,0 +1,170 @@

+import torch
+import torch.nn as nn
+from audioldm.clap.open_clip import create_model
+from audioldm.clap.training.data import get_audio_features
+import torchaudio
+from transformers import RobertaTokenizer
+import torch.nn.functional as F
+class CLAPAudioEmbeddingClassifierFreev2(nn.Module):
+    def __init__(
+        self,
+        pretrained_path="",
+        key="class",
+        sampling_rate=16000,
+        embed_mode="audio",
+        amodel = "HTSAT-tiny",
+        unconditional_prob=0.1,
+        random_mute=False,
+        max_random_mute_portion=0.5,
+        training_mode=True,
+    ):
+        super().__init__()
+        self.key = key
+        self.device = "cpu"
+        self.precision = "fp32"
+        self.amodel = amodel  # or 'PANN-14'
+        self.tmodel = "roberta"  # the best text encoder in our training
+        self.enable_fusion = False  # False if you do not want to use the fusion model
+        self.fusion_type = "aff_2d"
+        self.pretrained = pretrained_path
+        self.embed_mode = embed_mode
+        self.embed_mode_orig = embed_mode
+        self.sampling_rate = sampling_rate
+        self.unconditional_prob = unconditional_prob
+        self.random_mute = random_mute
+        self.tokenize = RobertaTokenizer.from_pretrained("roberta-base")
+        self.max_random_mute_portion = max_random_mute_portion
+        self.training_mode = training_mode
+        self.model, self.model_cfg = create_model(
+            self.amodel,
+            self.tmodel,
+            self.pretrained,
+            precision=self.precision,
+            device=self.device,
+            enable_fusion=self.enable_fusion,
+            fusion_type=self.fusion_type,
+        )
+        for p in self.model.parameters():
+            p.requires_grad = False
+        self.model.eval()
+    def get_unconditional_condition(self, batchsize):
+        self.unconditional_token = self.model.get_text_embedding(
+            self.tokenizer(["", ""])
+        )[0:1]
+        return torch.cat([self.unconditional_token.unsqueeze(0)] * batchsize, dim=0)
+    def batch_to_list(self, batch):
+        ret = []
+        for i in range(batch.size(0)):
+            ret.append(batch[i])
+        return ret
+    def make_decision(self, probability):
+        if float(torch.rand(1)) < probability:
+            return True
+        else:
+            return False
+    def random_uniform(self, start, end):
+        val = torch.rand(1).item()
+        return start + (end - start) * val
+    def _random_mute(self, waveform):
+        # waveform: [bs, t-steps]
+        t_steps = waveform.size(-1)
+        for i in range(waveform.size(0)):
+            mute_size = int(
+                self.random_uniform(0, end=int(t_steps * self.max_random_mute_portion))
+            )
+            mute_start = int(self.random_uniform(0, t_steps - mute_size))
+            waveform[i, mute_start : mute_start + mute_size] = 0
+        return waveform
+    def cos_similarity(self, waveform, text):
+        # waveform: [bs, t_steps]
+        with torch.no_grad():
+            self.embed_mode = "audio"
+            audio_emb = self(waveform.cuda())
+            self.embed_mode = "text"
+            text_emb = self(text)
+            similarity = F.cosine_similarity(audio_emb, text_emb, dim=2), audio_emb, text_emb
+            return similarity.squeeze()
+    def forward(self, batch, key=None):
+        # If you want this conditioner to be unconditional, set self.unconditional_prob = 1.0
+        # If you want this conditioner to be fully conditional, set self.unconditional_prob = 0.0
+        if self.model.training == True and not self.training_mode:
+            print(
+                "The pretrained CLAP model should always be in eval mode. Reloading model just in case you change the parameters."
+            )
+            self.model, self.model_cfg = create_model(
+                self.amodel,
+                self.tmodel,
+                self.pretrained,
+                precision=self.precision,
+                device="cuda",
+                enable_fusion=self.enable_fusion,
+                fusion_type=self.fusion_type,
+            )
+            for p in self.model.parameters():
+                p.requires_grad = False
+            self.model.eval()
+        # the 'fusion' truncate mode can be changed to 'rand_trunc' if run in unfusion mode
+        if self.embed_mode == "audio":
+            with torch.no_grad():
+                audio_dict_list = []
+                assert (
+                    self.sampling_rate == 16000
+                ), "We only support 16000 sampling rate"
+                if self.random_mute:
+                    batch = self._random_mute(batch)
+                # batch: [bs, 1, t-samples]
+                batch = torchaudio.functional.resample(
+                    batch, orig_freq=self.sampling_rate, new_freq=48000
+                )
+                for waveform in self.batch_to_list(batch):
+                    audio_dict = {}
+                    audio_dict = get_audio_features(
+                        audio_dict,
+                        waveform,
+                        480000,
+                        data_truncating="fusion",
+                        data_filling="repeatpad",
+                        audio_cfg=self.model_cfg["audio_cfg"],
+                    )
+                    audio_dict_list.append(audio_dict)
+                # [bs, 512]
+                embed = self.model.get_audio_embedding(audio_dict_list)
+        elif self.embed_mode == "text":
+            with torch.no_grad():
+                # the 'fusion' truncate mode can be changed to 'rand_trunc' if run in unfusion mode
+                text_data = self.tokenizer(batch)
+                embed = self.model.get_text_embedding(text_data)
+        embed = embed.unsqueeze(1)
+        self.unconditional_token = self.model.get_text_embedding(
+            self.tokenizer(["", ""])
+        )[0:1]
+        for i in range(embed.size(0)):
+            if self.make_decision(self.unconditional_prob):
+                embed[i] = self.unconditional_token
+        # [bs, 1, 512]
+        return embed.detach()
+    def tokenizer(self, text):
+        result = self.tokenize(
+            text,
+            padding="max_length",
+            truncation=True,
+            max_length=512,
+            return_tensors="pt",
+        )
+        return {k: v.squeeze(0) for k, v in result.items()}

src/audioldm/clap/open_clip/__init__.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from .factory import (
+    list_models,
+    create_model,
+    create_model_and_transforms,
+    add_model_config,
+)
+from .loss import ClipLoss, gather_features, LPLoss, lp_gather_features, LPMetrics
+from .model import (
+    CLAP,
+    CLAPTextCfg,
+    CLAPVisionCfg,
+    CLAPAudioCfp,
+    convert_weights_to_fp16,
+    trace_model,
+)
+from .openai import load_openai_model, list_openai_models
+from .pretrained import (
+    list_pretrained,
+    list_pretrained_tag_models,
+    list_pretrained_model_tags,
+    get_pretrained_url,
+    download_pretrained,
+)
+from .tokenizer import SimpleTokenizer, tokenize
+from .transform import image_transform

src/audioldm/clap/open_clip/bert.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from transformers import BertTokenizer, BertModel
+tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+model = BertModel.from_pretrained("bert-base-uncased")
+text = "Replace me by any text you'd like."
+def bert_embeddings(text):
+    # text = "Replace me by any text you'd like."
+    encoded_input = tokenizer(text, return_tensors="pt")
+    output = model(**encoded_input)
+    return output
+from transformers import RobertaTokenizer, RobertaModel
+tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
+model = RobertaModel.from_pretrained("roberta-base")
+text = "Replace me by any text you'd like."
+def Roberta_embeddings(text):
+    # text = "Replace me by any text you'd like."
+    encoded_input = tokenizer(text, return_tensors="pt")
+    output = model(**encoded_input)
+    return output
+from transformers import BartTokenizer, BartModel
+tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
+model = BartModel.from_pretrained("facebook/bart-base")
+text = "Replace me by any text you'd like."
+def bart_embeddings(text):
+    # text = "Replace me by any text you'd like."
+    encoded_input = tokenizer(text, return_tensors="pt")
+    output = model(**encoded_input)
+    return output

src/audioldm/clap/open_clip/bpe_simple_vocab_16e6.txt.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:924691ac288e54409236115652ad4aa250f48203de50a9e4722a6ecd48d6804a
+size 1356917

src/audioldm/clap/open_clip/factory.py ADDED Viewed

	@@ -0,0 +1,279 @@

+import json
+import logging
+import os
+import pathlib
+import re
+from copy import deepcopy
+from pathlib import Path
+import torch
+from .model import CLAP, convert_weights_to_fp16
+from .openai import load_openai_model
+from .pretrained import get_pretrained_url, download_pretrained
+from .transform import image_transform
+_MODEL_CONFIG_PATHS = [Path(__file__).parent / f"model_configs/"]
+_MODEL_CONFIGS = {}  # directory (model_name: config) of model architecture configs
+CACHE_DIR = os.getenv("AUDIOLDM_CACHE_DIR", "~/.cache/audioldm")
+def _natural_key(string_):
+    return [int(s) if s.isdigit() else s for s in re.split(r"(\d+)", string_.lower())]
+def _rescan_model_configs():
+    global _MODEL_CONFIGS
+    config_ext = (".json",)
+    config_files = []
+    for config_path in _MODEL_CONFIG_PATHS:
+        if config_path.is_file() and config_path.suffix in config_ext:
+            config_files.append(config_path)
+        elif config_path.is_dir():
+            for ext in config_ext:
+                config_files.extend(config_path.glob(f"*{ext}"))
+    for cf in config_files:
+        if os.path.basename(cf)[0] == ".":
+            continue  # Ignore hidden files
+        with open(cf, "r") as f:
+            model_cfg = json.load(f)
+            if all(a in model_cfg for a in ("embed_dim", "audio_cfg", "text_cfg")):
+                _MODEL_CONFIGS[cf.stem] = model_cfg
+    _MODEL_CONFIGS = {
+        k: v
+        for k, v in sorted(_MODEL_CONFIGS.items(), key=lambda x: _natural_key(x[0]))
+    }
+_rescan_model_configs()  # initial populate of model config registry
+def load_state_dict(checkpoint_path: str, map_location="cpu", skip_params=True):
+    checkpoint = torch.load(checkpoint_path, map_location=map_location)
+    if isinstance(checkpoint, dict) and "state_dict" in checkpoint:
+        state_dict = checkpoint["state_dict"]
+    else:
+        state_dict = checkpoint
+    if skip_params:
+        if next(iter(state_dict.items()))[0].startswith("module"):
+            state_dict = {k[7:]: v for k, v in state_dict.items()}
+    # for k in state_dict:
+    #     if k.startswith('transformer'):
+    #         v = state_dict.pop(k)
+    #         state_dict['text_branch.' + k[12:]] = v
+    return state_dict
+def create_model(
+    amodel_name: str,
+    tmodel_name: str,
+    pretrained: str = "",
+    precision: str = "fp32",
+    device: torch.device = torch.device("cpu"),
+    jit: bool = False,
+    force_quick_gelu: bool = False,
+    openai_model_cache_dir: str = os.path.expanduser(f"{CACHE_DIR}/clip"),
+    skip_params=True,
+    pretrained_audio: str = "",
+    pretrained_text: str = "",
+    enable_fusion: bool = False,
+    fusion_type: str = "None"
+    # pretrained_image: bool = False,
+):
+    amodel_name = amodel_name.replace(
+        "/", "-"
+    )  # for callers using old naming with / in ViT names
+    pretrained_orig = pretrained
+    pretrained = pretrained.lower()
+    if pretrained == "openai":
+        if amodel_name in _MODEL_CONFIGS:
+            logging.info(f"Loading {amodel_name} model config.")
+            model_cfg = deepcopy(_MODEL_CONFIGS[amodel_name])
+        else:
+            logging.error(
+                f"Model config for {amodel_name} not found; available models {list_models()}."
+            )
+            raise RuntimeError(f"Model config for {amodel_name} not found.")
+        logging.info(f"Loading pretrained ViT-B-16 text encoder from OpenAI.")
+        # Hard Code in model name
+        model_cfg["text_cfg"]["model_type"] = tmodel_name
+        model = load_openai_model(
+            "ViT-B-16",
+            model_cfg,
+            device=device,
+            jit=jit,
+            cache_dir=openai_model_cache_dir,
+            enable_fusion=enable_fusion,
+            fusion_type=fusion_type,
+        )
+        # See https://discuss.pytorch.org/t/valueerror-attemting-to-unscale-fp16-gradients/81372
+        if precision == "amp" or precision == "fp32":
+            model = model.float()
+    else:
+        if amodel_name in _MODEL_CONFIGS:
+            logging.info(f"Loading {amodel_name} model config.")
+            model_cfg = deepcopy(_MODEL_CONFIGS[amodel_name])
+        else:
+            logging.error(
+                f"Model config for {amodel_name} not found; available models {list_models()}."
+            )
+            raise RuntimeError(f"Model config for {amodel_name} not found.")
+        if force_quick_gelu:
+            # override for use of QuickGELU on non-OpenAI transformer models
+            model_cfg["quick_gelu"] = True
+        # if pretrained_image:
+        #     if 'timm_amodel_name' in model_cfg.get('vision_cfg', {}):
+        #         # pretrained weight loading for timm models set via vision_cfg
+        #         model_cfg['vision_cfg']['timm_model_pretrained'] = True
+        #     else:
+        #         assert False, 'pretrained image towers currently only supported for timm models'
+        model_cfg["text_cfg"]["model_type"] = tmodel_name
+        model_cfg["enable_fusion"] = enable_fusion
+        model_cfg["fusion_type"] = fusion_type
+        model = CLAP(**model_cfg)
+        if pretrained:
+            checkpoint_path = ""
+            url = get_pretrained_url(amodel_name, pretrained)
+            if url:
+                checkpoint_path = download_pretrained(url, root=openai_model_cache_dir)
+            elif os.path.exists(pretrained_orig):
+                checkpoint_path = pretrained_orig
+            if checkpoint_path:
+                logging.info(
+                    f"Loading pretrained {amodel_name}-{tmodel_name} weights ({pretrained})."
+                )
+                ckpt = load_state_dict(checkpoint_path, skip_params=True)
+                model.load_state_dict(ckpt)
+                param_names = [n for n, p in model.named_parameters()]
+                # for n in param_names:
+                #     print(n, "\t", "Loaded" if n in ckpt else "Unloaded")
+            else:
+                logging.warning(
+                    f"Pretrained weights ({pretrained}) not found for model {amodel_name}."
+                )
+                raise RuntimeError(
+                    f"Pretrained weights ({pretrained}) not found for model {amodel_name}."
+                )
+        if pretrained_audio:
+            if amodel_name.startswith("PANN"):
+                if "Cnn14_mAP" in pretrained_audio:  # official checkpoint
+                    audio_ckpt = torch.load(pretrained_audio, map_location="cpu")
+                    audio_ckpt = audio_ckpt["model"]
+                    keys = list(audio_ckpt.keys())
+                    for key in keys:
+                        if (
+                            "spectrogram_extractor" not in key
+                            and "logmel_extractor" not in key
+                        ):
+                            v = audio_ckpt.pop(key)
+                            audio_ckpt["audio_branch." + key] = v
+                elif os.path.basename(pretrained_audio).startswith(
+                    "PANN"
+                ):  # checkpoint trained via HTSAT codebase
+                    audio_ckpt = torch.load(pretrained_audio, map_location="cpu")
+                    audio_ckpt = audio_ckpt["state_dict"]
+                    keys = list(audio_ckpt.keys())
+                    for key in keys:
+                        if key.startswith("sed_model"):
+                            v = audio_ckpt.pop(key)
+                            audio_ckpt["audio_branch." + key[10:]] = v
+                elif os.path.basename(pretrained_audio).startswith(
+                    "finetuned"
+                ):  # checkpoint trained via linear probe codebase
+                    audio_ckpt = torch.load(pretrained_audio, map_location="cpu")
+                else:
+                    raise ValueError("Unknown audio checkpoint")
+            elif amodel_name.startswith("HTSAT"):
+                if "HTSAT_AudioSet_Saved" in pretrained_audio:  # official checkpoint
+                    audio_ckpt = torch.load(pretrained_audio, map_location="cpu")
+                    audio_ckpt = audio_ckpt["state_dict"]
+                    keys = list(audio_ckpt.keys())
+                    for key in keys:
+                        if key.startswith("sed_model") and (
+                            "spectrogram_extractor" not in key
+                            and "logmel_extractor" not in key
+                        ):
+                            v = audio_ckpt.pop(key)
+                            audio_ckpt["audio_branch." + key[10:]] = v
+                elif os.path.basename(pretrained_audio).startswith(
+                    "HTSAT"
+                ):  # checkpoint trained via HTSAT codebase
+                    audio_ckpt = torch.load(pretrained_audio, map_location="cpu")
+                    audio_ckpt = audio_ckpt["state_dict"]
+                    keys = list(audio_ckpt.keys())
+                    for key in keys:
+                        if key.startswith("sed_model"):
+                            v = audio_ckpt.pop(key)
+                            audio_ckpt["audio_branch." + key[10:]] = v
+                elif os.path.basename(pretrained_audio).startswith(
+                    "finetuned"
+                ):  # checkpoint trained via linear probe codebase
+                    audio_ckpt = torch.load(pretrained_audio, map_location="cpu")
+                else:
+                    raise ValueError("Unknown audio checkpoint")
+            else:
+                raise f"this audio encoder pretrained checkpoint is not support"
+            model.load_state_dict(audio_ckpt, strict=False)
+            logging.info(
+                f"Loading pretrained {amodel_name} weights ({pretrained_audio})."
+            )
+            param_names = [n for n, p in model.named_parameters()]
+            for n in param_names:
+                print(n, "\t", "Loaded" if n in audio_ckpt else "Unloaded")
+        model.to(device=device)
+        if precision == "fp16":
+            assert device.type != "cpu"
+            convert_weights_to_fp16(model)
+        if jit:
+            model = torch.jit.script(model)
+    return model, model_cfg
+def create_model_and_transforms(
+    model_name: str,
+    pretrained: str = "",
+    precision: str = "fp32",
+    device: torch.device = torch.device("cpu"),
+    jit: bool = False,
+    force_quick_gelu: bool = False,
+    # pretrained_image: bool = False,
+):
+    model = create_model(
+        model_name,
+        pretrained,
+        precision,
+        device,
+        jit,
+        force_quick_gelu=force_quick_gelu,
+        # pretrained_image=pretrained_image
+    )
+    preprocess_train = image_transform(model.visual.image_size, is_train=True)
+    preprocess_val = image_transform(model.visual.image_size, is_train=False)
+    return model, preprocess_train, preprocess_val
+def list_models():
+    """enumerate available model architectures based on config files"""
+    return list(_MODEL_CONFIGS.keys())
+def add_model_config(path):
+    """add model config path or file and update registry"""
+    if not isinstance(path, Path):
+        path = Path(path)
+    _MODEL_CONFIG_PATHS.append(path)
+    _rescan_model_configs()

src/audioldm/clap/open_clip/feature_fusion.py ADDED Viewed

	@@ -0,0 +1,192 @@

+"""
+Feature Fusion for Varible-Length Data Processing
+AFF/iAFF is referred and modified from https://github.com/YimianDai/open-aff/blob/master/aff_pytorch/aff_net/fusion.py
+According to the paper: Yimian Dai et al, Attentional Feature Fusion, IEEE Winter Conference on Applications of Computer Vision, WACV 2021
+"""
+import torch
+import torch.nn as nn
+class DAF(nn.Module):
+    """
+    直接相加 DirectAddFuse
+    """
+    def __init__(self):
+        super(DAF, self).__init__()
+    def forward(self, x, residual):
+        return x + residual
+class iAFF(nn.Module):
+    """
+    多特征融合 iAFF
+    """
+    def __init__(self, channels=64, r=4, type="2D"):
+        super(iAFF, self).__init__()
+        inter_channels = int(channels // r)
+        if type == "1D":
+            # 本地注意力
+            self.local_att = nn.Sequential(
+                nn.Conv1d(channels, inter_channels, kernel_size=1, stride=1, padding=0),
+                nn.BatchNorm1d(inter_channels),
+                nn.ReLU(inplace=True),
+                nn.Conv1d(inter_channels, channels, kernel_size=1, stride=1, padding=0),
+                nn.BatchNorm1d(channels),
+            )
+            # 全局注意力
+            self.global_att = nn.Sequential(
+                nn.AdaptiveAvgPool1d(1),
+                nn.Conv1d(channels, inter_channels, kernel_size=1, stride=1, padding=0),
+                nn.BatchNorm1d(inter_channels),
+                nn.ReLU(inplace=True),
+                nn.Conv1d(inter_channels, channels, kernel_size=1, stride=1, padding=0),
+                nn.BatchNorm1d(channels),
+            )
+            # 第二次本地注意力
+            self.local_att2 = nn.Sequential(
+                nn.Conv1d(channels, inter_channels, kernel_size=1, stride=1, padding=0),
+                nn.BatchNorm1d(inter_channels),
+                nn.ReLU(inplace=True),
+                nn.Conv1d(inter_channels, channels, kernel_size=1, stride=1, padding=0),
+                nn.BatchNorm1d(channels),
+            )
+            # 第二次全局注意力
+            self.global_att2 = nn.Sequential(
+                nn.AdaptiveAvgPool1d(1),
+                nn.Conv1d(channels, inter_channels, kernel_size=1, stride=1, padding=0),
+                nn.BatchNorm1d(inter_channels),
+                nn.ReLU(inplace=True),
+                nn.Conv1d(inter_channels, channels, kernel_size=1, stride=1, padding=0),
+                nn.BatchNorm1d(channels),
+            )
+        elif type == "2D":
+            # 本地注意力
+            self.local_att = nn.Sequential(
+                nn.Conv2d(channels, inter_channels, kernel_size=1, stride=1, padding=0),
+                nn.BatchNorm2d(inter_channels),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(inter_channels, channels, kernel_size=1, stride=1, padding=0),
+                nn.BatchNorm2d(channels),
+            )
+            # 全局注意力
+            self.global_att = nn.Sequential(
+                nn.AdaptiveAvgPool2d(1),
+                nn.Conv2d(channels, inter_channels, kernel_size=1, stride=1, padding=0),
+                nn.BatchNorm2d(inter_channels),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(inter_channels, channels, kernel_size=1, stride=1, padding=0),
+                nn.BatchNorm2d(channels),
+            )
+            # 第二次本地注意力
+            self.local_att2 = nn.Sequential(
+                nn.Conv2d(channels, inter_channels, kernel_size=1, stride=1, padding=0),
+                nn.BatchNorm2d(inter_channels),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(inter_channels, channels, kernel_size=1, stride=1, padding=0),
+                nn.BatchNorm2d(channels),
+            )
+            # 第二次全局注意力
+            self.global_att2 = nn.Sequential(
+                nn.AdaptiveAvgPool2d(1),
+                nn.Conv2d(channels, inter_channels, kernel_size=1, stride=1, padding=0),
+                nn.BatchNorm2d(inter_channels),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(inter_channels, channels, kernel_size=1, stride=1, padding=0),
+                nn.BatchNorm2d(channels),
+            )
+        else:
+            raise f"the type is not supported"
+        self.sigmoid = nn.Sigmoid()
+    def forward(self, x, residual):
+        flag = False
+        xa = x + residual
+        if xa.size(0) == 1:
+            xa = torch.cat([xa, xa], dim=0)
+            flag = True
+        xl = self.local_att(xa)
+        xg = self.global_att(xa)
+        xlg = xl + xg
+        wei = self.sigmoid(xlg)
+        xi = x * wei + residual * (1 - wei)
+        xl2 = self.local_att2(xi)
+        xg2 = self.global_att(xi)
+        xlg2 = xl2 + xg2
+        wei2 = self.sigmoid(xlg2)
+        xo = x * wei2 + residual * (1 - wei2)
+        if flag:
+            xo = xo[0].unsqueeze(0)
+        return xo
+class AFF(nn.Module):
+    """
+    多特征融合 AFF
+    """
+    def __init__(self, channels=64, r=4, type="2D"):
+        super(AFF, self).__init__()
+        inter_channels = int(channels // r)
+        if type == "1D":
+            self.local_att = nn.Sequential(
+                nn.Conv1d(channels, inter_channels, kernel_size=1, stride=1, padding=0),
+                nn.BatchNorm1d(inter_channels),
+                nn.ReLU(inplace=True),
+                nn.Conv1d(inter_channels, channels, kernel_size=1, stride=1, padding=0),
+                nn.BatchNorm1d(channels),
+            )
+            self.global_att = nn.Sequential(
+                nn.AdaptiveAvgPool1d(1),
+                nn.Conv1d(channels, inter_channels, kernel_size=1, stride=1, padding=0),
+                nn.BatchNorm1d(inter_channels),
+                nn.ReLU(inplace=True),
+                nn.Conv1d(inter_channels, channels, kernel_size=1, stride=1, padding=0),
+                nn.BatchNorm1d(channels),
+            )
+        elif type == "2D":
+            self.local_att = nn.Sequential(
+                nn.Conv2d(channels, inter_channels, kernel_size=1, stride=1, padding=0),
+                nn.BatchNorm2d(inter_channels),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(inter_channels, channels, kernel_size=1, stride=1, padding=0),
+                nn.BatchNorm2d(channels),
+            )
+            self.global_att = nn.Sequential(
+                nn.AdaptiveAvgPool2d(1),
+                nn.Conv2d(channels, inter_channels, kernel_size=1, stride=1, padding=0),
+                nn.BatchNorm2d(inter_channels),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(inter_channels, channels, kernel_size=1, stride=1, padding=0),
+                nn.BatchNorm2d(channels),
+            )
+        else:
+            raise f"the type is not supported."
+        self.sigmoid = nn.Sigmoid()
+    def forward(self, x, residual):
+        flag = False
+        xa = x + residual
+        if xa.size(0) == 1:
+            xa = torch.cat([xa, xa], dim=0)
+            flag = True
+        xl = self.local_att(xa)
+        xg = self.global_att(xa)
+        xlg = xl + xg
+        wei = self.sigmoid(xlg)
+        xo = 2 * x * wei + 2 * residual * (1 - wei)
+        if flag:
+            xo = xo[0].unsqueeze(0)
+        return xo

src/audioldm/clap/open_clip/htsat.py ADDED Viewed

	@@ -0,0 +1,1308 @@

+# Ke Chen
+# [email protected]
+# HTS-AT: A HIERARCHICAL TOKEN-SEMANTIC AUDIO TRANSFORMER FOR SOUND CLASSIFICATION AND DETECTION
+# Some layers designed on the model
+# below codes are based and referred from https://github.com/microsoft/Swin-Transformer
+# Swin Transformer for Computer Vision: https://arxiv.org/pdf/2103.14030.pdf
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from itertools import repeat
+import collections.abc
+import math
+import warnings
+from torch.nn.init import _calculate_fan_in_and_fan_out
+import torch.utils.checkpoint as checkpoint
+import random
+from torchlibrosa.stft import Spectrogram, LogmelFilterBank
+from torchlibrosa.augmentation import SpecAugmentation
+from itertools import repeat
+from .utils import do_mixup, interpolate
+from .feature_fusion import iAFF, AFF, DAF
+# from PyTorch internals
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable):
+            return x
+        return tuple(repeat(x, n))
+    return parse
+to_1tuple = _ntuple(1)
+to_2tuple = _ntuple(2)
+to_3tuple = _ntuple(3)
+to_4tuple = _ntuple(4)
+to_ntuple = _ntuple
+def drop_path(x, drop_prob: float = 0.0, training: bool = False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (
+        x.ndim - 1
+    )  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
+    random_tensor.floor_()  # binarize
+    output = x.div(keep_prob) * random_tensor
+    return output
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+class PatchEmbed(nn.Module):
+    """2D Image to Patch Embedding"""
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        norm_layer=None,
+        flatten=True,
+        patch_stride=16,
+        enable_fusion=False,
+        fusion_type="None",
+    ):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        patch_stride = to_2tuple(patch_stride)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.patch_stride = patch_stride
+        self.grid_size = (
+            img_size[0] // patch_stride[0],
+            img_size[1] // patch_stride[1],
+        )
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+        self.flatten = flatten
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.enable_fusion = enable_fusion
+        self.fusion_type = fusion_type
+        padding = (
+            (patch_size[0] - patch_stride[0]) // 2,
+            (patch_size[1] - patch_stride[1]) // 2,
+        )
+        if (self.enable_fusion) and (self.fusion_type == "channel_map"):
+            self.proj = nn.Conv2d(
+                in_chans * 4,
+                embed_dim,
+                kernel_size=patch_size,
+                stride=patch_stride,
+                padding=padding,
+            )
+        else:
+            self.proj = nn.Conv2d(
+                in_chans,
+                embed_dim,
+                kernel_size=patch_size,
+                stride=patch_stride,
+                padding=padding,
+            )
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+        if (self.enable_fusion) and (
+            self.fusion_type in ["daf_2d", "aff_2d", "iaff_2d"]
+        ):
+            self.mel_conv2d = nn.Conv2d(
+                in_chans,
+                embed_dim,
+                kernel_size=(patch_size[0], patch_size[1] * 3),
+                stride=(patch_stride[0], patch_stride[1] * 3),
+                padding=padding,
+            )
+            if self.fusion_type == "daf_2d":
+                self.fusion_model = DAF()
+            elif self.fusion_type == "aff_2d":
+                self.fusion_model = AFF(channels=embed_dim, type="2D")
+            elif self.fusion_type == "iaff_2d":
+                self.fusion_model = iAFF(channels=embed_dim, type="2D")
+    def forward(self, x, longer_idx=None):
+        if (self.enable_fusion) and (
+            self.fusion_type in ["daf_2d", "aff_2d", "iaff_2d"]
+        ):
+            global_x = x[:, 0:1, :, :]
+            # global processing
+            B, C, H, W = global_x.shape
+            assert (
+                H == self.img_size[0] and W == self.img_size[1]
+            ), f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+            global_x = self.proj(global_x)
+            TW = global_x.size(-1)
+            if len(longer_idx) > 0:
+                # local processing
+                local_x = x[longer_idx, 1:, :, :].contiguous()
+                B, C, H, W = local_x.shape
+                local_x = local_x.view(B * C, 1, H, W)
+                local_x = self.mel_conv2d(local_x)
+                local_x = local_x.view(
+                    B, C, local_x.size(1), local_x.size(2), local_x.size(3)
+                )
+                local_x = local_x.permute((0, 2, 3, 1, 4)).contiguous().flatten(3)
+                TB, TC, TH, _ = local_x.size()
+                if local_x.size(-1) < TW:
+                    local_x = torch.cat(
+                        [
+                            local_x,
+                            torch.zeros(
+                                (TB, TC, TH, TW - local_x.size(-1)),
+                                device=global_x.device,
+                            ),
+                        ],
+                        dim=-1,
+                    )
+                else:
+                    local_x = local_x[:, :, :, :TW]
+                global_x[longer_idx] = self.fusion_model(global_x[longer_idx], local_x)
+            x = global_x
+        else:
+            B, C, H, W = x.shape
+            assert (
+                H == self.img_size[0] and W == self.img_size[1]
+            ), f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+            x = self.proj(x)
+        if self.flatten:
+            x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
+        x = self.norm(x)
+        return x
+class Mlp(nn.Module):
+    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        drop=0.0,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+def _no_grad_trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn(
+            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+            "The distribution of values may be incorrect.",
+            stacklevel=2,
+        )
+    with torch.no_grad():
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        l = norm_cdf((a - mean) / std)
+        u = norm_cdf((b - mean) / std)
+        # Uniformly fill tensor with values from [l, u], then translate to
+        # [2l-1, 2u-1].
+        tensor.uniform_(2 * l - 1, 2 * u - 1)
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        tensor.erfinv_()
+        # Transform to proper mean, std
+        tensor.mul_(std * math.sqrt(2.0))
+        tensor.add_(mean)
+        # Clamp to ensure it's in the proper range
+        tensor.clamp_(min=a, max=b)
+        return tensor
+def trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0):
+    # type: (Tensor, float, float, float, float) -> Tensor
+    r"""Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \leq \text{mean} \leq b`.
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.trunc_normal_(w)
+    """
+    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
+def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
+    if mode == "fan_in":
+        denom = fan_in
+    elif mode == "fan_out":
+        denom = fan_out
+    elif mode == "fan_avg":
+        denom = (fan_in + fan_out) / 2
+    variance = scale / denom
+    if distribution == "truncated_normal":
+        # constant is stddev of standard normal truncated to (-2, 2)
+        trunc_normal_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
+    elif distribution == "normal":
+        tensor.normal_(std=math.sqrt(variance))
+    elif distribution == "uniform":
+        bound = math.sqrt(3 * variance)
+        tensor.uniform_(-bound, bound)
+    else:
+        raise ValueError(f"invalid distribution {distribution}")
+def lecun_normal_(tensor):
+    variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+    windows = (
+        x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    )
+    return windows
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+    Returns:
+        x: (B, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(
+        B, H // window_size, W // window_size, window_size, window_size, -1
+    )
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+class WindowAttention(nn.Module):
+    r"""Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+    def __init__(
+        self,
+        dim,
+        window_size,
+        num_heads,
+        qkv_bias=True,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)
+        )  # 2*Wh-1 * 2*Ww-1, nH
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = (
+            coords_flatten[:, :, None] - coords_flatten[:, None, :]
+        )  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(
+            1, 2, 0
+        ).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer("relative_position_index", relative_position_index)
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        trunc_normal_(self.relative_position_bias_table, std=0.02)
+        self.softmax = nn.Softmax(dim=-1)
+    def forward(self, x, mask=None):
+        """
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape(B_, N, 3, self.num_heads, C // self.num_heads)
+            .permute(2, 0, 3, 1, 4)
+        )
+        q, k, v = (
+            qkv[0],
+            qkv[1],
+            qkv[2],
+        )  # make torchscript happy (cannot use tensor as tuple)
+        q = q * self.scale
+        attn = q @ k.transpose(-2, -1)
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index.view(-1)
+        ].view(
+            self.window_size[0] * self.window_size[1],
+            self.window_size[0] * self.window_size[1],
+            -1,
+        )  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(
+            2, 0, 1
+        ).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(
+                1
+            ).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x, attn
+    def extra_repr(self):
+        return f"dim={self.dim}, window_size={self.window_size}, num_heads={self.num_heads}"
+# We use the model based on Swintransformer Block, therefore we can use the swin-transformer pretrained model
+class SwinTransformerBlock(nn.Module):
+    r"""Swin Transformer Block.
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resulotion.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(
+        self,
+        dim,
+        input_resolution,
+        num_heads,
+        window_size=7,
+        shift_size=0,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+        norm_before_mlp="ln",
+    ):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        self.norm_before_mlp = norm_before_mlp
+        if min(self.input_resolution) <= self.window_size:
+            # if window size is larger than input resolution, we don't partition windows
+            self.shift_size = 0
+            self.window_size = min(self.input_resolution)
+        assert (
+            0 <= self.shift_size < self.window_size
+        ), "shift_size must in 0-window_size"
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim,
+            window_size=to_2tuple(self.window_size),
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        if self.norm_before_mlp == "ln":
+            self.norm2 = nn.LayerNorm(dim)
+        elif self.norm_before_mlp == "bn":
+            self.norm2 = lambda x: nn.BatchNorm1d(dim)(x.transpose(1, 2)).transpose(
+                1, 2
+            )
+        else:
+            raise NotImplementedError
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+        )
+        if self.shift_size > 0:
+            # calculate attention mask for SW-MSA
+            H, W = self.input_resolution
+            img_mask = torch.zeros((1, H, W, 1))  # 1 H W 1
+            h_slices = (
+                slice(0, -self.window_size),
+                slice(-self.window_size, -self.shift_size),
+                slice(-self.shift_size, None),
+            )
+            w_slices = (
+                slice(0, -self.window_size),
+                slice(-self.window_size, -self.shift_size),
+                slice(-self.shift_size, None),
+            )
+            cnt = 0
+            for h in h_slices:
+                for w in w_slices:
+                    img_mask[:, h, w, :] = cnt
+                    cnt += 1
+            mask_windows = window_partition(
+                img_mask, self.window_size
+            )  # nW, window_size, window_size, 1
+            mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+            attn_mask = attn_mask.masked_fill(
+                attn_mask != 0, float(-100.0)
+            ).masked_fill(attn_mask == 0, float(0.0))
+        else:
+            attn_mask = None
+        self.register_buffer("attn_mask", attn_mask)
+    def forward(self, x):
+        # pdb.set_trace()
+        H, W = self.input_resolution
+        # print("H: ", H)
+        # print("W: ", W)
+        # pdb.set_trace()
+        B, L, C = x.shape
+        # assert L == H * W, "input feature has wrong size"
+        shortcut = x
+        x = self.norm1(x)
+        x = x.view(B, H, W, C)
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = torch.roll(
+                x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2)
+            )
+        else:
+            shifted_x = x
+        # partition windows
+        x_windows = window_partition(
+            shifted_x, self.window_size
+        )  # nW*B, window_size, window_size, C
+        x_windows = x_windows.view(
+            -1, self.window_size * self.window_size, C
+        )  # nW*B, window_size*window_size, C
+        # W-MSA/SW-MSA
+        attn_windows, attn = self.attn(
+            x_windows, mask=self.attn_mask
+        )  # nW*B, window_size*window_size, C
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
+        shifted_x = window_reverse(attn_windows, self.window_size, H, W)  # B H' W' C
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(
+                shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2)
+            )
+        else:
+            x = shifted_x
+        x = x.view(B, H * W, C)
+        # FFN
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x, attn
+    def extra_repr(self):
+        return (
+            f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, "
+            f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}"
+        )
+class PatchMerging(nn.Module):
+    r"""Patch Merging Layer.
+    Args:
+        input_resolution (tuple[int]): Resolution of input feature.
+        dim (int): Number of input channels.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
+    def forward(self, x):
+        """
+        x: B, H*W, C
+        """
+        H, W = self.input_resolution
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+        assert H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even."
+        x = x.view(B, H, W, C)
+        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
+        x = self.norm(x)
+        x = self.reduction(x)
+        return x
+    def extra_repr(self):
+        return f"input_resolution={self.input_resolution}, dim={self.dim}"
+class BasicLayer(nn.Module):
+    """A basic Swin Transformer layer for one stage.
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resolution.
+        depth (int): Number of blocks.
+        num_heads (int): Number of attention heads.
+        window_size (int): Local window size.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+    def __init__(
+        self,
+        dim,
+        input_resolution,
+        depth,
+        num_heads,
+        window_size,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        norm_layer=nn.LayerNorm,
+        downsample=None,
+        use_checkpoint=False,
+        norm_before_mlp="ln",
+    ):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+        # build blocks
+        self.blocks = nn.ModuleList(
+            [
+                SwinTransformerBlock(
+                    dim=dim,
+                    input_resolution=input_resolution,
+                    num_heads=num_heads,
+                    window_size=window_size,
+                    shift_size=0 if (i % 2 == 0) else window_size // 2,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop,
+                    attn_drop=attn_drop,
+                    drop_path=drop_path[i]
+                    if isinstance(drop_path, list)
+                    else drop_path,
+                    norm_layer=norm_layer,
+                    norm_before_mlp=norm_before_mlp,
+                )
+                for i in range(depth)
+            ]
+        )
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(
+                input_resolution, dim=dim, norm_layer=norm_layer
+            )
+        else:
+            self.downsample = None
+    def forward(self, x):
+        attns = []
+        for blk in self.blocks:
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x)
+            else:
+                x, attn = blk(x)
+                if not self.training:
+                    attns.append(attn.unsqueeze(0))
+        if self.downsample is not None:
+            x = self.downsample(x)
+        if not self.training:
+            attn = torch.cat(attns, dim=0)
+            attn = torch.mean(attn, dim=0)
+        return x, attn
+    def extra_repr(self):
+        return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}"
+# The Core of HTSAT
+class HTSAT_Swin_Transformer(nn.Module):
+    r"""HTSAT based on the Swin Transformer
+    Args:
+        spec_size (int | tuple(int)): Input Spectrogram size. Default 256
+        patch_size (int | tuple(int)): Patch size. Default: 4
+        path_stride (iot | tuple(int)): Patch Stride for Frequency and Time Axis. Default: 4
+        in_chans (int): Number of input image channels. Default: 1 (mono)
+        num_classes (int): Number of classes for classification head. Default: 527
+        embed_dim (int): Patch embedding dimension. Default: 96
+        depths (tuple(int)): Depth of each HTSAT-Swin Transformer layer.
+        num_heads (tuple(int)): Number of attention heads in different layers.
+        window_size (int): Window size. Default: 8
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None
+        drop_rate (float): Dropout rate. Default: 0
+        attn_drop_rate (float): Attention dropout rate. Default: 0
+        drop_path_rate (float): Stochastic depth rate. Default: 0.1
+        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
+        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False
+        config (module): The configuration Module from config.py
+    """
+    def __init__(
+        self,
+        spec_size=256,
+        patch_size=4,
+        patch_stride=(4, 4),
+        in_chans=1,
+        num_classes=527,
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=8,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.1,
+        norm_layer=nn.LayerNorm,
+        ape=False,
+        patch_norm=True,
+        use_checkpoint=False,
+        norm_before_mlp="ln",
+        config=None,
+        enable_fusion=False,
+        fusion_type="None",
+        **kwargs,
+    ):
+        super(HTSAT_Swin_Transformer, self).__init__()
+        self.config = config
+        self.spec_size = spec_size
+        self.patch_stride = patch_stride
+        self.patch_size = patch_size
+        self.window_size = window_size
+        self.embed_dim = embed_dim
+        self.depths = depths
+        self.ape = ape
+        self.in_chans = in_chans
+        self.num_classes = num_classes
+        self.num_heads = num_heads
+        self.num_layers = len(self.depths)
+        self.num_features = int(self.embed_dim * 2 ** (self.num_layers - 1))
+        self.drop_rate = drop_rate
+        self.attn_drop_rate = attn_drop_rate
+        self.drop_path_rate = drop_path_rate
+        self.qkv_bias = qkv_bias
+        self.qk_scale = None
+        self.patch_norm = patch_norm
+        self.norm_layer = norm_layer if self.patch_norm else None
+        self.norm_before_mlp = norm_before_mlp
+        self.mlp_ratio = mlp_ratio
+        self.use_checkpoint = use_checkpoint
+        self.enable_fusion = enable_fusion
+        self.fusion_type = fusion_type
+        #  process mel-spec ; used only once
+        self.freq_ratio = self.spec_size // self.config.mel_bins
+        window = "hann"
+        center = True
+        pad_mode = "reflect"
+        ref = 1.0
+        amin = 1e-10
+        top_db = None
+        self.interpolate_ratio = 32  # Downsampled ratio
+        # Spectrogram extractor
+        self.spectrogram_extractor = Spectrogram(
+            n_fft=config.window_size,
+            hop_length=config.hop_size,
+            win_length=config.window_size,
+            window=window,
+            center=center,
+            pad_mode=pad_mode,
+            freeze_parameters=True,
+        )
+        # Logmel feature extractor
+        self.logmel_extractor = LogmelFilterBank(
+            sr=config.sample_rate,
+            n_fft=config.window_size,
+            n_mels=config.mel_bins,
+            fmin=config.fmin,
+            fmax=config.fmax,
+            ref=ref,
+            amin=amin,
+            top_db=top_db,
+            freeze_parameters=True,
+        )
+        # Spec augmenter
+        self.spec_augmenter = SpecAugmentation(
+            time_drop_width=64,
+            time_stripes_num=2,
+            freq_drop_width=8,
+            freq_stripes_num=2,
+        )  # 2 2
+        self.bn0 = nn.BatchNorm2d(self.config.mel_bins)
+        # split spctrogram into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            img_size=self.spec_size,
+            patch_size=self.patch_size,
+            in_chans=self.in_chans,
+            embed_dim=self.embed_dim,
+            norm_layer=self.norm_layer,
+            patch_stride=patch_stride,
+            enable_fusion=self.enable_fusion,
+            fusion_type=self.fusion_type,
+        )
+        num_patches = self.patch_embed.num_patches
+        patches_resolution = self.patch_embed.grid_size
+        self.patches_resolution = patches_resolution
+        # absolute position embedding
+        if self.ape:
+            self.absolute_pos_embed = nn.Parameter(
+                torch.zeros(1, num_patches, self.embed_dim)
+            )
+            trunc_normal_(self.absolute_pos_embed, std=0.02)
+        self.pos_drop = nn.Dropout(p=self.drop_rate)
+        # stochastic depth
+        dpr = [
+            x.item() for x in torch.linspace(0, self.drop_path_rate, sum(self.depths))
+        ]  # stochastic depth decay rule
+        # build layers
+        self.layers = nn.ModuleList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                dim=int(self.embed_dim * 2**i_layer),
+                input_resolution=(
+                    patches_resolution[0] // (2**i_layer),
+                    patches_resolution[1] // (2**i_layer),
+                ),
+                depth=self.depths[i_layer],
+                num_heads=self.num_heads[i_layer],
+                window_size=self.window_size,
+                mlp_ratio=self.mlp_ratio,
+                qkv_bias=self.qkv_bias,
+                qk_scale=self.qk_scale,
+                drop=self.drop_rate,
+                attn_drop=self.attn_drop_rate,
+                drop_path=dpr[
+                    sum(self.depths[:i_layer]) : sum(self.depths[: i_layer + 1])
+                ],
+                norm_layer=self.norm_layer,
+                downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
+                use_checkpoint=use_checkpoint,
+                norm_before_mlp=self.norm_before_mlp,
+            )
+            self.layers.append(layer)
+        self.norm = self.norm_layer(self.num_features)
+        self.avgpool = nn.AdaptiveAvgPool1d(1)
+        self.maxpool = nn.AdaptiveMaxPool1d(1)
+        SF = (
+            self.spec_size
+            // (2 ** (len(self.depths) - 1))
+            // self.patch_stride[0]
+            // self.freq_ratio
+        )
+        self.tscam_conv = nn.Conv2d(
+            in_channels=self.num_features,
+            out_channels=self.num_classes,
+            kernel_size=(SF, 3),
+            padding=(0, 1),
+        )
+        self.head = nn.Linear(num_classes, num_classes)
+        if (self.enable_fusion) and (
+            self.fusion_type in ["daf_1d", "aff_1d", "iaff_1d"]
+        ):
+            self.mel_conv1d = nn.Sequential(
+                nn.Conv1d(64, 64, kernel_size=5, stride=3, padding=2),
+                nn.BatchNorm1d(64),
+            )
+            if self.fusion_type == "daf_1d":
+                self.fusion_model = DAF()
+            elif self.fusion_type == "aff_1d":
+                self.fusion_model = AFF(channels=64, type="1D")
+            elif self.fusion_type == "iaff_1d":
+                self.fusion_model = iAFF(channels=64, type="1D")
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {"absolute_pos_embed"}
+    @torch.jit.ignore
+    def no_weight_decay_keywords(self):
+        return {"relative_position_bias_table"}
+    def forward_features(self, x, longer_idx=None):
+        # A deprecated optimization for using a hierarchical output from different blocks
+        frames_num = x.shape[2]
+        x = self.patch_embed(x, longer_idx=longer_idx)
+        if self.ape:
+            x = x + self.absolute_pos_embed
+        x = self.pos_drop(x)
+        for i, layer in enumerate(self.layers):
+            x, attn = layer(x)
+        # for x
+        x = self.norm(x)
+        B, N, C = x.shape
+        SF = frames_num // (2 ** (len(self.depths) - 1)) // self.patch_stride[0]
+        ST = frames_num // (2 ** (len(self.depths) - 1)) // self.patch_stride[1]
+        x = x.permute(0, 2, 1).contiguous().reshape(B, C, SF, ST)
+        B, C, F, T = x.shape
+        # group 2D CNN
+        c_freq_bin = F // self.freq_ratio
+        x = x.reshape(B, C, F // c_freq_bin, c_freq_bin, T)
+        x = x.permute(0, 1, 3, 2, 4).contiguous().reshape(B, C, c_freq_bin, -1)
+        # get latent_output
+        fine_grained_latent_output = torch.mean(x, dim=2)
+        fine_grained_latent_output = interpolate(
+            fine_grained_latent_output.permute(0, 2, 1).contiguous(),
+            8 * self.patch_stride[1],
+        )
+        latent_output = self.avgpool(torch.flatten(x, 2))
+        latent_output = torch.flatten(latent_output, 1)
+        # display the attention map, if needed
+        x = self.tscam_conv(x)
+        x = torch.flatten(x, 2)  # B, C, T
+        fpx = interpolate(
+            torch.sigmoid(x).permute(0, 2, 1).contiguous(), 8 * self.patch_stride[1]
+        )
+        x = self.avgpool(x)
+        x = torch.flatten(x, 1)
+        output_dict = {
+            "framewise_output": fpx,  # already sigmoided
+            "clipwise_output": torch.sigmoid(x),
+            "fine_grained_embedding": fine_grained_latent_output,
+            "embedding": latent_output,
+        }
+        return output_dict
+    def crop_wav(self, x, crop_size, spe_pos=None):
+        time_steps = x.shape[2]
+        tx = torch.zeros(x.shape[0], x.shape[1], crop_size, x.shape[3]).to(x.device)
+        for i in range(len(x)):
+            if spe_pos is None:
+                crop_pos = random.randint(0, time_steps - crop_size - 1)
+            else:
+                crop_pos = spe_pos
+            tx[i][0] = x[i, 0, crop_pos : crop_pos + crop_size, :]
+        return tx
+    # Reshape the wavform to a img size, if you want to use the pretrained swin transformer model
+    def reshape_wav2img(self, x):
+        B, C, T, F = x.shape
+        target_T = int(self.spec_size * self.freq_ratio)
+        target_F = self.spec_size // self.freq_ratio
+        assert (
+            T <= target_T and F <= target_F
+        ), "the wav size should less than or equal to the swin input size"
+        # to avoid bicubic zero error
+        if T < target_T:
+            x = nn.functional.interpolate(
+                x, (target_T, x.shape[3]), mode="bicubic", align_corners=True
+            )
+        if F < target_F:
+            x = nn.functional.interpolate(
+                x, (x.shape[2], target_F), mode="bicubic", align_corners=True
+            )
+        x = x.permute(0, 1, 3, 2).contiguous()
+        x = x.reshape(
+            x.shape[0],
+            x.shape[1],
+            x.shape[2],
+            self.freq_ratio,
+            x.shape[3] // self.freq_ratio,
+        )
+        # print(x.shape)
+        x = x.permute(0, 1, 3, 2, 4).contiguous()
+        x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3], x.shape[4])
+        return x
+    # Repeat the wavform to a img size, if you want to use the pretrained swin transformer model
+    def repeat_wat2img(self, x, cur_pos):
+        B, C, T, F = x.shape
+        target_T = int(self.spec_size * self.freq_ratio)
+        target_F = self.spec_size // self.freq_ratio
+        assert (
+            T <= target_T and F <= target_F
+        ), "the wav size should less than or equal to the swin input size"
+        # to avoid bicubic zero error
+        if T < target_T:
+            x = nn.functional.interpolate(
+                x, (target_T, x.shape[3]), mode="bicubic", align_corners=True
+            )
+        if F < target_F:
+            x = nn.functional.interpolate(
+                x, (x.shape[2], target_F), mode="bicubic", align_corners=True
+            )
+        x = x.permute(0, 1, 3, 2).contiguous()  # B C F T
+        x = x[:, :, :, cur_pos : cur_pos + self.spec_size]
+        x = x.repeat(repeats=(1, 1, 4, 1))
+        return x
+    def forward(
+        self, x: torch.Tensor, mixup_lambda=None, infer_mode=False, device=None
+    ):  # out_feat_keys: List[str] = None):
+        if self.enable_fusion and x["longer"].sum() == 0:
+            # if no audio is longer than 10s, then randomly select one audio to be longer
+            x["longer"][torch.randint(0, x["longer"].shape[0], (1,))] = True
+        if not self.enable_fusion:
+            x = x["waveform"].to(device=device, non_blocking=True)
+            x = self.spectrogram_extractor(x)  # (batch_size, 1, time_steps, freq_bins)
+            x = self.logmel_extractor(x)  # (batch_size, 1, time_steps, mel_bins)
+            x = x.transpose(1, 3)
+            x = self.bn0(x)
+            x = x.transpose(1, 3)
+            if self.training:
+                x = self.spec_augmenter(x)
+            if self.training and mixup_lambda is not None:
+                x = do_mixup(x, mixup_lambda)
+            x = self.reshape_wav2img(x)
+            output_dict = self.forward_features(x)
+        else:
+            longer_list = x["longer"].to(device=device, non_blocking=True)
+            x = x["mel_fusion"].to(device=device, non_blocking=True)
+            x = x.transpose(1, 3)
+            x = self.bn0(x)
+            x = x.transpose(1, 3)
+            longer_list_idx = torch.where(longer_list)[0]
+            if self.fusion_type in ["daf_1d", "aff_1d", "iaff_1d"]:
+                new_x = x[:, 0:1, :, :].clone().contiguous()
+                if len(longer_list_idx) > 0:
+                    # local processing
+                    fusion_x_local = x[longer_list_idx, 1:, :, :].clone().contiguous()
+                    FB, FC, FT, FF = fusion_x_local.size()
+                    fusion_x_local = fusion_x_local.view(FB * FC, FT, FF)
+                    fusion_x_local = torch.permute(
+                        fusion_x_local, (0, 2, 1)
+                    ).contiguous()
+                    fusion_x_local = self.mel_conv1d(fusion_x_local)
+                    fusion_x_local = fusion_x_local.view(
+                        FB, FC, FF, fusion_x_local.size(-1)
+                    )
+                    fusion_x_local = (
+                        torch.permute(fusion_x_local, (0, 2, 1, 3))
+                        .contiguous()
+                        .flatten(2)
+                    )
+                    if fusion_x_local.size(-1) < FT:
+                        fusion_x_local = torch.cat(
+                            [
+                                fusion_x_local,
+                                torch.zeros(
+                                    (FB, FF, FT - fusion_x_local.size(-1)),
+                                    device=device,
+                                ),
+                            ],
+                            dim=-1,
+                        )
+                    else:
+                        fusion_x_local = fusion_x_local[:, :, :FT]
+                    # 1D fusion
+                    new_x = new_x.squeeze(1).permute((0, 2, 1)).contiguous()
+                    new_x[longer_list_idx] = self.fusion_model(
+                        new_x[longer_list_idx], fusion_x_local
+                    )
+                    x = new_x.permute((0, 2, 1)).contiguous()[:, None, :, :]
+                else:
+                    x = new_x
+            elif self.fusion_type in ["daf_2d", "aff_2d", "iaff_2d", "channel_map"]:
+                x = x  # no change
+            if self.training:
+                x = self.spec_augmenter(x)
+            if self.training and mixup_lambda is not None:
+                x = do_mixup(x, mixup_lambda)
+            x = self.reshape_wav2img(x)
+            output_dict = self.forward_features(x, longer_idx=longer_list_idx)
+        # if infer_mode:
+        #     # in infer mode. we need to handle different length audio input
+        #     frame_num = x.shape[2]
+        #     target_T = int(self.spec_size * self.freq_ratio)
+        #     repeat_ratio = math.floor(target_T / frame_num)
+        #     x = x.repeat(repeats=(1,1,repeat_ratio,1))
+        #     x = self.reshape_wav2img(x)
+        #     output_dict = self.forward_features(x)
+        # else:
+        #     if x.shape[2] > self.freq_ratio * self.spec_size:
+        #         if self.training:
+        #             x = self.crop_wav(x, crop_size=self.freq_ratio * self.spec_size)
+        #             x = self.reshape_wav2img(x)
+        #             output_dict = self.forward_features(x)
+        #         else:
+        #             # Change: Hard code here
+        #             overlap_size = (x.shape[2] - 1) // 4
+        #             output_dicts = []
+        #             crop_size = (x.shape[2] - 1) // 2
+        #             for cur_pos in range(0, x.shape[2] - crop_size - 1, overlap_size):
+        #                 tx = self.crop_wav(x, crop_size = crop_size, spe_pos = cur_pos)
+        #                 tx = self.reshape_wav2img(tx)
+        #                 output_dicts.append(self.forward_features(tx))
+        #             clipwise_output = torch.zeros_like(output_dicts[0]["clipwise_output"]).float().to(x.device)
+        #             framewise_output = torch.zeros_like(output_dicts[0]["framewise_output"]).float().to(x.device)
+        #             for d in output_dicts:
+        #                 clipwise_output += d["clipwise_output"]
+        #                 framewise_output += d["framewise_output"]
+        #             clipwise_output  = clipwise_output / len(output_dicts)
+        #             framewise_output = framewise_output / len(output_dicts)
+        #             output_dict = {
+        #                 'framewise_output': framewise_output,
+        #                 'clipwise_output': clipwise_output
+        #             }
+        #     else: # this part is typically used, and most easy one
+        #         x = self.reshape_wav2img(x)
+        #         output_dict = self.forward_features(x)
+        # x = self.head(x)
+        # We process the data in the dataloader part, in that here we only consider the input_T < fixed_T
+        return output_dict
+def create_htsat_model(audio_cfg, enable_fusion=False, fusion_type="None"):
+    try:
+        assert audio_cfg.model_name in [
+            "tiny",
+            "base",
+            "large",
+        ], "model name for HTS-AT is wrong!"
+        if audio_cfg.model_name == "tiny":
+            model = HTSAT_Swin_Transformer(
+                spec_size=256,
+                patch_size=4,
+                patch_stride=(4, 4),
+                num_classes=audio_cfg.class_num,
+                embed_dim=96,
+                depths=[2, 2, 6, 2],
+                num_heads=[4, 8, 16, 32],
+                window_size=8,
+                config=audio_cfg,
+                enable_fusion=enable_fusion,
+                fusion_type=fusion_type,
+            )
+        elif audio_cfg.model_name == "base":
+            model = HTSAT_Swin_Transformer(
+                spec_size=256,
+                patch_size=4,
+                patch_stride=(4, 4),
+                num_classes=audio_cfg.class_num,
+                embed_dim=128,
+                depths=[2, 2, 12, 2],
+                num_heads=[4, 8, 16, 32],
+                window_size=8,
+                config=audio_cfg,
+                enable_fusion=enable_fusion,
+                fusion_type=fusion_type,
+            )
+        elif audio_cfg.model_name == "large":
+            model = HTSAT_Swin_Transformer(
+                spec_size=256,
+                patch_size=4,
+                patch_stride=(4, 4),
+                num_classes=audio_cfg.class_num,
+                embed_dim=256,
+                depths=[2, 2, 12, 2],
+                num_heads=[4, 8, 16, 32],
+                window_size=8,
+                config=audio_cfg,
+                enable_fusion=enable_fusion,
+                fusion_type=fusion_type,
+            )
+        return model
+    except:
+        raise RuntimeError(
+            f"Import Model for {audio_cfg.model_name} not found, or the audio cfg parameters are not enough."
+        )

src/audioldm/clap/open_clip/linear_probe.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import numpy as np
+import torch.nn.functional as F
+from torch import nn
+from .model import MLPLayers
+class LinearProbe(nn.Module):
+    def __init__(self, model, mlp, freeze, in_ch, out_ch, act=None):
+        """
+        Args:
+            model: nn.Module
+            mlp: bool, if True, then use the MLP layer as the linear probe module
+            freeze: bool, if Ture, then freeze all the CLAP model's layers when training the linear probe
+            in_ch: int, the output channel from CLAP model
+            out_ch: int, the output channel from linear probe (class_num)
+            act: torch.nn.functional, the activation function before the loss function
+        """
+        super().__init__()
+        in_ch = 512
+        self.clap_model = model
+        self.clap_model.text_branch = None  # to save memory
+        self.freeze = freeze
+        if mlp:
+            self.lp_layer = MLPLayers(units=[in_ch, in_ch * 2, out_ch])
+        else:
+            self.lp_layer = nn.Linear(in_ch, out_ch)
+        if self.freeze:
+            for param in self.clap_model.parameters():
+                param.requires_grad = False
+        if act == "None":
+            self.act = None
+        elif act == "relu":
+            self.act = nn.ReLU()
+        elif act == "elu":
+            self.act = nn.ELU()
+        elif act == "prelu":
+            self.act = nn.PReLU(num_parameters=in_ch)
+        elif act == "softmax":
+            self.act = nn.Softmax(dim=-1)
+        elif act == "sigmoid":
+            self.act = nn.Sigmoid()
+    def forward(self, x, mix_lambda=None, device=None):
+        """
+        Args:
+            x: waveform, torch.tensor [batch, t_samples] / batch of mel_spec and longer list
+            mix_lambda: torch.tensor [batch], the mixup lambda
+        Returns:
+            class_prob: torch.tensor [batch, class_num]
+        """
+        # batchnorm cancel grandient
+        if self.freeze:
+            self.clap_model.eval()
+        x = self.clap_model.audio_projection(
+            self.clap_model.audio_branch(x, mixup_lambda=mix_lambda, device=device)[
+                "embedding"
+            ]
+        )
+        out = self.lp_layer(x)
+        if self.act is not None:
+            out = self.act(out)
+        return out

src/audioldm/clap/open_clip/loss.py ADDED Viewed

	@@ -0,0 +1,398 @@

+from multiprocessing.sharedctypes import Value
+import torch
+import torch.distributed.nn
+from torch import distributed as dist, nn as nn
+from torch.nn import functional as F
+import numpy as np
+from sklearn.metrics import average_precision_score, roc_auc_score, accuracy_score
+try:
+    import horovod.torch as hvd
+except ImportError:
+    hvd = None
+def gather_features(
+    audio_features,
+    text_features,
+    audio_features_mlp=None,
+    text_features_mlp=None,
+    local_loss=False,
+    gather_with_grad=False,
+    rank=0,
+    world_size=1,
+    use_horovod=False,
+    mlp_loss=False,
+):
+    if use_horovod:
+        assert hvd is not None, "Please install horovod"
+        if gather_with_grad:
+            all_audio_features = hvd.allgather(audio_features)
+            all_text_features = hvd.allgather(text_features)
+            if mlp_loss:
+                all_audio_features_mlp = hvd.allgather(audio_features_mlp)
+                all_text_features_mlp = hvd.allgather(text_features_mlp)
+        else:
+            with torch.no_grad():
+                all_audio_features = hvd.allgather(audio_features)
+                all_text_features = hvd.allgather(text_features)
+                if mlp_loss:
+                    all_audio_features_mlp = hvd.allgather(audio_features_mlp)
+                    all_text_features_mlp = hvd.allgather(text_features_mlp)
+            if not local_loss:
+                # ensure grads for local rank when all_* features don't have a gradient
+                gathered_audio_features = list(
+                    all_audio_features.chunk(world_size, dim=0)
+                )
+                gathered_text_features = list(
+                    all_text_features.chunk(world_size, dim=0)
+                )
+                gathered_audio_features[rank] = audio_features
+                gathered_text_features[rank] = text_features
+                all_audio_features = torch.cat(gathered_audio_features, dim=0)
+                all_text_features = torch.cat(gathered_text_features, dim=0)
+                if mlp_loss:
+                    gathered_audio_features_mlp = list(
+                        all_audio_features_mlp.chunk(world_size, dim=0)
+                    )
+                    gathered_text_features_mlp = list(
+                        all_text_features_mlp.chunk(world_size, dim=0)
+                    )
+                    gathered_audio_features_mlp[rank] = audio_features_mlp
+                    gathered_text_features_mlp[rank] = text_features_mlp
+                    all_audio_features_mlp = torch.cat(
+                        gathered_audio_features_mlp, dim=0
+                    )
+                    all_text_features_mlp = torch.cat(gathered_text_features_mlp, dim=0)
+    else:
+        # We gather tensors from all gpus
+        if gather_with_grad:
+            all_audio_features = torch.cat(
+                torch.distributed.nn.all_gather(audio_features), dim=0
+            )
+            all_text_features = torch.cat(
+                torch.distributed.nn.all_gather(text_features), dim=0
+            )
+            if mlp_loss:
+                all_audio_features_mlp = torch.cat(
+                    torch.distributed.nn.all_gather(audio_features_mlp), dim=0
+                )
+                all_text_features_mlp = torch.cat(
+                    torch.distributed.nn.all_gather(text_features_mlp), dim=0
+                )
+        else:
+            gathered_audio_features = [
+                torch.zeros_like(audio_features) for _ in range(world_size)
+            ]
+            gathered_text_features = [
+                torch.zeros_like(text_features) for _ in range(world_size)
+            ]
+            dist.all_gather(gathered_audio_features, audio_features)
+            dist.all_gather(gathered_text_features, text_features)
+            if mlp_loss:
+                gathered_audio_features_mlp = [
+                    torch.zeros_like(audio_features_mlp) for _ in range(world_size)
+                ]
+                gathered_text_features_mlp = [
+                    torch.zeros_like(text_features_mlp) for _ in range(world_size)
+                ]
+                dist.all_gather(gathered_audio_features_mlp, audio_features_mlp)
+                dist.all_gather(gathered_text_features_mlp, text_features_mlp)
+            if not local_loss:
+                # ensure grads for local rank when all_* features don't have a gradient
+                gathered_audio_features[rank] = audio_features
+                gathered_text_features[rank] = text_features
+                if mlp_loss:
+                    gathered_audio_features_mlp[rank] = audio_features_mlp
+                    gathered_text_features_mlp[rank] = text_features_mlp
+            all_audio_features = torch.cat(gathered_audio_features, dim=0)
+            all_text_features = torch.cat(gathered_text_features, dim=0)
+            if mlp_loss:
+                all_audio_features_mlp = torch.cat(gathered_audio_features_mlp, dim=0)
+                all_text_features_mlp = torch.cat(gathered_text_features_mlp, dim=0)
+    if mlp_loss:
+        return (
+            all_audio_features,
+            all_text_features,
+            all_audio_features_mlp,
+            all_text_features_mlp,
+        )
+    else:
+        return all_audio_features, all_text_features
+class ClipLoss(nn.Module):
+    def __init__(
+        self,
+        local_loss=False,
+        gather_with_grad=False,
+        cache_labels=False,
+        rank=0,
+        world_size=1,
+        use_horovod=False,
+        mlp_loss=False,
+        weight_loss_kappa=0,
+    ):
+        super().__init__()
+        self.local_loss = local_loss
+        self.gather_with_grad = gather_with_grad
+        self.cache_labels = cache_labels
+        self.rank = rank
+        self.world_size = world_size
+        self.use_horovod = use_horovod
+        self.mlp_loss = mlp_loss
+        self.weighted_loss = bool(weight_loss_kappa != 0)
+        self.weight_loss_kappa = weight_loss_kappa
+        # cache state
+        self.prev_num_logits = 0
+        self.labels = {}
+    def forward(
+        self,
+        audio_features,
+        text_features,
+        logit_scale_a,
+        logit_scale_t=None,
+        audio_features_mlp=None,
+        text_features_mlp=None,
+    ):
+        device = audio_features.device
+        if self.mlp_loss:
+            if self.world_size > 1:
+                (
+                    all_audio_features,
+                    all_text_features,
+                    all_audio_features_mlp,
+                    all_text_features_mlp,
+                ) = gather_features(
+                    audio_features=audio_features,
+                    text_features=text_features,
+                    audio_features_mlp=audio_features_mlp,
+                    text_features_mlp=text_features_mlp,
+                    local_loss=self.local_loss,
+                    gather_with_grad=self.gather_with_grad,
+                    rank=self.rank,
+                    world_size=self.world_size,
+                    use_horovod=self.use_horovod,
+                    mlp_loss=self.mlp_loss,
+                )
+                if self.local_loss:
+                    a_logits_per_audio = (
+                        logit_scale_a * audio_features @ all_text_features_mlp.T
+                    )
+                    a_logits_per_text = (
+                        logit_scale_a * text_features_mlp @ all_audio_features.T
+                    )
+                    t_logits_per_audio = (
+                        logit_scale_t * audio_features_mlp @ all_text_features.T
+                    )
+                    t_logits_per_text = (
+                        logit_scale_t * text_features @ all_audio_features_mlp.T
+                    )
+                else:
+                    a_logits_per_audio = (
+                        logit_scale_a * all_audio_features @ all_text_features_mlp.T
+                    )
+                    a_logits_per_text = a_logits_per_audio.T
+                    t_logits_per_audio = (
+                        logit_scale_t * all_audio_features_mlp @ all_text_features.T
+                    )
+                    t_logits_per_text = t_logits_per_audio.T
+            else:
+                a_logits_per_audio = (
+                    logit_scale_a * audio_features @ text_features_mlp.T
+                )
+                a_logits_per_text = logit_scale_a * text_features_mlp @ audio_features.T
+                t_logits_per_audio = (
+                    logit_scale_t * audio_features_mlp @ text_features.T
+                )
+                t_logits_per_text = logit_scale_t * text_features @ audio_features_mlp.T
+            # calculated ground-truth and cache if enabled
+            num_logits = a_logits_per_audio.shape[0]
+            if self.prev_num_logits != num_logits or device not in self.labels:
+                labels = torch.arange(num_logits, device=device, dtype=torch.long)
+                if self.world_size > 1 and self.local_loss:
+                    labels = labels + num_logits * self.rank
+                if self.cache_labels:
+                    self.labels[device] = labels
+                    self.prev_num_logits = num_logits
+            else:
+                labels = self.labels[device]
+            if not self.weighted_loss:
+                total_loss = (
+                    F.cross_entropy(a_logits_per_audio, labels)
+                    + F.cross_entropy(a_logits_per_text, labels)
+                    + F.cross_entropy(t_logits_per_audio, labels)
+                    + F.cross_entropy(t_logits_per_text, labels)
+                ) / 4
+            else:
+                audio_weight = (audio_features @ audio_features.T).detach()
+                audio_weight = (
+                    torch.exp(
+                        torch.sum(audio_weight, axis=1)
+                        / (self.weight_loss_kappa * len(audio_weight))
+                    )
+                ).detach()
+                text_weight = (text_features @ text_features.T).detach()
+                text_weight = (
+                    torch.exp(
+                        torch.sum(text_weight, axis=1)
+                        / (self.weight_loss_kappa * len(text_features))
+                    )
+                ).detach()
+                total_loss = (
+                    F.cross_entropy(a_logits_per_audio, labels, weight=audio_weight)
+                    + F.cross_entropy(a_logits_per_text, labels, weight=audio_weight)
+                    + F.cross_entropy(t_logits_per_audio, labels, weight=text_weight)
+                    + F.cross_entropy(t_logits_per_text, labels, weight=text_weight)
+                ) / 4
+        else:
+            if self.world_size > 1:
+                all_audio_features, all_text_features = gather_features(
+                    audio_features=audio_features,
+                    text_features=text_features,
+                    local_loss=self.local_loss,
+                    gather_with_grad=self.gather_with_grad,
+                    rank=self.rank,
+                    world_size=self.world_size,
+                    use_horovod=self.use_horovod,
+                    mlp_loss=self.mlp_loss,
+                )
+                if self.local_loss:
+                    logits_per_audio = (
+                        logit_scale_a * audio_features @ all_text_features.T
+                    )
+                    logits_per_text = (
+                        logit_scale_a * text_features @ all_audio_features.T
+                    )
+                else:
+                    logits_per_audio = (
+                        logit_scale_a * all_audio_features @ all_text_features.T
+                    )
+                    logits_per_text = logits_per_audio.T
+            else:
+                logits_per_audio = logit_scale_a * audio_features @ text_features.T
+                logits_per_text = logit_scale_a * text_features @ audio_features.T
+            # calculated ground-truth and cache if enabled
+            num_logits = logits_per_audio.shape[0]
+            if self.prev_num_logits != num_logits or device not in self.labels:
+                labels = torch.arange(num_logits, device=device, dtype=torch.long)
+                if self.world_size > 1 and self.local_loss:
+                    labels = labels + num_logits * self.rank
+                if self.cache_labels:
+                    self.labels[device] = labels
+                    self.prev_num_logits = num_logits
+            else:
+                labels = self.labels[device]
+            if not self.weighted_loss:
+                total_loss = (
+                    F.cross_entropy(logits_per_audio, labels)
+                    + F.cross_entropy(logits_per_text, labels)
+                ) / 2
+            else:
+                audio_weight = (all_audio_features @ all_audio_features.T).detach()
+                audio_weight = (
+                    torch.exp(
+                        torch.sum(audio_weight, axis=1)
+                        / (self.weight_loss_kappa * len(all_audio_features))
+                    )
+                ).detach()
+                text_weight = (all_text_features @ all_text_features.T).detach()
+                text_weight = (
+                    torch.exp(
+                        torch.sum(text_weight, axis=1)
+                        / (self.weight_loss_kappa * len(all_text_features))
+                    )
+                ).detach()
+                total_loss = (
+                    F.cross_entropy(logits_per_audio, labels, weight=text_weight)
+                    + F.cross_entropy(logits_per_text, labels, weight=audio_weight)
+                ) / 2
+        return total_loss
+def lp_gather_features(pred, target, world_size=1, use_horovod=False):
+    if use_horovod:
+        assert hvd is not None, "Please install horovod"
+        with torch.no_grad():
+            all_preds = hvd.allgather(pred)
+            all_targets = hvd.allgath(target)
+    else:
+        gathered_preds = [torch.zeros_like(pred) for _ in range(world_size)]
+        gathered_targets = [torch.zeros_like(target) for _ in range(world_size)]
+        dist.all_gather(gathered_preds, pred)
+        dist.all_gather(gathered_targets, target)
+        all_preds = torch.cat(gathered_preds, dim=0)
+        all_targets = torch.cat(gathered_targets, dim=0)
+    return all_preds, all_targets
+def get_map(pred, target):
+    pred = torch.sigmoid(pred).numpy()
+    target = target.numpy()
+    return np.mean(average_precision_score(target, pred, average=None))
+def get_acc(pred, target):
+    pred = torch.argmax(pred, 1).numpy()
+    target = torch.argmax(target, 1).numpy()
+    return accuracy_score(target, pred)
+def get_mauc(pred, target):
+    pred = torch.sigmoid(pred).numpy()
+    target = target.numpy()
+    return np.mean(roc_auc_score(target, pred, average=None))
+class LPMetrics(object):
+    def __init__(self, metric_names=["map", "acc", "mauc"]):
+        self.metrics = []
+        for name in metric_names:
+            self.metrics.append(self.get_metric(name))
+        self.metric_names = metric_names
+    def get_metric(self, name):
+        if name == "map":
+            return get_map
+        elif name == "acc":
+            return get_acc
+        elif name == "mauc":
+            return get_mauc
+        else:
+            raise ValueError(f"the metric should be at least one of [map, acc, mauc]")
+    def evaluate_mertics(self, pred, target):
+        metric_dict = {}
+        for i in range(len(self.metric_names)):
+            metric_dict[self.metric_names[i]] = self.metrics[i](pred, target)
+        return metric_dict
+def calc_celoss(pred, target):
+    target = torch.argmax(target, 1).long()
+    return nn.CrossEntropyLoss()(pred, target)
+class LPLoss(nn.Module):
+    def __init__(self, loss_name):
+        super().__init__()
+        if loss_name == "bce":
+            self.loss_func = nn.BCEWithLogitsLoss()
+        elif loss_name == "ce":
+            self.loss_func = calc_celoss
+        elif loss_name == "mse":
+            self.loss_func = nn.MSELoss()
+        else:
+            raise ValueError(f"the loss func should be at least one of [bce, ce, mse]")
+    def forward(self, pred, target):
+        loss = self.loss_func(pred, target)
+        return loss

src/audioldm/clap/open_clip/model.py ADDED Viewed

	@@ -0,0 +1,936 @@

+""" CLAP Model
+Adapted from CLIP: https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI.
+Adapted to the Audio Task.
+"""
+from collections import OrderedDict
+from dataclasses import dataclass
+from email.mime import audio
+from typing import Tuple, Union, Callable, Optional
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from .timm_model import TimmModel
+import logging
+from .utils import freeze_batch_norm_2d
+from .pann_model import create_pann_model
+from .htsat import create_htsat_model
+from transformers import BertModel, RobertaModel, BartModel
+from transformers.tokenization_utils_base import BatchEncoding
+class MLPLayers(nn.Module):
+    def __init__(self, units=[512, 512, 512], nonlin=nn.ReLU(), dropout=0.1):
+        super(MLPLayers, self).__init__()
+        self.nonlin = nonlin
+        self.dropout = dropout
+        sequence = []
+        for u0, u1 in zip(units[:-1], units[1:]):
+            sequence.append(nn.Linear(u0, u1))
+            sequence.append(self.nonlin)
+            sequence.append(nn.Dropout(self.dropout))
+        sequence = sequence[:-2]
+        self.sequential = nn.Sequential(*sequence)
+    def forward(self, X):
+        X = self.sequential(X)
+        return X
+class Bottleneck(nn.Module):
+    expansion = 4
+    def __init__(self, inplanes, planes, stride=1):
+        super().__init__()
+        # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
+        self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = None
+        self.stride = stride
+        if stride > 1 or inplanes != planes * Bottleneck.expansion:
+            # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
+            self.downsample = nn.Sequential(
+                OrderedDict(
+                    [
+                        ("-1", nn.AvgPool2d(stride)),
+                        (
+                            "0",
+                            nn.Conv2d(
+                                inplanes,
+                                planes * self.expansion,
+                                1,
+                                stride=1,
+                                bias=False,
+                            ),
+                        ),
+                        ("1", nn.BatchNorm2d(planes * self.expansion)),
+                    ]
+                )
+            )
+    def forward(self, x: torch.Tensor):
+        identity = x
+        out = self.relu(self.bn1(self.conv1(x)))
+        out = self.relu(self.bn2(self.conv2(out)))
+        out = self.avgpool(out)
+        out = self.bn3(self.conv3(out))
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu(out)
+        return out
+class AttentionPool2d(nn.Module):
+    def __init__(
+        self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None
+    ):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(
+            torch.randn(spacial_dim**2 + 1, embed_dim) / embed_dim**0.5
+        )
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
+        self.num_heads = num_heads
+    def forward(self, x):
+        x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3]).permute(
+            2, 0, 1
+        )  # NCHW -> (HW)NC
+        x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (HW+1)NC
+        x = x + self.positional_embedding[:, None, :].to(x.dtype)  # (HW+1)NC
+        x, _ = F.multi_head_attention_forward(
+            query=x,
+            key=x,
+            value=x,
+            embed_dim_to_check=x.shape[-1],
+            num_heads=self.num_heads,
+            q_proj_weight=self.q_proj.weight,
+            k_proj_weight=self.k_proj.weight,
+            v_proj_weight=self.v_proj.weight,
+            in_proj_weight=None,
+            in_proj_bias=torch.cat(
+                [self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]
+            ),
+            bias_k=None,
+            bias_v=None,
+            add_zero_attn=False,
+            dropout_p=0,
+            out_proj_weight=self.c_proj.weight,
+            out_proj_bias=self.c_proj.bias,
+            use_separate_proj_weight=True,
+            training=self.training,
+            need_weights=False,
+        )
+        return x[0]
+class ModifiedResNet(nn.Module):
+    """
+    A ResNet class that is similar to torchvision's but contains the following changes:
+    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
+    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
+    - The final pooling layer is a QKV attention instead of an average pool
+    """
+    def __init__(self, layers, output_dim, heads, image_size=224, width=64):
+        super().__init__()
+        self.output_dim = output_dim
+        self.image_size = image_size
+        # the 3-layer stem
+        self.conv1 = nn.Conv2d(
+            3, width // 2, kernel_size=3, stride=2, padding=1, bias=False
+        )
+        self.bn1 = nn.BatchNorm2d(width // 2)
+        self.conv2 = nn.Conv2d(
+            width // 2, width // 2, kernel_size=3, padding=1, bias=False
+        )
+        self.bn2 = nn.BatchNorm2d(width // 2)
+        self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(width)
+        self.avgpool = nn.AvgPool2d(2)
+        self.relu = nn.ReLU(inplace=True)
+        # residual layers
+        self._inplanes = width  # this is a *mutable* variable used during construction
+        self.layer1 = self._make_layer(width, layers[0])
+        self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
+        self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
+        self.layer4 = self._make_layer(width * 8, layers[3], stride=2)
+        embed_dim = width * 32  # the ResNet feature dimension
+        self.attnpool = AttentionPool2d(image_size // 32, embed_dim, heads, output_dim)
+        self.init_parameters()
+    def _make_layer(self, planes, blocks, stride=1):
+        layers = [Bottleneck(self._inplanes, planes, stride)]
+        self._inplanes = planes * Bottleneck.expansion
+        for _ in range(1, blocks):
+            layers.append(Bottleneck(self._inplanes, planes))
+        return nn.Sequential(*layers)
+    def init_parameters(self):
+        if self.attnpool is not None:
+            std = self.attnpool.c_proj.in_features**-0.5
+            nn.init.normal_(self.attnpool.q_proj.weight, std=std)
+            nn.init.normal_(self.attnpool.k_proj.weight, std=std)
+            nn.init.normal_(self.attnpool.v_proj.weight, std=std)
+            nn.init.normal_(self.attnpool.c_proj.weight, std=std)
+        for resnet_block in [self.layer1, self.layer2, self.layer3, self.layer4]:
+            for name, param in resnet_block.named_parameters():
+                if name.endswith("bn3.weight"):
+                    nn.init.zeros_(param)
+    def lock(self, unlocked_groups=0, freeze_bn_stats=False):
+        assert (
+            unlocked_groups == 0
+        ), "partial locking not currently supported for this model"
+        for param in self.parameters():
+            param.requires_grad = False
+        if freeze_bn_stats:
+            freeze_batch_norm_2d(self)
+    def stem(self, x):
+        for conv, bn in [
+            (self.conv1, self.bn1),
+            (self.conv2, self.bn2),
+            (self.conv3, self.bn3),
+        ]:
+            x = self.relu(bn(conv(x)))
+        x = self.avgpool(x)
+        return x
+    def forward(self, x):
+        x = self.stem(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.attnpool(x)
+        return x
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        x = F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
+        return x.to(orig_type)
+class QuickGELU(nn.Module):
+    # NOTE This is slower than nn.GELU or nn.SiLU and uses more GPU memory
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+class ResidualAttentionBlock(nn.Module):
+    def __init__(self, d_model: int, n_head: int, act_layer: Callable = nn.GELU):
+        super().__init__()
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(
+            OrderedDict(
+                [
+                    ("c_fc", nn.Linear(d_model, d_model * 4)),
+                    ("gelu", act_layer()),
+                    ("c_proj", nn.Linear(d_model * 4, d_model)),
+                ]
+            )
+        )
+        self.ln_2 = LayerNorm(d_model)
+    def attention(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None):
+        return self.attn(x, x, x, need_weights=False, attn_mask=attn_mask)[0]
+    def forward(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None):
+        x = x + self.attention(self.ln_1(x), attn_mask=attn_mask)
+        x = x + self.mlp(self.ln_2(x))
+        return x
+class Transformer(nn.Module):
+    def __init__(
+        self, width: int, layers: int, heads: int, act_layer: Callable = nn.GELU
+    ):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.resblocks = nn.ModuleList(
+            [
+                ResidualAttentionBlock(width, heads, act_layer=act_layer)
+                for _ in range(layers)
+            ]
+        )
+    def forward(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None):
+        for r in self.resblocks:
+            x = r(x, attn_mask=attn_mask)
+        return x
+class VisualTransformer(nn.Module):
+    def __init__(
+        self,
+        image_size: int,
+        patch_size: int,
+        width: int,
+        layers: int,
+        heads: int,
+        output_dim: int,
+        act_layer: Callable = nn.GELU,
+    ):
+        super().__init__()
+        self.image_size = image_size
+        self.output_dim = output_dim
+        self.conv1 = nn.Conv2d(
+            in_channels=3,
+            out_channels=width,
+            kernel_size=patch_size,
+            stride=patch_size,
+            bias=False,
+        )
+        scale = width**-0.5
+        self.class_embedding = nn.Parameter(scale * torch.randn(width))
+        self.positional_embedding = nn.Parameter(
+            scale * torch.randn((image_size // patch_size) ** 2 + 1, width)
+        )
+        self.ln_pre = LayerNorm(width)
+        self.text_branch = Transformer(width, layers, heads, act_layer=act_layer)
+        self.ln_post = LayerNorm(width)
+        self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
+    def lock(self, unlocked_groups=0, freeze_bn_stats=False):
+        assert (
+            unlocked_groups == 0
+        ), "partial locking not currently supported for this model"
+        for param in self.parameters():
+            param.requires_grad = False
+    def forward(self, x: torch.Tensor):
+        x = self.conv1(x)  # shape = [*, width, grid, grid]
+        x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+        x = torch.cat(
+            [
+                self.class_embedding.to(x.dtype)
+                + torch.zeros(
+                    x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device
+                ),
+                x,
+            ],
+            dim=1,
+        )  # shape = [*, grid ** 2 + 1, width]
+        x = x + self.positional_embedding.to(x.dtype)
+        x = self.ln_pre(x)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.text_branch(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_post(x[:, 0, :])
+        if self.proj is not None:
+            x = x @ self.proj
+        return x
+@dataclass
+class CLAPVisionCfg:
+    layers: Union[Tuple[int, int, int, int], int] = 12
+    width: int = 768
+    patch_size: int = 16
+    image_size: Union[Tuple[int, int], int] = 224
+    timm_model_name: str = (
+        None  # a valid model name overrides layers, width, patch_size
+    )
+    timm_model_pretrained: bool = (
+        False  # use (imagenet) pretrained weights for named model
+    )
+    timm_pool: str = (
+        "avg"  # feature pooling for timm model ('abs_attn', 'rot_attn', 'avg', '')
+    )
+    timm_proj: str = (
+        "linear"  # linear projection for timm model output ('linear', 'mlp', '')
+    )
+# Audio Config Class
+@dataclass
+class CLAPAudioCfp:
+    model_type: str = "PANN"
+    model_name: str = "Cnn14"
+    sample_rate: int = 48000
+    # Param
+    audio_length: int = 1024
+    window_size: int = 1024
+    hop_size: int = 1024
+    fmin: int = 50
+    fmax: int = 14000
+    class_num: int = 527
+    mel_bins: int = 64
+    clip_samples: int = 480000
+@dataclass
+class CLAPTextCfg:
+    context_length: int
+    vocab_size: int
+    width: int
+    heads: int
+    layers: int
+    model_type: str
+class CLAP(nn.Module):
+    def __init__(
+        self,
+        embed_dim: int,
+        audio_cfg: CLAPAudioCfp,
+        text_cfg: CLAPTextCfg,
+        quick_gelu: bool = False,
+        enable_fusion: bool = False,
+        fusion_type: str = "None",
+        joint_embed_shape: int = 512,
+        mlp_act: str = "relu",
+    ):
+        super().__init__()
+        if isinstance(audio_cfg, dict):
+            audio_cfg = CLAPAudioCfp(**audio_cfg)
+        if isinstance(text_cfg, dict):
+            text_cfg = CLAPTextCfg(**text_cfg)
+        self.audio_cfg = audio_cfg
+        self.text_cfg = text_cfg
+        self.enable_fusion = enable_fusion
+        self.fusion_type = fusion_type
+        self.joint_embed_shape = joint_embed_shape
+        self.mlp_act = mlp_act
+        self.context_length = text_cfg.context_length
+        # OpenAI models are pretrained w/ QuickGELU but native nn.GELU is both faster and more
+        # memory efficient in recent PyTorch releases (>= 1.10).
+        # NOTE: timm models always use native GELU regardless of quick_gelu flag.
+        act_layer = QuickGELU if quick_gelu else nn.GELU
+        if mlp_act == "relu":
+            mlp_act_layer = nn.ReLU()
+        elif mlp_act == "gelu":
+            mlp_act_layer = nn.GELU()
+        else:
+            raise NotImplementedError
+        # audio branch
+        # audio branch parameters
+        if audio_cfg.model_type == "PANN":
+            self.audio_branch = create_pann_model(audio_cfg, enable_fusion, fusion_type)
+        elif audio_cfg.model_type == "HTSAT":
+            self.audio_branch = create_htsat_model(
+                audio_cfg, enable_fusion, fusion_type
+            )
+        else:
+            logging.error(f"Model config for {audio_cfg.model_type} not found")
+            raise RuntimeError(f"Model config for {audio_cfg.model_type} not found.")
+        # text branch
+        # text branch parameters
+        if text_cfg.model_type == "transformer":
+            self.text_branch = Transformer(
+                width=text_cfg.width,
+                layers=text_cfg.layers,
+                heads=text_cfg.heads,
+                act_layer=act_layer,
+            )
+            self.vocab_size = text_cfg.vocab_size
+            self.token_embedding = nn.Embedding(text_cfg.vocab_size, text_cfg.width)
+            self.positional_embedding = nn.Parameter(
+                torch.empty(self.context_length, text_cfg.width)
+            )
+            self.ln_final = LayerNorm(text_cfg.width)
+            self.text_transform = MLPLayers(
+                units=[
+                    self.joint_embed_shape,
+                    self.joint_embed_shape,
+                    self.joint_embed_shape,
+                ],
+                dropout=0.1,
+            )
+            self.text_projection = nn.Sequential(
+                nn.Linear(text_cfg.width, self.joint_embed_shape),
+                mlp_act_layer,
+                nn.Linear(self.joint_embed_shape, self.joint_embed_shape),
+            )
+        elif text_cfg.model_type == "bert":
+            self.text_branch = BertModel.from_pretrained("bert-base-uncased")
+            self.text_transform = MLPLayers(
+                units=[
+                    self.joint_embed_shape,
+                    self.joint_embed_shape,
+                    self.joint_embed_shape,
+                ],
+                dropout=0.1,
+            )
+            self.text_projection = nn.Sequential(
+                nn.Linear(768, self.joint_embed_shape),
+                mlp_act_layer,
+                nn.Linear(self.joint_embed_shape, self.joint_embed_shape),
+            )
+        elif text_cfg.model_type == "roberta":
+            self.text_branch = RobertaModel.from_pretrained("roberta-base")
+            self.text_transform = MLPLayers(
+                units=[
+                    self.joint_embed_shape,
+                    self.joint_embed_shape,
+                    self.joint_embed_shape,
+                ],
+                dropout=0.1,
+            )
+            self.text_projection = nn.Sequential(
+                nn.Linear(768, self.joint_embed_shape),
+                mlp_act_layer,
+                nn.Linear(self.joint_embed_shape, self.joint_embed_shape),
+            )
+        elif text_cfg.model_type == "bart":
+            self.text_branch = BartModel.from_pretrained("facebook/bart-base")
+            self.text_transform = MLPLayers(
+                units=[
+                    self.joint_embed_shape,
+                    self.joint_embed_shape,
+                    self.joint_embed_shape,
+                ],
+                dropout=0.1,
+            )
+            self.text_projection = nn.Sequential(
+                nn.Linear(768, self.joint_embed_shape),
+                mlp_act_layer,
+                nn.Linear(self.joint_embed_shape, self.joint_embed_shape),
+            )
+        else:
+            logging.error(f"Model config for {text_cfg.model_type} not found")
+            raise RuntimeError(f"Model config for {text_cfg.model_type} not found.")
+        self.text_branch_type = text_cfg.model_type
+        # text branch parameters
+        # audio branch parameters
+        self.audio_transform = MLPLayers(
+            units=[
+                self.joint_embed_shape,
+                self.joint_embed_shape,
+                self.joint_embed_shape,
+            ],
+            dropout=0.1,
+        )
+        # below here is text branch parameters
+        # ============================================================================================================
+        self.audio_projection = nn.Sequential(
+            nn.Linear(embed_dim, self.joint_embed_shape),
+            mlp_act_layer,
+            nn.Linear(self.joint_embed_shape, self.joint_embed_shape),
+        )
+        self.logit_scale_a = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.logit_scale_t = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.register_buffer("attn_mask", self.build_attention_mask(), persistent=False)
+        self.init_text_branch_parameters()
+    def init_text_branch_parameters(self):
+        if self.text_branch_type == "transformer":
+            nn.init.normal_(self.token_embedding.weight, std=0.02)
+            nn.init.normal_(self.positional_embedding, std=0.01)
+            proj_std = (self.text_branch.width**-0.5) * (
+                (2 * self.text_branch.layers) ** -0.5
+            )
+            attn_std = self.text_branch.width**-0.5
+            fc_std = (2 * self.text_branch.width) ** -0.5
+            for block in self.text_branch.resblocks:
+                nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
+                nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
+                nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
+                nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
+        if self.text_branch_type == "bert" or self.text_branch_type == "roberta":
+            width = self.text_branch.embeddings.word_embeddings.weight.shape[-1]
+        elif self.text_branch_type == "bart":
+            width = self.text_branch.shared.weight.shape[-1]
+        else:
+            width = self.text_branch.width
+        nn.init.constant_(self.logit_scale_a, np.log(1 / 0.07))
+        nn.init.constant_(self.logit_scale_t, np.log(1 / 0.07))
+        # deprecated
+        # if hasattr(self.visual, 'init_parameters'):
+        # self.visual.init_parameters()
+        # if self.text_projection is not None:
+        #     nn.init.normal_(self.text_projection, std=width**-0.5)
+    def build_attention_mask(self):
+        # lazily create causal attention mask, with full attention between the vision tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(self.context_length, self.context_length)
+        mask.fill_(float("-inf"))
+        mask.triu_(1)  # zero out the lower diagonal
+        return mask
+    def encode_audio(self, audio, device):
+        return self.audio_branch(
+            audio, mixup_lambda=None, device=device
+        )  # mix lambda needs to add
+    # def list_of_dict_of_tensor2dict_of_tensor(self, x, device):
+    #     tmp = {}
+    #     for k in x[0].keys():
+    #         tmp[k] = []
+    #         for i in range(len(x)):
+    #             tmp[k].append(x[i][k][:77])
+    #     for k in x[0].keys():
+    #         tmp[k] = torch.tensor(tmp[k]).to(device=device, non_blocking=True)
+    #     return tmp
+    def encode_text(self, text, device):
+        if self.text_branch_type == "transformer":
+            text = text.to(device=device, non_blocking=True)
+            x = self.token_embedding(text)  # [batch_size, n_ctx, d_model]
+            x = x + self.positional_embedding
+            x = x.permute(1, 0, 2)  # NLD -> LND
+            x = self.text_branch(x, attn_mask=self.attn_mask)
+            x = x.permute(1, 0, 2)  # LND -> NLD
+            x = self.ln_final(x)
+            # x.shape = [batch_size, n_ctx, transformer.width]
+            # take features from the eot embedding (eot_token is the highest number in each sequence)
+            x = self.text_projection(x[torch.arange(x.shape[0]), text.argmax(dim=-1)])
+        elif self.text_branch_type == "bert":
+            # text = self.list_of_dict_of_tensor2dict_of_tensor(text, device)
+            # text = BatchEncoding(text)
+            x = self.text_branch(
+                input_ids=text["input_ids"].to(device=device, non_blocking=True),
+                attention_mask=text["attention_mask"].to(
+                    device=device, non_blocking=True
+                ),
+                token_type_ids=text["token_type_ids"].to(
+                    device=device, non_blocking=True
+                ),
+            )["pooler_output"]
+            x = self.text_projection(x)
+        elif self.text_branch_type == "roberta":
+            x = self.text_branch(
+                input_ids=text["input_ids"].to(device=device, non_blocking=True),
+                attention_mask=text["attention_mask"].to(
+                    device=device, non_blocking=True
+                ),
+            )["pooler_output"]
+            x = self.text_projection(x)
+        elif self.text_branch_type == "bart":
+            x = torch.mean(
+                self.text_branch(
+                    input_ids=text["input_ids"].to(device=device, non_blocking=True),
+                    attention_mask=text["attention_mask"].to(
+                        device=device, non_blocking=True
+                    ),
+                )["encoder_last_hidden_state"],
+                axis=1,
+            )
+            x = self.text_projection(x)
+        else:
+            logging.error(f"Model type {self.text_branch_type} not found")
+            raise RuntimeError(f"Model type {self.text_branch_type} not found.")
+        return x
+    def forward(self, audio, text, device=None):
+        """Forward audio and text into the CLAP
+        Parameters
+        ----------
+        audio: torch.Tensor (batch_size, audio_length)
+            the time-domain audio input / the batch of mel_spec and longer list.
+        text: torch.Tensor () // need to add
+            the text token input
+        """
+        if device is None:
+            if audio is not None:
+                device = audio.device
+            elif text is not None:
+                device = text.device
+        if audio is None and text is None:
+            # a hack to get the logit scale
+            return self.logit_scale_a.exp(), self.logit_scale_t.exp()
+        elif audio is None:
+            return self.encode_text(text, device=device)
+        elif text is None:
+            return self.audio_projection(
+                self.encode_audio(audio, device=device)["embedding"]
+            )
+        audio_features = self.audio_projection(
+            self.encode_audio(audio, device=device)["embedding"]
+        )
+        audio_features = F.normalize(audio_features, dim=-1)
+        text_features = self.encode_text(text, device=device)
+        # print("text_features", text_features)
+        # print("text_features.shape", text_features.shape)
+        # print("text_features.type", type(text_features))
+        text_features = F.normalize(text_features, dim=-1)
+        audio_features_mlp = self.audio_transform(audio_features)
+        text_features_mlp = self.text_transform(text_features)
+        # Four outputs: audio features (basic & MLP), text features (basic & MLP)
+        return (
+            audio_features,
+            text_features,
+            audio_features_mlp,
+            text_features_mlp,
+            self.logit_scale_a.exp(),
+            self.logit_scale_t.exp(),
+        )
+    def get_logit_scale(self):
+        return self.logit_scale_a.exp(), self.logit_scale_t.exp()
+    def get_text_embedding(self, data):
+        """Get the text embedding from the model
+        Parameters
+        ----------
+        data: torch.Tensor
+            a tensor of text embedding
+        Returns
+        ----------
+        text_embed: torch.Tensor
+            a tensor of text_embeds (N, D)
+        """
+        device = next(self.parameters()).device
+        for k in data:
+            data[k] = data[k].to(device)
+            if len(data[k].size()) < 2:
+                data[k] = data[k].unsqueeze(0)
+        text_embeds = self.encode_text(data, device=device)
+        text_embeds = F.normalize(text_embeds, dim=-1)
+        return text_embeds
+    def get_audio_embedding(self, data):
+        """Get the audio embedding from the model
+        Parameters
+        ----------
+        data: a list of dict
+            the audio input dict list from 'get_audio_feature' method
+        Returns
+        ----------
+        audio_embed: torch.Tensor
+            a tensor of audio_embeds (N, D)
+        """
+        device = next(self.parameters()).device
+        input_dict = {}
+        keys = data[0].keys()
+        for k in keys:
+            input_dict[k] = torch.cat([d[k].unsqueeze(0) for d in data], dim=0).to(
+                device
+            )
+        audio_embeds = self.audio_projection(
+            self.encode_audio(input_dict, device=device)["embedding"]
+        )
+        audio_embeds = F.normalize(audio_embeds, dim=-1)
+        return audio_embeds
+    def audio_infer(self, audio, hopsize=None, device=None):
+        """Forward one audio and produce the audio embedding
+        Parameters
+        ----------
+        audio:  (audio_length)
+            the time-domain audio input, notice that it must be only one input
+        hopsize: int
+            the overlap hopsize as the sliding window
+        Returns
+        ----------
+        output_dict: {
+            key: [n, (embedding_shape)] if "HTS-AT"
+            or
+            key: [(embedding_shape)] if "PANN"
+        }
+            the list of key values of the audio branch
+        """
+        assert not self.training, "the inference mode must be run at eval stage"
+        output_dict = {}
+        # PANN
+        if self.audio_cfg.model_type == "PANN":
+            audio_input = audio.unsqueeze(dim=0)
+            output_dict[key] = self.encode_audio(audio_input, device=device)[
+                key
+            ].squeeze(dim=0)
+        elif self.audio_cfg.model_type == "HTSAT":
+            # repeat
+            audio_len = len(audio)
+            k = self.audio_cfg.clip_samples // audio_len
+            if k > 1:
+                audio = audio.repeat(k)
+                audio_len = len(audio)
+            if hopsize is None:
+                hopsize = min(hopsize, audio_len)
+            if audio_len > self.audio_cfg.clip_samples:
+                audio_input = [
+                    audio[pos : pos + self.audio_cfg.clip_samples].clone()
+                    for pos in range(
+                        0, audio_len - self.audio_cfg.clip_samples, hopsize
+                    )
+                ]
+                audio_input.append(audio[-self.audio_cfg.clip_samples :].clone())
+                audio_input = torch.stack(audio_input)
+                output_dict[key] = self.encode_audio(audio_input, device=device)[key]
+            else:
+                audio_input = audio.unsqueeze(dim=0)
+                output_dict[key] = self.encode_audio(audio_input, device=device)[
+                    key
+                ].squeeze(dim=0)
+        return output_dict
+def convert_weights_to_fp16(model: nn.Module):
+    """Convert applicable model parameters to fp16"""
+    def _convert_weights_to_fp16(l):
+        if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)):
+            l.weight.data = l.weight.data.half()
+            if l.bias is not None:
+                l.bias.data = l.bias.data.half()
+        if isinstance(l, nn.MultiheadAttention):
+            for attr in [
+                *[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]],
+                "in_proj_bias",
+                "bias_k",
+                "bias_v",
+            ]:
+                tensor = getattr(l, attr)
+                if tensor is not None:
+                    tensor.data = tensor.data.half()
+        for name in ["text_projection", "proj"]:
+            if hasattr(l, name):
+                attr = getattr(l, name)
+                if attr is not None:
+                    attr.data = attr.data.half()
+    model.apply(_convert_weights_to_fp16)
+# Ignore the state dict of the vision part
+def build_model_from_openai_state_dict(
+    state_dict: dict, model_cfg, enable_fusion: bool = False, fusion_type: str = "None"
+):
+    embed_dim = model_cfg["embed_dim"]
+    audio_cfg = model_cfg["audio_cfg"]
+    text_cfg = model_cfg["text_cfg"]
+    context_length = state_dict["positional_embedding"].shape[0]
+    vocab_size = state_dict["token_embedding.weight"].shape[0]
+    transformer_width = state_dict["ln_final.weight"].shape[0]
+    transformer_heads = transformer_width // 64
+    transformer_layers = len(
+        set(
+            k.split(".")[2]
+            for k in state_dict
+            if k.startswith(f"transformer.resblocks")
+        )
+    )
+    audio_cfg = CLAPAudioCfp(**audio_cfg)
+    text_cfg = CLAPTextCfg(**text_cfg)
+    model = CLAP(
+        embed_dim,
+        audio_cfg=audio_cfg,
+        text_cfg=text_cfg,
+        quick_gelu=True,  # OpenAI models were trained with QuickGELU
+        enable_fusion=enable_fusion,
+        fusion_type=fusion_type,
+    )
+    state_dict["logit_scale_a"] = state_dict["logit_scale"]
+    state_dict["logit_scale_t"] = state_dict["logit_scale"]
+    pop_keys = list(state_dict.keys())[::]
+    # pop the visual branch saved weights
+    for key in pop_keys:
+        if key.startswith("visual."):
+            state_dict.pop(key, None)
+    for key in ["logit_scale", "input_resolution", "context_length", "vocab_size"]:
+        state_dict.pop(key, None)
+    # not use fp16
+    # convert_weights_to_fp16(model)
+    model.load_state_dict(state_dict, strict=False)
+    return model.eval()
+def trace_model(model, batch_size=256, device=torch.device("cpu")):
+    model.eval()
+    audio_length = model.audio_cfg.audio_length
+    example_audio = torch.ones((batch_size, audio_length), device=device)
+    example_text = torch.zeros(
+        (batch_size, model.context_length), dtype=torch.int, device=device
+    )
+    model = torch.jit.trace_module(
+        model,
+        inputs=dict(
+            forward=(example_audio, example_text),
+            encode_text=(example_text,),
+            encode_image=(example_audio,),
+        ),
+    )
+    model.audio_cfg.audio_length = audio_length  # Question: what does this do?
+    return model

src/audioldm/clap/open_clip/model_configs/HTSAT-base.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+    "embed_dim": 1024,
+    "audio_cfg": {
+        "audio_length": 1024,
+        "clip_samples": 480000,
+        "mel_bins": 64,
+        "sample_rate": 48000,
+        "window_size": 1024,
+        "hop_size": 480,
+        "fmin": 50,
+        "fmax": 14000,
+        "class_num": 527,
+        "model_type": "HTSAT",
+        "model_name": "base"
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 512,
+        "heads": 8,
+        "layers": 12
+    }
+}

src/audioldm/clap/open_clip/model_configs/HTSAT-large.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+    "embed_dim": 2048,
+    "audio_cfg": {
+        "audio_length": 1024,
+        "clip_samples": 480000,
+        "mel_bins": 64,
+        "sample_rate": 48000,
+        "window_size": 1024,
+        "hop_size": 480,
+        "fmin": 50,
+        "fmax": 14000,
+        "class_num": 527,
+        "model_type": "HTSAT",
+        "model_name": "large"
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 512,
+        "heads": 8,
+        "layers": 12
+    }
+}

src/audioldm/clap/open_clip/model_configs/HTSAT-tiny-win-1536.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+    "embed_dim": 768,
+    "audio_cfg": {
+        "audio_length": 1024,
+        "clip_samples": 480000,
+        "mel_bins": 64,
+        "sample_rate": 48000,
+        "window_size": 1536,
+        "hop_size": 480,
+        "fmin": 50,
+        "fmax": 14000,
+        "class_num": 527,
+        "model_type": "HTSAT",
+        "model_name": "tiny"
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 512,
+        "heads": 8,
+        "layers": 12
+    }
+}

src/audioldm/clap/open_clip/model_configs/HTSAT-tiny.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+    "embed_dim": 768,
+    "audio_cfg": {
+        "audio_length": 1024,
+        "clip_samples": 480000,
+        "mel_bins": 64,
+        "sample_rate": 48000,
+        "window_size": 1024,
+        "hop_size": 480,
+        "fmin": 50,
+        "fmax": 14000,
+        "class_num": 527,
+        "model_type": "HTSAT",
+        "model_name": "tiny"
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 512,
+        "heads": 8,
+        "layers": 12
+    }
+}

src/audioldm/clap/open_clip/model_configs/PANN-10.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+    "embed_dim": 1024,
+    "audio_cfg": {
+        "audio_length": 1024,
+        "clip_samples": 480000,
+        "mel_bins": 64,
+        "sample_rate": 48000,
+        "window_size": 1024,
+        "hop_size": 480,
+        "fmin": 50,
+        "fmax": 14000,
+        "class_num": 527,
+        "model_type": "PANN",
+        "model_name": "Cnn10"
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 512,
+        "heads": 8,
+        "layers": 12
+    }
+}