Spaces:

DennisHung
/

DiffmorpherXAp-adapter

Runtime error

App Files Files Community

CSH-1220 commited on Jan 21

Commit

55f08a9

1 Parent(s): aef267d

Files update

Browse files

Files changed (4) hide show

app.py +5 -1
download.py +9 -0
pipeline/morph_pipeline_successed_ver1.py +101 -175
utils/lora_utils_successed_ver1.py +12 -24

app.py CHANGED Viewed

@@ -47,9 +47,13 @@ def morph_audio(audio_file1, audio_file2, prompt1, prompt2, negative_prompt1="Lo
     )
     # Collect the output file paths
-    output_paths = [os.path.join(save_lora_dir, file) for file in os.listdir(save_lora_dir) if file.endswith(".wav")]
     return output_paths
 # Gradio interface function
 def interface(audio1, audio2, prompt1, prompt2):
     output_paths = morph_audio(audio1, audio2, prompt1, prompt2)

     )
     # Collect the output file paths
+    output_paths = sorted(
+    [os.path.join(save_lora_dir, file) for file in os.listdir(save_lora_dir) if file.endswith(".wav")],
+    key=lambda x: int(os.path.splitext(os.path.basename(x))[0])
+    )
     return output_paths
 # Gradio interface function
 def interface(audio1, audio2, prompt1, prompt2):
     output_paths = morph_audio(audio1, audio2, prompt1, prompt2)

download.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from huggingface_hub import hf_hub_download
+import torch
+model_path = hf_hub_download(
+    repo_id="DennisHung/Pre-trained_AudioMAE_weights",
+    filename="pytorch_model.bin",
+    local_dir="./",
+    local_dir_use_symlinks=False
+)

pipeline/morph_pipeline_successed_ver1.py CHANGED Viewed

@@ -49,64 +49,12 @@ if is_librosa_available():
     import librosa
 import warnings
 import matplotlib.pyplot as plt
-from huggingface_hub import hf_hub_download
 from .pipeline_audioldm2 import AudioLDM2Pipeline
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-pipeline_trained = AudioLDM2Pipeline.from_pretrained("cvssp/audioldm2-large", torch_dtype=torch.float32)
-pipeline_trained = pipeline_trained.to(DEVICE)
-layer_num = 0
-cross = [None, None, 768, 768, 1024, 1024, None, None]
-unet = pipeline_trained.unet
-attn_procs = {}
-for name in  unet.attn_processors.keys():
-    cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
-    if name.startswith("mid_block"):
-        hidden_size = unet.config.block_out_channels[-1]
-    elif name.startswith("up_blocks"):
-        block_id = int(name[len("up_blocks.")])
-        hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
-    elif name.startswith("down_blocks"):
-        block_id = int(name[len("down_blocks.")])
-        hidden_size = unet.config.block_out_channels[block_id]
-    if cross_attention_dim is None:
-        attn_procs[name] = AttnProcessor2_0()
-    else:
-        cross_attention_dim = cross[layer_num % 8]
-        layer_num += 1
-        if cross_attention_dim == 768:
-            attn_procs[name] = IPAttnProcessor2_0(
-                hidden_size=hidden_size,
-                name=name,
-                cross_attention_dim=cross_attention_dim,
-                scale=0.5,
-                num_tokens=8,
-                do_copy=False
-            ).to(DEVICE, dtype=torch.float32)
-        else:
-            attn_procs[name] = AttnProcessor2_0()
-adapter_weight = hf_hub_download(
-    repo_id="DennisHung/Pre-trained_AudioMAE_weights",
-    filename="pytorch_model.bin",
-)
-state_dict = torch.load(adapter_weight, map_location=DEVICE)
-for name, processor in attn_procs.items():
-    if hasattr(processor, 'to_v_ip') or hasattr(processor, 'to_k_ip'):
-        weight_name_v = name + ".to_v_ip.weight"
-        weight_name_k = name + ".to_k_ip.weight"
-        processor.to_v_ip.weight = torch.nn.Parameter(state_dict[weight_name_v].half())
-        processor.to_k_ip.weight = torch.nn.Parameter(state_dict[weight_name_k].half())
-unet.set_attn_processor(attn_procs)
-unet.to(DEVICE, dtype=torch.float32)
 def visualize_mel_spectrogram(mel_spect_tensor, output_path=None):
@@ -125,10 +73,6 @@ def visualize_mel_spectrogram(mel_spect_tensor, output_path=None):
         plt.show()
-warnings.filterwarnings("ignore", category=FutureWarning)
-warnings.filterwarnings("ignore", category=UserWarning)
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 class StoreProcessor():
     def __init__(self, original_processor, value_dict, name):
         self.original_processor = original_processor
@@ -140,12 +84,9 @@ class StoreProcessor():
     def __call__(self, attn, hidden_states, *args, encoder_hidden_states=None, attention_mask=None, **kwargs):
         # Is self attention
         if encoder_hidden_states is None:
-            # 將 hidden_states 存入 value_dict 中，名稱為 self.name
-            # 如果輸入沒有 encoder_hidden_states，表示是自注意力層，則將輸入的 hidden_states 儲存在 value_dict 中。
             # print(f'In StoreProcessor: {self.name} {self.id}')
             self.value_dict[self.name][self.id] = hidden_states.detach()
             self.id += 1
-        # 調用原始處理器，執行正常的注意力操作
         res = self.original_processor(attn, hidden_states, *args,
                                       encoder_hidden_states=encoder_hidden_states,
                                       attention_mask=attention_mask,
@@ -167,32 +108,26 @@ class LoadProcessor():
     def __call__(self, attn, hidden_states, *args, encoder_hidden_states=None, attention_mask=None, **kwargs):
         # Is self attention
-        # 判斷是否是自注意力（self-attention）
         if encoder_hidden_states is None:
-            # 如果當前索引小於 10 倍的 self.lamd，使用自定義的混合邏輯
             if self.id < 10 * self.lamd:
                 map0 = self.aud1_dict[self.name][self.id]
                 map1 = self.aud2_dict[self.name][self.id]
                 cross_map = self.beta * hidden_states + \
                     (1 - self.beta) * ((1 - self.alpha) * map0 + self.alpha * map1)
-                # 調用原始處理器，將 cross_map 作為 encoder_hidden_states 傳入
                 res = self.original_processor(attn, hidden_states, *args,
                                               encoder_hidden_states=cross_map,
                                               attention_mask=attention_mask,
                                               **kwargs)
             else:
-                # 否則，使用原��的 encoder_hidden_states（可能為 None）
                 res = self.original_processor(attn, hidden_states, *args,
                                               encoder_hidden_states=encoder_hidden_states,
                                               attention_mask=attention_mask,
                                               **kwargs)
             self.id += 1
-            # 如果索引到達 self.aud1_dict[self.name] 的長度，重置索引為 0
             if self.id == len(self.aud1_dict[self.name]):
                 self.id = 0
         else:
-            # 如果是跨注意力（encoder_hidden_states 不為 None），直接使用原始處理器
             res = self.original_processor(attn, hidden_states, *args,
                                           encoder_hidden_states=encoder_hidden_states,
                                           attention_mask=attention_mask,
@@ -908,7 +843,7 @@ class AudioLDM2MorphPipeline(DiffusionPipeline,TextualInversionLoaderMixin):
         ta_kaldi_fbank = extract_kaldi_fbank_feature(waveform, sr, fbank)
         # print("ta_kaldi_fbank.shape",ta_kaldi_fbank.shape)
         mel_spect_tensor = ta_kaldi_fbank.unsqueeze(0)
-        model = AudioMAEConditionCTPoolRand().to(next(self.unet.parameters()).device)
         model.eval()
         LOA_embed = model(mel_spect_tensor, time_pool=time_pooling, freq_pool=freq_pooling)
         uncond_LOA_embed = model(torch.zeros_like(mel_spect_tensor), time_pool=time_pooling, freq_pool=freq_pooling)
@@ -932,16 +867,66 @@ class AudioLDM2MorphPipeline(DiffusionPipeline,TextualInversionLoaderMixin):
         return prompt_embeds, attention_mask, generated_prompt_embeds
     @torch.no_grad()
     def aud2latent(self, audio_path, audio_length_in_s):
         DEVICE = torch.device(
             "cuda") if torch.cuda.is_available() else torch.device("cpu")
-        # waveform, sr = torchaudio.load(audio_path)
-        # fbank = torch.zeros((height, 64))
-        # ta_kaldi_fbank = extract_kaldi_fbank_feature(waveform, sr, fbank, num_mels=64)
-        # mel_spect_tensor = ta_kaldi_fbank.unsqueeze(0).unsqueeze(0)
         mel_spect_tensor = wav_to_mel(audio_path, duration=audio_length_in_s).unsqueeze(0)
         output_path = audio_path.replace('.wav', '_fbank.png')
         visualize_mel_spectrogram(mel_spect_tensor, output_path)
@@ -954,7 +939,8 @@ class AudioLDM2MorphPipeline(DiffusionPipeline,TextualInversionLoaderMixin):
     @torch.no_grad()
     def ddim_inversion(self, start_latents, prompt_embeds, attention_mask, generated_prompt_embeds, guidance_scale,num_inference_steps):
         start_step = 0
-        num_inference_steps = num_inference_steps
         device = start_latents.device
         self.scheduler.set_timesteps(num_inference_steps, device=device)
         start_latents *= self.scheduler.init_noise_sigma
@@ -973,9 +959,7 @@ class AudioLDM2MorphPipeline(DiffusionPipeline,TextualInversionLoaderMixin):
     def generate_morphing_prompt(self, prompt_1, prompt_2, alpha):
         closer_prompt = prompt_1 if alpha <= 0.5 else prompt_2
         prompt = (
-            f"A musical performance morphing between '{prompt_1}' and '{prompt_2}'. "
-            f"The sound is closer to '{closer_prompt}' with an interpolation factor of alpha={alpha:.2f}, "
-            f"where alpha=0 represents fully the {prompt_1} and alpha=1 represents fully {prompt_2}."
         )
         return prompt
@@ -983,8 +967,10 @@ class AudioLDM2MorphPipeline(DiffusionPipeline,TextualInversionLoaderMixin):
     def cal_latent(self,audio_length_in_s,time_pooling, freq_pooling,num_inference_steps, guidance_scale, aud_noise_1, aud_noise_2, prompt_1, prompt_2,
                    prompt_embeds_1, attention_mask_1, generated_prompt_embeds_1, prompt_embeds_2, attention_mask_2, generated_prompt_embeds_2,
                    alpha, original_processor,attn_processor_dict, use_morph_prompt, morphing_with_lora):
         latents = slerp(aud_noise_1, aud_noise_2, alpha, self.use_adain)
         if not use_morph_prompt:
             max_length = max(prompt_embeds_1.shape[1], prompt_embeds_2.shape[1])
             if prompt_embeds_1.shape[1] < max_length:
                 pad_size = max_length - prompt_embeds_1.shape[1]
@@ -1033,13 +1019,13 @@ class AudioLDM2MorphPipeline(DiffusionPipeline,TextualInversionLoaderMixin):
             # attention_mask = (attention_mask > 0.5).long()
             if morphing_with_lora:
-                pipeline_trained.unet.set_attn_processor(attn_processor_dict)
-            waveform = pipeline_trained(
                 time_pooling= time_pooling,
                 freq_pooling= freq_pooling,
                 latents = latents,
                 num_inference_steps= num_inference_steps,
-                guidance_scale= guidance_scale,
                 num_waveforms_per_prompt= 1,
                 audio_length_in_s=audio_length_in_s,
                 prompt_embeds = prompt_embeds.chunk(2)[1],
@@ -1050,13 +1036,13 @@ class AudioLDM2MorphPipeline(DiffusionPipeline,TextualInversionLoaderMixin):
                 negative_attention_mask = attention_mask.chunk(2)[0],
             ).audios[0]
             if morphing_with_lora:
-                pipeline_trained.unet.set_attn_processor(original_processor)
         else:
             latent_model_input = latents
             morphing_prompt = self.generate_morphing_prompt(prompt_1, prompt_2, alpha)
             if morphing_with_lora:
-                pipeline_trained.unet.set_attn_processor(attn_processor_dict)
-            waveform = pipeline_trained(
                 time_pooling= time_pooling,
                 freq_pooling= freq_pooling,
                 latents = latent_model_input,
@@ -1068,15 +1054,18 @@ class AudioLDM2MorphPipeline(DiffusionPipeline,TextualInversionLoaderMixin):
                 negative_prompt= 'Low quality',
             ).audios[0]
             if morphing_with_lora:
-                pipeline_trained.unet.set_attn_processor(original_processor)
-        return waveform
     @torch.no_grad()
     def __call__(
         self,
         audio_file = None,
         audio_file2 = None,
         save_lora_dir = "./lora",
         load_lora_path_1 = None,
         load_lora_path_2 = None,
@@ -1100,7 +1089,6 @@ class AudioLDM2MorphPipeline(DiffusionPipeline,TextualInversionLoaderMixin):
         attn_beta=0,
         lamd=0.6,
         fix_lora=None,
-        save_intermediates=True,
         num_frames=50,
         max_new_tokens: Optional[int] = None,
         callback_steps: Optional[int] = 1,
@@ -1108,6 +1096,7 @@ class AudioLDM2MorphPipeline(DiffusionPipeline,TextualInversionLoaderMixin):
         morphing_with_lora=False,
         use_morph_prompt=False,
     ):
         device = "cuda" if torch.cuda.is_available() else "cpu"
         # 0. Load the pre-trained AP-adapter model
         layer_num = 0
@@ -1123,48 +1112,44 @@ class AudioLDM2MorphPipeline(DiffusionPipeline,TextualInversionLoaderMixin):
             elif name.startswith("down_blocks"):
                 block_id = int(name[len("down_blocks.")])
                 hidden_size = self.unet.config.block_out_channels[block_id]
             if cross_attention_dim is None:
                 attn_procs[name] = AttnProcessor2_0()
             else:
                 cross_attention_dim = cross[layer_num % 8]
                 layer_num += 1
                 if cross_attention_dim == 768:
-                    attn_procs[name] = IPAttnProcessor2_0(
                         hidden_size=hidden_size,
                         name=name,
                         cross_attention_dim=cross_attention_dim,
-                        scale=0.5,
                         num_tokens=8,
                         do_copy=False
-                    ).to(DEVICE, dtype=torch.float32)
                 else:
                     attn_procs[name] = AttnProcessor2_0()
-        state_dict = torch.load(adapter_weight, map_location=device)
         for name, processor in attn_procs.items():
             if hasattr(processor, 'to_v_ip') or hasattr(processor, 'to_k_ip'):
                 weight_name_v = name + ".to_v_ip.weight"
                 weight_name_k = name + ".to_k_ip.weight"
-                processor.to_v_ip.weight = torch.nn.Parameter(state_dict[weight_name_v].half())
-                processor.to_k_ip.weight = torch.nn.Parameter(state_dict[weight_name_k].half())
         self.unet.set_attn_processor(attn_procs)
-        self.vae= self.vae.to(DEVICE, dtype=torch.float32)
-        self.unet = self.unet.to(DEVICE, dtype=torch.float32)
-        self.language_model = self.language_model.to(DEVICE, dtype=torch.float32)
-        self.projection_model = self.projection_model.to(DEVICE, dtype=torch.float32)
-        self.vocoder = self.vocoder.to(DEVICE, dtype=torch.float32)
-        self.text_encoder = self.text_encoder.to(DEVICE, dtype=torch.float32)
-        self.text_encoder_2 = self.text_encoder_2.to(DEVICE, dtype=torch.float32)
         # 1. Pre-check
         height, original_waveform_length = self.pre_check(audio_length_in_s, prompt_1, callback_steps, negative_prompt_1)
         _, _ = self.pre_check(audio_length_in_s, prompt_2, callback_steps, negative_prompt_2)
         # print(f"height: {height}, original_waveform_length: {original_waveform_length}") # height: 1000, original_waveform_length: 160000
         # # 2. Define call parameters
         do_classifier_free_guidance = guidance_scale > 1.0
         self.use_lora = use_lora
         self.use_adain = use_adain
@@ -1178,7 +1163,7 @@ class AudioLDM2MorphPipeline(DiffusionPipeline,TextualInversionLoaderMixin):
                 weight_name = f"{output_path.split('/')[-1]}_lora_0.ckpt"
                 load_lora_path_1 = save_lora_dir + "/" + weight_name
                 if not os.path.exists(load_lora_path_1):
-                    train_lora(audio_file ,height ,time_pooling ,freq_pooling ,prompt_1, negative_prompt_1, guidance_scale, save_lora_dir, self.tokenizer, self.tokenizer_2,
                         self.text_encoder, self.text_encoder_2, self.language_model, self.projection_model, self.vocoder,
                         self.vae, self.unet, self.scheduler, lora_steps, lora_lr, lora_rank, weight_name=weight_name)
             print(f"Load from {load_lora_path_1}.")
@@ -1193,7 +1178,7 @@ class AudioLDM2MorphPipeline(DiffusionPipeline,TextualInversionLoaderMixin):
                 weight_name = f"{output_path.split('/')[-1]}_lora_1.ckpt"
                 load_lora_path_2 = save_lora_dir + "/" + weight_name
                 if not os.path.exists(load_lora_path_2):
-                    train_lora(audio_file2 ,height,time_pooling ,freq_pooling ,prompt_2, negative_prompt_2, guidance_scale, save_lora_dir, self.tokenizer, self.tokenizer_2,
                         self.text_encoder, self.text_encoder_2, self.language_model, self.projection_model, self.vocoder,
                         self.vae, self.unet, self.scheduler, lora_steps, lora_lr, lora_rank, weight_name=weight_name)
             print(f"Load from {load_lora_path_2}.")
@@ -1212,75 +1197,29 @@ class AudioLDM2MorphPipeline(DiffusionPipeline,TextualInversionLoaderMixin):
         # 4. Prepare latent variables
-        # For the first audio file
         original_processor = list(self.unet.attn_processors.values())[0]
         if noisy_latent_with_lora:
             self.unet = load_lora(self.unet, lora_1, lora_2, 0)
-        # print(self.unet.attn_processors)
         # We directly use the latent representation of the audio file for VAE's decoder as the 1st ground truth
         audio_latent = self.aud2latent(audio_file, audio_length_in_s).to(device)
-        # mel_spectrogram = self.vae.decode(audio_latent).sample
-        # first_audio = self.mel_spectrogram_to_waveform(mel_spectrogram)
-        # first_audio = first_audio[:, :original_waveform_length]
-        # torchaudio.save(f"{self.output_path}/{0:02d}_gt.wav", first_audio, 16000)
         # aud_noise_1 is the noisy latent representation of the audio file 1
-        aud_noise_1 = self.ddim_inversion(audio_latent, prompt_embeds_1, attention_mask_1, generated_prompt_embeds_1, guidance_scale, num_inference_steps)
-        # We use the pre-trained model to generate the audio file from the noisy latent representation
-        # waveform = pipeline_trained(
-        #     audio_file = audio_file,
-        #     time_pooling= 2,
-        #     freq_pooling= 2,
-        #     prompt= prompt_1,
-        #     latents = aud_noise_1,
-        #     negative_prompt= negative_prompt_1,
-        #     num_inference_steps= 100,
-        #     guidance_scale= guidance_scale,
-        #     num_waveforms_per_prompt= 1,
-        #     audio_length_in_s=10,
-        # ).audios
-        # file_path = os.path.join(self.output_path, f"{0:02d}_gt2.wav")
-        # scipy.io.wavfile.write(file_path, rate=16000, data=waveform[0])
         # After reconstructed the audio file 1, we set the original processor back
         if noisy_latent_with_lora:
             self.unet.set_attn_processor(original_processor)
-        # print(self.unet.attn_processors)
-        # For the second audio file
         if noisy_latent_with_lora:
             self.unet = load_lora(self.unet, lora_1, lora_2, 1)
-        # print(self.unet.attn_processors)
         # We directly use the latent representation of the audio file for VAE's decoder as the 1st ground truth
         audio_latent = self.aud2latent(audio_file2, audio_length_in_s)
-        # mel_spectrogram = self.vae.decode(audio_latent).sample
-        # last_audio = self.mel_spectrogram_to_waveform(mel_spectrogram)
-        # last_audio = last_audio[:, :original_waveform_length]
-        # torchaudio.save(f"{self.output_path}/{num_frames-1:02d}_gt.wav", last_audio, 16000)
         # aud_noise_2 is the noisy latent representation of the audio file 2
-        aud_noise_2 = self.ddim_inversion(audio_latent, prompt_embeds_2, attention_mask_2, generated_prompt_embeds_2, guidance_scale, num_inference_steps)
-        # waveform = pipeline_trained(
-        #     audio_file = audio_file2,
-        #     time_pooling= 2,
-        #     freq_pooling= 2,
-        #     prompt= prompt_2,
-        #     latents = aud_noise_2,
-        #     negative_prompt= negative_prompt_2,
-        #     num_inference_steps= 100,
-        #     guidance_scale= guidance_scale,
-        #     num_waveforms_per_prompt= 1,
-        #     audio_length_in_s=10,
-        # ).audios
-        # file_path = os.path.join(self.output_path, f"{num_frames-1:02d}_gt2.wav")
-        # scipy.io.wavfile.write(file_path, rate=16000, data=waveform[0])
         if noisy_latent_with_lora:
             self.unet.set_attn_processor(original_processor)
-        # print(self.unet.attn_processors)
         # After reconstructed the audio file 1, we set the original processor back
         original_processor = list(self.unet.attn_processors.values())[0]
         def morph(alpha_list, desc):
             audios = []
             # if attn_beta is not None:
@@ -1288,11 +1227,9 @@ class AudioLDM2MorphPipeline(DiffusionPipeline,TextualInversionLoaderMixin):
                 self.unet = load_lora(
                     self.unet, lora_1, lora_2, 0 if fix_lora is None else fix_lora)
             attn_processor_dict = {}
-            # print(self.unet.attn_processors)
             for k in self.unet.attn_processors.keys():
                 # print(k)
                 if do_replace_attn(k):
-                    # print(f"Since the key starts with *up*, we replace the processor with StoreProcessor.")
                     if self.use_lora:
                         attn_processor_dict[k] = StoreProcessor(self.unet.attn_processors[k],
                                                                 self.aud1_dict, k)
@@ -1300,16 +1237,8 @@ class AudioLDM2MorphPipeline(DiffusionPipeline,TextualInversionLoaderMixin):
                         attn_processor_dict[k] = StoreProcessor(original_processor,
                                                                 self.aud1_dict, k)
                 else:
-                    attn_processor_dict[k] = self.unet.attn_processors[k]
-            #     print(attn_processor_dict)
-            # print(attn_processor_dict)
-            # print(self.unet.attn_processors)
-            # self.unet.set_attn_processor(attn_processor_dict)
-            # print(self.unet.attn_processors)
-            first_audio = self.cal_latent(
                 audio_length_in_s,
                 time_pooling,
                 freq_pooling,
@@ -1335,14 +1264,12 @@ class AudioLDM2MorphPipeline(DiffusionPipeline,TextualInversionLoaderMixin):
             self.unet.set_attn_processor(original_processor)
             file_path = os.path.join(self.output_path, f"{0:02d}.wav")
             scipy.io.wavfile.write(file_path, rate=16000, data=first_audio)
             if self.use_lora:
                 self.unet = load_lora(
                     self.unet, lora_1, lora_2, 1 if fix_lora is None else fix_lora)
             attn_processor_dict = {}
             for k in self.unet.attn_processors.keys():
                 if do_replace_attn(k):
-                    # print(f"Since the key starts with *up*, we replace the processor with StoreProcessor.")
                     if self.use_lora:
                         attn_processor_dict[k] = StoreProcessor(self.unet.attn_processors[k],
                                                                 self.aud2_dict, k)
@@ -1351,8 +1278,7 @@ class AudioLDM2MorphPipeline(DiffusionPipeline,TextualInversionLoaderMixin):
                                                                 self.aud2_dict, k)
                 else:
                     attn_processor_dict[k] = self.unet.attn_processors[k]
-            # self.unet.set_attn_processor(attn_processor_dict)
-            last_audio = self.cal_latent(
                 audio_length_in_s,
                 time_pooling,
                 freq_pooling,
@@ -1376,6 +1302,7 @@ class AudioLDM2MorphPipeline(DiffusionPipeline,TextualInversionLoaderMixin):
             )
             file_path = os.path.join(self.output_path, f"{num_frames-1:02d}.wav")
             scipy.io.wavfile.write(file_path, rate=16000, data=last_audio)
             self.unet.set_attn_processor(original_processor)
             for i in tqdm(range(1, num_frames - 1), desc=desc):
@@ -1395,8 +1322,7 @@ class AudioLDM2MorphPipeline(DiffusionPipeline,TextualInversionLoaderMixin):
                                 original_processor, k, self.aud1_dict, self.aud2_dict, alpha, attn_beta, lamd)
                     else:
                         attn_processor_dict[k] = self.unet.attn_processors[k]
-                # self.unet.set_attn_processor(attn_processor_dict)
-                audio = self.cal_latent(
                         audio_length_in_s,
                         time_pooling,
                         freq_pooling,

     import librosa
 import warnings
 import matplotlib.pyplot as plt
 from .pipeline_audioldm2 import AudioLDM2Pipeline
+warnings.filterwarnings("ignore", category=FutureWarning)
+warnings.filterwarnings("ignore", category=UserWarning)
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 def visualize_mel_spectrogram(mel_spect_tensor, output_path=None):
         plt.show()
 class StoreProcessor():
     def __init__(self, original_processor, value_dict, name):
         self.original_processor = original_processor
     def __call__(self, attn, hidden_states, *args, encoder_hidden_states=None, attention_mask=None, **kwargs):
         # Is self attention
         if encoder_hidden_states is None:
             # print(f'In StoreProcessor: {self.name} {self.id}')
             self.value_dict[self.name][self.id] = hidden_states.detach()
             self.id += 1
         res = self.original_processor(attn, hidden_states, *args,
                                       encoder_hidden_states=encoder_hidden_states,
                                       attention_mask=attention_mask,
     def __call__(self, attn, hidden_states, *args, encoder_hidden_states=None, attention_mask=None, **kwargs):
         # Is self attention
         if encoder_hidden_states is None:
             if self.id < 10 * self.lamd:
                 map0 = self.aud1_dict[self.name][self.id]
                 map1 = self.aud2_dict[self.name][self.id]
                 cross_map = self.beta * hidden_states + \
                     (1 - self.beta) * ((1 - self.alpha) * map0 + self.alpha * map1)
                 res = self.original_processor(attn, hidden_states, *args,
                                               encoder_hidden_states=cross_map,
                                               attention_mask=attention_mask,
                                               **kwargs)
             else:
                 res = self.original_processor(attn, hidden_states, *args,
                                               encoder_hidden_states=encoder_hidden_states,
                                               attention_mask=attention_mask,
                                               **kwargs)
             self.id += 1
             if self.id == len(self.aud1_dict[self.name]):
                 self.id = 0
         else:
             res = self.original_processor(attn, hidden_states, *args,
                                           encoder_hidden_states=encoder_hidden_states,
                                           attention_mask=attention_mask,
         ta_kaldi_fbank = extract_kaldi_fbank_feature(waveform, sr, fbank)
         # print("ta_kaldi_fbank.shape",ta_kaldi_fbank.shape)
         mel_spect_tensor = ta_kaldi_fbank.unsqueeze(0)
+        model = AudioMAEConditionCTPoolRand().cuda()
         model.eval()
         LOA_embed = model(mel_spect_tensor, time_pool=time_pooling, freq_pool=freq_pooling)
         uncond_LOA_embed = model(torch.zeros_like(mel_spect_tensor), time_pool=time_pooling, freq_pool=freq_pooling)
         return prompt_embeds, attention_mask, generated_prompt_embeds
+    def init_trained_pipeline(self, model_path, device, dtype, ap_scale, text_ap_scale):
+        pipeline_trained = AudioLDM2Pipeline.from_pretrained("cvssp/audioldm2-large", torch_dtype=dtype).to(device)
+        layer_num = 0
+        cross = [None, None, 768, 768, 1024, 1024, None, None]
+        unet = pipeline_trained.unet
+        attn_procs = {}
+        for name in  unet.attn_processors.keys():
+            cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
+            if name.startswith("mid_block"):
+                hidden_size = unet.config.block_out_channels[-1]
+            elif name.startswith("up_blocks"):
+                block_id = int(name[len("up_blocks.")])
+                hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
+            elif name.startswith("down_blocks"):
+                block_id = int(name[len("down_blocks.")])
+                hidden_size = unet.config.block_out_channels[block_id]
+            if cross_attention_dim is None:
+                attn_procs[name] = AttnProcessor2_0()
+            else:
+                cross_attention_dim = cross[layer_num % 8]
+                layer_num += 1
+                if cross_attention_dim == 768:
+                    attn_procs[name] = IPAttnProcessor2_0(
+                        hidden_size=hidden_size,
+                        name=name,
+                        flag='trained',
+                        cross_attention_dim=cross_attention_dim,
+                        text_scale=text_ap_scale,
+                        scale=ap_scale,
+                        num_tokens=8,
+                        do_copy=False
+                    ).to(device, dtype=dtype)
+                else:
+                    attn_procs[name] = AttnProcessor2_0()
+        state_dict = torch.load(model_path, map_location=device)
+        for name, processor in attn_procs.items():
+            if hasattr(processor, 'to_v_ip') or hasattr(processor, 'to_k_ip'):
+                weight_name_v = name + ".to_v_ip.weight"
+                weight_name_k = name + ".to_k_ip.weight"
+                if dtype == torch.float32:
+                    processor.to_v_ip.weight = torch.nn.Parameter(state_dict[weight_name_v].float())
+                    processor.to_k_ip.weight = torch.nn.Parameter(state_dict[weight_name_k].float())
+                elif dtype == torch.float16:
+                    processor.to_v_ip.weight = torch.nn.Parameter(state_dict[weight_name_v].half())
+                    processor.to_k_ip.weight = torch.nn.Parameter(state_dict[weight_name_k].half())
+        unet.set_attn_processor(attn_procs)
+        class _Wrapper(AttnProcsLayers):
+            def forward(self, *args, **kwargs):
+                return unet(*args, **kwargs)
+        unet = _Wrapper(unet.attn_processors)
+        return pipeline_trained
     @torch.no_grad()
     def aud2latent(self, audio_path, audio_length_in_s):
         DEVICE = torch.device(
             "cuda") if torch.cuda.is_available() else torch.device("cpu")
         mel_spect_tensor = wav_to_mel(audio_path, duration=audio_length_in_s).unsqueeze(0)
         output_path = audio_path.replace('.wav', '_fbank.png')
         visualize_mel_spectrogram(mel_spect_tensor, output_path)
     @torch.no_grad()
     def ddim_inversion(self, start_latents, prompt_embeds, attention_mask, generated_prompt_embeds, guidance_scale,num_inference_steps):
         start_step = 0
+        # print(f"Scheduler timesteps: {self.scheduler.timesteps}")
+        num_inference_steps = min(num_inference_steps, int(max(self.scheduler.timesteps)))
         device = start_latents.device
         self.scheduler.set_timesteps(num_inference_steps, device=device)
         start_latents *= self.scheduler.init_noise_sigma
     def generate_morphing_prompt(self, prompt_1, prompt_2, alpha):
         closer_prompt = prompt_1 if alpha <= 0.5 else prompt_2
         prompt = (
+            f"Jazz style music"
         )
         return prompt
     def cal_latent(self,audio_length_in_s,time_pooling, freq_pooling,num_inference_steps, guidance_scale, aud_noise_1, aud_noise_2, prompt_1, prompt_2,
                    prompt_embeds_1, attention_mask_1, generated_prompt_embeds_1, prompt_embeds_2, attention_mask_2, generated_prompt_embeds_2,
                    alpha, original_processor,attn_processor_dict, use_morph_prompt, morphing_with_lora):
+        num_inference_steps = min(num_inference_steps, int(max(self.pipeline_trained.scheduler.timesteps)))
         latents = slerp(aud_noise_1, aud_noise_2, alpha, self.use_adain)
         if not use_morph_prompt:
+            print("Not using morphing prompt")
             max_length = max(prompt_embeds_1.shape[1], prompt_embeds_2.shape[1])
             if prompt_embeds_1.shape[1] < max_length:
                 pad_size = max_length - prompt_embeds_1.shape[1]
             # attention_mask = (attention_mask > 0.5).long()
             if morphing_with_lora:
+                self.pipeline_trained.unet.set_attn_processor(attn_processor_dict)
+            waveform = self.pipeline_trained(
                 time_pooling= time_pooling,
                 freq_pooling= freq_pooling,
                 latents = latents,
                 num_inference_steps= num_inference_steps,
+                guidance_scale = guidance_scale,
                 num_waveforms_per_prompt= 1,
                 audio_length_in_s=audio_length_in_s,
                 prompt_embeds = prompt_embeds.chunk(2)[1],
                 negative_attention_mask = attention_mask.chunk(2)[0],
             ).audios[0]
             if morphing_with_lora:
+                self.pipeline_trained.unet.set_attn_processor(original_processor)
         else:
             latent_model_input = latents
             morphing_prompt = self.generate_morphing_prompt(prompt_1, prompt_2, alpha)
             if morphing_with_lora:
+                self.pipeline_trained.unet.set_attn_processor(attn_processor_dict)
+            waveform = self.pipeline_trained(
                 time_pooling= time_pooling,
                 freq_pooling= freq_pooling,
                 latents = latent_model_input,
                 negative_prompt= 'Low quality',
             ).audios[0]
             if morphing_with_lora:
+                self.pipeline_trained.unet.set_attn_processor(original_processor)
+        return waveform, latents
     @torch.no_grad()
     def __call__(
         self,
+        dtype,
         audio_file = None,
         audio_file2 = None,
+        ap_scale = 1.0,
+        text_ap_scale = 1.0,
         save_lora_dir = "./lora",
         load_lora_path_1 = None,
         load_lora_path_2 = None,
         attn_beta=0,
         lamd=0.6,
         fix_lora=None,
         num_frames=50,
         max_new_tokens: Optional[int] = None,
         callback_steps: Optional[int] = 1,
         morphing_with_lora=False,
         use_morph_prompt=False,
     ):
+        ap_adapter_path = 'pytorch_model.bin'
         device = "cuda" if torch.cuda.is_available() else "cpu"
         # 0. Load the pre-trained AP-adapter model
         layer_num = 0
             elif name.startswith("down_blocks"):
                 block_id = int(name[len("down_blocks.")])
                 hidden_size = self.unet.config.block_out_channels[block_id]
             if cross_attention_dim is None:
                 attn_procs[name] = AttnProcessor2_0()
             else:
                 cross_attention_dim = cross[layer_num % 8]
                 layer_num += 1
                 if cross_attention_dim == 768:
+                    attn_procs[name].scale = IPAttnProcessor2_0(
                         hidden_size=hidden_size,
                         name=name,
                         cross_attention_dim=cross_attention_dim,
+                        text_scale=100,
+                        scale=ap_scale,
                         num_tokens=8,
                         do_copy=False
+                    ).to(device, dtype=dtype)
                 else:
                     attn_procs[name] = AttnProcessor2_0()
+        state_dict = torch.load(ap_adapter_path, map_location=device)
         for name, processor in attn_procs.items():
             if hasattr(processor, 'to_v_ip') or hasattr(processor, 'to_k_ip'):
                 weight_name_v = name + ".to_v_ip.weight"
                 weight_name_k = name + ".to_k_ip.weight"
+                if dtype == torch.float32:
+                    processor.to_v_ip.weight = torch.nn.Parameter(state_dict[weight_name_v].float())
+                    processor.to_k_ip.weight = torch.nn.Parameter(state_dict[weight_name_k].float())
+                elif dtype == torch.float16:
+                    processor.to_v_ip.weight = torch.nn.Parameter(state_dict[weight_name_v].half())
+                    processor.to_k_ip.weight = torch.nn.Parameter(state_dict[weight_name_k].half())
         self.unet.set_attn_processor(attn_procs)
+        self.pipeline_trained = self.init_trained_pipeline(ap_adapter_path, device, dtype, ap_scale, text_ap_scale)
         # 1. Pre-check
         height, original_waveform_length = self.pre_check(audio_length_in_s, prompt_1, callback_steps, negative_prompt_1)
         _, _ = self.pre_check(audio_length_in_s, prompt_2, callback_steps, negative_prompt_2)
         # print(f"height: {height}, original_waveform_length: {original_waveform_length}") # height: 1000, original_waveform_length: 160000
         # # 2. Define call parameters
+        device = "cuda" if torch.cuda.is_available() else "cpu"
         do_classifier_free_guidance = guidance_scale > 1.0
         self.use_lora = use_lora
         self.use_adain = use_adain
                 weight_name = f"{output_path.split('/')[-1]}_lora_0.ckpt"
                 load_lora_path_1 = save_lora_dir + "/" + weight_name
                 if not os.path.exists(load_lora_path_1):
+                    train_lora(audio_file, dtype, time_pooling ,freq_pooling ,prompt_1, negative_prompt_1, guidance_scale, save_lora_dir, self.tokenizer, self.tokenizer_2,
                         self.text_encoder, self.text_encoder_2, self.language_model, self.projection_model, self.vocoder,
                         self.vae, self.unet, self.scheduler, lora_steps, lora_lr, lora_rank, weight_name=weight_name)
             print(f"Load from {load_lora_path_1}.")
                 weight_name = f"{output_path.split('/')[-1]}_lora_1.ckpt"
                 load_lora_path_2 = save_lora_dir + "/" + weight_name
                 if not os.path.exists(load_lora_path_2):
+                    train_lora(audio_file2, dtype,time_pooling ,freq_pooling ,prompt_2, negative_prompt_2, guidance_scale, save_lora_dir, self.tokenizer, self.tokenizer_2,
                         self.text_encoder, self.text_encoder_2, self.language_model, self.projection_model, self.vocoder,
                         self.vae, self.unet, self.scheduler, lora_steps, lora_lr, lora_rank, weight_name=weight_name)
             print(f"Load from {load_lora_path_2}.")
         # 4. Prepare latent variables
+        # ------- For the first audio file -------
         original_processor = list(self.unet.attn_processors.values())[0]
         if noisy_latent_with_lora:
             self.unet = load_lora(self.unet, lora_1, lora_2, 0)
         # We directly use the latent representation of the audio file for VAE's decoder as the 1st ground truth
         audio_latent = self.aud2latent(audio_file, audio_length_in_s).to(device)
         # aud_noise_1 is the noisy latent representation of the audio file 1
+        aud_noise_1 = self.ddim_inversion(audio_latent, prompt_embeds_1, attention_mask_1, generated_prompt_embeds_1, guidance_scale, num_inference_steps = num_inference_steps)
         # After reconstructed the audio file 1, we set the original processor back
         if noisy_latent_with_lora:
             self.unet.set_attn_processor(original_processor)
+        # ------- For the second audio file -------
         if noisy_latent_with_lora:
             self.unet = load_lora(self.unet, lora_1, lora_2, 1)
         # We directly use the latent representation of the audio file for VAE's decoder as the 1st ground truth
         audio_latent = self.aud2latent(audio_file2, audio_length_in_s)
         # aud_noise_2 is the noisy latent representation of the audio file 2
+        aud_noise_2 = self.ddim_inversion(audio_latent, prompt_embeds_2, attention_mask_2, generated_prompt_embeds_2, guidance_scale, num_inference_steps = num_inference_steps)
         if noisy_latent_with_lora:
             self.unet.set_attn_processor(original_processor)
         # After reconstructed the audio file 1, we set the original processor back
         original_processor = list(self.unet.attn_processors.values())[0]
         def morph(alpha_list, desc):
             audios = []
             # if attn_beta is not None:
                 self.unet = load_lora(
                     self.unet, lora_1, lora_2, 0 if fix_lora is None else fix_lora)
             attn_processor_dict = {}
             for k in self.unet.attn_processors.keys():
                 # print(k)
                 if do_replace_attn(k):
                     if self.use_lora:
                         attn_processor_dict[k] = StoreProcessor(self.unet.attn_processors[k],
                                                                 self.aud1_dict, k)
                         attn_processor_dict[k] = StoreProcessor(original_processor,
                                                                 self.aud1_dict, k)
                 else:
+                    attn_processor_dict[k] = self.unet.attn_processors[k]
+            first_audio, first_latents = self.cal_latent(
                 audio_length_in_s,
                 time_pooling,
                 freq_pooling,
             self.unet.set_attn_processor(original_processor)
             file_path = os.path.join(self.output_path, f"{0:02d}.wav")
             scipy.io.wavfile.write(file_path, rate=16000, data=first_audio)
             if self.use_lora:
                 self.unet = load_lora(
                     self.unet, lora_1, lora_2, 1 if fix_lora is None else fix_lora)
             attn_processor_dict = {}
             for k in self.unet.attn_processors.keys():
                 if do_replace_attn(k):
                     if self.use_lora:
                         attn_processor_dict[k] = StoreProcessor(self.unet.attn_processors[k],
                                                                 self.aud2_dict, k)
                                                                 self.aud2_dict, k)
                 else:
                     attn_processor_dict[k] = self.unet.attn_processors[k]
+            last_audio, last_latents = self.cal_latent(
                 audio_length_in_s,
                 time_pooling,
                 freq_pooling,
             )
             file_path = os.path.join(self.output_path, f"{num_frames-1:02d}.wav")
             scipy.io.wavfile.write(file_path, rate=16000, data=last_audio)
             self.unet.set_attn_processor(original_processor)
             for i in tqdm(range(1, num_frames - 1), desc=desc):
                                 original_processor, k, self.aud1_dict, self.aud2_dict, alpha, attn_beta, lamd)
                     else:
                         attn_processor_dict[k] = self.unet.attn_processors[k]
+                audio, latents = self.cal_latent(
                         audio_length_in_s,
                         time_pooling,
                         freq_pooling,

utils/lora_utils_successed_ver1.py CHANGED Viewed

@@ -449,7 +449,7 @@ def plot_loss(loss_history, loss_plot_path, lora_steps):
 # lora_steps: number of lora training step
 # lora_lr: learning rate of lora training
 # lora_rank: the rank of lora
-def train_lora(audio_path ,height ,time_pooling ,freq_pooling ,prompt, negative_prompt, guidance_scale, save_lora_dir, tokenizer=None, tokenizer_2=None,
                text_encoder=None, text_encoder_2=None, GPT2=None, projection_model=None, vocoder=None,
                vae=None, unet=None, noise_scheduler=None, lora_steps=200, lora_lr=2e-4, lora_rank=16, weight_name=None, safe_serialization=False, progress=tqdm):
     time_pooling = time_pooling
@@ -534,7 +534,7 @@ def train_lora(audio_path ,height ,time_pooling ,freq_pooling ,prompt, negative_
                     scale=1.0,
                     num_tokens=8,
                     do_copy = do_copy
-                ).to(device, dtype=torch.float32)
             else:
                 unet_lora_attn_procs[name] = AttnProcessor2_0()
     unet.set_attn_processor(unet_lora_attn_procs)
@@ -580,7 +580,7 @@ def train_lora(audio_path ,height ,time_pooling ,freq_pooling ,prompt, negative_
     fbank = torch.zeros((1024, 128))
     ta_kaldi_fbank = extract_kaldi_fbank_feature(waveform, sr, fbank)
     mel_spect_tensor = ta_kaldi_fbank.unsqueeze(0)
-    model = AudioMAEConditionCTPoolRand().to(device).to(dtype=torch.float32)
     model.eval()
     mel_spect_tensor = mel_spect_tensor.to(device, dtype=next(model.parameters()).dtype)
     LOA_embed = model(mel_spect_tensor, time_pool=time_pooling, freq_pool=freq_pooling)
@@ -599,24 +599,6 @@ def train_lora(audio_path ,height ,time_pooling ,freq_pooling ,prompt, negative_
     generated_prompt_embeds = torch.cat([uncond, cond], dim=0)
     model_dtype = next(unet.parameters()).dtype
     generated_prompt_embeds = generated_prompt_embeds.to(model_dtype)
-    # num_channels_latents = unet.config.in_channels
-    # batch_size = 1
-    # num_waveforms_per_prompt = 1
-    # generator = None
-    # latents = None
-    # latents = prepare_latents(
-    #         vae,
-    #         vocoder,
-    #         noise_scheduler,
-    #         batch_size * num_waveforms_per_prompt,
-    #         num_channels_latents,
-    #         height,
-    #         prompt_embeds.dtype,
-    #         device,
-    #         generator,
-    #         latents,
-    #     )
     loss_history = []
     if not os.path.exists(save_lora_dir):
@@ -683,7 +665,7 @@ def train_lora(audio_path ,height ,time_pooling ,freq_pooling ,prompt, negative_
         safe_serialization=safe_serialization
     )
-def load_lora(unet, lora_0, lora_1, alpha):
     attn_procs = unet.attn_processors
     for name, processor in attn_procs.items():
         if hasattr(processor, 'to_v_ip') or hasattr(processor, 'to_k_ip'):
@@ -691,10 +673,16 @@ def load_lora(unet, lora_0, lora_1, alpha):
             weight_name_k = name + ".to_k_ip.weight"
             if weight_name_v in lora_0 and weight_name_v in lora_1:
                 v_weight = (1 - alpha) * lora_0[weight_name_v] + alpha * lora_1[weight_name_v]
-                processor.to_v_ip.weight = torch.nn.Parameter(v_weight.half())
             if weight_name_k in lora_0 and weight_name_k in lora_1:
                 k_weight = (1 - alpha) * lora_0[weight_name_k] + alpha * lora_1[weight_name_k]
-                processor.to_k_ip.weight = torch.nn.Parameter(k_weight.half())
     unet.set_attn_processor(attn_procs)
     return unet

 # lora_steps: number of lora training step
 # lora_lr: learning rate of lora training
 # lora_rank: the rank of lora
+def train_lora(audio_path ,dtype ,time_pooling ,freq_pooling ,prompt, negative_prompt, guidance_scale, save_lora_dir, tokenizer=None, tokenizer_2=None,
                text_encoder=None, text_encoder_2=None, GPT2=None, projection_model=None, vocoder=None,
                vae=None, unet=None, noise_scheduler=None, lora_steps=200, lora_lr=2e-4, lora_rank=16, weight_name=None, safe_serialization=False, progress=tqdm):
     time_pooling = time_pooling
                     scale=1.0,
                     num_tokens=8,
                     do_copy = do_copy
+                ).to(device, dtype=dtype)
             else:
                 unet_lora_attn_procs[name] = AttnProcessor2_0()
     unet.set_attn_processor(unet_lora_attn_procs)
     fbank = torch.zeros((1024, 128))
     ta_kaldi_fbank = extract_kaldi_fbank_feature(waveform, sr, fbank)
     mel_spect_tensor = ta_kaldi_fbank.unsqueeze(0)
+    model = AudioMAEConditionCTPoolRand().to(device).to(dtype=dtype)
     model.eval()
     mel_spect_tensor = mel_spect_tensor.to(device, dtype=next(model.parameters()).dtype)
     LOA_embed = model(mel_spect_tensor, time_pool=time_pooling, freq_pool=freq_pooling)
     generated_prompt_embeds = torch.cat([uncond, cond], dim=0)
     model_dtype = next(unet.parameters()).dtype
     generated_prompt_embeds = generated_prompt_embeds.to(model_dtype)
     loss_history = []
     if not os.path.exists(save_lora_dir):
         safe_serialization=safe_serialization
     )
+def load_lora(unet, lora_0, lora_1, alpha, dtype):
     attn_procs = unet.attn_processors
     for name, processor in attn_procs.items():
         if hasattr(processor, 'to_v_ip') or hasattr(processor, 'to_k_ip'):
             weight_name_k = name + ".to_k_ip.weight"
             if weight_name_v in lora_0 and weight_name_v in lora_1:
                 v_weight = (1 - alpha) * lora_0[weight_name_v] + alpha * lora_1[weight_name_v]
+                if dtype == torch.float32:
+                    processor.to_v_ip.weight = torch.nn.Parameter(v_weight.float())
+                elif dtype == torch.float16:
+                    processor.to_v_ip.weight = torch.nn.Parameter(v_weight.half())
             if weight_name_k in lora_0 and weight_name_k in lora_1:
                 k_weight = (1 - alpha) * lora_0[weight_name_k] + alpha * lora_1[weight_name_k]
+                if dtype == torch.float32:
+                    processor.to_k_ip.weight = torch.nn.Parameter(k_weight.float())
+                elif dtype == torch.float16:
+                    processor.to_k_ip.weight = torch.nn.Parameter(k_weight.half())
     unet.set_attn_processor(attn_procs)
     return unet