Francis-Rings · doehyeonlee · Sep 29, 2025 · Sep 30, 2025 · Sep 30, 2025 · Oct 2, 2025
diff --git a/.ipynb_checkpoints/command_train-checkpoint.sh b/.ipynb_checkpoints/command_train-checkpoint.sh
@@ -0,0 +1,23 @@
+CUDA_VISIBLE_DEVICES=0,1 accelerate launch /root/StableAnimator_Music/train_single.py \
+ --pretrained_model_name_or_path="/root/StableAnimator_Music/checkpoints/stable-video-diffusion-img2vid-xt" \
+ --output_dir="./checkpoints/Animation" \
+ --data_root_path="/root/aist_hdf5/rec" \
+ --data_path="/root/aist_hdf5/full_list.txt" \
+ --dataset_width=512 \
+ --dataset_height=512 \
+ --validation_image_folder="./validation/ground_truth" \
+ --validation_control_folder="./validation/poses" \
+ --validation_image="./validation/reference.png" \
+ --num_workers=2 \
+ --lr_warmup_steps=10 \
+ --sample_n_frames=16 \
+ --learning_rate=1e-5 \
+ --per_gpu_batch_size=1 \
+ --num_train_epochs=10000 \
+ --mixed_precision="fp16" \
+ --gradient_accumulation_steps=1 \
+ --checkpointing_steps=2000 \
+ --validation_steps=500 \
+ --checkpoints_total_limit=5000 \
+ --resume_from_checkpoint="latest" \
+ --max_train_steps=30000
diff --git a/DWPose/skeleton_extraction.py b/DWPose/skeleton_extraction.py
@@ -149,7 +149,8 @@ def get_video_pose(video_path, ref_image_path, poses_folder_path=None):
     detected_poses = []
     files = os.listdir(video_path)
     png_files = [f for f in files if f.endswith('.png')]
-    png_files.sort(key=lambda x: int(x.split('_')[1].split('.')[0]))
+    png_files.sort(key=lambda x: int(x.split('.')[0]))
+#    png_files.sort(key=lambda x: int(x.split('_')[1].split('.')[0]))
     for sub_name in png_files:
         sub_driven_image_path = os.path.join(video_path, sub_name)
         driven_image = cv2.imread(sub_driven_image_path)

diff --git a/README.md b/README.md
diff --git a/animation/dataset/animation_new_dataset.py b/animation/dataset/animation_new_dataset.py
diff --git a/animation/modules/music_encoder.py b/animation/modules/music_encoder.py
@@ -0,0 +1,60 @@
+from pathlib import Path
+
+import einops
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.init as init
+from diffusers.models.modeling_utils import ModelMixin
+
+class MusicEncoder(ModelMixin):
+    def __init__(self, indim=4800, hw=64, noise_latent_channels=320):
+        super().__init__()
+        self.hw = hw
+        self.noise_latent_channels = noise_latent_channels
+        self.latent_dim = hw * hw
+
+        # projection to 64x64
+        self.net = nn.Linear(indim, self.latent_dim)
+
+        # 1→noise_latent_channels 채널 확장
+        self.expand = nn.Conv2d(1, noise_latent_channels, kernel_size=3, padding=1)
+
+    def forward(self, x):  # (B, T, 4800)
+        B, T, F = x.shape
+        z = self.net(x.view(B*T, F))                      # (B*T, 4096)
+        z = z.view(B*T, 1, self.hw, self.hw)              # (B*T, 1, 64, 64)
+        z = self.expand(z)                                # (B*T, noise_latent_channels, 64, 64)
+        return z
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_path, latent_dim=1024):
+        """Load pretrained music encoder weights"""
+        if not Path(pretrained_model_path).exists():
+            raise FileNotFoundError(f"No model file at {pretrained_model_path}")
+
+        print(f"Loading MusicEncoder from {pretrained_model_path}")
+
+        state_dict = torch.load(pretrained_model_path, map_location="cpu")
+
+        # Pretrained 모델의 latent_dim 확인
+        pretrained_latent_dim = state_dict['net.4.weight'].shape[0]
+
+        model = cls(in_dim=4800, latent_dim=pretrained_latent_dim)
+        model.load_state_dict(state_dict, strict=True)
+
+        # latent_dim이 다르면 projection layer 추가
+        if pretrained_latent_dim != latent_dim:
+            print(f"Adding projection layer: {pretrained_latent_dim} -> {latent_dim}")
+            model.projection = nn.Linear(pretrained_latent_dim, latent_dim)
+            model.latent_dim = latent_dim
+
+        return model
+
+    def forward_with_projection(self, x):
+        """Projection layer가 있을 때 사용"""
+        z = self.forward(x)  # (B, T, pretrained_latent_dim)
+        if hasattr(self, 'projection'):
+            z = self.projection(z.view(-1, z.shape[-1]))  # (B*T, latent_dim)
+            z = z.view(x.shape[0], x.shape[1], -1)  # (B, T, latent_dim)
+        return z
diff --git a/animation/modules/new.py b/animation/modules/new.py
diff --git a/animation/modules/unet.py b/animation/modules/unet.py
@@ -329,7 +329,9 @@ def set_default_attn_processor(self):
 
         self.set_attn_processor(processor)
 
-    def _set_gradient_checkpointing(self, module, value=False):
+    def _set_gradient_checkpointing(self, module, value=False, enable=None):
+        if enable is not None:
+            module.gradient_checkpointing = True
         if hasattr(module, "gradient_checkpointing"):
             module.gradient_checkpointing = value