Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions .ipynb_checkpoints/command_train-checkpoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
CUDA_VISIBLE_DEVICES=0,1 accelerate launch /root/StableAnimator_Music/train_single.py \
--pretrained_model_name_or_path="/root/StableAnimator_Music/checkpoints/stable-video-diffusion-img2vid-xt" \
--output_dir="./checkpoints/Animation" \
--data_root_path="/root/aist_hdf5/rec" \
--data_path="/root/aist_hdf5/full_list.txt" \
--dataset_width=512 \
--dataset_height=512 \
--validation_image_folder="./validation/ground_truth" \
--validation_control_folder="./validation/poses" \
--validation_image="./validation/reference.png" \
--num_workers=2 \
--lr_warmup_steps=10 \
--sample_n_frames=16 \
--learning_rate=1e-5 \
--per_gpu_batch_size=1 \
--num_train_epochs=10000 \
--mixed_precision="fp16" \
--gradient_accumulation_steps=1 \
--checkpointing_steps=2000 \
--validation_steps=500 \
--checkpoints_total_limit=5000 \
--resume_from_checkpoint="latest" \
--max_train_steps=30000
3 changes: 2 additions & 1 deletion DWPose/skeleton_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,8 @@ def get_video_pose(video_path, ref_image_path, poses_folder_path=None):
detected_poses = []
files = os.listdir(video_path)
png_files = [f for f in files if f.endswith('.png')]
png_files.sort(key=lambda x: int(x.split('_')[1].split('.')[0]))
png_files.sort(key=lambda x: int(x.split('.')[0]))
# png_files.sort(key=lambda x: int(x.split('_')[1].split('.')[0]))
for sub_name in png_files:
sub_driven_image_path = os.path.join(video_path, sub_name)
driven_image = cv2.imread(sub_driven_image_path)
Expand Down
296 changes: 1 addition & 295 deletions README.md

Large diffs are not rendered by default.

348 changes: 348 additions & 0 deletions animation/dataset/animation_new_dataset.py

Large diffs are not rendered by default.

60 changes: 60 additions & 0 deletions animation/modules/music_encoder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
from pathlib import Path

import einops
import numpy as np
import torch
import torch.nn as nn
import torch.nn.init as init
from diffusers.models.modeling_utils import ModelMixin

class MusicEncoder(ModelMixin):
def __init__(self, indim=4800, hw=64, noise_latent_channels=320):
super().__init__()
self.hw = hw
self.noise_latent_channels = noise_latent_channels
self.latent_dim = hw * hw

# projection to 64x64
self.net = nn.Linear(indim, self.latent_dim)

# 1→noise_latent_channels 채널 확장
self.expand = nn.Conv2d(1, noise_latent_channels, kernel_size=3, padding=1)

def forward(self, x): # (B, T, 4800)
B, T, F = x.shape
z = self.net(x.view(B*T, F)) # (B*T, 4096)
z = z.view(B*T, 1, self.hw, self.hw) # (B*T, 1, 64, 64)
z = self.expand(z) # (B*T, noise_latent_channels, 64, 64)
return z

@classmethod
def from_pretrained(cls, pretrained_model_path, latent_dim=1024):
"""Load pretrained music encoder weights"""
if not Path(pretrained_model_path).exists():
raise FileNotFoundError(f"No model file at {pretrained_model_path}")

print(f"Loading MusicEncoder from {pretrained_model_path}")

state_dict = torch.load(pretrained_model_path, map_location="cpu")

# Pretrained 모델의 latent_dim 확인
pretrained_latent_dim = state_dict['net.4.weight'].shape[0]

model = cls(in_dim=4800, latent_dim=pretrained_latent_dim)
model.load_state_dict(state_dict, strict=True)

# latent_dim이 다르면 projection layer 추가
if pretrained_latent_dim != latent_dim:
print(f"Adding projection layer: {pretrained_latent_dim} -> {latent_dim}")
model.projection = nn.Linear(pretrained_latent_dim, latent_dim)
model.latent_dim = latent_dim

return model

def forward_with_projection(self, x):
"""Projection layer가 있을 때 사용"""
z = self.forward(x) # (B, T, pretrained_latent_dim)
if hasattr(self, 'projection'):
z = self.projection(z.view(-1, z.shape[-1])) # (B*T, latent_dim)
z = z.view(x.shape[0], x.shape[1], -1) # (B, T, latent_dim)
return z
1 change: 1 addition & 0 deletions animation/modules/new.py

Large diffs are not rendered by default.

4 changes: 3 additions & 1 deletion animation/modules/unet.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,9 @@ def set_default_attn_processor(self):

self.set_attn_processor(processor)

def _set_gradient_checkpointing(self, module, value=False):
def _set_gradient_checkpointing(self, module, value=False, enable=None):
if enable is not None:
module.gradient_checkpointing = True
if hasattr(module, "gradient_checkpointing"):
module.gradient_checkpointing = value

Expand Down
Loading