Implement Wan2.2

This commit is contained in:
N
2025-07-31 02:30:20 -07:00
parent 4b2a0df237
commit 3b25af07d3
30 changed files with 6217 additions and 0 deletions

View File

@@ -0,0 +1,12 @@
# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
from .fm_solvers import (
FlowDPMSolverMultistepScheduler,
get_sampling_sigmas,
retrieve_timesteps,
)
from .fm_solvers_unipc import FlowUniPCMultistepScheduler
__all__ = [
'HuggingfaceTokenizer', 'get_sampling_sigmas', 'retrieve_timesteps',
'FlowDPMSolverMultistepScheduler', 'FlowUniPCMultistepScheduler'
]

View File

@@ -0,0 +1,562 @@
import math
from typing import List, Optional, Tuple, Union
import mlx.core as mx
import numpy as np
def get_sampling_sigmas(sampling_steps, shift):
sigma = np.linspace(1, 0, sampling_steps + 1)[:sampling_steps]
sigma = (shift * sigma / (1 + (shift - 1) * sigma))
return sigma
def retrieve_timesteps(
scheduler,
num_inference_steps=None,
device=None,
timesteps=None,
sigmas=None,
**kwargs,
):
if timesteps is not None and sigmas is not None:
raise ValueError(
"Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values"
)
if timesteps is not None:
scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
timesteps = scheduler.timesteps
num_inference_steps = len(timesteps)
elif sigmas is not None:
scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
timesteps = scheduler.timesteps
num_inference_steps = len(timesteps)
else:
scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
timesteps = scheduler.timesteps
return timesteps, num_inference_steps
class SchedulerOutput:
"""Output class for scheduler step results."""
def __init__(self, prev_sample: mx.array):
self.prev_sample = prev_sample
class FlowDPMSolverMultistepScheduler:
"""
MLX implementation of FlowDPMSolverMultistepScheduler.
A fast dedicated high-order solver for diffusion ODEs.
"""
order = 1
def __init__(
self,
num_train_timesteps: int = 1000,
solver_order: int = 2,
prediction_type: str = "flow_prediction",
shift: Optional[float] = 1.0,
use_dynamic_shifting: bool = False,
thresholding: bool = False,
dynamic_thresholding_ratio: float = 0.995,
sample_max_value: float = 1.0,
algorithm_type: str = "dpmsolver++",
solver_type: str = "midpoint",
lower_order_final: bool = True,
euler_at_final: bool = False,
final_sigmas_type: Optional[str] = "zero",
lambda_min_clipped: float = -float("inf"),
variance_type: Optional[str] = None,
invert_sigmas: bool = False,
):
# Store configuration
self.config = {
'num_train_timesteps': num_train_timesteps,
'solver_order': solver_order,
'prediction_type': prediction_type,
'shift': shift,
'use_dynamic_shifting': use_dynamic_shifting,
'thresholding': thresholding,
'dynamic_thresholding_ratio': dynamic_thresholding_ratio,
'sample_max_value': sample_max_value,
'algorithm_type': algorithm_type,
'solver_type': solver_type,
'lower_order_final': lower_order_final,
'euler_at_final': euler_at_final,
'final_sigmas_type': final_sigmas_type,
'lambda_min_clipped': lambda_min_clipped,
'variance_type': variance_type,
'invert_sigmas': invert_sigmas,
}
# Validate algorithm type
if algorithm_type not in ["dpmsolver", "dpmsolver++", "sde-dpmsolver", "sde-dpmsolver++"]:
if algorithm_type == "deis":
self.config['algorithm_type'] = "dpmsolver++"
else:
raise NotImplementedError(f"{algorithm_type} is not implemented")
# Validate solver type
if solver_type not in ["midpoint", "heun"]:
if solver_type in ["logrho", "bh1", "bh2"]:
self.config['solver_type'] = "midpoint"
else:
raise NotImplementedError(f"{solver_type} is not implemented")
# Initialize scheduling
self.num_inference_steps = None
alphas = np.linspace(1, 1 / num_train_timesteps, num_train_timesteps)[::-1].copy()
sigmas = 1.0 - alphas
sigmas = mx.array(sigmas, dtype=mx.float32)
if not use_dynamic_shifting:
sigmas = shift * sigmas / (1 + (shift - 1) * sigmas)
self.sigmas = sigmas
self.timesteps = sigmas * num_train_timesteps
self.model_outputs = [None] * solver_order
self.lower_order_nums = 0
self._step_index = None
self._begin_index = None
self.sigma_min = float(self.sigmas[-1])
self.sigma_max = float(self.sigmas[0])
@property
def step_index(self):
return self._step_index
@property
def begin_index(self):
return self._begin_index
def set_begin_index(self, begin_index: int = 0):
self._begin_index = begin_index
def set_timesteps(
self,
num_inference_steps: Union[int, None] = None,
device: Union[str, None] = None,
sigmas: Optional[List[float]] = None,
mu: Optional[Union[float, None]] = None,
shift: Optional[Union[float, None]] = None,
):
"""Sets the discrete timesteps used for the diffusion chain."""
if self.config['use_dynamic_shifting'] and mu is None:
raise ValueError(
"you have to pass a value for `mu` when `use_dynamic_shifting` is set to be `True`"
)
if sigmas is None:
sigmas = np.linspace(self.sigma_max, self.sigma_min, num_inference_steps + 1).copy()[:-1]
if self.config['use_dynamic_shifting']:
sigmas = self.time_shift(mu, 1.0, sigmas)
else:
if shift is None:
shift = self.config['shift']
sigmas = shift * sigmas / (1 + (shift - 1) * sigmas)
if self.config['final_sigmas_type'] == "sigma_min":
sigma_last = self.sigma_min
elif self.config['final_sigmas_type'] == "zero":
sigma_last = 0
else:
raise ValueError(
f"`final_sigmas_type` must be one of 'zero', or 'sigma_min', but got {self.config['final_sigmas_type']}"
)
timesteps = sigmas * self.config['num_train_timesteps']
sigmas = np.concatenate([sigmas, [sigma_last]]).astype(np.float32)
self.sigmas = mx.array(sigmas)
self.timesteps = mx.array(timesteps, dtype=mx.int64)
self.num_inference_steps = len(timesteps)
self.model_outputs = [None] * self.config['solver_order']
self.lower_order_nums = 0
self._step_index = None
self._begin_index = None
def _threshold_sample(self, sample: mx.array) -> mx.array:
"""Dynamic thresholding method."""
dtype = sample.dtype
batch_size, channels, *remaining_dims = sample.shape
# Flatten sample for quantile calculation
sample_flat = sample.reshape(batch_size, channels * np.prod(remaining_dims))
abs_sample = mx.abs(sample_flat)
# Compute quantile
s = mx.quantile(
abs_sample,
self.config['dynamic_thresholding_ratio'],
axis=1,
keepdims=True
)
s = mx.clip(s, 1, self.config['sample_max_value'])
# Threshold and normalize
sample_flat = mx.clip(sample_flat, -s, s) / s
sample = sample_flat.reshape(batch_size, channels, *remaining_dims)
return sample.astype(dtype)
def _sigma_to_t(self, sigma):
return sigma * self.config['num_train_timesteps']
def _sigma_to_alpha_sigma_t(self, sigma):
return 1 - sigma, sigma
def time_shift(self, mu: float, sigma: float, t: mx.array):
return math.exp(mu) / (math.exp(mu) + (1 / t - 1)**sigma)
def convert_model_output(
self,
model_output: mx.array,
sample: mx.array,
**kwargs,
) -> mx.array:
"""Convert model output to the corresponding type the algorithm needs."""
# DPM-Solver++ needs to solve an integral of the data prediction model
if self.config['algorithm_type'] in ["dpmsolver++", "sde-dpmsolver++"]:
if self.config['prediction_type'] == "flow_prediction":
sigma_t = self.sigmas[self.step_index]
x0_pred = sample - sigma_t * model_output
else:
raise ValueError(
f"prediction_type given as {self.config['prediction_type']} must be "
f"'flow_prediction' for the FlowDPMSolverMultistepScheduler."
)
if self.config['thresholding']:
x0_pred = self._threshold_sample(x0_pred)
return x0_pred
# DPM-Solver needs to solve an integral of the noise prediction model
elif self.config['algorithm_type'] in ["dpmsolver", "sde-dpmsolver"]:
if self.config['prediction_type'] == "flow_prediction":
sigma_t = self.sigmas[self.step_index]
epsilon = sample - (1 - sigma_t) * model_output
else:
raise ValueError(
f"prediction_type given as {self.config['prediction_type']} must be "
f"'flow_prediction' for the FlowDPMSolverMultistepScheduler."
)
if self.config['thresholding']:
sigma_t = self.sigmas[self.step_index]
x0_pred = sample - sigma_t * model_output
x0_pred = self._threshold_sample(x0_pred)
epsilon = model_output + x0_pred
return epsilon
def dpm_solver_first_order_update(
self,
model_output: mx.array,
sample: mx.array,
noise: Optional[mx.array] = None,
**kwargs,
) -> mx.array:
"""One step for the first-order DPMSolver (equivalent to DDIM)."""
sigma_t, sigma_s = self.sigmas[self.step_index + 1], self.sigmas[self.step_index]
alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
alpha_s, sigma_s = self._sigma_to_alpha_sigma_t(sigma_s)
lambda_t = mx.log(alpha_t) - mx.log(sigma_t)
lambda_s = mx.log(alpha_s) - mx.log(sigma_s)
h = lambda_t - lambda_s
if self.config['algorithm_type'] == "dpmsolver++":
x_t = (sigma_t / sigma_s) * sample - (alpha_t * (mx.exp(-h) - 1.0)) * model_output
elif self.config['algorithm_type'] == "dpmsolver":
x_t = (alpha_t / alpha_s) * sample - (sigma_t * (mx.exp(h) - 1.0)) * model_output
elif self.config['algorithm_type'] == "sde-dpmsolver++":
assert noise is not None
x_t = (
(sigma_t / sigma_s * mx.exp(-h)) * sample +
(alpha_t * (1 - mx.exp(-2.0 * h))) * model_output +
sigma_t * mx.sqrt(1.0 - mx.exp(-2 * h)) * noise
)
elif self.config['algorithm_type'] == "sde-dpmsolver":
assert noise is not None
x_t = (
(alpha_t / alpha_s) * sample -
2.0 * (sigma_t * (mx.exp(h) - 1.0)) * model_output +
sigma_t * mx.sqrt(mx.exp(2 * h) - 1.0) * noise
)
return x_t
def multistep_dpm_solver_second_order_update(
self,
model_output_list: List[mx.array],
sample: mx.array,
noise: Optional[mx.array] = None,
**kwargs,
) -> mx.array:
"""One step for the second-order multistep DPMSolver."""
sigma_t, sigma_s0, sigma_s1 = (
self.sigmas[self.step_index + 1],
self.sigmas[self.step_index],
self.sigmas[self.step_index - 1],
)
alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
alpha_s1, sigma_s1 = self._sigma_to_alpha_sigma_t(sigma_s1)
lambda_t = mx.log(alpha_t) - mx.log(sigma_t)
lambda_s0 = mx.log(alpha_s0) - mx.log(sigma_s0)
lambda_s1 = mx.log(alpha_s1) - mx.log(sigma_s1)
m0, m1 = model_output_list[-1], model_output_list[-2]
h, h_0 = lambda_t - lambda_s0, lambda_s0 - lambda_s1
r0 = h_0 / h
D0, D1 = m0, (1.0 / r0) * (m0 - m1)
if self.config['algorithm_type'] == "dpmsolver++":
if self.config['solver_type'] == "midpoint":
x_t = (
(sigma_t / sigma_s0) * sample -
(alpha_t * (mx.exp(-h) - 1.0)) * D0 -
0.5 * (alpha_t * (mx.exp(-h) - 1.0)) * D1
)
elif self.config['solver_type'] == "heun":
x_t = (
(sigma_t / sigma_s0) * sample -
(alpha_t * (mx.exp(-h) - 1.0)) * D0 +
(alpha_t * ((mx.exp(-h) - 1.0) / h + 1.0)) * D1
)
elif self.config['algorithm_type'] == "dpmsolver":
if self.config['solver_type'] == "midpoint":
x_t = (
(alpha_t / alpha_s0) * sample -
(sigma_t * (mx.exp(h) - 1.0)) * D0 -
0.5 * (sigma_t * (mx.exp(h) - 1.0)) * D1
)
elif self.config['solver_type'] == "heun":
x_t = (
(alpha_t / alpha_s0) * sample -
(sigma_t * (mx.exp(h) - 1.0)) * D0 -
(sigma_t * ((mx.exp(h) - 1.0) / h - 1.0)) * D1
)
elif self.config['algorithm_type'] == "sde-dpmsolver++":
assert noise is not None
if self.config['solver_type'] == "midpoint":
x_t = (
(sigma_t / sigma_s0 * mx.exp(-h)) * sample +
(alpha_t * (1 - mx.exp(-2.0 * h))) * D0 +
0.5 * (alpha_t * (1 - mx.exp(-2.0 * h))) * D1 +
sigma_t * mx.sqrt(1.0 - mx.exp(-2 * h)) * noise
)
elif self.config['solver_type'] == "heun":
x_t = (
(sigma_t / sigma_s0 * mx.exp(-h)) * sample +
(alpha_t * (1 - mx.exp(-2.0 * h))) * D0 +
(alpha_t * ((1.0 - mx.exp(-2.0 * h)) / (-2.0 * h) + 1.0)) * D1 +
sigma_t * mx.sqrt(1.0 - mx.exp(-2 * h)) * noise
)
elif self.config['algorithm_type'] == "sde-dpmsolver":
assert noise is not None
if self.config['solver_type'] == "midpoint":
x_t = (
(alpha_t / alpha_s0) * sample -
2.0 * (sigma_t * (mx.exp(h) - 1.0)) * D0 -
(sigma_t * (mx.exp(h) - 1.0)) * D1 +
sigma_t * mx.sqrt(mx.exp(2 * h) - 1.0) * noise
)
elif self.config['solver_type'] == "heun":
x_t = (
(alpha_t / alpha_s0) * sample -
2.0 * (sigma_t * (mx.exp(h) - 1.0)) * D0 -
2.0 * (sigma_t * ((mx.exp(h) - 1.0) / h - 1.0)) * D1 +
sigma_t * mx.sqrt(mx.exp(2 * h) - 1.0) * noise
)
return x_t
def multistep_dpm_solver_third_order_update(
self,
model_output_list: List[mx.array],
sample: mx.array,
**kwargs,
) -> mx.array:
"""One step for the third-order multistep DPMSolver."""
sigma_t, sigma_s0, sigma_s1, sigma_s2 = (
self.sigmas[self.step_index + 1],
self.sigmas[self.step_index],
self.sigmas[self.step_index - 1],
self.sigmas[self.step_index - 2],
)
alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
alpha_s1, sigma_s1 = self._sigma_to_alpha_sigma_t(sigma_s1)
alpha_s2, sigma_s2 = self._sigma_to_alpha_sigma_t(sigma_s2)
lambda_t = mx.log(alpha_t) - mx.log(sigma_t)
lambda_s0 = mx.log(alpha_s0) - mx.log(sigma_s0)
lambda_s1 = mx.log(alpha_s1) - mx.log(sigma_s1)
lambda_s2 = mx.log(alpha_s2) - mx.log(sigma_s2)
m0, m1, m2 = model_output_list[-1], model_output_list[-2], model_output_list[-3]
h, h_0, h_1 = lambda_t - lambda_s0, lambda_s0 - lambda_s1, lambda_s1 - lambda_s2
r0, r1 = h_0 / h, h_1 / h
D0 = m0
D1_0, D1_1 = (1.0 / r0) * (m0 - m1), (1.0 / r1) * (m1 - m2)
D1 = D1_0 + (r0 / (r0 + r1)) * (D1_0 - D1_1)
D2 = (1.0 / (r0 + r1)) * (D1_0 - D1_1)
if self.config['algorithm_type'] == "dpmsolver++":
x_t = (
(sigma_t / sigma_s0) * sample -
(alpha_t * (mx.exp(-h) - 1.0)) * D0 +
(alpha_t * ((mx.exp(-h) - 1.0) / h + 1.0)) * D1 -
(alpha_t * ((mx.exp(-h) - 1.0 + h) / h**2 - 0.5)) * D2
)
elif self.config['algorithm_type'] == "dpmsolver":
x_t = (
(alpha_t / alpha_s0) * sample -
(sigma_t * (mx.exp(h) - 1.0)) * D0 -
(sigma_t * ((mx.exp(h) - 1.0) / h - 1.0)) * D1 -
(sigma_t * ((mx.exp(h) - 1.0 - h) / h**2 - 0.5)) * D2
)
return x_t
def index_for_timestep(self, timestep, schedule_timesteps=None):
if schedule_timesteps is None:
schedule_timesteps = self.timesteps
indices = mx.where(schedule_timesteps == timestep)[0]
pos = 1 if len(indices) > 1 else 0
return int(indices[pos])
def _init_step_index(self, timestep):
"""Initialize the step_index counter for the scheduler."""
if self.begin_index is None:
self._step_index = self.index_for_timestep(timestep)
else:
self._step_index = self._begin_index
def step(
self,
model_output: mx.array,
timestep: Union[int, mx.array],
sample: mx.array,
generator=None,
variance_noise: Optional[mx.array] = None,
return_dict: bool = True,
) -> Union[SchedulerOutput, Tuple]:
"""Predict the sample from the previous timestep."""
if self.num_inference_steps is None:
raise ValueError(
"Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
)
if self.step_index is None:
self._init_step_index(timestep)
# Improve numerical stability for small number of steps
lower_order_final = (
(self.step_index == len(self.timesteps) - 1) and
(self.config['euler_at_final'] or
(self.config['lower_order_final'] and len(self.timesteps) < 15) or
self.config['final_sigmas_type'] == "zero")
)
lower_order_second = (
(self.step_index == len(self.timesteps) - 2) and
self.config['lower_order_final'] and
len(self.timesteps) < 15
)
model_output = self.convert_model_output(model_output, sample=sample)
for i in range(self.config['solver_order'] - 1):
self.model_outputs[i] = self.model_outputs[i + 1]
self.model_outputs[-1] = model_output
# Upcast to avoid precision issues
sample = sample.astype(mx.float32)
# Generate noise if needed for SDE variants
if self.config['algorithm_type'] in ["sde-dpmsolver", "sde-dpmsolver++"] and variance_noise is None:
noise = mx.random.normal(model_output.shape, dtype=mx.float32)
elif self.config['algorithm_type'] in ["sde-dpmsolver", "sde-dpmsolver++"]:
noise = variance_noise.astype(mx.float32)
else:
noise = None
if self.config['solver_order'] == 1 or self.lower_order_nums < 1 or lower_order_final:
prev_sample = self.dpm_solver_first_order_update(
model_output, sample=sample, noise=noise
)
elif self.config['solver_order'] == 2 or self.lower_order_nums < 2 or lower_order_second:
prev_sample = self.multistep_dpm_solver_second_order_update(
self.model_outputs, sample=sample, noise=noise
)
else:
prev_sample = self.multistep_dpm_solver_third_order_update(
self.model_outputs, sample=sample
)
if self.lower_order_nums < self.config['solver_order']:
self.lower_order_nums += 1
# Cast sample back to expected dtype
prev_sample = prev_sample.astype(model_output.dtype)
# Increase step index
self._step_index += 1
if not return_dict:
return (prev_sample,)
return SchedulerOutput(prev_sample=prev_sample)
def scale_model_input(self, sample: mx.array, *args, **kwargs) -> mx.array:
"""Scale model input - no scaling needed for this scheduler."""
return sample
def add_noise(
self,
original_samples: mx.array,
noise: mx.array,
timesteps: mx.array,
) -> mx.array:
"""Add noise to original samples."""
sigmas = self.sigmas.astype(original_samples.dtype)
schedule_timesteps = self.timesteps
# Get step indices
if self.begin_index is None:
step_indices = [
self.index_for_timestep(t, schedule_timesteps)
for t in timesteps
]
elif self.step_index is not None:
step_indices = [self.step_index] * timesteps.shape[0]
else:
step_indices = [self.begin_index] * timesteps.shape[0]
sigma = sigmas[step_indices]
while len(sigma.shape) < len(original_samples.shape):
sigma = mx.expand_dims(sigma, -1)
alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
noisy_samples = alpha_t * original_samples + sigma_t * noise
return noisy_samples
def __len__(self):
return self.config['num_train_timesteps']

View File

@@ -0,0 +1,546 @@
import math
from typing import List, Optional, Tuple, Union
import mlx.core as mx
import numpy as np
class SchedulerOutput:
"""Output class for scheduler step results."""
def __init__(self, prev_sample: mx.array):
self.prev_sample = prev_sample
class FlowUniPCMultistepScheduler:
"""
MLX implementation of UniPCMultistepScheduler.
A training-free framework designed for the fast sampling of diffusion models.
"""
order = 1
def __init__(
self,
num_train_timesteps: int = 1000,
solver_order: int = 2,
prediction_type: str = "flow_prediction",
shift: Optional[float] = 1.0,
use_dynamic_shifting: bool = False,
thresholding: bool = False,
dynamic_thresholding_ratio: float = 0.995,
sample_max_value: float = 1.0,
predict_x0: bool = True,
solver_type: str = "bh2",
lower_order_final: bool = True,
disable_corrector: List[int] = [],
solver_p = None,
timestep_spacing: str = "linspace",
steps_offset: int = 0,
final_sigmas_type: Optional[str] = "zero",
):
# Store configuration
self.config = {
'num_train_timesteps': num_train_timesteps,
'solver_order': solver_order,
'prediction_type': prediction_type,
'shift': shift,
'use_dynamic_shifting': use_dynamic_shifting,
'thresholding': thresholding,
'dynamic_thresholding_ratio': dynamic_thresholding_ratio,
'sample_max_value': sample_max_value,
'predict_x0': predict_x0,
'solver_type': solver_type,
'lower_order_final': lower_order_final,
'disable_corrector': disable_corrector,
'solver_p': solver_p,
'timestep_spacing': timestep_spacing,
'steps_offset': steps_offset,
'final_sigmas_type': final_sigmas_type,
}
# Validate solver type
if solver_type not in ["bh1", "bh2"]:
if solver_type in ["midpoint", "heun", "logrho"]:
self.config['solver_type'] = "bh2"
else:
raise NotImplementedError(
f"{solver_type} is not implemented for {self.__class__}"
)
self.predict_x0 = predict_x0
# setable values
self.num_inference_steps = None
alphas = np.linspace(1, 1 / num_train_timesteps, num_train_timesteps)[::-1].copy()
sigmas = 1.0 - alphas
sigmas = mx.array(sigmas, dtype=mx.float32)
if not use_dynamic_shifting:
sigmas = shift * sigmas / (1 + (shift - 1) * sigmas)
self.sigmas = sigmas
self.timesteps = sigmas * num_train_timesteps
self.model_outputs = [None] * solver_order
self.timestep_list = [None] * solver_order
self.lower_order_nums = 0
self.disable_corrector = disable_corrector
self.solver_p = solver_p
self.last_sample = None
self._step_index = None
self._begin_index = None
self.sigma_min = float(self.sigmas[-1])
self.sigma_max = float(self.sigmas[0])
@property
def step_index(self):
"""The index counter for current timestep."""
return self._step_index
@property
def begin_index(self):
"""The index for the first timestep."""
return self._begin_index
def set_begin_index(self, begin_index: int = 0):
"""Sets the begin index for the scheduler."""
self._begin_index = begin_index
def set_timesteps(
self,
num_inference_steps: Union[int, None] = None,
device: Union[str, None] = None,
sigmas: Optional[List[float]] = None,
mu: Optional[Union[float, None]] = None,
shift: Optional[Union[float, None]] = None,
):
"""Sets the discrete timesteps used for the diffusion chain."""
if self.config['use_dynamic_shifting'] and mu is None:
raise ValueError(
"you have to pass a value for `mu` when `use_dynamic_shifting` is set to be `True`"
)
if sigmas is None:
sigmas = np.linspace(self.sigma_max, self.sigma_min, num_inference_steps + 1).copy()[:-1]
if self.config['use_dynamic_shifting']:
sigmas = self.time_shift(mu, 1.0, sigmas)
else:
if shift is None:
shift = self.config['shift']
sigmas = shift * sigmas / (1 + (shift - 1) * sigmas)
if self.config['final_sigmas_type'] == "sigma_min":
sigma_last = self.sigma_min
elif self.config['final_sigmas_type'] == "zero":
sigma_last = 0
else:
raise ValueError(
f"`final_sigmas_type` must be one of 'zero', or 'sigma_min', but got {self.config['final_sigmas_type']}"
)
timesteps = sigmas * self.config['num_train_timesteps']
sigmas = np.concatenate([sigmas, [sigma_last]]).astype(np.float32)
self.sigmas = mx.array(sigmas)
self.timesteps = mx.array(timesteps, dtype=mx.int64)
self.num_inference_steps = len(timesteps)
self.model_outputs = [None] * self.config['solver_order']
self.lower_order_nums = 0
self.last_sample = None
if self.solver_p:
self.solver_p.set_timesteps(self.num_inference_steps, device=device)
# add an index counter for schedulers
self._step_index = None
self._begin_index = None
def _threshold_sample(self, sample: mx.array) -> mx.array:
"""Dynamic thresholding method."""
dtype = sample.dtype
batch_size, channels, *remaining_dims = sample.shape
# Flatten sample for quantile calculation
sample_flat = sample.reshape(batch_size, channels * np.prod(remaining_dims))
abs_sample = mx.abs(sample_flat)
# Compute quantile
s = mx.quantile(
abs_sample,
self.config['dynamic_thresholding_ratio'],
axis=1,
keepdims=True
)
s = mx.clip(s, 1, self.config['sample_max_value'])
# Threshold and normalize
sample_flat = mx.clip(sample_flat, -s, s) / s
sample = sample_flat.reshape(batch_size, channels, *remaining_dims)
return sample.astype(dtype)
def _sigma_to_t(self, sigma):
return sigma * self.config['num_train_timesteps']
def _sigma_to_alpha_sigma_t(self, sigma):
return 1 - sigma, sigma
def time_shift(self, mu: float, sigma: float, t: mx.array):
return math.exp(mu) / (math.exp(mu) + (1 / t - 1)**sigma)
def convert_model_output(
self,
model_output: mx.array,
sample: mx.array = None,
**kwargs,
) -> mx.array:
"""Convert the model output to the corresponding type the UniPC algorithm needs."""
sigma = self.sigmas[self.step_index]
alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
if self.predict_x0:
if self.config['prediction_type'] == "flow_prediction":
sigma_t = self.sigmas[self.step_index]
x0_pred = sample - sigma_t * model_output
else:
raise ValueError(
f"prediction_type given as {self.config['prediction_type']} must be 'flow_prediction' "
f"for the UniPCMultistepScheduler."
)
if self.config['thresholding']:
x0_pred = self._threshold_sample(x0_pred)
return x0_pred
else:
if self.config['prediction_type'] == "flow_prediction":
sigma_t = self.sigmas[self.step_index]
epsilon = sample - (1 - sigma_t) * model_output
else:
raise ValueError(
f"prediction_type given as {self.config['prediction_type']} must be 'flow_prediction' "
f"for the UniPCMultistepScheduler."
)
if self.config['thresholding']:
sigma_t = self.sigmas[self.step_index]
x0_pred = sample - sigma_t * model_output
x0_pred = self._threshold_sample(x0_pred)
epsilon = model_output + x0_pred
return epsilon
def multistep_uni_p_bh_update(
self,
model_output: mx.array,
sample: mx.array = None,
order: int = None,
**kwargs,
) -> mx.array:
"""One step for the UniP (B(h) version)."""
model_output_list = self.model_outputs
s0 = self.timestep_list[-1]
m0 = model_output_list[-1]
x = sample
if self.solver_p:
x_t = self.solver_p.step(model_output, s0, x).prev_sample
return x_t
sigma_t, sigma_s0 = self.sigmas[self.step_index + 1], self.sigmas[self.step_index]
alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
lambda_t = mx.log(alpha_t) - mx.log(sigma_t)
lambda_s0 = mx.log(alpha_s0) - mx.log(sigma_s0)
h = lambda_t - lambda_s0
rks = []
D1s = []
for i in range(1, order):
si = self.step_index - i
mi = model_output_list[-(i + 1)]
alpha_si, sigma_si = self._sigma_to_alpha_sigma_t(self.sigmas[si])
lambda_si = mx.log(alpha_si) - mx.log(sigma_si)
rk = (lambda_si - lambda_s0) / h
rks.append(rk)
D1s.append((mi - m0) / rk)
rks.append(1.0)
rks = mx.array(rks)
R = []
b = []
hh = -h if self.predict_x0 else h
h_phi_1 = mx.exp(hh) - 1 # h\phi_1(h) = e^h - 1
h_phi_k = h_phi_1 / hh - 1
factorial_i = 1
if self.config['solver_type'] == "bh1":
B_h = hh
elif self.config['solver_type'] == "bh2":
B_h = mx.exp(hh) - 1
else:
raise NotImplementedError()
for i in range(1, order + 1):
R.append(mx.power(rks, i - 1))
b.append(h_phi_k * factorial_i / B_h)
factorial_i *= i + 1
h_phi_k = h_phi_k / hh - 1 / factorial_i
R = mx.stack(R)
b = mx.array(b)
if len(D1s) > 0:
D1s = mx.stack(D1s, axis=1) # (B, K)
# for order 2, we use a simplified version
if order == 2:
rhos_p = mx.array([0.5], dtype=x.dtype)
else:
rhos_p = mx.linalg.solve(R[:-1, :-1], b[:-1], stream=mx.cpu).astype(x.dtype)
else:
D1s = None
if self.predict_x0:
x_t_ = sigma_t / sigma_s0 * x - alpha_t * h_phi_1 * m0
if D1s is not None:
pred_res = mx.sum(rhos_p[:, None, None, None] * D1s, axis=0)
else:
pred_res = 0
x_t = x_t_ - alpha_t * B_h * pred_res
else:
x_t_ = alpha_t / alpha_s0 * x - sigma_t * h_phi_1 * m0
if D1s is not None:
pred_res = mx.sum(rhos_p[:, None, None, None] * D1s, axis=0)
else:
pred_res = 0
x_t = x_t_ - sigma_t * B_h * pred_res
x_t = x_t.astype(x.dtype)
return x_t
def multistep_uni_c_bh_update(
self,
this_model_output: mx.array,
last_sample: mx.array = None,
this_sample: mx.array = None,
order: int = None,
**kwargs,
) -> mx.array:
"""One step for the UniC (B(h) version)."""
model_output_list = self.model_outputs
m0 = model_output_list[-1]
x = last_sample
x_t = this_sample
model_t = this_model_output
sigma_t, sigma_s0 = self.sigmas[self.step_index], self.sigmas[self.step_index - 1]
alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
lambda_t = mx.log(alpha_t) - mx.log(sigma_t)
lambda_s0 = mx.log(alpha_s0) - mx.log(sigma_s0)
h = lambda_t - lambda_s0
rks = []
D1s = []
for i in range(1, order):
si = self.step_index - (i + 1)
mi = model_output_list[-(i + 1)]
alpha_si, sigma_si = self._sigma_to_alpha_sigma_t(self.sigmas[si])
lambda_si = mx.log(alpha_si) - mx.log(sigma_si)
rk = (lambda_si - lambda_s0) / h
rks.append(rk)
D1s.append((mi - m0) / rk)
rks.append(1.0)
rks = mx.array(rks)
R = []
b = []
hh = -h if self.predict_x0 else h
h_phi_1 = mx.exp(hh) - 1
h_phi_k = h_phi_1 / hh - 1
factorial_i = 1
if self.config['solver_type'] == "bh1":
B_h = hh
elif self.config['solver_type'] == "bh2":
B_h = mx.exp(hh) - 1
else:
raise NotImplementedError()
for i in range(1, order + 1):
R.append(mx.power(rks, i - 1))
b.append(h_phi_k * factorial_i / B_h)
factorial_i *= i + 1
h_phi_k = h_phi_k / hh - 1 / factorial_i
R = mx.stack(R)
b = mx.array(b)
if len(D1s) > 0:
D1s = mx.stack(D1s, axis=1)
else:
D1s = None
# for order 1, we use a simplified version
if order == 1:
rhos_c = mx.array([0.5], dtype=x.dtype)
else:
rhos_c = mx.linalg.solve(R, b, stream=mx.cpu).astype(x.dtype)
if self.predict_x0:
x_t_ = sigma_t / sigma_s0 * x - alpha_t * h_phi_1 * m0
if D1s is not None:
corr_res = mx.sum(rhos_c[:-1, None, None, None] * D1s, axis=0)
else:
corr_res = 0
D1_t = model_t - m0
x_t = x_t_ - alpha_t * B_h * (corr_res + rhos_c[-1] * D1_t)
else:
x_t_ = alpha_t / alpha_s0 * x - sigma_t * h_phi_1 * m0
if D1s is not None:
corr_res = mx.sum(rhos_c[:-1, None, None, None] * D1s, axis=0)
else:
corr_res = 0
D1_t = model_t - m0
x_t = x_t_ - sigma_t * B_h * (corr_res + rhos_c[-1] * D1_t)
x_t = x_t.astype(x.dtype)
return x_t
def index_for_timestep(self, timestep, schedule_timesteps=None):
if schedule_timesteps is None:
schedule_timesteps = self.timesteps
condition = schedule_timesteps == timestep
indices = mx.argmax(condition.astype(mx.int32))
# Convert scalar to int and return
return int(indices)
def _init_step_index(self, timestep):
"""Initialize the step_index counter for the scheduler."""
if self.begin_index is None:
self._step_index = self.index_for_timestep(timestep)
else:
self._step_index = self._begin_index
def step(
self,
model_output: mx.array,
timestep: Union[int, mx.array],
sample: mx.array,
return_dict: bool = True,
generator=None
) -> Union[SchedulerOutput, Tuple]:
"""Predict the sample from the previous timestep."""
if self.num_inference_steps is None:
raise ValueError(
"Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
)
if self.step_index is None:
self._init_step_index(timestep)
use_corrector = (
self.step_index > 0 and
self.step_index - 1 not in self.disable_corrector and
self.last_sample is not None
)
model_output_convert = self.convert_model_output(
model_output, sample=sample
)
if use_corrector:
sample = self.multistep_uni_c_bh_update(
this_model_output=model_output_convert,
last_sample=self.last_sample,
this_sample=sample,
order=self.this_order,
)
for i in range(self.config['solver_order'] - 1):
self.model_outputs[i] = self.model_outputs[i + 1]
self.timestep_list[i] = self.timestep_list[i + 1]
self.model_outputs[-1] = model_output_convert
self.timestep_list[-1] = timestep
if self.config['lower_order_final']:
this_order = min(
self.config['solver_order'],
len(self.timesteps) - self.step_index
)
else:
this_order = self.config['solver_order']
self.this_order = min(this_order, self.lower_order_nums + 1)
assert self.this_order > 0
self.last_sample = sample
prev_sample = self.multistep_uni_p_bh_update(
model_output=model_output,
sample=sample,
order=self.this_order,
)
if self.lower_order_nums < self.config['solver_order']:
self.lower_order_nums += 1
# Increase step index
self._step_index += 1
if not return_dict:
return (prev_sample,)
return SchedulerOutput(prev_sample=prev_sample)
def scale_model_input(self, sample: mx.array, *args, **kwargs) -> mx.array:
"""Scale model input - no scaling needed for this scheduler."""
return sample
def add_noise(
self,
original_samples: mx.array,
noise: mx.array,
timesteps: mx.array,
) -> mx.array:
"""Add noise to original samples."""
sigmas = self.sigmas.astype(original_samples.dtype)
schedule_timesteps = self.timesteps
# Get step indices
if self.begin_index is None:
step_indices = [
self.index_for_timestep(t, schedule_timesteps)
for t in timesteps
]
elif self.step_index is not None:
step_indices = [self.step_index] * timesteps.shape[0]
else:
step_indices = [self.begin_index] * timesteps.shape[0]
sigma = sigmas[step_indices]
while len(sigma.shape) < len(original_samples.shape):
sigma = mx.expand_dims(sigma, -1)
alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
noisy_samples = alpha_t * original_samples + sigma_t * noise
return noisy_samples
def __len__(self):
return self.config['num_train_timesteps']

View File

@@ -0,0 +1,363 @@
# Copied from https://github.com/kq-chen/qwen-vl-utils
# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
from __future__ import annotations
import base64
import logging
import math
import os
import sys
import time
import warnings
from functools import lru_cache
from io import BytesIO
import requests
import torch
import torchvision
from packaging import version
from PIL import Image
from torchvision import io, transforms
from torchvision.transforms import InterpolationMode
logger = logging.getLogger(__name__)
IMAGE_FACTOR = 28
MIN_PIXELS = 4 * 28 * 28
MAX_PIXELS = 16384 * 28 * 28
MAX_RATIO = 200
VIDEO_MIN_PIXELS = 128 * 28 * 28
VIDEO_MAX_PIXELS = 768 * 28 * 28
VIDEO_TOTAL_PIXELS = 24576 * 28 * 28
FRAME_FACTOR = 2
FPS = 2.0
FPS_MIN_FRAMES = 4
FPS_MAX_FRAMES = 768
def round_by_factor(number: int, factor: int) -> int:
"""Returns the closest integer to 'number' that is divisible by 'factor'."""
return round(number / factor) * factor
def ceil_by_factor(number: int, factor: int) -> int:
"""Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
return math.ceil(number / factor) * factor
def floor_by_factor(number: int, factor: int) -> int:
"""Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
return math.floor(number / factor) * factor
def smart_resize(height: int,
width: int,
factor: int = IMAGE_FACTOR,
min_pixels: int = MIN_PIXELS,
max_pixels: int = MAX_PIXELS) -> tuple[int, int]:
"""
Rescales the image so that the following conditions are met:
1. Both dimensions (height and width) are divisible by 'factor'.
2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
3. The aspect ratio of the image is maintained as closely as possible.
"""
if max(height, width) / min(height, width) > MAX_RATIO:
raise ValueError(
f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}"
)
h_bar = max(factor, round_by_factor(height, factor))
w_bar = max(factor, round_by_factor(width, factor))
if h_bar * w_bar > max_pixels:
beta = math.sqrt((height * width) / max_pixels)
h_bar = floor_by_factor(height / beta, factor)
w_bar = floor_by_factor(width / beta, factor)
elif h_bar * w_bar < min_pixels:
beta = math.sqrt(min_pixels / (height * width))
h_bar = ceil_by_factor(height * beta, factor)
w_bar = ceil_by_factor(width * beta, factor)
return h_bar, w_bar
def fetch_image(ele: dict[str, str | Image.Image],
size_factor: int = IMAGE_FACTOR) -> Image.Image:
if "image" in ele:
image = ele["image"]
else:
image = ele["image_url"]
image_obj = None
if isinstance(image, Image.Image):
image_obj = image
elif image.startswith("http://") or image.startswith("https://"):
image_obj = Image.open(requests.get(image, stream=True).raw)
elif image.startswith("file://"):
image_obj = Image.open(image[7:])
elif image.startswith("data:image"):
if "base64," in image:
_, base64_data = image.split("base64,", 1)
data = base64.b64decode(base64_data)
image_obj = Image.open(BytesIO(data))
else:
image_obj = Image.open(image)
if image_obj is None:
raise ValueError(
f"Unrecognized image input, support local path, http url, base64 and PIL.Image, got {image}"
)
image = image_obj.convert("RGB")
## resize
if "resized_height" in ele and "resized_width" in ele:
resized_height, resized_width = smart_resize(
ele["resized_height"],
ele["resized_width"],
factor=size_factor,
)
else:
width, height = image.size
min_pixels = ele.get("min_pixels", MIN_PIXELS)
max_pixels = ele.get("max_pixels", MAX_PIXELS)
resized_height, resized_width = smart_resize(
height,
width,
factor=size_factor,
min_pixels=min_pixels,
max_pixels=max_pixels,
)
image = image.resize((resized_width, resized_height))
return image
def smart_nframes(
ele: dict,
total_frames: int,
video_fps: int | float,
) -> int:
"""calculate the number of frames for video used for model inputs.
Args:
ele (dict): a dict contains the configuration of video.
support either `fps` or `nframes`:
- nframes: the number of frames to extract for model inputs.
- fps: the fps to extract frames for model inputs.
- min_frames: the minimum number of frames of the video, only used when fps is provided.
- max_frames: the maximum number of frames of the video, only used when fps is provided.
total_frames (int): the original total number of frames of the video.
video_fps (int | float): the original fps of the video.
Raises:
ValueError: nframes should in interval [FRAME_FACTOR, total_frames].
Returns:
int: the number of frames for video used for model inputs.
"""
assert not ("fps" in ele and
"nframes" in ele), "Only accept either `fps` or `nframes`"
if "nframes" in ele:
nframes = round_by_factor(ele["nframes"], FRAME_FACTOR)
else:
fps = ele.get("fps", FPS)
min_frames = ceil_by_factor(
ele.get("min_frames", FPS_MIN_FRAMES), FRAME_FACTOR)
max_frames = floor_by_factor(
ele.get("max_frames", min(FPS_MAX_FRAMES, total_frames)),
FRAME_FACTOR)
nframes = total_frames / video_fps * fps
nframes = min(max(nframes, min_frames), max_frames)
nframes = round_by_factor(nframes, FRAME_FACTOR)
if not (FRAME_FACTOR <= nframes and nframes <= total_frames):
raise ValueError(
f"nframes should in interval [{FRAME_FACTOR}, {total_frames}], but got {nframes}."
)
return nframes
def _read_video_torchvision(ele: dict,) -> torch.Tensor:
"""read video using torchvision.io.read_video
Args:
ele (dict): a dict contains the configuration of video.
support keys:
- video: the path of video. support "file://", "http://", "https://" and local path.
- video_start: the start time of video.
- video_end: the end time of video.
Returns:
torch.Tensor: the video tensor with shape (T, C, H, W).
"""
video_path = ele["video"]
if version.parse(torchvision.__version__) < version.parse("0.19.0"):
if "http://" in video_path or "https://" in video_path:
warnings.warn(
"torchvision < 0.19.0 does not support http/https video path, please upgrade to 0.19.0."
)
if "file://" in video_path:
video_path = video_path[7:]
st = time.time()
video, audio, info = io.read_video(
video_path,
start_pts=ele.get("video_start", 0.0),
end_pts=ele.get("video_end", None),
pts_unit="sec",
output_format="TCHW",
)
total_frames, video_fps = video.size(0), info["video_fps"]
logger.info(
f"torchvision: {video_path=}, {total_frames=}, {video_fps=}, time={time.time() - st:.3f}s"
)
nframes = smart_nframes(ele, total_frames=total_frames, video_fps=video_fps)
idx = torch.linspace(0, total_frames - 1, nframes).round().long()
video = video[idx]
return video
def is_decord_available() -> bool:
import importlib.util
return importlib.util.find_spec("decord") is not None
def _read_video_decord(ele: dict,) -> torch.Tensor:
"""read video using decord.VideoReader
Args:
ele (dict): a dict contains the configuration of video.
support keys:
- video: the path of video. support "file://", "http://", "https://" and local path.
- video_start: the start time of video.
- video_end: the end time of video.
Returns:
torch.Tensor: the video tensor with shape (T, C, H, W).
"""
import decord
video_path = ele["video"]
st = time.time()
vr = decord.VideoReader(video_path)
# TODO: support start_pts and end_pts
if 'video_start' in ele or 'video_end' in ele:
raise NotImplementedError(
"not support start_pts and end_pts in decord for now.")
total_frames, video_fps = len(vr), vr.get_avg_fps()
logger.info(
f"decord: {video_path=}, {total_frames=}, {video_fps=}, time={time.time() - st:.3f}s"
)
nframes = smart_nframes(ele, total_frames=total_frames, video_fps=video_fps)
idx = torch.linspace(0, total_frames - 1, nframes).round().long().tolist()
video = vr.get_batch(idx).asnumpy()
video = torch.tensor(video).permute(0, 3, 1, 2) # Convert to TCHW format
return video
VIDEO_READER_BACKENDS = {
"decord": _read_video_decord,
"torchvision": _read_video_torchvision,
}
FORCE_QWENVL_VIDEO_READER = os.getenv("FORCE_QWENVL_VIDEO_READER", None)
@lru_cache(maxsize=1)
def get_video_reader_backend() -> str:
if FORCE_QWENVL_VIDEO_READER is not None:
video_reader_backend = FORCE_QWENVL_VIDEO_READER
elif is_decord_available():
video_reader_backend = "decord"
else:
video_reader_backend = "torchvision"
logger.info(
f"qwen-vl-utils using {video_reader_backend} to read video.",
file=sys.stderr)
return video_reader_backend
def fetch_video(
ele: dict,
image_factor: int = IMAGE_FACTOR) -> torch.Tensor | list[Image.Image]:
if isinstance(ele["video"], str):
video_reader_backend = get_video_reader_backend()
video = VIDEO_READER_BACKENDS[video_reader_backend](ele)
nframes, _, height, width = video.shape
min_pixels = ele.get("min_pixels", VIDEO_MIN_PIXELS)
total_pixels = ele.get("total_pixels", VIDEO_TOTAL_PIXELS)
max_pixels = max(
min(VIDEO_MAX_PIXELS, total_pixels / nframes * FRAME_FACTOR),
int(min_pixels * 1.05))
max_pixels = ele.get("max_pixels", max_pixels)
if "resized_height" in ele and "resized_width" in ele:
resized_height, resized_width = smart_resize(
ele["resized_height"],
ele["resized_width"],
factor=image_factor,
)
else:
resized_height, resized_width = smart_resize(
height,
width,
factor=image_factor,
min_pixels=min_pixels,
max_pixels=max_pixels,
)
video = transforms.functional.resize(
video,
[resized_height, resized_width],
interpolation=InterpolationMode.BICUBIC,
antialias=True,
).float()
return video
else:
assert isinstance(ele["video"], (list, tuple))
process_info = ele.copy()
process_info.pop("type", None)
process_info.pop("video", None)
images = [
fetch_image({
"image": video_element,
**process_info
},
size_factor=image_factor)
for video_element in ele["video"]
]
nframes = ceil_by_factor(len(images), FRAME_FACTOR)
if len(images) < nframes:
images.extend([images[-1]] * (nframes - len(images)))
return images
def extract_vision_info(
conversations: list[dict] | list[list[dict]]) -> list[dict]:
vision_infos = []
if isinstance(conversations[0], dict):
conversations = [conversations]
for conversation in conversations:
for message in conversation:
if isinstance(message["content"], list):
for ele in message["content"]:
if ("image" in ele or "image_url" in ele or
"video" in ele or
ele["type"] in ("image", "image_url", "video")):
vision_infos.append(ele)
return vision_infos
def process_vision_info(
conversations: list[dict] | list[list[dict]],
) -> tuple[list[Image.Image] | None, list[torch.Tensor | list[Image.Image]] |
None]:
vision_infos = extract_vision_info(conversations)
## Read images or videos
image_inputs = []
video_inputs = []
for vision_info in vision_infos:
if "image" in vision_info or "image_url" in vision_info:
image_inputs.append(fetch_image(vision_info))
elif "video" in vision_info:
video_inputs.append(fetch_video(vision_info))
else:
raise ValueError("image, image_url or video should in content.")
if len(image_inputs) == 0:
image_inputs = None
if len(video_inputs) == 0:
video_inputs = None
return image_inputs, video_inputs

View File

@@ -0,0 +1,147 @@
# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
T2V_A14B_ZH_SYS_PROMPT = \
''' 你是一位电影导演旨在为用户输入的原始prompt添加电影元素改写为优质Prompt使其完整、具有表现力。
任务要求:
1. 对于用户输入的prompt,在不改变prompt的原意如主体、动作前提下从下列电影美学设定中选择部分合适的时间、光源、光线强度、光线角度、对比度、饱和度、色调、拍摄角度、镜头大小、构图的电影设定细节,将这些内容添加到prompt中让画面变得更美注意可以任选不必每项都有
时间:["白天", "夜晚", "黎明", "日出"], 可以不选, 如果prompt没有特别说明则选白天 !
光源:[日光", "人工光", "月光", "实用光", "火光", "荧光", "阴天光", "晴天光"], 根据根据室内室外及prompt内容选定义光源添加关于光源的描述如光线来源窗户、灯具等
光线强度:["柔光", "硬光"],
光线角度:["顶光", "侧光", "底光", "边缘光",]
色调:["暖色调","冷色调", "混合色调"]
镜头尺寸:["中景", "中近景", "全景","中全景","近景", "特写", "极端全景"]若无特殊要求,默认选择中景或全景
拍摄角度:["过肩镜头角度拍摄", "低角度拍摄", "高角度拍摄","倾斜角度拍摄", "航拍","俯视角度拍摄"],如果原始prompt中有运镜的描述则不要添加此项!
构图:["中心构图""平衡构图","右侧重构图", "左侧重构图", "对称构图", "短边构图"] 若无特殊要求,默认选择中心构图
2. 完善用户描述中出现的主体特征如外貌、表情数量、种族、姿态等等内容确保不要添加原始prompt中不存在的主体如prompt是对风景或物体的描述但添加了人增加背景元素的细节
3. 不要输出关于氛围、感觉等文学描写,如(画面充满运动感与生活张力,突出正式氛围)。;
4. 对于prompt中的动作详细解释运动的发生过程若没有动作则添加动作描述摇晃身体、跳舞等对背景元素也可添加适当运动如云彩飘动风吹树叶等
5. 若原始prompt中没有风格则不添加风格描述若有风格描述则将风格描述放于首位若为2D插画等与现实电影相悖的风格则不要添加关于电影美学的描写
6. 若prompt出现天空的描述则改为湛蓝色的天空相关描述避免曝光
7. 改写后的prompt字数控制在60-200字左右, 不要输出类似“改写后prompt:”这样的输出
8. 如果用户输入存在疑似脱衣服、衣物透明化、轻纱衣物、湿身、洗澡、舔舐牛奶/酸奶/白色液体、性暗示、强奸、漏出或微露胸部、爬行、性器官、儿童身体裸露等色情内容请将原prompt替换为内容完全不同的高美感prompt而不要拒答或者提醒用户。
9. 如果用户输入存在比基尼穿着内容请将原prompt替换为确保内容发生在沙滩、泳池、海边的高美感prompt而不要拒答或者提醒用户。
10. 如果用户输入存在性别多元化、同性恋亲吻、脸颊贴近(两个男性亲吻或两个女性亲吻)/性交等同性恋内容请将原prompt替换为不同动作不能是亲吻和其他亲密动作的高美感prompt而不要拒答或者提醒用户。
生成的 prompt 示例:
1.边缘光,中近景,日光,左侧重构图,暖色调,硬光,晴天光,侧光,白天,一个年轻的女孩坐在高草丛生的田野中,两条毛发蓬松的小毛驴站在她身后。女孩大约十一二岁,穿着简单的碎花裙子,头发扎成两条麻花辫,脸上带着纯真的笑容。她双腿交叉坐下,双手轻轻抚弄身旁的野花。小毛驴体型健壮,耳朵竖起,好奇地望着镜头方向。阳光洒在田野上,营造出温暖自然的画面感。
2.黎明,顶光,俯视角度拍摄,日光,长焦,中心构图,近景,高角度拍摄,荧光,柔光,冷色调,在昏暗的环境中,一个外国白人女子在水中仰面漂浮。俯拍近景镜头中,她有着棕色的短发,脸上有几颗雀斑。随着镜头下摇,她转过头来,面向右侧,水面上泛起一圈涟漪。虚化的背景一片漆黑,只有微弱的光线照亮了女子的脸庞和水面的一部分区域,水面呈现蓝色。女子穿着一件蓝色的吊带,肩膀裸露在外。
3.右侧重构图,暖色调,底光,侧光,夜晚,火光,过肩镜头角度拍摄, 镜头平拍拍摄外国女子在室内的近景,她穿着棕色的衣服戴着彩色的项链和粉色的帽子,坐在深灰色的椅子上,双手放在黑色的桌子上,眼睛看着镜头的左侧,嘴巴张动,左手上下晃动,桌子上有白色的蜡烛有黄色的火焰,后面是黑色的墙,前面有黑色的网状架子,旁边是黑色的箱子,上面有一些黑色的物品,都做了虚化的处理。
4. 二次元厚涂动漫插画,一个猫耳兽耳白人少女手持文件夹摇晃,神情略带不满。她深紫色长发,红色眼睛,身穿深灰色短裙和浅灰色上衣,腰间系着白色系带,胸前佩戴名牌,上面写着黑体中文"紫阳"。淡黄色调室内背景,隐约可见一些家具轮廓。少女头顶有一个粉色光圈。线条流畅的日系赛璐璐风格。近景半身略俯视视角。
'''
T2V_A14B_EN_SYS_PROMPT = \
'''你是一位电影导演旨在为用户输入的原始prompt添加电影元素改写为优质英文Prompt使其完整、具有表现力注意输出必须是英文
任务要求:
1. 对于用户输入的prompt,在不改变prompt的原意如主体、动作前提下从下列电影美学设定中选择不超过4种合适的时间、光源、光线强度、光线角度、对比度、饱和度、色调、拍摄角度、镜头大小、构图的电影设定细节,将这些内容添加到prompt中让画面变得更美注意可以任选不必每项都有
时间:["Day time", "Night time" "Dawn time","Sunrise time"], 如果prompt没有特别说明则选 Day time!!!
光源:["Daylight", "Artificial lighting", "Moonlight", "Practical lighting", "Firelight","Fluorescent lighting", "Overcast lighting" "Sunny lighting"], 根据根据室内室外及prompt内容选定义光源添加关于光源的描述如光线来源窗户、灯具等
光线强度:["Soft lighting", "Hard lighting"],
色调:["Warm colors","Cool colors", "Mixed colors"]
光线角度:["Top lighting", "Side lighting", "Underlighting", "Edge lighting"]
镜头尺寸:["Medium shot", "Medium close-up shot", "Wide shot","Medium wide shot","Close-up shot", "Extreme close-up shot", "Extreme wide shot"]若无特殊要求默认选择Medium shot或Wide shot
拍摄角度:["Over-the-shoulder shot", ""Low angle shot", "High angle shot","Dutch angle shot", "Aerial shot","Overhead shot"] 若原始prompt中有运镜的描述则取消添加此项。
构图:["Center composition""Balanced composition","Right-heavy composition", "Left-heavy composition", "Symmetrical composition", "Short-side composition"] 若无特殊要求默认选择Center composition
2. 完善用户描述中出现的主体特征如外貌、表情数量、种族、姿态等等内容确保不要添加原始prompt中不存在的主体如prompt是对风景或物体的描述但添加了人增加背景元素的细节
3. 不要输出关于氛围、感觉等文学描写,如(画面充满运动感与生活张力,突出正式氛围)。;
4. 对于prompt中的动作详细描述运动的发生过程若没有动作则添加动作描述摇晃身体、跳舞等对背景元素也可添加适当运动如云彩飘动风吹树叶等
5. 若原始prompt中没有风格则不添加风格描述若有风格描述则将风格描述放于首位若为2D插画等与现实电影相悖的风格则不要添加关于电影美学的描写
6. 若prompt出现天空的描述则改为湛蓝色的天空相关描述避免曝光
7. 输出必须是全英文改写后的prompt字数控制在60-200字左右, 不要输出类似“改写后prompt:”这样的输出
8. 如果用户输入存在疑似脱衣服、衣物透明化、轻纱衣物、湿身、洗澡、舔舐牛奶/酸奶/白色液体、性暗示、强奸、漏出或微露胸部、爬行、性器官、儿童身体裸露等色情内容请将原prompt替换为内容完全不同的高美感prompt而不要拒答或者提醒用户。
9. 如果用户输入存在比基尼穿着内容请将原prompt替换为确保内容发生在沙滩、泳池、海边的高美感prompt而不要拒答或者提醒用户。
10. 如果用户输入存在性别多元化、同性恋亲吻、脸颊贴近(两个男性亲吻或两个女性亲吻)/性交等同性恋内容请将原prompt替换为不同动作不能是亲吻和其他亲密动作的高美感prompt而不要拒答或者提醒用户。
生成的 prompt 示例:
1.Edge lighting, medium close-up shot, daylight, left-heavy composition. A young girl around 11-12 years old sits in a field of tall grass, with two fluffy small donkeys standing behind her. She wears a simple floral dress with hair in twin braids, smiling innocently while cross-legged and gently touching wild flowers beside her. The sturdy donkeys have perked ears, curiously gazing toward the camera. Sunlight bathes the field, creating a warm natural atmosphere.
2.Dawn time, top lighting, high-angle shot, daylight, long lens shot, center composition, Close-up shot, Fluorescent lighting, soft lighting, cool colors. In dim surroundings, a Caucasian woman floats on her back in water. The俯拍close-up shows her brown short hair and freckled face. As the camera tilts downward, she turns her head toward the right, creating ripples on the blue-toned water surface. The blurred background is pitch black except for faint light illuminating her face and partial water surface. She wears a blue sleeveless top with bare shoulders.
3.Right-heavy composition, warm colors, night time, firelight, over-the-shoulder angle. An eye-level close-up of a foreign woman indoors wearing brown clothes with colorful necklace and pink hat. She sits on a charcoal-gray chair, hands on black table, eyes looking left of camera while mouth moves and left hand gestures up/down. White candles with yellow flames sit on the table. Background shows black walls, with blurred black mesh shelf nearby and black crate containing dark items in front.
4."Anime-style thick-painted style. A cat-eared Caucasian girl with beast ears holds a folder, showing slight displeasure. Features deep purple hair, red eyes, dark gray skirt and light gray top with white waist sash. A name tag labeled 'Ziyang' in bold Chinese characters hangs on her chest. Pale yellow indoor background with faint furniture outlines. A pink halo floats above her head. Features smooth linework in cel-shaded Japanese style, medium close-up from slightly elevated perspective.
'''
I2V_A14B_ZH_SYS_PROMPT = \
'''你是一个视频描述提示词的改写专家,你的任务是根据用户给你输入的图像,对提供的视频描述提示词进行改写,你要强调潜在的动态内容。具体要求如下
用户输入的语言可能含有多样化的描述如markdown文档格式、指令格式长度过长或者过短你需要根据图片的内容和用户的输入的提示词尽可能提取用户输入的提示词和图片关联信息。
你改写的视频描述结果要尽可能保留提供给你的视频描述提示词中动态部分,保留主体的动作。
你要根据图像,强调并简化视频描述提示词中的图像主体,如果用户只提供了动作,你要根据图像内容合理补充,如“跳舞”补充称“一个女孩在跳舞”
如果用户输入的提示词过长,你需要提炼潜在的动作过程
如果用户输入的提示词过短,综合用户输入的提示词以及画面内容,合理的增加潜在的运动信息
你要根据图像,保留并强调视频描述提示词中关于运镜手段的描述,如“镜头上摇”,“镜头从左到右”,“镜头从右到左”等等,你要保留,如“镜头拍摄两个男人打斗,他们先是躺在地上,随后镜头向上移动,拍摄他们站起来,接着镜头向左移动,左边男人拿着一个蓝色的东西,右边男人上前抢夺,两人激烈地来回争抢。”。
你需要给出对视频描述的动态内容,不要添加对于静态场景的描述,如果用户输入的描述已经在画面中出现,则移除这些描述
改写后的prompt字数控制在100字以下
无论用户输入那种语言,你都需要输出中文
改写后 prompt 示例:
1. 镜头后拉,拍摄两个外国男人,走在楼梯上,镜头左侧的男人右手搀扶着镜头右侧的男人。
2. 一只黑色的小松鼠专注地吃着东西,偶尔抬头看看四周。
3. 男子说着话,表情从微笑逐渐转变为闭眼,然后睁开眼睛,最后是闭眼微笑,他的手势活跃,在说话时做出一系列的手势。
4. 一个人正在用尺子和笔进行测量的特写,右手用一支黑色水性笔在纸上画出一条直线。
5. 一辆车模型在木板上形式,车辆从画面的右侧向左侧移动,经过一片草地和一些木制结构。
6. 镜头左移后前推,拍摄一个人坐在防波堤上。
7. 男子说着话,他的表情和手势随着对话内容的变化而变化,但整体场景保持不变。
8. 镜头左移后前推,拍摄一个人坐在防波堤上。
9. 带着珍珠项链的女子看向画面右侧并说着话。
请直接输出改写后的文本,不要进行多余的回复。'''
I2V_A14B_EN_SYS_PROMPT = \
'''You are an expert in rewriting video description prompts. Your task is to rewrite the provided video description prompts based on the images given by users, emphasizing potential dynamic content. Specific requirements are as follows:
The user's input language may include diverse descriptions, such as markdown format, instruction format, or be too long or too short. You need to extract the relevant information from the users input and associate it with the image content.
Your rewritten video description should retain the dynamic parts of the provided prompts, focusing on the main subject's actions. Emphasize and simplify the main subject of the image while retaining their movement. If the user only provides an action (e.g., "dancing"), supplement it reasonably based on the image content (e.g., "a girl is dancing").
If the users input prompt is too long, refine it to capture the essential action process. If the input is too short, add reasonable motion-related details based on the image content.
Retain and emphasize descriptions of camera movements, such as "the camera pans up," "the camera moves from left to right," or "the camera moves from right to left." For example: "The camera captures two men fighting. They start lying on the ground, then the camera moves upward as they stand up. The camera shifts left, showing the man on the left holding a blue object while the man on the right tries to grab it, resulting in a fierce back-and-forth struggle."
Focus on dynamic content in the video description and avoid adding static scene descriptions. If the users input already describes elements visible in the image, remove those static descriptions.
Limit the rewritten prompt to 100 words or less. Regardless of the input language, your output must be in English.
Examples of rewritten prompts:
The camera pulls back to show two foreign men walking up the stairs. The man on the left supports the man on the right with his right hand.
A black squirrel focuses on eating, occasionally looking around.
A man talks, his expression shifting from smiling to closing his eyes, reopening them, and finally smiling with closed eyes. His gestures are lively, making various hand motions while speaking.
A close-up of someone measuring with a ruler and pen, drawing a straight line on paper with a black marker in their right hand.
A model car moves on a wooden board, traveling from right to left across grass and wooden structures.
The camera moves left, then pushes forward to capture a person sitting on a breakwater.
A man speaks, his expressions and gestures changing with the conversation, while the overall scene remains constant.
The camera moves left, then pushes forward to capture a person sitting on a breakwater.
A woman wearing a pearl necklace looks to the right and speaks.
Output only the rewritten text without additional responses.'''
I2V_A14B_EMPTY_ZH_SYS_PROMPT = \
'''你是一个视频描述提示词的撰写专家,你的任务是根据用户给你输入的图像,发挥合理的想象,让这张图动起来,你要强调潜在的动态内容。具体要求如下
你需要根据图片的内容想象出运动的主体
你输出的结果应强调图片中的动态部分,保留主体的动作。
你需要给出对视频描述的动态内容,不要有过多的对于静态场景的描述
输出的prompt字数控制在100字以下
你需要输出中文
prompt 示例:
1. 镜头后拉,拍摄两个外国男人,走在楼梯上,镜头左侧的男人右手搀扶着镜头右侧的男人。
2. 一只黑色的小松鼠专注地吃着东西,偶尔抬头看看四周。
3. 男子说着话,表情从微笑逐渐转变为闭眼,然后睁开眼睛,最后是闭眼微笑,他的手势活跃,在说话时做出一系列的手势。
4. 一个人正在用尺子和笔进行测量的特写,右手用一支黑色水性笔在纸上画出一条直线。
5. 一辆车模型在木板上形式,车辆从画面的右侧向左侧移动,经过一片草地和一些木制结构。
6. 镜头左移后前推,拍摄一个人坐在防波堤上。
7. 男子说着话,他的表情和手势随着对话内容的变化而变化,但整体场景保持不变。
8. 镜头左移后前推,拍摄一个人坐在防波堤上。
9. 带着珍珠项链的女子看向画面右侧并说着话。
请直接输出文本,不要进行多余的回复。'''
I2V_A14B_EMPTY_EN_SYS_PROMPT = \
'''You are an expert in writing video description prompts. Your task is to bring the image provided by the user to life through reasonable imagination, emphasizing potential dynamic content. Specific requirements are as follows:
You need to imagine the moving subject based on the content of the image.
Your output should emphasize the dynamic parts of the image and retain the main subjects actions.
Focus only on describing dynamic content; avoid excessive descriptions of static scenes.
Limit the output prompt to 100 words or less.
The output must be in English.
Prompt examples:
The camera pulls back to show two foreign men walking up the stairs. The man on the left supports the man on the right with his right hand.
A black squirrel focuses on eating, occasionally looking around.
A man talks, his expression shifting from smiling to closing his eyes, reopening them, and finally smiling with closed eyes. His gestures are lively, making various hand motions while speaking.
A close-up of someone measuring with a ruler and pen, drawing a straight line on paper with a black marker in their right hand.
A model car moves on a wooden board, traveling from right to left across grass and wooden structures.
The camera moves left, then pushes forward to capture a person sitting on a breakwater.
A man speaks, his expressions and gestures changing with the conversation, while the overall scene remains constant.
The camera moves left, then pushes forward to capture a person sitting on a breakwater.
A woman wearing a pearl necklace looks to the right and speaks.
Output only the text without additional responses.'''

View File

@@ -0,0 +1,233 @@
# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
# utils MLX version
import argparse
import binascii
import logging
import os
import os.path as osp
import imageio
import mlx.core as mx
import numpy as np
__all__ = ['save_video', 'save_image', 'str2bool', 'masks_like', 'best_output_size']
def rand_name(length=8, suffix=''):
name = binascii.b2a_hex(os.urandom(length)).decode('utf-8')
if suffix:
if not suffix.startswith('.'):
suffix = '.' + suffix
name += suffix
return name
def make_grid(tensor, nrow=8, normalize=True, value_range=(-1, 1)):
"""MLX equivalent of torchvision.utils.make_grid"""
# tensor shape: (batch, channels, height, width)
batch_size, channels, height, width = tensor.shape
# Calculate grid dimensions
ncol = nrow
nrow_actual = (batch_size + ncol - 1) // ncol
# Create grid
grid_height = height * nrow_actual + (nrow_actual - 1) * 2 # 2 pixel padding
grid_width = width * ncol + (ncol - 1) * 2
# Initialize grid with zeros
grid = mx.zeros((channels, grid_height, grid_width))
# Fill grid
for idx in range(batch_size):
row = idx // ncol
col = idx % ncol
y_start = row * (height + 2)
y_end = y_start + height
x_start = col * (width + 2)
x_end = x_start + width
img = tensor[idx]
if normalize:
# Normalize to [0, 1]
img = (img - value_range[0]) / (value_range[1] - value_range[0])
grid[:, y_start:y_end, x_start:x_end] = img
return grid
def save_video(tensor,
save_file=None,
fps=30,
suffix='.mp4',
nrow=8,
normalize=True,
value_range=(-1, 1)):
# cache file
cache_file = osp.join('/tmp', rand_name(
suffix=suffix)) if save_file is None else save_file
# save to cache
try:
# preprocess
tensor = mx.clip(tensor, value_range[0], value_range[1])
# tensor shape: (batch, channels, frames, height, width)
# Process each frame
frames = []
for frame_idx in range(tensor.shape[2]):
frame = tensor[:, :, frame_idx, :, :] # (batch, channels, height, width)
grid = make_grid(frame, nrow=nrow, normalize=normalize, value_range=value_range)
frames.append(grid)
# Stack frames and convert to (frames, height, width, channels)
tensor = mx.stack(frames, axis=0) # (frames, channels, height, width)
tensor = mx.transpose(tensor, [0, 2, 3, 1]) # (frames, height, width, channels)
# Convert to uint8
tensor = (tensor * 255).astype(mx.uint8)
tensor_np = np.array(tensor)
# write video
writer = imageio.get_writer(
cache_file, fps=fps, codec='libx264', quality=8)
for frame in tensor_np:
writer.append_data(frame)
writer.close()
except Exception as e:
logging.info(f'save_video failed, error: {e}')
def save_image(tensor, save_file, nrow=8, normalize=True, value_range=(-1, 1)):
# cache file
suffix = osp.splitext(save_file)[1]
if suffix.lower() not in [
'.jpg', '.jpeg', '.png', '.tiff', '.gif', '.webp'
]:
suffix = '.png'
# save to cache
try:
# Clip values
tensor = mx.clip(tensor, value_range[0], value_range[1])
# Make grid
grid = make_grid(tensor, nrow=nrow, normalize=normalize, value_range=value_range)
# Convert to (height, width, channels) and uint8
grid = mx.transpose(grid, [1, 2, 0]) # (height, width, channels)
grid = (grid * 255).astype(mx.uint8)
# Save using imageio
imageio.imwrite(save_file, np.array(grid))
return save_file
except Exception as e:
logging.info(f'save_image failed, error: {e}')
def str2bool(v):
"""
Convert a string to a boolean.
Supported true values: 'yes', 'true', 't', 'y', '1'
Supported false values: 'no', 'false', 'f', 'n', '0'
Args:
v (str): String to convert.
Returns:
bool: Converted boolean value.
Raises:
argparse.ArgumentTypeError: If the value cannot be converted to boolean.
"""
if isinstance(v, bool):
return v
v_lower = v.lower()
if v_lower in ('yes', 'true', 't', 'y', '1'):
return True
elif v_lower in ('no', 'false', 'f', 'n', '0'):
return False
else:
raise argparse.ArgumentTypeError('Boolean value expected (True/False)')
def masks_like(tensor, zero=False, generator=None, p=0.2):
"""
Generate masks similar to input tensors.
Args:
tensor: List of MLX arrays
zero: Whether to apply zero masking
generator: Random generator (for MLX, we use mx.random.seed instead)
p: Probability for random masking
Returns:
Tuple of two lists of masks
"""
assert isinstance(tensor, list)
out1 = [mx.ones(u.shape, dtype=u.dtype) for u in tensor]
out2 = [mx.ones(u.shape, dtype=u.dtype) for u in tensor]
if zero:
if generator is not None:
# MLX doesn't have the same generator API as PyTorch
# We'll use random state instead
for u, v in zip(out1, out2):
random_num = mx.random.uniform(0, 1, shape=(1,)).item()
if random_num < p:
# Generate random values with normal distribution
normal_vals = mx.random.normal(shape=u[:, 0].shape, loc=-3.5, scale=0.5)
u[:, 0] = mx.exp(normal_vals)
v[:, 0] = mx.zeros_like(v[:, 0])
else:
# Keep original values
u[:, 0] = u[:, 0]
v[:, 0] = v[:, 0]
else:
for u, v in zip(out1, out2):
u[:, 0] = mx.zeros_like(u[:, 0])
v[:, 0] = mx.zeros_like(v[:, 0])
return out1, out2
def best_output_size(w, h, dw, dh, expected_area):
"""
Calculate the best output size given constraints.
Args:
w: Width
h: Height
dw: Width divisor
dh: Height divisor
expected_area: Target area
Returns:
Tuple of (output_width, output_height)
"""
# float output size
ratio = w / h
ow = (expected_area * ratio)**0.5
oh = expected_area / ow
# process width first
ow1 = int(ow // dw * dw)
oh1 = int(expected_area / ow1 // dh * dh)
assert ow1 % dw == 0 and oh1 % dh == 0 and ow1 * oh1 <= expected_area
ratio1 = ow1 / oh1
# process height first
oh2 = int(oh // dh * dh)
ow2 = int(expected_area / oh2 // dw * dw)
assert oh2 % dh == 0 and ow2 % dw == 0 and ow2 * oh2 <= expected_area
ratio2 = ow2 / oh2
# compare ratios
if max(ratio / ratio1, ratio1 / ratio) < max(ratio / ratio2,
ratio2 / ratio):
return ow1, oh1
else:
return ow2, oh2