Spaces:

fusodoya
/

DepthCrafter

Sleeping

App Files Files Community

DepthCrafter / depthcrafter /inference.py

fusodoya

Upload folder using huggingface_hub

40b178e verified 18 days ago

raw

history blame contribute delete

6.41 kB

	import gc
	import logging
	import os
	from typing import List, Optional

	import numpy as np
	import torch
	from diffusers.training_utils import set_seed

	from depthcrafter.depth_crafter_ppl import DepthCrafterPipeline
	from depthcrafter.unet import DiffusersUNetSpatioTemporalConditionModelDepthCrafter
	from depthcrafter.utils import read_video_frames, save_video, vis_sequence_depth

	logger = logging.getLogger(__name__)


	class DepthCrafterInference:
	"""
	Inference class for DepthCrafter.
	"""

	def __init__(
	self,
	unet_path: str,
	pre_train_path: str,
	cpu_offload: Optional[str] = "model",
	device: str = "cuda",
	):
	"""
	Initialize the DepthCrafter inference pipeline.

	Args:
	unet_path (str): Path to the UNet model.
	pre_train_path (str): Path to the pre-trained model.
	cpu_offload (Optional[str]): CPU offload strategy ("model", "sequential", or None).
	device (str): Device to run the model on ("cuda" or "cpu").
	"""
	# Determine dtype based on device
	if device == "cpu":
	dtype = torch.float32
	variant = None
	else:
	dtype = torch.float16
	variant = "fp16"

	logger.info(f"Loading UNet from {unet_path}")
	unet = DiffusersUNetSpatioTemporalConditionModelDepthCrafter.from_pretrained(
	unet_path,
	low_cpu_mem_usage=True,
	torch_dtype=dtype,
	)

	logger.info(f"Loading pipeline from {pre_train_path}")
	pipeline_kwargs = {
	"unet": unet,
	"torch_dtype": dtype,
	}
	if variant is not None:
	pipeline_kwargs["variant"] = variant

	self.pipe = DepthCrafterPipeline.from_pretrained(
	pre_train_path,
	**pipeline_kwargs
	)

	if cpu_offload is not None:
	if cpu_offload == "sequential":
	self.pipe.enable_sequential_cpu_offload()
	elif cpu_offload == "model":
	self.pipe.enable_model_cpu_offload()
	else:
	raise ValueError(f"Unknown cpu offload option: {cpu_offload}")
	else:
	self.pipe.to(device)

	try:
	self.pipe.enable_xformers_memory_efficient_attention()
	except (ImportError, ModuleNotFoundError, AttributeError) as e:
	logger.warning(f"Xformers is not enabled: {e}")

	self.pipe.enable_attention_slicing()

	def infer(
	self,
	video_path: str,
	num_denoising_steps: int,
	guidance_scale: float,
	save_folder: str = "./demo_output",
	window_size: int = 110,
	process_length: int = 195,
	overlap: int = 25,
	max_res: int = 1024,
	dataset: str = "open",
	target_fps: int = 15,
	seed: int = 42,
	track_time: bool = True,
	save_npz: bool = False,
	save_exr: bool = False,
	) -> List[str]:
	"""
	Run inference on a video.

	Args:
	video_path (str): Path to the input video.
	num_denoising_steps (int): Number of denoising steps.
	guidance_scale (float): Guidance scale.
	save_folder (str): Folder to save output.
	window_size (int): Window size for sliding window inference.
	process_length (int): Maximum number of frames to process.
	overlap (int): Overlap between windows.
	max_res (int): Maximum resolution.
	dataset (str): Dataset name for resolution settings.
	target_fps (int): Target FPS for output video.
	seed (int): Random seed.
	track_time (bool): Whether to track execution time.
	save_npz (bool): Whether to save depth map as .npz.
	save_exr (bool): Whether to save depth map as .exr.

	Returns:
	List[str]: List of paths to saved files.
	"""
	set_seed(seed)

	frames, target_fps = read_video_frames(
	video_path,
	process_length,
	target_fps,
	max_res,
	dataset,
	)

	with torch.inference_mode():
	res = self.pipe(
	frames,
	height=frames.shape[1],
	width=frames.shape[2],
	output_type="np",
	guidance_scale=guidance_scale,
	num_inference_steps=num_denoising_steps,
	window_size=window_size,
	overlap=overlap,
	track_time=track_time,
	).frames[0]

	res = res.sum(-1) / res.shape[-1]
	res = (res - res.min()) / (res.max() - res.min())
	vis = vis_sequence_depth(res)

	save_path = os.path.join(
	save_folder, os.path.splitext(os.path.basename(video_path))[0]
	)
	os.makedirs(os.path.dirname(save_path), exist_ok=True)

	save_video(res, save_path + "_depth.mp4", fps=target_fps)
	save_video(vis, save_path + "_vis.mp4", fps=target_fps)
	save_video(frames, save_path + "_input.mp4", fps=target_fps)

	if save_npz:
	np.savez_compressed(save_path + ".npz", depth=res)

	if save_exr:
	self._save_exr(res, save_path)

	return [
	save_path + "_input.mp4",
	save_path + "_vis.mp4",
	save_path + "_depth.mp4",
	]

	def _save_exr(self, res: np.ndarray, save_path: str):
	"""
	Save results as EXR files.
	"""
	try:
	import OpenEXR
	import Imath
	except ImportError:
	logger.error("OpenEXR or Imath not installed. Skipping EXR saving.")
	return

	os.makedirs(save_path, exist_ok=True)
	logger.info(f"Saving EXR results to {save_path}")

	for i, frame in enumerate(res):
	output_exr = f"{save_path}/frame_{i:04d}.exr"
	header = OpenEXR.Header(frame.shape[1], frame.shape[0])
	header["channels"] = {
	"Z": Imath.Channel(Imath.PixelType(Imath.PixelType.FLOAT))
	}
	exr_file = OpenEXR.OutputFile(output_exr, header)
	exr_file.writePixels({"Z": frame.tobytes()})
	exr_file.close()

	def clear_cache(self):
	"""Clear CUDA cache."""
	gc.collect()
	torch.cuda.empty_cache()