Spaces:

Etadingrui
/

PIWM

Sleeping

App Files Files Community

musictimer commited on Sep 10

Commit

41e58ab

1 Parent(s): 02c6351

Fix initial bugs

Browse files

Files changed (1) hide show

app.py +69 -17

app.py CHANGED Viewed

@@ -167,14 +167,24 @@ class WebGameEngine:
                 self.loading_status = "Loading model weights into agent..."
                 logger.info("State dict loaded, applying to agent...")
-                # Load state dict into agent, but skip actor_critic if not present
                 has_actor_critic = any(k.startswith('actor_critic.') for k in state_dict.keys())
-                logger.info(f"Model has actor_critic weights: {has_actor_critic}")
                 agent.load_state_dict(state_dict, load_actor_critic=has_actor_critic)
                 # Track if actor_critic was actually loaded with trained weights
                 self.actor_critic_loaded = has_actor_critic
                 self.download_progress = 100
                 self.loading_status = "Model loaded successfully!"
                 logger.info("All model weights loaded successfully!")
@@ -297,35 +307,61 @@ class WebGameEngine:
                     logger.info(f"Actor-critic device: {agent.actor_critic.device}")
                     # Force AI control for web demo
                     self.play_env.is_human_player = False
-                    logger.info("WebPlayEnv set to AI control mode")
                 elif agent.actor_critic is not None and not self.actor_critic_loaded:
-                    logger.warning("Actor-critic model exists but has no trained weights - using dummy mode!")
                     self.play_env.is_human_player = True
                     logger.info("WebPlayEnv set to human control mode (no trained weights)")
                 else:
-                    logger.warning("No actor-critic model found - AI inference will not work!")
                     self.play_env.is_human_player = True
                     logger.info("WebPlayEnv set to human control mode (fallback)")
-                # Enable torch.compile by default like play.py does (can disable with DISABLE_TORCH_COMPILE=1)
-                import os, pwd
                 try:
                     pwd.getpwuid(os.getuid())
                 except KeyError:
                     os.environ["USER"] = "huggingface"
-                os.environ["DISABLE_TORCH_COMPILE"] = "0"
-                if device.type == "cuda" and os.getenv("DISABLE_TORCH_COMPILE", "0") != "1":
                     logger.info("Compiling models for faster inference (like play.py --compile)...")
                     try:
                         wm_env.predict_next_obs = torch.compile(wm_env.predict_next_obs, mode="reduce-overhead")
                         if wm_env.upsample_next_obs is not None:
                             wm_env.upsample_next_obs = torch.compile(wm_env.upsample_next_obs, mode="reduce-overhead")
-                        logger.info("Model compilation enabled successfully!")
                     except Exception as e:
-                        logger.warning(f"Model compilation failed: {e}")
                 else:
-                    logger.info("Model compilation disabled. Set DISABLE_TORCH_COMPILE=0 to enable.")
                 # Reset environment
                 self.obs, _ = self.play_env.reset()
@@ -714,17 +750,33 @@ class WebGameEngine:
                 start = self.time_module.time()
                 # Use FP16 autocast for faster inference (like play.py can do with modern GPUs)
-                from torch.cuda.amp import autocast
-                with autocast(dtype=torch.float16, enabled=torch.cuda.is_available()):
                     res = self.play_env.step_from_web_input(**web_state)
                 infer_t = self.time_module.time() - start
                 await self._out_queue.put((*res, infer_t))
             except Exception as e:
                 logger.error(f"Inference worker error: {e}")
-                # Put a dummy result to avoid hanging
-                dummy_obs = self.obs if self.obs is not None else torch.zeros(3, 150, 600)
-                await self._out_queue.put((dummy_obs, 0.0, False, False, {"error": str(e)}, 0.0))
 # Global game engine instance
 game_engine = WebGameEngine()

                 self.loading_status = "Loading model weights into agent..."
                 logger.info("State dict loaded, applying to agent...")
+                # Check what components are in the state dict
                 has_actor_critic = any(k.startswith('actor_critic.') for k in state_dict.keys())
+                has_denoiser = any(k.startswith('denoiser.') for k in state_dict.keys())
+                has_upsampler = any(k.startswith('upsampler.') for k in state_dict.keys())
+                logger.info(f"Model components found - actor_critic: {has_actor_critic}, denoiser: {has_denoiser}, upsampler: {has_upsampler}")
+                # Load state dict into agent
                 agent.load_state_dict(state_dict, load_actor_critic=has_actor_critic)
                 # Track if actor_critic was actually loaded with trained weights
                 self.actor_critic_loaded = has_actor_critic
+                # For HF Spaces demo, if no actor_critic, we can still show the world model
+                if not has_actor_critic:
+                    logger.warning("No actor_critic weights found - world model will work but AI won't play")
+                    logger.info("Users can still interact and see the world model predictions")
                 self.download_progress = 100
                 self.loading_status = "Model loaded successfully!"
                 logger.info("All model weights loaded successfully!")
                     logger.info(f"Actor-critic device: {agent.actor_critic.device}")
                     # Force AI control for web demo
                     self.play_env.is_human_player = False
+                    logger.info("✅ WebPlayEnv set to AI control mode - ready for inference!")
                 elif agent.actor_critic is not None and not self.actor_critic_loaded:
+                    logger.warning("⚠️ Actor-critic model exists but has no trained weights!")
+                    logger.info("🎮 Demo will work in world-model mode (human input + world simulation)")
+                    # Still allow human input to drive the world model
                     self.play_env.is_human_player = True
+                    self.play_env.human_input_override = True  # Always use human input
                     logger.info("WebPlayEnv set to human control mode (no trained weights)")
                 else:
+                    logger.warning("❌ No actor-critic model found - AI inference will not work!")
                     self.play_env.is_human_player = True
                     logger.info("WebPlayEnv set to human control mode (fallback)")
+                # Set up cache directories for HF Spaces compatibility
+                import os, pwd, tempfile
                 try:
                     pwd.getpwuid(os.getuid())
                 except KeyError:
                     os.environ["USER"] = "huggingface"
+                # Set writable cache directories for HF Spaces
+                cache_dir = tempfile.gettempdir()
+                os.environ.setdefault("TRITON_CACHE_DIR", os.path.join(cache_dir, "triton"))
+                os.environ.setdefault("TORCH_COMPILE_DEBUG_DIR", os.path.join(cache_dir, "torch_compile"))
+                # Create cache directories
+                for cache_var in ["TRITON_CACHE_DIR", "TORCH_COMPILE_DEBUG_DIR"]:
+                    cache_path = os.environ[cache_var]
+                    os.makedirs(cache_path, exist_ok=True)
+                # Enable torch.compile with proper error handling for HF Spaces
+                # Check if we're on HF Spaces (common indicators)
+                is_hf_spaces = any([
+                    'space_id' in os.environ,
+                    'huggingface' in os.environ.get('USER', '').lower(),
+                    '/app' in os.getcwd()
+                ])
+                compile_enabled = (device.type == "cuda" and
+                                 os.getenv("DISABLE_TORCH_COMPILE", "0") != "1" and
+                                 not is_hf_spaces)  # Disable by default on HF Spaces due to permission issues
+                if compile_enabled:
                     logger.info("Compiling models for faster inference (like play.py --compile)...")
                     try:
                         wm_env.predict_next_obs = torch.compile(wm_env.predict_next_obs, mode="reduce-overhead")
                         if wm_env.upsample_next_obs is not None:
                             wm_env.upsample_next_obs = torch.compile(wm_env.upsample_next_obs, mode="reduce-overhead")
+                        logger.info("✅ Model compilation enabled successfully!")
                     except Exception as e:
+                        logger.warning(f"⚠️ Model compilation failed: {e}")
+                        logger.info("Continuing without model compilation...")
                 else:
+                    reason = "HF Spaces detected" if is_hf_spaces else "disabled by env var"
+                    logger.info(f"Model compilation disabled ({reason}). Models will run uncompiled.")
                 # Reset environment
                 self.obs, _ = self.play_env.reset()
                 start = self.time_module.time()
                 # Use FP16 autocast for faster inference (like play.py can do with modern GPUs)
+                # Use newer autocast API to avoid deprecation warning
+                import torch
+                with torch.amp.autocast('cuda', dtype=torch.float16, enabled=torch.cuda.is_available()):
                     res = self.play_env.step_from_web_input(**web_state)
                 infer_t = self.time_module.time() - start
                 await self._out_queue.put((*res, infer_t))
             except Exception as e:
                 logger.error(f"Inference worker error: {e}")
+                import traceback
+                logger.error(f"Full traceback: {traceback.format_exc()}")
+                # Create a proper dummy result with correct tensor properties
+                try:
+                    if self.obs is not None and hasattr(self.obs, 'shape') and hasattr(self.obs, 'device'):
+                        dummy_obs = self.obs.clone()
+                    else:
+                        # Fallback to a standard tensor on the right device
+                        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+                        dummy_obs = torch.zeros(1, 3, 150, 600, device=device)
+                    await self._out_queue.put((dummy_obs, 0.0, False, False, {"error": str(e)}, 0.0))
+                except Exception as e2:
+                    logger.error(f"Error creating dummy result: {e2}")
+                    # Last resort - create CPU tensor
+                    dummy_obs = torch.zeros(1, 3, 150, 600)
+                    await self._out_queue.put((dummy_obs, 0.0, False, False, {"error": str(e)}, 0.0))
 # Global game engine instance
 game_engine = WebGameEngine()