2.1寒假每日总结23

最最简单的超级马里奥训练过程
from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
import time
from matplotlib import pyplot as plt
from stable_baselines3 import PPO
env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = JoypadSpace(env, SIMPLE_MOVEMENT)
tensorboard_log = r'./tensorboard_log/'

model = PPO("CnnPolicy", env, verbose=1,
            tensorboard_log = tensorboard_log)
model.learn(total_timesteps=25000)
model.save("mario_model")
Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
Logging to ./tensorboard_log/PPO_1


D:\software\e_anaconda\envs\pytorch\lib\site-packages\gym_super_mario_bros\smb_env.py:148: RuntimeWarning: overflow encountered in ubyte_scalars
  return (self.ram[0x86] - self.ram[0x071c]) % 256


-----------------------------
| time/              |      |
|    fps             | 116  |
|    iterations      | 1    |
|    time_elapsed    | 17   |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 81          |
|    iterations           | 2           |
|    time_elapsed         | 50          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.025405666 |
|    clip_fraction        | 0.274       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.92       |
|    explained_variance   | 0.00504     |
|    learning_rate        | 0.0003      |
|    loss                 | 0.621       |
|    n_updates            | 10          |
|    policy_gradient_loss | 0.0109      |
|    value_loss           | 17.4        |
-----------------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 73          |
|    iterations           | 3           |
|    time_elapsed         | 83          |
|    total_timesteps      | 6144        |
| train/                  |             |
|    approx_kl            | 0.010906073 |
|    clip_fraction        | 0.109       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.92       |
|    explained_variance   | 0.0211      |
|    learning_rate        | 0.0003      |
|    loss                 | 0.101       |
|    n_updates            | 20          |
|    policy_gradient_loss | -0.00392    |
|    value_loss           | 0.187       |
-----------------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 69          |
|    iterations           | 4           |
|    time_elapsed         | 117         |
|    total_timesteps      | 8192        |
| train/                  |             |
|    approx_kl            | 0.009882288 |
|    clip_fraction        | 0.0681      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.9        |
|    explained_variance   | 0.101       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0738      |
|    n_updates            | 30          |
|    policy_gradient_loss | -0.00502    |
|    value_loss           | 0.13        |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.01e+04    |
|    ep_rew_mean          | 891         |
| time/                   |             |
|    fps                  | 65          |
|    iterations           | 5           |
|    time_elapsed         | 156         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.008186281 |
|    clip_fraction        | 0.105       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.87       |
|    explained_variance   | 0.0161      |
|    learning_rate        | 0.0003      |
|    loss                 | 0.28        |
|    n_updates            | 40          |
|    policy_gradient_loss | -0.00649    |
|    value_loss           | 0.811       |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.01e+04    |
|    ep_rew_mean          | 891         |
| time/                   |             |
|    fps                  | 64          |
|    iterations           | 6           |
|    time_elapsed         | 190         |
|    total_timesteps      | 12288       |
| train/                  |             |
|    approx_kl            | 0.024062362 |
|    clip_fraction        | 0.246       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.9        |
|    explained_variance   | 0.269       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.54        |
|    n_updates            | 50          |
|    policy_gradient_loss | 0.0362      |
|    value_loss           | 10.8        |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.01e+04    |
|    ep_rew_mean          | 891         |
| time/                   |             |
|    fps                  | 63          |
|    iterations           | 7           |
|    time_elapsed         | 225         |
|    total_timesteps      | 14336       |
| train/                  |             |
|    approx_kl            | 0.024466533 |
|    clip_fraction        | 0.211       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.89       |
|    explained_variance   | 0.839       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.435       |
|    n_updates            | 60          |
|    policy_gradient_loss | 0.023       |
|    value_loss           | 3.06        |
-----------------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 1.01e+04   |
|    ep_rew_mean          | 891        |
| time/                   |            |
|    fps                  | 63         |
|    iterations           | 8          |
|    time_elapsed         | 259        |
|    total_timesteps      | 16384      |
| train/                  |            |
|    approx_kl            | 0.01970315 |
|    clip_fraction        | 0.242      |
|    clip_range           | 0.2        |
|    entropy_loss         | -1.9       |
|    explained_variance   | 0.486      |
|    learning_rate        | 0.0003     |
|    loss                 | 0.526      |
|    n_updates            | 70         |
|    policy_gradient_loss | 0.00486    |
|    value_loss           | 1.57       |
----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.01e+04    |
|    ep_rew_mean          | 891         |
| time/                   |             |
|    fps                  | 62          |
|    iterations           | 9           |
|    time_elapsed         | 293         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.012460884 |
|    clip_fraction        | 0.217       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.87       |
|    explained_variance   | 0.74        |
|    learning_rate        | 0.0003      |
|    loss                 | 0.139       |
|    n_updates            | 80          |
|    policy_gradient_loss | -0.000311   |
|    value_loss           | 0.734       |
-----------------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 1.01e+04   |
|    ep_rew_mean          | 891        |
| time/                   |            |
|    fps                  | 62         |
|    iterations           | 10         |
|    time_elapsed         | 327        |
|    total_timesteps      | 20480      |
| train/                  |            |
|    approx_kl            | 0.02535792 |
|    clip_fraction        | 0.298      |
|    clip_range           | 0.2        |
|    entropy_loss         | -1.88      |
|    explained_variance   | 0.405      |
|    learning_rate        | 0.0003     |
|    loss                 | 1.17       |
|    n_updates            | 90         |
|    policy_gradient_loss | 0.0205     |
|    value_loss           | 6.6        |
----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.01e+04    |
|    ep_rew_mean          | 891         |
| time/                   |             |
|    fps                  | 62          |
|    iterations           | 11          |
|    time_elapsed         | 361         |
|    total_timesteps      | 22528       |
| train/                  |             |
|    approx_kl            | 0.019694094 |
|    clip_fraction        | 0.243       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.91       |
|    explained_variance   | 0.952       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.39        |
|    n_updates            | 100         |
|    policy_gradient_loss | -0.00434    |
|    value_loss           | 1.31        |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.19e+04    |
|    ep_rew_mean          | 884         |
| time/                   |             |
|    fps                  | 61          |
|    iterations           | 12          |
|    time_elapsed         | 398         |
|    total_timesteps      | 24576       |
| train/                  |             |
|    approx_kl            | 0.013096321 |
|    clip_fraction        | 0.227       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.91       |
|    explained_variance   | 0.0132      |
|    learning_rate        | 0.0003      |
|    loss                 | 0.669       |
|    n_updates            | 110         |
|    policy_gradient_loss | -0.000837   |
|    value_loss           | 1.42        |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.19e+04    |
|    ep_rew_mean          | 884         |
| time/                   |             |
|    fps                  | 61          |
|    iterations           | 13          |
|    time_elapsed         | 432         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.014833134 |
|    clip_fraction        | 0.239       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.9        |
|    explained_variance   | 0.452       |
|    learning_rate        | 0.0003      |
|    loss                 | 18.1        |
|    n_updates            | 120         |
|    policy_gradient_loss | -7.3e-05    |
|    value_loss           | 26.3        |
-----------------------------------------

测试代码

from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
import time
from matplotlib import pyplot as plt
from stable_baselines3 import PPO
env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = JoypadSpace(env, SIMPLE_MOVEMENT)
model = PPO.load("mario_model")

obs = env.reset()
obs=obs.copy()
done = True
while True:
    if done:
        state = env.reset()
    action, _states = model.predict(obs)
    obs, rewards, done, info = env.step(action)
    obs=obs.copy()
    env.render()
posted @ 2024-02-01 21:32  风·华正茂  阅读(20)  评论(0编辑  收藏  举报