baselines算法库common/retro_wrappers.py模块分析
retro_wrappers.py模块代码:

from collections import deque import cv2 cv2.ocl.setUseOpenCL(False) from .atari_wrappers import WarpFrame, ClipRewardEnv, FrameStack, ScaledFloatFrame from .wrappers import TimeLimit import numpy as np import gym class StochasticFrameSkip(gym.Wrapper): def __init__(self, env, n, stickprob): gym.Wrapper.__init__(self, env) self.n = n self.stickprob = stickprob self.curac = None self.rng = np.random.RandomState() self.supports_want_render = hasattr(env, "supports_want_render") def reset(self, **kwargs): self.curac = None return self.env.reset(**kwargs) def step(self, ac): done = False totrew = 0 for i in range(self.n): # First step after reset, use action if self.curac is None: self.curac = ac # First substep, delay with probability=stickprob elif i==0: if self.rng.rand() > self.stickprob: self.curac = ac # Second substep, new action definitely kicks in elif i==1: self.curac = ac if self.supports_want_render and i<self.n-1: ob, rew, done, info = self.env.step(self.curac, want_render=False) else: ob, rew, done, info = self.env.step(self.curac) totrew += rew if done: break return ob, totrew, done, info def seed(self, s): self.rng.seed(s) class PartialFrameStack(gym.Wrapper): def __init__(self, env, k, channel=1): """ Stack one channel (channel keyword) from previous frames """ gym.Wrapper.__init__(self, env) shp = env.observation_space.shape self.channel = channel self.observation_space = gym.spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] + k - 1), dtype=env.observation_space.dtype) self.k = k self.frames = deque([], maxlen=k) shp = env.observation_space.shape def reset(self): ob = self.env.reset() assert ob.shape[2] > self.channel for _ in range(self.k): self.frames.append(ob) return self._get_ob() def step(self, ac): ob, reward, done, info = self.env.step(ac) self.frames.append(ob) return self._get_ob(), reward, done, info def _get_ob(self): assert len(self.frames) == self.k return np.concatenate([frame if i==self.k-1 else frame[:,:,self.channel:self.channel+1] for (i, frame) in enumerate(self.frames)], axis=2) class Downsample(gym.ObservationWrapper): def __init__(self, env, ratio): """ Downsample images by a factor of ratio """ gym.ObservationWrapper.__init__(self, env) (oldh, oldw, oldc) = env.observation_space.shape newshape = (oldh//ratio, oldw//ratio, oldc) self.observation_space = gym.spaces.Box(low=0, high=255, shape=newshape, dtype=np.uint8) def observation(self, frame): height, width, _ = self.observation_space.shape frame = cv2.resize(frame, (width, height), interpolation=cv2.INTER_AREA) if frame.ndim == 2: frame = frame[:,:,None] return frame class Rgb2gray(gym.ObservationWrapper): def __init__(self, env): """ Downsample images by a factor of ratio """ gym.ObservationWrapper.__init__(self, env) (oldh, oldw, _oldc) = env.observation_space.shape self.observation_space = gym.spaces.Box(low=0, high=255, shape=(oldh, oldw, 1), dtype=np.uint8) def observation(self, frame): frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY) return frame[:,:,None] class MovieRecord(gym.Wrapper): def __init__(self, env, savedir, k): gym.Wrapper.__init__(self, env) self.savedir = savedir self.k = k self.epcount = 0 def reset(self): if self.epcount % self.k == 0: self.env.unwrapped.movie_path = self.savedir else: self.env.unwrapped.movie_path = None self.env.unwrapped.movie = None self.epcount += 1 return self.env.reset() class AppendTimeout(gym.Wrapper): def __init__(self, env): gym.Wrapper.__init__(self, env) self.action_space = env.action_space self.timeout_space = gym.spaces.Box(low=np.array([0.0]), high=np.array([1.0]), dtype=np.float32) self.original_os = env.observation_space if isinstance(self.original_os, gym.spaces.Dict): import copy ordered_dict = copy.deepcopy(self.original_os.spaces) ordered_dict['value_estimation_timeout'] = self.timeout_space self.observation_space = gym.spaces.Dict(ordered_dict) self.dict_mode = True else: self.observation_space = gym.spaces.Dict({ 'original': self.original_os, 'value_estimation_timeout': self.timeout_space }) self.dict_mode = False self.ac_count = None while 1: if not hasattr(env, "_max_episode_steps"): # Looking for TimeLimit wrapper that has this field env = env.env continue break self.timeout = env._max_episode_steps def step(self, ac): self.ac_count += 1 ob, rew, done, info = self.env.step(ac) return self._process(ob), rew, done, info def reset(self): self.ac_count = 0 return self._process(self.env.reset()) def _process(self, ob): fracmissing = 1 - self.ac_count / self.timeout if self.dict_mode: ob['value_estimation_timeout'] = fracmissing else: return { 'original': ob, 'value_estimation_timeout': fracmissing } class StartDoingRandomActionsWrapper(gym.Wrapper): """ Warning: can eat info dicts, not good if you depend on them """ def __init__(self, env, max_random_steps, on_startup=True, every_episode=False): gym.Wrapper.__init__(self, env) self.on_startup = on_startup self.every_episode = every_episode self.random_steps = max_random_steps self.last_obs = None if on_startup: self.some_random_steps() def some_random_steps(self): self.last_obs = self.env.reset() n = np.random.randint(self.random_steps) #print("running for random %i frames" % n) for _ in range(n): self.last_obs, _, done, _ = self.env.step(self.env.action_space.sample()) if done: self.last_obs = self.env.reset() def reset(self): return self.last_obs def step(self, a): self.last_obs, rew, done, info = self.env.step(a) if done: self.last_obs = self.env.reset() if self.every_episode: self.some_random_steps() return self.last_obs, rew, done, info def make_retro(*, game, state=None, max_episode_steps=4500, **kwargs): import retro if state is None: state = retro.State.DEFAULT env = retro.make(game, state, **kwargs) env = StochasticFrameSkip(env, n=4, stickprob=0.25) if max_episode_steps is not None: env = TimeLimit(env, max_episode_steps=max_episode_steps) return env def wrap_deepmind_retro(env, scale=True, frame_stack=4): """ Configure environment for retro games, using config similar to DeepMind-style Atari in wrap_deepmind """ env = WarpFrame(env) env = ClipRewardEnv(env) if frame_stack > 1: env = FrameStack(env, frame_stack) if scale: env = ScaledFloatFrame(env) return env class SonicDiscretizer(gym.ActionWrapper): """ Wrap a gym-retro environment and make it use discrete actions for the Sonic game. """ def __init__(self, env): super(SonicDiscretizer, self).__init__(env) buttons = ["B", "A", "MODE", "START", "UP", "DOWN", "LEFT", "RIGHT", "C", "Y", "X", "Z"] actions = [['LEFT'], ['RIGHT'], ['LEFT', 'DOWN'], ['RIGHT', 'DOWN'], ['DOWN'], ['DOWN', 'B'], ['B']] self._actions = [] for action in actions: arr = np.array([False] * 12) for button in action: arr[buttons.index(button)] = True self._actions.append(arr) self.action_space = gym.spaces.Discrete(len(self._actions)) def action(self, a): # pylint: disable=W0221 return self._actions[a].copy() class RewardScaler(gym.RewardWrapper): """ Bring rewards to a reasonable scale for PPO. This is incredibly important and effects performance drastically. """ def __init__(self, env, scale=0.01): super(RewardScaler, self).__init__(env) self.scale = scale def reward(self, reward): return reward * self.scale class AllowBacktracking(gym.Wrapper): """ Use deltas in max(X) as the reward, rather than deltas in X. This way, agents are not discouraged too heavily from exploring backwards if there is no way to advance head-on in the level. """ def __init__(self, env): super(AllowBacktracking, self).__init__(env) self._cur_x = 0 self._max_x = 0 def reset(self, **kwargs): # pylint: disable=E0202 self._cur_x = 0 self._max_x = 0 return self.env.reset(**kwargs) def step(self, action): # pylint: disable=E0202 obs, rew, done, info = self.env.step(action) self._cur_x += rew rew = max(0, self._cur_x - self._max_x) self._max_x = max(self._max_x, self._cur_x) return obs, rew, done, info
该模块顾名思义就是为retro环境库做包装的。
该模块对环境的包装与atari库的包装相似但是也有所不同,retro库最有名的环境应该就是超级马里奥、俄罗斯方块和刺猬sonic了。
由于该模块需要使用opencv对图片进行处理因此文件开始处进行了opencl禁用设置,以防止与cuda冲突。
cv2.ocl.setUseOpenCL(False)
class StochasticFrameSkip(gym.Wrapper): def __init__(self, env, n, stickprob): gym.Wrapper.__init__(self, env) self.n = n self.stickprob = stickprob self.curac = None self.rng = np.random.RandomState() self.supports_want_render = hasattr(env, "supports_want_render") def reset(self, **kwargs): self.curac = None return self.env.reset(**kwargs) def step(self, ac): done = False totrew = 0 for i in range(self.n): # First step after reset, use action if self.curac is None: self.curac = ac # First substep, delay with probability=stickprob elif i==0: if self.rng.rand() > self.stickprob: self.curac = ac # Second substep, new action definitely kicks in elif i==1: self.curac = ac if self.supports_want_render and i<self.n-1: ob, rew, done, info = self.env.step(self.curac, want_render=False) else: ob, rew, done, info = self.env.step(self.curac) totrew += rew if done: break return ob, totrew, done, info def seed(self, s): self.rng.seed(s)
包装类StochasticFrameSkip的重点在step函数上:
该类采用frameSkip技术,也就是说收到一个动作后会与环境重复交互n次,但是与其他的frameSkip不同,这里采用的是StochasticFrameSkip,也就是在收到动作后第一个交互动作以概率stickprob保持上一次与环境交互的动作而不是此次接收到的动作。
从第二次动作,也就是i==1以后与环境进行的交互动作都是此次调用step函数时接收到的动作。
这里有一个小点,就是如果step的时候需要绘图操作,即render,只会在n次与环境交互的最后一次进行绘图render 。
由于接收都一次动作而与环境进行了n次交互,因此最终的reward为这n次获得的reward之和。
class PartialFrameStack(gym.Wrapper): def __init__(self, env, k, channel=1): """ Stack one channel (channel keyword) from previous frames """ gym.Wrapper.__init__(self, env) shp = env.observation_space.shape self.channel = channel self.observation_space = gym.spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] + k - 1), dtype=env.observation_space.dtype) self.k = k self.frames = deque([], maxlen=k) shp = env.observation_space.shape def reset(self): ob = self.env.reset() assert ob.shape[2] > self.channel for _ in range(self.k): self.frames.append(ob) return self._get_ob() def step(self, ac): ob, reward, done, info = self.env.step(ac) self.frames.append(ob) return self._get_ob(), reward, done, info def _get_ob(self): assert len(self.frames) == self.k return np.concatenate([frame if i==self.k-1 else frame[:,:,self.channel:self.channel+1] for (i, frame) in enumerate(self.frames)], axis=2)
将K帧游戏图片在通道channel维度上进行拼接。
需要注意的是这个环境包装类并不是传统的维度拼接,而是一种部分通道拼接,一个需要拼接的图片帧为K,在前K-1个帧图片拼接时是只选择指定的通道channel的,只有最后一帧,第K帧拼接时才选取所有通道,即:
frame if i==self.k-1 else frame[:,:,self.channel:self.channel+1
class Downsample(gym.ObservationWrapper): def __init__(self, env, ratio): """ Downsample images by a factor of ratio """ gym.ObservationWrapper.__init__(self, env) (oldh, oldw, oldc) = env.observation_space.shape newshape = (oldh//ratio, oldw//ratio, oldc) self.observation_space = gym.spaces.Box(low=0, high=255, shape=newshape, dtype=np.uint8) def observation(self, frame): height, width, _ = self.observation_space.shape frame = cv2.resize(frame, (width, height), interpolation=cv2.INTER_AREA) if frame.ndim == 2: frame = frame[:,:,None] return frame
observation包装类,对图片大小进行缩放,需要注意的是该类包装后的observation都是带有channel维度的np.array,也就是说返回的observation都是维度为3的。
如果observation维度为2则为channel维度进行扩充:
if frame.ndim == 2: frame = frame[:,:,None]
class Rgb2gray(gym.ObservationWrapper): def __init__(self, env): """ Downsample images by a factor of ratio """ gym.ObservationWrapper.__init__(self, env) (oldh, oldw, _oldc) = env.observation_space.shape self.observation_space = gym.spaces.Box(low=0, high=255, shape=(oldh, oldw, 1), dtype=np.uint8) def observation(self, frame): frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY) return frame[:,:,None]
使用opencv对rgb图片转换为gray灰度图,需要注意的是最后返回的observation是对channel维度进行扩充过的,也就是observation返回值都是3个维度的。
维度扩充操作:
return frame[:,:,None]
class MovieRecord(gym.Wrapper): def __init__(self, env, savedir, k): gym.Wrapper.__init__(self, env) self.savedir = savedir self.k = k self.epcount = 0 def reset(self): if self.epcount % self.k == 0: self.env.unwrapped.movie_path = self.savedir else: self.env.unwrapped.movie_path = None self.env.unwrapped.movie = None self.epcount += 1 return self.env.reset()
在一定episodes周期上在进行reset操作时设置env.unwrapped.movie_path变量。
在这里设置视频保存地址的具体作用还不知晓,在整个baselines项目中也没有查找到具体使用。
class AppendTimeout(gym.Wrapper): def __init__(self, env): gym.Wrapper.__init__(self, env) self.action_space = env.action_space self.timeout_space = gym.spaces.Box(low=np.array([0.0]), high=np.array([1.0]), dtype=np.float32) self.original_os = env.observation_space if isinstance(self.original_os, gym.spaces.Dict): import copy ordered_dict = copy.deepcopy(self.original_os.spaces) ordered_dict['value_estimation_timeout'] = self.timeout_space self.observation_space = gym.spaces.Dict(ordered_dict) self.dict_mode = True else: self.observation_space = gym.spaces.Dict({ 'original': self.original_os, 'value_estimation_timeout': self.timeout_space }) self.dict_mode = False self.ac_count = None while 1: if not hasattr(env, "_max_episode_steps"): # Looking for TimeLimit wrapper that has this field env = env.env continue break self.timeout = env._max_episode_steps def step(self, ac): self.ac_count += 1 ob, rew, done, info = self.env.step(ac) return self._process(ob), rew, done, info def reset(self): self.ac_count = 0 return self._process(self.env.reset()) def _process(self, ob): fracmissing = 1 - self.ac_count / self.timeout if self.dict_mode: ob['value_estimation_timeout'] = fracmissing else: return { 'original': ob, 'value_estimation_timeout': fracmissing }
如果observation为Dict类型则为其添加key值为'value_estimation_timeout',value值为一个episode内当前步数距离最大episode步数的比值。
如果observation为np.array类型,则将其转为key值为'original'的字典,同时添加key值为'value_estimation_timeout',value值为一个episode内当前步数距离最大episode步数的比值。
改类主要对observation进行 包装,将observation转为dict类型,同时添加key为'value_estimation_timeout' 。
对传入的env变量判断是否有_max_episode_steps变量,并不断循环env=env.env,来判断最内层的env的最大episode steps 。
该类主要为在返回的observation中记录当前步数与最大episode steps之间的距离。
class StartDoingRandomActionsWrapper(gym.Wrapper): """ Warning: can eat info dicts, not good if you depend on them """ def __init__(self, env, max_random_steps, on_startup=True, every_episode=False): gym.Wrapper.__init__(self, env) self.on_startup = on_startup self.every_episode = every_episode self.random_steps = max_random_steps self.last_obs = None if on_startup: self.some_random_steps() def some_random_steps(self): self.last_obs = self.env.reset() n = np.random.randint(self.random_steps) #print("running for random %i frames" % n) for _ in range(n): self.last_obs, _, done, _ = self.env.step(self.env.action_space.sample()) if done: self.last_obs = self.env.reset() def reset(self): return self.last_obs def step(self, a): self.last_obs, rew, done, info = self.env.step(a) if done: self.last_obs = self.env.reset() if self.every_episode: self.some_random_steps() return self.last_obs, rew, done, info
设置是否在一个episode开始时进行一定步数的随机动作。
主要代码:
for _ in range(n): self.last_obs, _, done, _ = self.env.step(self.env.action_space.sample()) if done: self.last_obs = self.env.reset()
该类可以设置在类初始第一个episode的时候是否进行一定步数的随机动作,也可以设置是否在每个episode开始的时候进行一定步数的随机动作。
def make_retro(*, game, state=None, max_episode_steps=4500, **kwargs): import retro if state is None: state = retro.State.DEFAULT env = retro.make(game, state, **kwargs) env = StochasticFrameSkip(env, n=4, stickprob=0.25) if max_episode_steps is not None: env = TimeLimit(env, max_episode_steps=max_episode_steps) return env
对前面的包装类进行组合。
对retro生成的环境使用StochasticFrameSkip和TimeLimit两个类进行包装。
def wrap_deepmind_retro(env, scale=True, frame_stack=4): """ Configure environment for retro games, using config similar to DeepMind-style Atari in wrap_deepmind """ env = WarpFrame(env) env = ClipRewardEnv(env) if frame_stack > 1: env = FrameStack(env, frame_stack) if scale: env = ScaledFloatFrame(env) return env
使用atari游戏的环境包装类对retro游戏进行包装。
WarpFrame对图片进行灰度化和裁剪。
ClipReward对奖励值裁剪为-1, 0, +1 。
FrameStack对k个图片在通道维度上进行堆叠。
ScaledFloatFrame将图片np.array的数值从0到255的uint8转为0到1的float32。
class SonicDiscretizer(gym.ActionWrapper): """ Wrap a gym-retro environment and make it use discrete actions for the Sonic game. """ def __init__(self, env): super(SonicDiscretizer, self).__init__(env) buttons = ["B", "A", "MODE", "START", "UP", "DOWN", "LEFT", "RIGHT", "C", "Y", "X", "Z"] actions = [['LEFT'], ['RIGHT'], ['LEFT', 'DOWN'], ['RIGHT', 'DOWN'], ['DOWN'], ['DOWN', 'B'], ['B']] self._actions = [] for action in actions: arr = np.array([False] * 12) for button in action: arr[buttons.index(button)] = True self._actions.append(arr) self.action_space = gym.spaces.Discrete(len(self._actions)) def action(self, a): # pylint: disable=W0221 return self._actions[a].copy()
对动作action进行包装。
环境接收的外部传入的动作为:
actions = [['LEFT'], ['RIGHT'], ['LEFT', 'DOWN'], ['RIGHT', 'DOWN'], ['DOWN'],
['DOWN', 'B'], ['B']]
接收的动作为整数,0代表的为['LEFT'], 1代表的为['RIGHT'],2代表的为['LEFT', 'DOWN'],等等......
可以知道外部传入的动作为0到6的数字,而内部retro环境能识别的动作为:
buttons = ["B", "A", "MODE", "START", "UP", "DOWN", "LEFT", "RIGHT", "C", "Y", "X", "Z"]
共有12个,这里传给内部retro环境的动作使用one-hot编码,但是不同的是允许两个动作的组合,这样正好可以与外部传入的动作0到6所对应。
for action in actions: arr = np.array([False] * 12) for button in action: arr[buttons.index(button)] = True self._actions.append(arr)
class RewardScaler(gym.RewardWrapper): """ Bring rewards to a reasonable scale for PPO. This is incredibly important and effects performance drastically. """ def __init__(self, env, scale=0.01): super(RewardScaler, self).__init__(env) self.scale = scale def reward(self, reward): return reward * self.scale
对环境的reward进行包装,对reward值进行缩放,根据注释这个包装类主要为PPO算法提供并且可以提升显著的算法性能。
class AllowBacktracking(gym.Wrapper): """ Use deltas in max(X) as the reward, rather than deltas in X. This way, agents are not discouraged too heavily from exploring backwards if there is no way to advance head-on in the level. """ def __init__(self, env): super(AllowBacktracking, self).__init__(env) self._cur_x = 0 self._max_x = 0 def reset(self, **kwargs): # pylint: disable=E0202 self._cur_x = 0 self._max_x = 0 return self.env.reset(**kwargs) def step(self, action): # pylint: disable=E0202 obs, rew, done, info = self.env.step(action) self._cur_x += rew rew = max(0, self._cur_x - self._max_x) self._max_x = max(self._max_x, self._cur_x) return obs, rew, done, info
改包装类个人理解主要是为超级马里奥游戏提供,主要作用是通过对reset函数和step函数进行包装从而实现对reward的定制化。
由于该类是对reward的包装,而个人对于这类游戏的reward设计并不是很了解,因此只能从代码反推内部环境类对reward的设计:
假设游戏是超级马里奥,agent(也就是马里奥)向右移动reward为正数,如果向左移动则reward为负数,因为向右移动是朝向游戏终点移动,而向左移动是朝远离游戏终点的方向移动。
这里self._max_x的设计是为了记录agent移动历史中最靠右的坐标,self._cur_x是对历史获得reward的求和,假设当前agent处在历史最右坐标self._max_x的左侧,那么此刻无论agent的上步动作是什么它所获得的reward必然为0,因为:
rew = max(0, self._cur_x - self._max_x)
在某种程度上可以理解这个reward的设计就是不鼓励agent朝向已走过的路进行探索(self._max_x的左侧),但是无论agent如何执行动作只要它处于的位置在self._max_x的左侧获得的reward都为0。
===========================================
posted on 2022-03-24 16:44 Angry_Panda 阅读(97) 评论(0) 编辑 收藏 举报
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· winform 绘制太阳,地球,月球 运作规律
· AI与.NET技术实操系列(五):向量存储与相似性搜索在 .NET 中的实现
· 超详细:普通电脑也行Windows部署deepseek R1训练数据并当服务器共享给他人
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 上周热点回顾(3.3-3.9)
2019-03-24 【转载】 一个老博士的经验顺口溜! 研究生生活的精华总结!
2019-03-24 【转载】 研究生生活总结(2):从技术到研究再到技术的过程
2019-03-24 【转载】 研究生生活总结(1):当助教的那些人和事
2018-03-24 Android Studio真机测试失败-----''No target device found" (转)
2018-03-24 Android Studio 默认 debug.keystore , apk打包,keystore.jks文件生成,根据keystore密钥获取SHA1安全码