baselines算法库common/retro_wrappers.py模块分析

retro_wrappers.py模块代码：

from collections import deque
import cv2
cv2.ocl.setUseOpenCL(False)
from .atari_wrappers import WarpFrame, ClipRewardEnv, FrameStack, ScaledFloatFrame
from .wrappers import TimeLimit
import numpy as np
import gym


class StochasticFrameSkip(gym.Wrapper):
    def __init__(self, env, n, stickprob):
        gym.Wrapper.__init__(self, env)
        self.n = n
        self.stickprob = stickprob
        self.curac = None
        self.rng = np.random.RandomState()
        self.supports_want_render = hasattr(env, "supports_want_render")

    def reset(self, **kwargs):
        self.curac = None
        return self.env.reset(**kwargs)

    def step(self, ac):
        done = False
        totrew = 0
        for i in range(self.n):
            # First step after reset, use action
            if self.curac is None:
                self.curac = ac
            # First substep, delay with probability=stickprob
            elif i==0:
                if self.rng.rand() > self.stickprob:
                    self.curac = ac
            # Second substep, new action definitely kicks in
            elif i==1:
                self.curac = ac
            if self.supports_want_render and i<self.n-1:
                ob, rew, done, info = self.env.step(self.curac, want_render=False)
            else:
                ob, rew, done, info = self.env.step(self.curac)
            totrew += rew
            if done: break
        return ob, totrew, done, info

    def seed(self, s):
        self.rng.seed(s)

class PartialFrameStack(gym.Wrapper):
    def __init__(self, env, k, channel=1):
        """
        Stack one channel (channel keyword) from previous frames
        """
        gym.Wrapper.__init__(self, env)
        shp = env.observation_space.shape
        self.channel = channel
        self.observation_space = gym.spaces.Box(low=0, high=255,
            shape=(shp[0], shp[1], shp[2] + k - 1),
            dtype=env.observation_space.dtype)
        self.k = k
        self.frames = deque([], maxlen=k)
        shp = env.observation_space.shape

    def reset(self):
        ob = self.env.reset()
        assert ob.shape[2] > self.channel
        for _ in range(self.k):
            self.frames.append(ob)
        return self._get_ob()

    def step(self, ac):
        ob, reward, done, info = self.env.step(ac)
        self.frames.append(ob)
        return self._get_ob(), reward, done, info

    def _get_ob(self):
        assert len(self.frames) == self.k
        return np.concatenate([frame if i==self.k-1 else frame[:,:,self.channel:self.channel+1]
            for (i, frame) in enumerate(self.frames)], axis=2)

class Downsample(gym.ObservationWrapper):
    def __init__(self, env, ratio):
        """
        Downsample images by a factor of ratio
        """
        gym.ObservationWrapper.__init__(self, env)
        (oldh, oldw, oldc) = env.observation_space.shape
        newshape = (oldh//ratio, oldw//ratio, oldc)
        self.observation_space = gym.spaces.Box(low=0, high=255,
            shape=newshape, dtype=np.uint8)

    def observation(self, frame):
        height, width, _ = self.observation_space.shape
        frame = cv2.resize(frame, (width, height), interpolation=cv2.INTER_AREA)
        if frame.ndim == 2:
            frame = frame[:,:,None]
        return frame

class Rgb2gray(gym.ObservationWrapper):
    def __init__(self, env):
        """
        Downsample images by a factor of ratio
        """
        gym.ObservationWrapper.__init__(self, env)
        (oldh, oldw, _oldc) = env.observation_space.shape
        self.observation_space = gym.spaces.Box(low=0, high=255,
            shape=(oldh, oldw, 1), dtype=np.uint8)

    def observation(self, frame):
        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
        return frame[:,:,None]


class MovieRecord(gym.Wrapper):
    def __init__(self, env, savedir, k):
        gym.Wrapper.__init__(self, env)
        self.savedir = savedir
        self.k = k
        self.epcount = 0
    def reset(self):
        if self.epcount % self.k == 0:
            self.env.unwrapped.movie_path = self.savedir
        else:
            self.env.unwrapped.movie_path = None
            self.env.unwrapped.movie = None
        self.epcount += 1
        return self.env.reset()

class AppendTimeout(gym.Wrapper):
    def __init__(self, env):
        gym.Wrapper.__init__(self, env)
        self.action_space = env.action_space
        self.timeout_space = gym.spaces.Box(low=np.array([0.0]), high=np.array([1.0]), dtype=np.float32)
        self.original_os = env.observation_space
        if isinstance(self.original_os, gym.spaces.Dict):
            import copy
            ordered_dict = copy.deepcopy(self.original_os.spaces)
            ordered_dict['value_estimation_timeout'] = self.timeout_space
            self.observation_space = gym.spaces.Dict(ordered_dict)
            self.dict_mode = True
        else:
            self.observation_space = gym.spaces.Dict({
                'original': self.original_os,
                'value_estimation_timeout': self.timeout_space
                })
            self.dict_mode = False
        self.ac_count = None
        while 1:
            if not hasattr(env, "_max_episode_steps"):  # Looking for TimeLimit wrapper that has this field
                env = env.env
                continue
            break
        self.timeout = env._max_episode_steps

    def step(self, ac):
        self.ac_count += 1
        ob, rew, done, info = self.env.step(ac)
        return self._process(ob), rew, done, info

    def reset(self):
        self.ac_count = 0
        return self._process(self.env.reset())

    def _process(self, ob):
        fracmissing = 1 - self.ac_count / self.timeout
        if self.dict_mode:
            ob['value_estimation_timeout'] = fracmissing
        else:
            return { 'original': ob, 'value_estimation_timeout': fracmissing }

class StartDoingRandomActionsWrapper(gym.Wrapper):
    """
    Warning: can eat info dicts, not good if you depend on them
    """
    def __init__(self, env, max_random_steps, on_startup=True, every_episode=False):
        gym.Wrapper.__init__(self, env)
        self.on_startup = on_startup
        self.every_episode = every_episode
        self.random_steps = max_random_steps
        self.last_obs = None
        if on_startup:
            self.some_random_steps()

    def some_random_steps(self):
        self.last_obs = self.env.reset()
        n = np.random.randint(self.random_steps)
        #print("running for random %i frames" % n)
        for _ in range(n):
            self.last_obs, _, done, _ = self.env.step(self.env.action_space.sample())
            if done: self.last_obs = self.env.reset()

    def reset(self):
        return self.last_obs

    def step(self, a):
        self.last_obs, rew, done, info = self.env.step(a)
        if done:
            self.last_obs = self.env.reset()
            if self.every_episode:
                self.some_random_steps()
        return self.last_obs, rew, done, info

def make_retro(*, game, state=None, max_episode_steps=4500, **kwargs):
    import retro
    if state is None:
        state = retro.State.DEFAULT
    env = retro.make(game, state, **kwargs)
    env = StochasticFrameSkip(env, n=4, stickprob=0.25)
    if max_episode_steps is not None:
        env = TimeLimit(env, max_episode_steps=max_episode_steps)
    return env

def wrap_deepmind_retro(env, scale=True, frame_stack=4):
    """
    Configure environment for retro games, using config similar to DeepMind-style Atari in wrap_deepmind
    """
    env = WarpFrame(env)
    env = ClipRewardEnv(env)
    if frame_stack > 1:
        env = FrameStack(env, frame_stack)
    if scale:
        env = ScaledFloatFrame(env)
    return env

class SonicDiscretizer(gym.ActionWrapper):
    """
    Wrap a gym-retro environment and make it use discrete
    actions for the Sonic game.
    """
    def __init__(self, env):
        super(SonicDiscretizer, self).__init__(env)
        buttons = ["B", "A", "MODE", "START", "UP", "DOWN", "LEFT", "RIGHT", "C", "Y", "X", "Z"]
        actions = [['LEFT'], ['RIGHT'], ['LEFT', 'DOWN'], ['RIGHT', 'DOWN'], ['DOWN'],
                   ['DOWN', 'B'], ['B']]
        self._actions = []
        for action in actions:
            arr = np.array([False] * 12)
            for button in action:
                arr[buttons.index(button)] = True
            self._actions.append(arr)
        self.action_space = gym.spaces.Discrete(len(self._actions))

    def action(self, a): # pylint: disable=W0221
        return self._actions[a].copy()

class RewardScaler(gym.RewardWrapper):
    """
    Bring rewards to a reasonable scale for PPO.
    This is incredibly important and effects performance
    drastically.
    """
    def __init__(self, env, scale=0.01):
        super(RewardScaler, self).__init__(env)
        self.scale = scale

    def reward(self, reward):
        return reward * self.scale

class AllowBacktracking(gym.Wrapper):
    """
    Use deltas in max(X) as the reward, rather than deltas
    in X. This way, agents are not discouraged too heavily
    from exploring backwards if there is no way to advance
    head-on in the level.
    """
    def __init__(self, env):
        super(AllowBacktracking, self).__init__(env)
        self._cur_x = 0
        self._max_x = 0

    def reset(self, **kwargs): # pylint: disable=E0202
        self._cur_x = 0
        self._max_x = 0
        return self.env.reset(**kwargs)

    def step(self, action): # pylint: disable=E0202
        obs, rew, done, info = self.env.step(action)
        self._cur_x += rew
        rew = max(0, self._cur_x - self._max_x)
        self._max_x = max(self._max_x, self._cur_x)
        return obs, rew, done, info

View Code

该模块顾名思义就是为retro环境库做包装的。

该模块对环境的包装与atari库的包装相似但是也有所不同，retro库最有名的环境应该就是超级马里奥、俄罗斯方块和刺猬sonic了。

由于该模块需要使用opencv对图片进行处理因此文件开始处进行了opencl禁用设置，以防止与cuda冲突。

cv2.ocl.setUseOpenCL(False)

class StochasticFrameSkip(gym.Wrapper):
    def __init__(self, env, n, stickprob):
        gym.Wrapper.__init__(self, env)
        self.n = n
        self.stickprob = stickprob
        self.curac = None
        self.rng = np.random.RandomState()
        self.supports_want_render = hasattr(env, "supports_want_render")

    def reset(self, **kwargs):
        self.curac = None
        return self.env.reset(**kwargs)

    def step(self, ac):
        done = False
        totrew = 0
        for i in range(self.n):
            # First step after reset, use action
            if self.curac is None:
                self.curac = ac
            # First substep, delay with probability=stickprob
            elif i==0:
                if self.rng.rand() > self.stickprob:
                    self.curac = ac
            # Second substep, new action definitely kicks in
            elif i==1:
                self.curac = ac
            if self.supports_want_render and i<self.n-1:
                ob, rew, done, info = self.env.step(self.curac, want_render=False)
            else:
                ob, rew, done, info = self.env.step(self.curac)
            totrew += rew
            if done: break
        return ob, totrew, done, info

    def seed(self, s):
        self.rng.seed(s)

包装类StochasticFrameSkip的重点在step函数上：

该类采用frameSkip技术，也就是说收到一个动作后会与环境重复交互n次，但是与其他的frameSkip不同，这里采用的是StochasticFrameSkip，也就是在收到动作后第一个交互动作以概率stickprob保持上一次与环境交互的动作而不是此次接收到的动作。

从第二次动作，也就是i==1以后与环境进行的交互动作都是此次调用step函数时接收到的动作。

这里有一个小点，就是如果step的时候需要绘图操作，即render，只会在n次与环境交互的最后一次进行绘图render 。

由于接收都一次动作而与环境进行了n次交互，因此最终的reward为这n次获得的reward之和。

class PartialFrameStack(gym.Wrapper):
    def __init__(self, env, k, channel=1):
        """
        Stack one channel (channel keyword) from previous frames
        """
        gym.Wrapper.__init__(self, env)
        shp = env.observation_space.shape
        self.channel = channel
        self.observation_space = gym.spaces.Box(low=0, high=255,
            shape=(shp[0], shp[1], shp[2] + k - 1),
            dtype=env.observation_space.dtype)
        self.k = k
        self.frames = deque([], maxlen=k)
        shp = env.observation_space.shape

    def reset(self):
        ob = self.env.reset()
        assert ob.shape[2] > self.channel
        for _ in range(self.k):
            self.frames.append(ob)
        return self._get_ob()

    def step(self, ac):
        ob, reward, done, info = self.env.step(ac)
        self.frames.append(ob)
        return self._get_ob(), reward, done, info

    def _get_ob(self):
        assert len(self.frames) == self.k
        return np.concatenate([frame if i==self.k-1 else frame[:,:,self.channel:self.channel+1]
            for (i, frame) in enumerate(self.frames)], axis=2)

将K帧游戏图片在通道channel维度上进行拼接。

需要注意的是这个环境包装类并不是传统的维度拼接，而是一种部分通道拼接，一个需要拼接的图片帧为K，在前K-1个帧图片拼接时是只选择指定的通道channel的，只有最后一帧，第K帧拼接时才选取所有通道，即：

frame if i==self.k-1 else frame[:,:,self.channel:self.channel+1

class Downsample(gym.ObservationWrapper):
    def __init__(self, env, ratio):
        """
        Downsample images by a factor of ratio
        """
        gym.ObservationWrapper.__init__(self, env)
        (oldh, oldw, oldc) = env.observation_space.shape
        newshape = (oldh//ratio, oldw//ratio, oldc)
        self.observation_space = gym.spaces.Box(low=0, high=255,
            shape=newshape, dtype=np.uint8)

    def observation(self, frame):
        height, width, _ = self.observation_space.shape
        frame = cv2.resize(frame, (width, height), interpolation=cv2.INTER_AREA)
        if frame.ndim == 2:
            frame = frame[:,:,None]
        return frame

observation包装类，对图片大小进行缩放，需要注意的是该类包装后的observation都是带有channel维度的np.array，也就是说返回的observation都是维度为3的。

如果observation维度为2则为channel维度进行扩充：

        if frame.ndim == 2:
            frame = frame[:,:,None]

class Rgb2gray(gym.ObservationWrapper):
    def __init__(self, env):
        """
        Downsample images by a factor of ratio
        """
        gym.ObservationWrapper.__init__(self, env)
        (oldh, oldw, _oldc) = env.observation_space.shape
        self.observation_space = gym.spaces.Box(low=0, high=255,
            shape=(oldh, oldw, 1), dtype=np.uint8)

    def observation(self, frame):
        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
        return frame[:,:,None]

使用opencv对rgb图片转换为gray灰度图，需要注意的是最后返回的observation是对channel维度进行扩充过的，也就是observation返回值都是3个维度的。

维度扩充操作：

        return frame[:,:,None]

class MovieRecord(gym.Wrapper):
    def __init__(self, env, savedir, k):
        gym.Wrapper.__init__(self, env)
        self.savedir = savedir
        self.k = k
        self.epcount = 0
    def reset(self):
        if self.epcount % self.k == 0:
            self.env.unwrapped.movie_path = self.savedir
        else:
            self.env.unwrapped.movie_path = None
            self.env.unwrapped.movie = None
        self.epcount += 1
        return self.env.reset()

在一定episodes周期上在进行reset操作时设置env.unwrapped.movie_path变量。

在这里设置视频保存地址的具体作用还不知晓，在整个baselines项目中也没有查找到具体使用。

class AppendTimeout(gym.Wrapper):
    def __init__(self, env):
        gym.Wrapper.__init__(self, env)
        self.action_space = env.action_space
        self.timeout_space = gym.spaces.Box(low=np.array([0.0]), high=np.array([1.0]), dtype=np.float32)
        self.original_os = env.observation_space
        if isinstance(self.original_os, gym.spaces.Dict):
            import copy
            ordered_dict = copy.deepcopy(self.original_os.spaces)
            ordered_dict['value_estimation_timeout'] = self.timeout_space
            self.observation_space = gym.spaces.Dict(ordered_dict)
            self.dict_mode = True
        else:
            self.observation_space = gym.spaces.Dict({
                'original': self.original_os,
                'value_estimation_timeout': self.timeout_space
                })
            self.dict_mode = False
        self.ac_count = None
        while 1:
            if not hasattr(env, "_max_episode_steps"):  # Looking for TimeLimit wrapper that has this field
                env = env.env
                continue
            break
        self.timeout = env._max_episode_steps

    def step(self, ac):
        self.ac_count += 1
        ob, rew, done, info = self.env.step(ac)
        return self._process(ob), rew, done, info

    def reset(self):
        self.ac_count = 0
        return self._process(self.env.reset())

    def _process(self, ob):
        fracmissing = 1 - self.ac_count / self.timeout
        if self.dict_mode:
            ob['value_estimation_timeout'] = fracmissing
        else:
            return { 'original': ob, 'value_estimation_timeout': fracmissing }

如果observation为Dict类型则为其添加key值为'value_estimation_timeout'，value值为一个episode内当前步数距离最大episode步数的比值。

如果observation为np.array类型，则将其转为key值为'original'的字典，同时添加key值为'value_estimation_timeout'，value值为一个episode内当前步数距离最大episode步数的比值。

改类主要对observation进行包装，将observation转为dict类型，同时添加key为'value_estimation_timeout' 。

对传入的env变量判断是否有_max_episode_steps变量，并不断循环env=env.env，来判断最内层的env的最大episode steps 。

该类主要为在返回的observation中记录当前步数与最大episode steps之间的距离。

class StartDoingRandomActionsWrapper(gym.Wrapper):
    """
    Warning: can eat info dicts, not good if you depend on them
    """
    def __init__(self, env, max_random_steps, on_startup=True, every_episode=False):
        gym.Wrapper.__init__(self, env)
        self.on_startup = on_startup
        self.every_episode = every_episode
        self.random_steps = max_random_steps
        self.last_obs = None
        if on_startup:
            self.some_random_steps()

    def some_random_steps(self):
        self.last_obs = self.env.reset()
        n = np.random.randint(self.random_steps)
        #print("running for random %i frames" % n)
        for _ in range(n):
            self.last_obs, _, done, _ = self.env.step(self.env.action_space.sample())
            if done: self.last_obs = self.env.reset()

    def reset(self):
        return self.last_obs

    def step(self, a):
        self.last_obs, rew, done, info = self.env.step(a)
        if done:
            self.last_obs = self.env.reset()
            if self.every_episode:
                self.some_random_steps()
        return self.last_obs, rew, done, info

设置是否在一个episode开始时进行一定步数的随机动作。

主要代码：

        for _ in range(n):
            self.last_obs, _, done, _ = self.env.step(self.env.action_space.sample())
            if done: self.last_obs = self.env.reset()

该类可以设置在类初始第一个episode的时候是否进行一定步数的随机动作，也可以设置是否在每个episode开始的时候进行一定步数的随机动作。

def make_retro(*, game, state=None, max_episode_steps=4500, **kwargs):
    import retro
    if state is None:
        state = retro.State.DEFAULT
    env = retro.make(game, state, **kwargs)
    env = StochasticFrameSkip(env, n=4, stickprob=0.25)
    if max_episode_steps is not None:
        env = TimeLimit(env, max_episode_steps=max_episode_steps)
    return env

对前面的包装类进行组合。

对retro生成的环境使用StochasticFrameSkip和TimeLimit两个类进行包装。

def wrap_deepmind_retro(env, scale=True, frame_stack=4):
    """
    Configure environment for retro games, using config similar to DeepMind-style Atari in wrap_deepmind
    """
    env = WarpFrame(env)
    env = ClipRewardEnv(env)
    if frame_stack > 1:
        env = FrameStack(env, frame_stack)
    if scale:
        env = ScaledFloatFrame(env)
    return env

使用atari游戏的环境包装类对retro游戏进行包装。

WarpFrame对图片进行灰度化和裁剪。

ClipReward对奖励值裁剪为-1， 0， +1 。

FrameStack对k个图片在通道维度上进行堆叠。

ScaledFloatFrame将图片np.array的数值从0到255的uint8转为0到1的float32。

class SonicDiscretizer(gym.ActionWrapper):
    """
    Wrap a gym-retro environment and make it use discrete
    actions for the Sonic game.
    """
    def __init__(self, env):
        super(SonicDiscretizer, self).__init__(env)
        buttons = ["B", "A", "MODE", "START", "UP", "DOWN", "LEFT", "RIGHT", "C", "Y", "X", "Z"]
        actions = [['LEFT'], ['RIGHT'], ['LEFT', 'DOWN'], ['RIGHT', 'DOWN'], ['DOWN'],
                   ['DOWN', 'B'], ['B']]
        self._actions = []
        for action in actions:
            arr = np.array([False] * 12)
            for button in action:
                arr[buttons.index(button)] = True
            self._actions.append(arr)
        self.action_space = gym.spaces.Discrete(len(self._actions))

    def action(self, a): # pylint: disable=W0221
        return self._actions[a].copy()

对动作action进行包装。

环境接收的外部传入的动作为：

actions = [['LEFT'], ['RIGHT'], ['LEFT', 'DOWN'], ['RIGHT', 'DOWN'], ['DOWN'],
                   ['DOWN', 'B'], ['B']]

接收的动作为整数，0代表的为['LEFT']， 1代表的为['RIGHT']，2代表的为['LEFT', 'DOWN']，等等......

可以知道外部传入的动作为0到6的数字，而内部retro环境能识别的动作为：

        buttons = ["B", "A", "MODE", "START", "UP", "DOWN", "LEFT", "RIGHT", "C", "Y", "X", "Z"]

共有12个，这里传给内部retro环境的动作使用one-hot编码，但是不同的是允许两个动作的组合，这样正好可以与外部传入的动作0到6所对应。

        for action in actions:
            arr = np.array([False] * 12)
            for button in action:
                arr[buttons.index(button)] = True
            self._actions.append(arr)

class RewardScaler(gym.RewardWrapper):
    """
    Bring rewards to a reasonable scale for PPO.
    This is incredibly important and effects performance
    drastically.
    """
    def __init__(self, env, scale=0.01):
        super(RewardScaler, self).__init__(env)
        self.scale = scale

    def reward(self, reward):
        return reward * self.scale

对环境的reward进行包装，对reward值进行缩放，根据注释这个包装类主要为PPO算法提供并且可以提升显著的算法性能。

class AllowBacktracking(gym.Wrapper):
    """
    Use deltas in max(X) as the reward, rather than deltas
    in X. This way, agents are not discouraged too heavily
    from exploring backwards if there is no way to advance
    head-on in the level.
    """
    def __init__(self, env):
        super(AllowBacktracking, self).__init__(env)
        self._cur_x = 0
        self._max_x = 0

    def reset(self, **kwargs): # pylint: disable=E0202
        self._cur_x = 0
        self._max_x = 0
        return self.env.reset(**kwargs)

    def step(self, action): # pylint: disable=E0202
        obs, rew, done, info = self.env.step(action)
        self._cur_x += rew
        rew = max(0, self._cur_x - self._max_x)
        self._max_x = max(self._max_x, self._cur_x)
        return obs, rew, done, info

改包装类个人理解主要是为超级马里奥游戏提供，主要作用是通过对reset函数和step函数进行包装从而实现对reward的定制化。

由于该类是对reward的包装，而个人对于这类游戏的reward设计并不是很了解，因此只能从代码反推内部环境类对reward的设计：

假设游戏是超级马里奥，agent（也就是马里奥）向右移动reward为正数，如果向左移动则reward为负数，因为向右移动是朝向游戏终点移动，而向左移动是朝远离游戏终点的方向移动。

这里self._max_x的设计是为了记录agent移动历史中最靠右的坐标，self._cur_x是对历史获得reward的求和，假设当前agent处在历史最右坐标self._max_x的左侧，那么此刻无论agent的上步动作是什么它所获得的reward必然为0，因为：

        rew = max(0, self._cur_x - self._max_x)

在某种程度上可以理解这个reward的设计就是不鼓励agent朝向已走过的路进行探索（self._max_x的左侧），但是无论agent如何执行动作只要它处于的位置在self._max_x的左侧获得的reward都为0。

===========================================

posted on 2022-03-24 16:44 Angry_Panda 阅读(148) 评论(0) 收藏举报

刷新页面返回顶部

Angry Panda（T-800）

baselines算法库common/retro_wrappers.py模块分析

公告

导航