使用DQN训练AI玩游戏

实验结果
实验源代码
#wrapped_flappy_bird.py
import numpy as np
import sys
import random
import pygame
import flappy_bird_utils
import pygame.surfarray as surfarray
from pygame.locals import *
from itertools import cycle

FPS = 30
SCREENWIDTH  = 288
SCREENHEIGHT = 512

pygame.init()
FPSCLOCK = pygame.time.Clock()
SCREEN = pygame.display.set_mode((SCREENWIDTH, SCREENHEIGHT))
pygame.display.set_caption('Flappy Bird')

IMAGES, SOUNDS, HITMASKS = flappy_bird_utils.load()
PIPEGAPSIZE = 100 # gap between upper and lower part of pipe
BASEY = SCREENHEIGHT * 0.79

PLAYER_WIDTH = IMAGES['player'][0].get_width()
PLAYER_HEIGHT = IMAGES['player'][0].get_height()
PIPE_WIDTH = IMAGES['pipe'][0].get_width()
PIPE_HEIGHT = IMAGES['pipe'][0].get_height()
BACKGROUND_WIDTH = IMAGES['background'].get_width()

PLAYER_INDEX_GEN = cycle([0, 1, 2, 1])


class GameState:
    def __init__(self):
        self.score = self.playerIndex = self.loopIter = 0
        self.playerx = int(SCREENWIDTH * 0.2)
        self.playery = int((SCREENHEIGHT - PLAYER_HEIGHT) / 2)
        self.basex = 0
        self.baseShift = IMAGES['base'].get_width() - BACKGROUND_WIDTH

        newPipe1 = getRandomPipe()
        newPipe2 = getRandomPipe()
        self.upperPipes = [
            {'x': SCREENWIDTH, 'y': newPipe1[0]['y']},
            {'x': SCREENWIDTH + (SCREENWIDTH / 2), 'y': newPipe2[0]['y']},
        ]
        self.lowerPipes = [
            {'x': SCREENWIDTH, 'y': newPipe1[1]['y']},
            {'x': SCREENWIDTH + (SCREENWIDTH / 2), 'y': newPipe2[1]['y']},
        ]

        # player velocity, max velocity, downward accleration, accleration on flap
        self.pipeVelX = -4
        self.playerVelY    =  0    # player's velocity along Y, default same as playerFlapped
        self.playerMaxVelY =  10   # max vel along Y, max descend speed
        self.playerMinVelY =  -8   # min vel along Y, max ascend speed
        self.playerAccY    =   1   # players downward accleration
        self.playerFlapAcc =  -7   # players speed on flapping
        self.playerFlapped = False # True when player flaps

    def frame_step(self, input_actions):
        pygame.event.pump()

        reward = 0.1
        terminal = False

        if sum(input_actions) != 1:
            raise ValueError('Multiple input actions!')

        # input_actions[0] == 1: do nothing
        # input_actions[1] == 1: flap the bird
        if input_actions[1] == 1:
            if self.playery > -2 * PLAYER_HEIGHT:
                self.playerVelY = self.playerFlapAcc
                self.playerFlapped = True
                #SOUNDS['wing'].play()

        # check for score
        playerMidPos = self.playerx + PLAYER_WIDTH / 2
        for pipe in self.upperPipes:
            pipeMidPos = pipe['x'] + PIPE_WIDTH / 2
            if pipeMidPos <= playerMidPos < pipeMidPos + 4:
                self.score += 1
                #SOUNDS['point'].play()
                reward = 1

        # playerIndex basex change
        if (self.loopIter + 1) % 3 == 0:
            self.playerIndex = next(PLAYER_INDEX_GEN)
        self.loopIter = (self.loopIter + 1) % 30
        self.basex = -((-self.basex + 100) % self.baseShift)

        # player's movement
        if self.playerVelY < self.playerMaxVelY and not self.playerFlapped:
            self.playerVelY += self.playerAccY
        if self.playerFlapped:
            self.playerFlapped = False
        self.playery += min(self.playerVelY, BASEY - self.playery - PLAYER_HEIGHT)
        if self.playery < 0:
            self.playery = 0

        # move pipes to left
        for uPipe, lPipe in zip(self.upperPipes, self.lowerPipes):
            uPipe['x'] += self.pipeVelX
            lPipe['x'] += self.pipeVelX

        # add new pipe when first pipe is about to touch left of screen
        if 0 < self.upperPipes[0]['x'] < 5:
            newPipe = getRandomPipe()
            self.upperPipes.append(newPipe[0])
            self.lowerPipes.append(newPipe[1])

        # remove first pipe if its out of the screen
        if self.upperPipes[0]['x'] < -PIPE_WIDTH:
            self.upperPipes.pop(0)
            self.lowerPipes.pop(0)

        # check if crash here
        isCrash= checkCrash({'x': self.playerx, 'y': self.playery,
                             'index': self.playerIndex},
                            self.upperPipes, self.lowerPipes)
        if isCrash:
            #SOUNDS['hit'].play()
            #SOUNDS['die'].play()
            terminal = True
            self.__init__()
            reward = -1

        # draw sprites
        SCREEN.blit(IMAGES['background'], (0,0))

        for uPipe, lPipe in zip(self.upperPipes, self.lowerPipes):
            SCREEN.blit(IMAGES['pipe'][0], (uPipe['x'], uPipe['y']))
            SCREEN.blit(IMAGES['pipe'][1], (lPipe['x'], lPipe['y']))

        SCREEN.blit(IMAGES['base'], (self.basex, BASEY))
        # print score so player overlaps the score
        # showScore(self.score)
        SCREEN.blit(IMAGES['player'][self.playerIndex],
                    (self.playerx, self.playery))

        image_data = pygame.surfarray.array3d(pygame.display.get_surface())
        pygame.display.update()
        FPSCLOCK.tick(FPS)
        #print self.upperPipes[0]['y'] + PIPE_HEIGHT - int(BASEY * 0.2)
        return image_data, reward, terminal

def getRandomPipe():
    """returns a randomly generated pipe"""
    # y of gap between upper and lower pipe
    gapYs = [20, 30, 40, 50, 60, 70, 80, 90]
    index = random.randint(0, len(gapYs)-1)
    gapY = gapYs[index]

    gapY += int(BASEY * 0.2)
    pipeX = SCREENWIDTH + 10

    return [
        {'x': pipeX, 'y': gapY - PIPE_HEIGHT},  # upper pipe
        {'x': pipeX, 'y': gapY + PIPEGAPSIZE},  # lower pipe
    ]


def showScore(score):
    """displays score in center of screen"""
    scoreDigits = [int(x) for x in list(str(score))]
    totalWidth = 0 # total width of all numbers to be printed

    for digit in scoreDigits:
        totalWidth += IMAGES['numbers'][digit].get_width()

    Xoffset = (SCREENWIDTH - totalWidth) / 2

    for digit in scoreDigits:
        SCREEN.blit(IMAGES['numbers'][digit], (Xoffset, SCREENHEIGHT * 0.1))
        Xoffset += IMAGES['numbers'][digit].get_width()


def checkCrash(player, upperPipes, lowerPipes):
    """returns True if player collders with base or pipes."""
    pi = player['index']
    player['w'] = IMAGES['player'][0].get_width()
    player['h'] = IMAGES['player'][0].get_height()

    # if player crashes into ground
    if player['y'] + player['h'] >= BASEY - 1:
        return True
    else:

        playerRect = pygame.Rect(player['x'], player['y'],
                      player['w'], player['h'])

        for uPipe, lPipe in zip(upperPipes, lowerPipes):
            # upper and lower pipe rects
            uPipeRect = pygame.Rect(uPipe['x'], uPipe['y'], PIPE_WIDTH, PIPE_HEIGHT)
            lPipeRect = pygame.Rect(lPipe['x'], lPipe['y'], PIPE_WIDTH, PIPE_HEIGHT)

            # player and upper/lower pipe hitmasks
            pHitMask = HITMASKS['player'][pi]
            uHitmask = HITMASKS['pipe'][0]
            lHitmask = HITMASKS['pipe'][1]

            # if bird collided with upipe or lpipe
            uCollide = pixelCollision(playerRect, uPipeRect, pHitMask, uHitmask)
            lCollide = pixelCollision(playerRect, lPipeRect, pHitMask, lHitmask)

            if uCollide or lCollide:
                return True

    return False

def pixelCollision(rect1, rect2, hitmask1, hitmask2):
    """Checks if two objects collide and not just their rects"""
    rect = rect1.clip(rect2)

    if rect.width == 0 or rect.height == 0:
        return False

    x1, y1 = rect.x - rect1.x, rect.y - rect1.y
    x2, y2 = rect.x - rect2.x, rect.y - rect2.y

    for x in range(rect.width):
        for y in range(rect.height):
            if hitmask1[x1+x][y1+y] and hitmask2[x2+x][y2+y]:
                return True
    return False

flappy_bird_utils.py
import pygame
import sys
def load():
    # path of player with different states
    PLAYER_PATH = (
            'assets/sprites/redbird-upflap.png',
            'assets/sprites/redbird-midflap.png',
            'assets/sprites/redbird-downflap.png'
    )

    # path of background
    BACKGROUND_PATH = 'assets/sprites/background-black.png'

    # path of pipe
    PIPE_PATH = 'assets/sprites/pipe-green.png'

    IMAGES, SOUNDS, HITMASKS = {}, {}, {}

    # numbers sprites for score display
    IMAGES['numbers'] = (
        pygame.image.load('assets/sprites/0.png').convert_alpha(),
        pygame.image.load('assets/sprites/1.png').convert_alpha(),
        pygame.image.load('assets/sprites/2.png').convert_alpha(),
        pygame.image.load('assets/sprites/3.png').convert_alpha(),
        pygame.image.load('assets/sprites/4.png').convert_alpha(),
        pygame.image.load('assets/sprites/5.png').convert_alpha(),
        pygame.image.load('assets/sprites/6.png').convert_alpha(),
        pygame.image.load('assets/sprites/7.png').convert_alpha(),
        pygame.image.load('assets/sprites/8.png').convert_alpha(),
        pygame.image.load('assets/sprites/9.png').convert_alpha()
    )

    # base (ground) sprite
    IMAGES['base'] = pygame.image.load('assets/sprites/base.png').convert_alpha()

    # sounds
    if 'win' in sys.platform:
        soundExt = '.wav'
    else:
        soundExt = '.ogg'

    SOUNDS['die']    = pygame.mixer.Sound('assets/audio/die' + soundExt)
    SOUNDS['hit']    = pygame.mixer.Sound('assets/audio/hit' + soundExt)
    SOUNDS['point']  = pygame.mixer.Sound('assets/audio/point' + soundExt)
    SOUNDS['swoosh'] = pygame.mixer.Sound('assets/audio/swoosh' + soundExt)
    SOUNDS['wing']   = pygame.mixer.Sound('assets/audio/wing' + soundExt)

    # select random background sprites
    IMAGES['background'] = pygame.image.load(BACKGROUND_PATH).convert()

    # select random player sprites
    IMAGES['player'] = (
        pygame.image.load(PLAYER_PATH[0]).convert_alpha(),
        pygame.image.load(PLAYER_PATH[1]).convert_alpha(),
        pygame.image.load(PLAYER_PATH[2]).convert_alpha(),
    )

    # select random pipe sprites
    IMAGES['pipe'] = (
        pygame.transform.rotate(
            pygame.image.load(PIPE_PATH).convert_alpha(), 180),
        pygame.image.load(PIPE_PATH).convert_alpha(),
    )

    # hismask for pipes
    HITMASKS['pipe'] = (
        getHitmask(IMAGES['pipe'][0]),
        getHitmask(IMAGES['pipe'][1]),
    )

    # hitmask for player
    HITMASKS['player'] = (
        getHitmask(IMAGES['player'][0]),
        getHitmask(IMAGES['player'][1]),
        getHitmask(IMAGES['player'][2]),
    )

    return IMAGES, SOUNDS, HITMASKS

def getHitmask(image):
    """returns a hitmask using an image's alpha."""
    mask = []
    for x in range(image.get_width()):
        mask.append([])
        for y in range(image.get_height()):
            mask[x].append(bool(image.get_at((x,y))[3]))
    return mask

# BrainDQN_NIPS.py

import tensorflow as tf 
import numpy as np 
import random
from collections import deque 

# Hyper Parameters:
FRAME_PER_ACTION = 1
GAMMA = 0.99 # decay rate of past observations
OBSERVE = 100. # timesteps to observe before training
EXPLORE = 150000. # frames over which to anneal epsilon
FINAL_EPSILON = 0.0 # final value of epsilon
INITIAL_EPSILON = 0.9 # starting value of epsilon
REPLAY_MEMORY = 50000 # number of previous transitions to remember
BATCH_SIZE = 32 # size of minibatch

class BrainDQN:

   def __init__(self,actions):
      # init replay memory
      self.replayMemory = deque()
      # init some parameters
      self.timeStep = 0
      self.epsilon = INITIAL_EPSILON
      self.actions = actions
      # init Q network
      self.createQNetwork()

   def createQNetwork(self):
      # network weights
      W_conv1 = self.weight_variable([8,8,4,32])
      b_conv1 = self.bias_variable([32])

      W_conv2 = self.weight_variable([4,4,32,64])
      b_conv2 = self.bias_variable([64])

      W_conv3 = self.weight_variable([3,3,64,64])
      b_conv3 = self.bias_variable([64])

      W_fc1 = self.weight_variable([1600,512])
      b_fc1 = self.bias_variable([512])

      W_fc2 = self.weight_variable([512,self.actions])
      b_fc2 = self.bias_variable([self.actions])

      # input layer

      self.stateInput = tf.placeholder("float",[None,80,80,4])

      # hidden layers
      h_conv1 = tf.nn.relu(self.conv2d(self.stateInput,W_conv1,4) + b_conv1)
      h_pool1 = self.max_pool_2x2(h_conv1)

      h_conv2 = tf.nn.relu(self.conv2d(h_pool1,W_conv2,2) + b_conv2)

      h_conv3 = tf.nn.relu(self.conv2d(h_conv2,W_conv3,1) + b_conv3)

      h_conv3_flat = tf.reshape(h_conv3,[-1,1600])
      h_fc1 = tf.nn.relu(tf.matmul(h_conv3_flat,W_fc1) + b_fc1)

      # Q Value layer
      self.QValue = tf.matmul(h_fc1,W_fc2) + b_fc2

      self.actionInput = tf.placeholder("float",[None,self.actions])
      self.yInput = tf.placeholder("float", [None]) 
      Q_action = tf.reduce_sum(tf.mul(self.QValue, self.actionInput), reduction_indices = 1)
      self.cost = tf.reduce_mean(tf.square(self.yInput - Q_action))
      self.trainStep = tf.train.AdamOptimizer(1e-6).minimize(self.cost)

      # saving and loading networks
      self.saver = tf.train.Saver()
      self.session = tf.InteractiveSession()
      self.session.run(tf.initialize_all_variables())
      checkpoint = tf.train.get_checkpoint_state("saved_networks")
      if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.session, checkpoint.model_checkpoint_path)
            print ("Successfully loaded:", checkpoint.model_checkpoint_path)
      else:
            print ("Could not find old network weights")

   def trainQNetwork(self):
      # Step 1: obtain random minibatch from replay memory
      minibatch = random.sample(self.replayMemory,BATCH_SIZE)
      state_batch = [data[0] for data in minibatch]
      action_batch = [data[1] for data in minibatch]
      reward_batch = [data[2] for data in minibatch]
      nextState_batch = [data[3] for data in minibatch]

      # Step 2: calculate y 
      y_batch = []
      QValue_batch = self.QValue.eval(feed_dict={self.stateInput:nextState_batch})
      for i in range(0,BATCH_SIZE):
         terminal = minibatch[i][4]
         if terminal:
            y_batch.append(reward_batch[i])
         else:
            y_batch.append(reward_batch[i] + GAMMA * np.max(QValue_batch[i]))

      self.trainStep.run(feed_dict={
         self.yInput : y_batch,
         self.actionInput : action_batch,
         self.stateInput : state_batch
         })

      # save network every 100000 iteration
      if self.timeStep % 10000 == 0:
         self.saver.save(self.session, 'saved_networks/' + 'network' + '-dqn', global_step = self.timeStep)

      
   def setPerception(self,nextObservation,action,reward,terminal):
      #newState = np.append(nextObservation,self.currentState[:,:,1:],axis = 2)
      newState = np.append(self.currentState[:,:,1:],nextObservation,axis = 2)
      self.replayMemory.append((self.currentState,action,reward,newState,terminal))
      if len(self.replayMemory) > REPLAY_MEMORY:
         self.replayMemory.popleft()
      if self.timeStep > OBSERVE:
         # Train the network
         self.trainQNetwork()

      self.currentState = newState
      self.timeStep += 1

   def getAction(self):
      QValue = self.QValue.eval(feed_dict= {self.stateInput:[self.currentState]})[0]
      action = np.zeros(self.actions)
      action_index = 0
      if self.timeStep % FRAME_PER_ACTION == 0:
         if random.random() <= self.epsilon:
            action_index = random.randrange(self.actions)
            action[action_index] = 1
         else:
            action_index = np.argmax(QValue)
            action[action_index] = 1
      else:
         action[0] = 1 # do nothing

      # change episilon
      if self.epsilon > FINAL_EPSILON and self.timeStep > OBSERVE:
         self.epsilon -= (INITIAL_EPSILON - FINAL_EPSILON)/EXPLORE

      return action

   def setInitState(self,observation):
      self.currentState = np.stack((observation, observation, observation, observation), axis = 2)

   def weight_variable(self,shape):
      initial = tf.truncated_normal(shape, stddev = 0.01)
      return tf.Variable(initial)

   def bias_variable(self,shape):
      initial = tf.constant(0.01, shape = shape)
      return tf.Variable(initial)

   def conv2d(self,x, W, stride):
      return tf.nn.conv2d(x, W, strides = [1, stride, stride, 1], padding = "SAME")

   def max_pool_2x2(self,x):
      return tf.nn.max_pool(x, ksize = [1, 2, 2, 1], strides = [1, 2, 2, 1], padding = "SAME")
      

# BrainDQN_Nature.py

import tensorflow as tf 
import numpy as np 
import random
from collections import deque 

# Hyper Parameters:
FRAME_PER_ACTION = 1
GAMMA = 0.99 # decay rate of past observations
OBSERVE = 100. # timesteps to observe before training
EXPLORE = 200000. # frames over which to anneal epsilon
FINAL_EPSILON = 0#0.001 # final value of epsilon
INITIAL_EPSILON = 0#0.01 # starting value of epsilon
REPLAY_MEMORY = 50000 # number of previous transitions to remember
BATCH_SIZE = 32 # size of minibatch
UPDATE_TIME = 100

try:
    tf.mul
except:
    # For new version of tensorflow
    # tf.mul has been removed in new version of tensorflow
    # Using tf.multiply to replace tf.mul
    tf.mul = tf.multiply

class BrainDQN:

   def __init__(self,actions):
      # init replay memory
      self.replayMemory = deque()
      # init some parameters
      self.timeStep = 0
      self.epsilon = INITIAL_EPSILON
      self.actions = actions
      # init Q network
      self.stateInput,self.QValue,self.W_conv1,self.b_conv1,self.W_conv2,self.b_conv2,self.W_conv3,self.b_conv3,self.W_fc1,self.b_fc1,self.W_fc2,self.b_fc2 = self.createQNetwork()

      # init Target Q Network
      self.stateInputT,self.QValueT,self.W_conv1T,self.b_conv1T,self.W_conv2T,self.b_conv2T,self.W_conv3T,self.b_conv3T,self.W_fc1T,self.b_fc1T,self.W_fc2T,self.b_fc2T = self.createQNetwork()

      self.copyTargetQNetworkOperation = [self.W_conv1T.assign(self.W_conv1),self.b_conv1T.assign(self.b_conv1),self.W_conv2T.assign(self.W_conv2),self.b_conv2T.assign(self.b_conv2),self.W_conv3T.assign(self.W_conv3),self.b_conv3T.assign(self.b_conv3),self.W_fc1T.assign(self.W_fc1),self.b_fc1T.assign(self.b_fc1),self.W_fc2T.assign(self.W_fc2),self.b_fc2T.assign(self.b_fc2)]

      self.createTrainingMethod()

      # saving and loading networks
      self.saver = tf.train.Saver()
      self.session = tf.InteractiveSession()
      self.session.run(tf.initialize_all_variables())
      checkpoint = tf.train.get_checkpoint_state("saved_networks")
      if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.session, checkpoint.model_checkpoint_path)
            print ("Successfully loaded:", checkpoint.model_checkpoint_path)
      else:
            print ("Could not find old network weights")


   def createQNetwork(self):
      # network weights
      W_conv1 = self.weight_variable([8,8,4,32])
      b_conv1 = self.bias_variable([32])

      W_conv2 = self.weight_variable([4,4,32,64])
      b_conv2 = self.bias_variable([64])

      W_conv3 = self.weight_variable([3,3,64,64])
      b_conv3 = self.bias_variable([64])

      W_fc1 = self.weight_variable([1600,512])
      b_fc1 = self.bias_variable([512])

      W_fc2 = self.weight_variable([512,self.actions])
      b_fc2 = self.bias_variable([self.actions])

      # input layer

      stateInput = tf.placeholder("float",[None,80,80,4])

      # hidden layers
      h_conv1 = tf.nn.relu(self.conv2d(stateInput,W_conv1,4) + b_conv1)
      h_pool1 = self.max_pool_2x2(h_conv1)

      h_conv2 = tf.nn.relu(self.conv2d(h_pool1,W_conv2,2) + b_conv2)

      h_conv3 = tf.nn.relu(self.conv2d(h_conv2,W_conv3,1) + b_conv3)

      h_conv3_flat = tf.reshape(h_conv3,[-1,1600])
      h_fc1 = tf.nn.relu(tf.matmul(h_conv3_flat,W_fc1) + b_fc1)

      # Q Value layer
      QValue = tf.matmul(h_fc1,W_fc2) + b_fc2

      return stateInput,QValue,W_conv1,b_conv1,W_conv2,b_conv2,W_conv3,b_conv3,W_fc1,b_fc1,W_fc2,b_fc2

   def copyTargetQNetwork(self):
      self.session.run(self.copyTargetQNetworkOperation)

   def createTrainingMethod(self):
      self.actionInput = tf.placeholder("float",[None,self.actions])
      self.yInput = tf.placeholder("float", [None]) 
      Q_Action = tf.reduce_sum(tf.mul(self.QValue, self.actionInput), reduction_indices = 1)
      self.cost = tf.reduce_mean(tf.square(self.yInput - Q_Action))
      self.trainStep = tf.train.AdamOptimizer(1e-6).minimize(self.cost)


   def trainQNetwork(self):

      
      # Step 1: obtain random minibatch from replay memory
      minibatch = random.sample(self.replayMemory,BATCH_SIZE)
      state_batch = [data[0] for data in minibatch]
      action_batch = [data[1] for data in minibatch]
      reward_batch = [data[2] for data in minibatch]
      nextState_batch = [data[3] for data in minibatch]

      # Step 2: calculate y 
      y_batch = []
      QValue_batch = self.QValueT.eval(feed_dict={self.stateInputT:nextState_batch})
      for i in range(0,BATCH_SIZE):
         terminal = minibatch[i][4]
         if terminal:
            y_batch.append(reward_batch[i])
         else:
            y_batch.append(reward_batch[i] + GAMMA * np.max(QValue_batch[i]))

      self.trainStep.run(feed_dict={
         self.yInput : y_batch,
         self.actionInput : action_batch,
         self.stateInput : state_batch
         })

      # save network every 100000 iteration
      if self.timeStep % 10000 == 0:
         self.saver.save(self.session, 'saved_networks/' + 'network' + '-dqn', global_step = self.timeStep)

      if self.timeStep % UPDATE_TIME == 0:
         self.copyTargetQNetwork()

      
   def setPerception(self,nextObservation,action,reward,terminal):
      #newState = np.append(nextObservation,self.currentState[:,:,1:],axis = 2)
      newState = np.append(self.currentState[:,:,1:],nextObservation,axis = 2)
      self.replayMemory.append((self.currentState,action,reward,newState,terminal))
      if len(self.replayMemory) > REPLAY_MEMORY:
         self.replayMemory.popleft()
      if self.timeStep > OBSERVE:
         # Train the network
         self.trainQNetwork()

      # print info
      state = ""
      if self.timeStep <= OBSERVE:
         state = "observe"
      elif self.timeStep > OBSERVE and self.timeStep <= OBSERVE + EXPLORE:
         state = "explore"
      else:
         state = "train"

      print ("TIMESTEP", self.timeStep, "/ STATE", state, \
            "/ EPSILON", self.epsilon)

      self.currentState = newState
      self.timeStep += 1

   def getAction(self):
      QValue = self.QValue.eval(feed_dict= {self.stateInput:[self.currentState]})[0]
      action = np.zeros(self.actions)
      action_index = 0
      if self.timeStep % FRAME_PER_ACTION == 0:
         if random.random() <= self.epsilon:
            action_index = random.randrange(self.actions)
            action[action_index] = 1
         else:
            action_index = np.argmax(QValue)
            action[action_index] = 1
      else:
         action[0] = 1 # do nothing

      # change episilon
      if self.epsilon > FINAL_EPSILON and self.timeStep > OBSERVE:
         self.epsilon -= (INITIAL_EPSILON - FINAL_EPSILON)/EXPLORE

      return action

   def setInitState(self,observation):
      self.currentState = np.stack((observation, observation, observation, observation), axis = 2)

   def weight_variable(self,shape):
      initial = tf.truncated_normal(shape, stddev = 0.01)
      return tf.Variable(initial)

   def bias_variable(self,shape):
      initial = tf.constant(0.01, shape = shape)
      return tf.Variable(initial)

   def conv2d(self,x, W, stride):
      return tf.nn.conv2d(x, W, strides = [1, stride, stride, 1], padding = "SAME")

   def max_pool_2x2(self,x):
      return tf.nn.max_pool(x, ksize = [1, 2, 2, 1], strides = [1, 2, 2, 1], padding = "SAME")
      

#FlappyBirdDQN.py
import cv2
import sys
sys.path.append("game/")
import wrapped_flappy_bird as game
from BrainDQN_Nature import BrainDQN
import numpy as np

# preprocess raw image to 80*80 gray image
def preprocess(observation):
   observation = cv2.cvtColor(cv2.resize(observation, (80, 80)), cv2.COLOR_BGR2GRAY)
   ret, observation = cv2.threshold(observation,1,255,cv2.THRESH_BINARY)
   return np.reshape(observation,(80,80,1))

def playFlappyBird():
   # Step 1: init BrainDQN
   actions = 2
   brain = BrainDQN(actions)
   # Step 2: init Flappy Bird Game
   flappyBird = game.GameState()
   # Step 3: play game
   # Step 3.1: obtain init state
   action0 = np.array([1,0])  # do nothing
   observation0, reward0, terminal = flappyBird.frame_step(action0)
   observation0 = cv2.cvtColor(cv2.resize(observation0, (80, 80)), cv2.COLOR_BGR2GRAY)
   ret, observation0 = cv2.threshold(observation0,1,255,cv2.THRESH_BINARY)
   brain.setInitState(observation0)

   # Step 3.2: run the game
   while 1!= 0:
      action = brain.getAction()
      nextObservation,reward,terminal = flappyBird.frame_step(action)
      nextObservation = preprocess(nextObservation)
      brain.setPerception(nextObservation,action,reward,terminal)

def main():
   playFlappyBird()

if __name__ == '__main__':
   main()
View Code
posted @ 2021-05-30 07:17 祈欢阅读(150) 评论(0) 收藏举报
刷新页面返回顶部
祈欢

使用DQN训练AI玩游戏

公告