DQN 处理 CartPole 问题——使用强化学习,本质上是训练MLP,预测每一个动作的得分
代码:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 | # -*- coding: utf-8 -*- import random import gym import numpy as np from collections import deque from keras.models import Sequential from keras.layers import Dense from keras.optimizers import Adam from keras.utils.vis_utils import plot_model EPISODES = 1000 class DQNAgent: def __init__( self , state_size, action_size): self .state_size = state_size self .action_size = action_size self .memory = deque(maxlen = 2000 ) self .gamma = 0.95 # discount rate #self.epsilon = 1.0 # exploration rate self .epsilon = 0.4 # exploration rate self .epsilon_min = 0.01 self .epsilon_decay = 0.995 self .learning_rate = 0.001 self .model = self ._build_model() #可视化MLP结构 plot_model( self .model, to_file = 'dqn-cartpole-v0-mlp.png' , show_shapes = False ) def _build_model( self ): # Neural Net for Deep-Q learning Model model = Sequential() model.add(Dense( 24 , input_dim = self .state_size, activation = 'relu' )) model.add(Dense( 24 , activation = 'relu' )) model.add(Dense( self .action_size, activation = 'linear' )) model. compile (loss = 'mse' , optimizer = Adam(lr = self .learning_rate)) return model def remember( self , state, action, reward, next_state, done): self .memory.append((state, action, reward, next_state, done)) def act( self , state): if np.random.rand() < = self .epsilon: return random.randrange( self .action_size) act_values = self .model.predict(state) #print("act_values:") #print(act_values) return np.argmax(act_values[ 0 ]) # returns action def replay( self , batch_size): minibatch = random.sample( self .memory, batch_size) for state, action, reward, next_state, done in minibatch: target = reward if not done: target = (reward + self .gamma * np.amax( self .model.predict(next_state)[ 0 ])) target_f = self .model.predict(state) target_f[ 0 ][action] = target self .model.fit(state, target_f, epochs = 1 , verbose = 0 ) #if self.epsilon > self.epsilon_min: # self.epsilon *= self.epsilon_decay def load( self , name): self .model.load_weights(name) def save( self , name): self .model.save_weights(name) if __name__ = = "__main__" : env = gym.make( 'CartPole-v0' ) state_size = env.observation_space.shape[ 0 ] action_size = env.action_space.n #print(state_size) #print(action_size) agent = DQNAgent(state_size, action_size) done = False batch_size = 32 avg = 0 for e in range (EPISODES): state = env.reset() state = np.reshape(state, [ 1 , state_size]) for time in range ( 500 ): env.render() action = agent.act(state) next_state, reward, done, _ = env.step(action) reward = reward if not done else - 10 next_state = np.reshape(next_state, [ 1 , state_size]) agent.remember(state, action, reward, next_state, done) state = next_state if done: print ( "episode: {}/{}, score: {}, e: {:.2}" . format (e, EPISODES, time, agent.epsilon)) avg + = time break if len (agent.memory) > batch_size: agent.replay(batch_size) print ( "Avg score:{}" . format (avg / 1000 )) |
基本思路:
让他自己训练玩这个游戏(每次应该左右移动的距离),基本思路就是:
本质上就是使用MLP训练(动作,得分)
这个得分是坚持时间的长短,如果时间长得分就高。
但是我感觉这个gym自己做了很多事情,比如度量奖励分数,action描述等。待进一步挖掘!
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 记一次.NET内存居高不下排查解决与启示
· 探究高空视频全景AR技术的实现原理
· 理解Rust引用及其生命周期标识(上)
· 浏览器原生「磁吸」效果!Anchor Positioning 锚点定位神器解析
· 没有源码,如何修改代码逻辑?
· 全程不用写代码,我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· 记一次.NET内存居高不下排查解决与启示
· 白话解读 Dapr 1.15:你的「微服务管家」又秀新绝活了
· DeepSeek 开源周回顾「GitHub 热点速览」