Skip to content

Instantly share code, notes, and snippets.

@jkarnows
Forked from imironhead/openai_cartpole_v0_dqn.py
Last active September 8, 2016 07:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jkarnows/522c2d6000e519482b6deb825d17b34b to your computer and use it in GitHub Desktop.
Save jkarnows/522c2d6000e519482b6deb825d17b34b to your computer and use it in GitHub Desktop.
jkarnows-cartpole-v0-dqn
"""
Solve OpenAI Gym Cartpole V0 with DQN.
TensorFlow code by TiehHung Chuang (imironhead), 2016-09-06
tf-slim code by Jeremy Karnowski jkarnows, 2016-09-07
"""
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division
import numpy as np
import random
import gym
import tensorflow as tf
import tensorflow.contrib.slim as slim
class DeepQLearningAgent(object):
def __init__(self, state_space, action_space, network_layers):
self._action_space = action_space
self._dim_state = state_space.shape[0]
self._dim_action = action_space.n
self._batch_size = 200
self._gamma = 0.95
self._prev_state = None
self._prev_action = None
self._prev_reward = 0
prev_states = tf.placeholder(tf.float32, [None, self._dim_state])
net = slim.stack(prev_states, slim.fully_connected, network_layers, activation_fn=tf.nn.relu, scope='fc')
prev_action_values = slim.fully_connected(net, 2, activation_fn=None, scope='qvalues')
prev_action_masks = tf.placeholder(tf.float32, [None, self._dim_action])
prev_values = tf.reduce_sum(tf.mul(prev_action_values, prev_action_masks), reduction_indices=1)
prev_rewards = tf.placeholder(tf.float32, [None, ])
next_states = tf.placeholder(tf.float32, [None, self._dim_state])
net = slim.stack(next_states, slim.fully_connected, [128,128,128], activation_fn=tf.nn.relu, scope='fc', reuse=True)
next_action_values = slim.fully_connected(net, 2, activation_fn=None, scope='qvalues', reuse=True)
next_values = prev_rewards + self._gamma * tf.reduce_max(next_action_values, reduction_indices=1)
loss = tf.reduce_mean(tf.square(prev_values - next_values))
training = tf.train.AdamOptimizer(1e-4).minimize(loss)
self._tf_action_value_predict = prev_action_values
self._tf_prev_states = prev_states
self._tf_prev_action_masks = prev_action_masks
self._tf_prev_rewards = prev_rewards
self._tf_next_states = next_states
self._tf_training = training
self._tf_loss = loss
self._tf_session = tf.InteractiveSession()
self._tf_session.run(tf.initialize_all_variables())
# Build the D which keeps experiences.
self._time = 0
self._epsilon = 1.0
self._epsilon_decay_time = 100
self._epsilon_decay_rate = 0.9
self._experiences_max = 1000
self._experiences_num = 0
self._experiences_prev_states = np.zeros((self._experiences_max, self._dim_state))
self._experiences_next_states = np.zeros((self._experiences_max, self._dim_state))
self._experiences_rewards = np.zeros((self._experiences_max))
self._experiences_actions_mask = np.zeros((self._experiences_max, self._dim_action))
def create_experience(self, prev_state, prev_action, reward, next_state):
"""
keep an experience for later training.
"""
if self._experiences_num >= self._experiences_max:
idx = np.random.choice(self._experiences_max)
else:
idx = self._experiences_num
self._experiences_num += 1
self._experiences_prev_states[idx] = np.array(prev_state)
self._experiences_next_states[idx] = np.array(next_state)
self._experiences_rewards[idx] = reward
self._experiences_actions_mask[idx] = np.zeros(self._dim_action)
self._experiences_actions_mask[idx, prev_action] = 1.0
def train(self):
"""
train the deep q-learning network.
"""
# start training only when there are enough experiences.
if self._experiences_num < self._experiences_max:
return
ixs = np.random.choice(self._experiences_max, self._batch_size, replace=True)
fatches = [self._tf_loss, self._tf_training]
feed = {
self._tf_prev_states: self._experiences_prev_states[ixs],
self._tf_prev_action_masks: self._experiences_actions_mask[ixs],
self._tf_prev_rewards: self._experiences_rewards[ixs],
self._tf_next_states: self._experiences_next_states[ixs]
}
loss, _ = self._tf_session.run(fatches, feed_dict=feed)
def act(self, observation, reward, done):
"""
ask the next action from the agent
"""
self._time += 1
if self._time % self._epsilon_decay_time == 0:
self._epsilon *= self._epsilon_decay_rate
if np.random.rand() > self._epsilon:
states = np.array([observation])
action_values = self._tf_action_value_predict.eval(
feed_dict={self._tf_prev_states: states})
action = np.argmax(action_values)
else:
action = self._action_space.sample()
if self._prev_state is not None:
if done:
reward = -500.0
observation = np.zeros_like(observation)
self.create_experience(
self._prev_state, self._prev_action, reward, observation)
self._prev_state = None if done else observation
self._prev_action = None if done else action
self._prev_reward = 0 if done else self._prev_reward + reward
self.train()
return action
if __name__ == '__main__':
# Environment settings
max_episodes = 1000
max_steps = 200
# Agent settings
network_layers = [128, 128, 128]
# Recording settings
record = True
save_filename = ''
api_key = ''
algorithm_id='' # Deep Q-learning
# Initialize simulation
env = gym.make('CartPole-v0')
# Start recording
if record:
env.monitor.start(save_filename, force=True)
# Create agent
agent = DeepQLearningAgent(env.observation_space, env.action_space, network_layers)
# Run simulation
for episode in xrange(max_episodes):
observation, reward, done = env.reset(), 0.0, False
for step in xrange(max_steps):
action = agent.act(observation, reward, done)
if done or step + 1 == max_steps:
print("{} - {}".format(episode, step))
break
observation, reward, done, _ = env.step(action)
# Stop recording and upload to gym
if record:
env.monitor.close()
gym.upload(save_filename, api_key=api_key, algorithm_id=algorithm_id)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment