Explanation of Fundamental Functions involved in A3C algorithm
Although any implementation of the Asynchronous Advantage Actor Critic algorithm is bound to be complex, all the implementations will have the one thing in common – the presence of the Global Network and the worker class.
- Global Network class: This contains all the required Tensorflow operations to autonomously create the neural networks.
- Worker class: This class is used to simulate the learning process of a worker who has its own copy of the environment and a “personal” neural network.
For the following implementation, the following modules will be required:-
- Numpy
- Tensorflow
- Multiprocessing
The following lines of code denote the fundamental functions required to build the respective class.
Global Network class:
# Defining the Network class class AC_Network(): |
The following lines contain the various functions describe the member functions of the class defined above.
Initializing the class:
# Initializing the class def __init__( self , s_size, a_size, scope, trainer): with tf.variable_scope(scope): # Input and visually encoding the layers self .inputs = tf.placeholder(shape = [ None , s_size], dtype = tf.float32) self .imageIn = tf.reshape( self .inputs, shape = [ - 1 , 84 , 84 , 1 ]) self .conv1 = slim.conv2d(activation_fn = tf.nn.elu, inputs = self .imageIn, num_outputs = 16 , kernel_size = [ 8 , 8 ], stride = [ 4 , 4 ], padding = 'VALID' ) self .conv2 = slim.conv2d(activation_fn = tf.nn.elu, inputs = self .conv1, num_outputs = 32 , kernel_size = [ 4 , 4 ], stride = [ 2 , 2 ], padding = 'VALID' ) hidden = slim.fully_connected(slim.flatten( self .conv2), 256 , activation_fn = tf.nn.elu) |
-> tf.placeholder() - Inserts a placeholder for a tensor that will always be fed. -> tf.reshape() - Reshapes the input tensor -> slim.conv2d() - Adds an n-dimensional convolutional network -> slim.fully_connected() - Adds a fully connected layer
Note the following definitions:
- Filter: It is a small matrix which is used to apply different effects on a given image.
- Padding: It is the process of adding an extra row or column on the boundaries of an image to completely compute the filter convolution value of the filter.
- Stride: It is the number of steps after which the filter is set upon the pixel in a given direction.
Building the Recurrent network:
def __init__( self , s_size, a_size, scope, trainer): with tf.variable_scope(scope): . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # Building the Recurrent network for temporal dependencies lstm_cell = tf.nn.rnn_cell.BasicLSTMCell( 256 , state_is_tuple = True ) c_init = np.zeros(( 1 , lstm_cell.state_size.c), np.float32) h_init = np.zeros(( 1 , lstm_cell.state_size.h), np.float32) self .state_init = [c_init, h_init] c_Init = tf.placeholder(tf.float32, [ 1 , lstm_cell.state_size.c]) h_Init = tf.placeholder(tf.float32, [ 1 , lstm_cell.state_size.h]) self .state_Init = (c_Init, h_Init) rnn_init = tf.expand_dims(hidden, [ 0 ]) step_size = tf.shape( self .imageIn)[: 1 ] state_Init = tf.nn.rnn_cell.LSTMStateTuple(c_Init, h_Init) lstm_outputs, lstm_state = tf.nn.dynamic_rnn(lstm_cell, rnn_init, initial_state = state_Init, sequence_length = step_size, time_major = False ) lstm_c, lstm_h = lstm_state self .state_out = (lstm_c[: 1 , :], lstm_h[: 1 , :]) rnn_out = tf.reshape(lstm_outputs, [ - 1 , 256 ]) |
-> tf.nn.rnn_cell.BasicLSTMCell() - Builds a basic LSTM Recurrent network cell -> tf.expand_dims() - Inserts a dimension of 1 at the dimension index axis of input's shape -> tf.shape() - returns the shape of the tensor -> tf.nn.rnn_cell.LSTMStateTuple() - Creates a tuple to be used by the LSTM cells for state_size, zero_state and output state. -> tf.nn.dynamic_rnn() - Builds a Recurrent network according to the Recurrent network cell
Building the output layers for value and policy estimation:
def __init__( self , s_size, a_size, scope, trainer): with tf.variable_scope(scope): . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # Building the output layers for value and policy estimations self .policy = slim.fully_connected(rnn_out, a_size, activation_fn = tf.nn.softmax, weights_initializer = normalized_columns_initializer( 0.01 ), biases_initializer = None ) self .value = slim.fully_connected(rnn_out, 1 , activation_fn = None , weights_initializer = normalized_columns_initializer( 1.0 ), biases_initializer = None ) |
Building the Master network and deploying the workers:
def __init__( self , s_size, a_size, scope, trainer): with tf.variable_scope(scope): . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . with tf.device( "/cpu:0" ): # Generating the global network master_network = AC_Network(s_size, a_size, 'global' , None ) # Keeping the number of workers # as the number of available CPU threads num_workers = multiprocessing.cpu_count() # Creating and deploying the workers workers = [] for i in range (num_workers): workers.append(Worker(DoomGame(), i, s_size, a_size, trainer, saver, model_path)) |
Running the parallel Tensorflow operations:
def __init__( self , s_size, a_size, scope, trainer): with tf.variable_scope(scope): . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . with tf.Session() as sess: coord = tf.train.Coordinator() if load_model = = True : ckpt = tf.train.get_checkpoint_state(model_path) saver.restore(sess, ckpt.model_checkpoint_path) else : sess.run(tf.global_variables_initializer()) worker_threads = [] for worker in workers: worker_work = lambda : worker.work(max_episode_length, gamma, master_network, sess, coord) t = threading.Thread(target = (worker_work)) t.start() worker_threads.append(t) coord.join(worker_threads) |
-> tf.Session() - A class to run the Tensorflow operations -> tf.train.Coordinator() - Returns a coordinator for the multiple threads -> tf.train.get_checkpoint_state() - Returns a valid checkpoint state from the "checkpoint" file -> saver.restore() - Is used to store and restore the models -> sess.run() - Outputs the tensors and metadata obtained from running a session
Updating the global network parameters:
def __init__( self , s_size, a_size, scope, trainer): with tf.variable_scope(scope): . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . if scope ! = 'global' : self .actions = tf.placeholder(shape = [ None ], dtype = tf.int32) self .actions_onehot = tf.one_hot( self .actions, a_size, dtype = tf.float32) self .target_v = tf.placeholder(shape = [ None ], dtype = tf.float32) self .advantages = tf.placeholder(shape = [ None ], dtype = tf.float32) self .responsible_outputs = tf.reduce_sum( self .policy * self .actions_onehot, [ 1 ]) # Computing the error self .value_loss = 0.5 * tf.reduce_sum(tf.square( self .target_v - tf.reshape( self .value, [ - 1 ]))) self .entropy = - tf.reduce_sum( self .policy * tf.log( self .policy)) self .policy_loss = - tf.reduce_sum(tf.log( self .responsible_outputs) * self .advantages) self .loss = 0.5 * self .value_loss + self .policy_loss - self .entropy * 0.01 # Get gradients from the local network local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope) self .gradients = tf.gradients( self .loss, local_vars) self .var_norms = tf.global_norm(local_vars) grads, self .grad_norms = tf.clip_by_global_norm( self .gradients, 40.0 ) # Apply the local gradients to the global network global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global' ) self .apply_grads = trainer.apply_gradients( zip (grads, global_vars)) |
-> tf.one_hot() - Returns a one-hot encoded tensor -> tf.reduce_sum() - Reduces the input tensor along the input dimensions -> tf.gradients() - Constructs the symbolic derivatives of the sum -> tf.clip_by_global_norm() - Performs the clipping of values of the multiple tensors by the ratio of the sum of the norms -> trainer.apply_gradients() - Performs the update step according to the optimizer.
Defining a utility function to copy the parameters of one network to the other:
def update_target_graph(from_scope, to_scope): from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope) to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope) op_holder = [] for from_var, to_var in zip (from_vars, to_vars): op_holder.append(to_var.assign(from_var)) return op_holder |
-> tf.get_collection() - Returns the list of values in the collection with the given name.
Worker Class:
Defining the Class:
# Defining the Worker Class class Worker(): |
The following lines of code describes the member functions of the above described class
Initializing the class:
# Initializing the class def __init__( self , game, name, s_size, a_size, trainer, saver, model_path): # Creating a copy of the environment and the network self .local_AC = AC_Network(s_size, a_size, self .name, trainer) self .update_local_ops = update_target_graph( 'global' , self .name) |
Defining the function for the worker to interact with it’s environment:
def work( self , max_episode_length, gamma, global_AC, sess, coord): episode_count = 0 total_step_count = 0 with sess.as_default(), sess.graph.as_default(): while not coord.should_stop(): sess.run( self .update_local_ops) episode_buffer = [] episode_values = [] episode_frames = [] episode_reward = 0 episode_step_count = 0 d = False # Building the environment into the frame # buffer of the machine self .env.new_episode() s = self .env.get_state().screen_buffer episode_frames.append(s) s = process_frame(s) rnn_state = self .local_AC.state_init while self .env.is_episode_finished() = = False : # Sampling the different actions a_dist, v, rnn_state = sess.run([ self .local_AC.policy, self .local_AC.value, self .local_AC.state_out], feed_dict = { self .local_AC.inputs:[s], self .local_AC.state_in[ 0 ]:rnn_state[ 0 ], self .local_AC.state_in[ 1 ]:rnn_state[ 1 ]}) a = np.random.choice(a_dist[ 0 ], p = a_dist[ 0 ]) a = np.argmax(a_dist = = a) # Computing the reward r = self .env.make_action( self .actions[a]) / 100.0 d = self .env.is_episode_finished() if d = = False : s1 = self .env.get_state().screen_buffer episode_frames.append(s1) s1 = process_frame(s1) else : s1 = s episode_buffer.append([s, a, r, s1, d, v[ 0 , 0 ]]) episode_values.append(v[ 0 , 0 ]) episode_reward + = r s = s1 total_steps + = 1 episode_step_count + = 1 |
-> sess.as_default() - Sets the current session as the default session -> self.env.new_episode() - Initializes a new training episode for the worker
Defining the training function for the worker:
def train( self , global_AC, rollout, sess, gamma, bootstrap_value): rollout = np.array(rollout) observations = rollout[:, 0 ] actions = rollout[:, 1 ] rewards = rollout[:, 2 ] next_observations = rollout[:, 3 ] values = rollout[:, 5 ] # Calculating the rewards and compute the advantage function self .rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value]) discounted_rewards = discount( self .rewards_plus, gamma)[: - 1 ] self .value_plus = np.asarray(values.tolist() + [bootstrap_value]) advantages = rewards + gamma * self .value_plus[ 1 :] - self .value_plus[: - 1 ] advantages = discount(advantages, gamma) # Update the global network parameters rnn_state = self .local_AC.state_init feed_dict = { self .local_AC.target_v:discounted_rewards, self .local_AC.inputs:np.vstack(observations), self .local_AC.actions:actions, self .local_AC.advantages:advantages, self .local_AC.state_in[ 0 ]:rnn_state[ 0 ], self .local_AC.state_in[ 1 ]:rnn_state[ 1 ]} v_l, p_l, e_l, g_n, v_n, _ = sess.run([ self .local_AC.value_loss, self .local_AC.policy_loss, self .local_AC.entropy, self .local_AC.grad_norms, self .local_AC.var_norms, self .local_AC.apply_grads], feed_dict = feed_dict) return v_l / len (rollout), p_l / len (rollout), e_l / len (rollout), g_n, v_n |