Retraining InceptionV4 Final Layer for new categories: local variable not initialized

I'm still new to tensor flow, so I'm sorry if this is a naive question. I am trying to use the inception_V4 site model. In addition, I use my network as is, I mean published on my site .

This is what I call the network:

 def network(images_op, keep_prob): width_needed_InceptionV4Net = 342 shape = images_op.get_shape().as_list() H = int(round(width_needed_InceptionV4Net * shape[1] / shape[2], 2)) resized_images = tf.image.resize_images(images_op, [width_needed_InceptionV4Net, H], tf.image.ResizeMethod.BILINEAR) with slim.arg_scope(inception.inception_v4_arg_scope()): logits, _ = inception.inception_v4(resized_images, num_classes=20, is_training=True, dropout_keep_prob = keep_prob) return logits 

Since I need to reset the final inception_V4 level for my categories, I changed the number of classes to 20, as you can see in the method call ( inception.inception_v4 ).

Here is the train method that I still have:

 def optimistic_restore(session, save_file, flags): reader = tf.train.NewCheckpointReader(save_file) saved_shapes = reader.get_variable_to_shape_map() var_names = sorted([(var.name, var.name.split(':')[0]) for var in tf.global_variables() if var.name.split(':')[0] in saved_shapes]) restore_vars = [] name2var = dict(zip(map(lambda x:x.name.split(':')[0], tf.global_variables()), tf.global_variables())) if flags.checkpoint_exclude_scopes is not None: exclusions = [scope.strip() for scope in flags.checkpoint_exclude_scopes.split(',')] with tf.variable_scope('', reuse=True): variables_to_init = [] for var_name, saved_var_name in var_names: curr_var = name2var[saved_var_name] var_shape = curr_var.get_shape().as_list() if var_shape == saved_shapes[saved_var_name]: print(saved_var_name) excluded = False for exclusion in exclusions: if saved_var_name.startswith(exclusion): variables_to_init.append(var) excluded = True break if not excluded: restore_vars.append(curr_var) saver = tf.train.Saver(restore_vars) saver.restore(session, save_file) def train(images, ids, labels, total_num_examples, batch_size, train_dir, network, flags, optimizer, log_periods, resume): """ !@brief Trains the network for a number of steps. @param images image tensor @param ids id tensor @param labels label tensor @param total_num_examples total number of training examples @param batch_size batch size @param train_dir directory where checkpoints should be saved @param network pointer to a function describing the network @param flags command-line arguments @param optimizer pointer to the optimization class @param log_periods list containing the step intervals at which 1) logs should be printed, 2) logs should be saved for TensorBoard and 3) variables should be saved @param resume should training be resumed (or restarted from scratch)? @return the number of training steps performed since the first call to 'train' """ # clearing the training directory if not resume: if tf.gfile.Exists(train_dir): tf.gfile.DeleteRecursively(train_dir) tf.gfile.MakeDirs(train_dir) print('Training the network in directory %s...' % train_dir) global_step = tf.Variable(0, trainable = False) # creating a placeholder, set to ones, used to assess the importance of each pixel mask, ones = _mask(images, batch_size, flags) # building a Graph that computes the logits predictions from the inference model keep_prob = tf.placeholder_with_default(0.5, []) logits = network(images * mask, keep_prob) # creating the optimizer if optimizer == tf.train.MomentumOptimizer: opt = optimizer(flags.learning_rate, flags.momentum) else: opt = optimizer(flags.learning_rate) # calculating the semantic loss, defined as the classification or regression loss if flags.boosting_weights is not None and os.path.isfile(flags.boosting_weights): boosting_weights_value = np.loadtxt(flags.boosting_weights, dtype = np.float32, delimiter = ',') boosting_weights = tf.placeholder_with_default(boosting_weights_value, list(boosting_weights_value.shape), name = 'boosting_weights') semantic_loss = _boosting_loss(logits, ids, boosting_weights, flags) else: semantic_loss = _loss(logits, labels, flags) tf.add_to_collection('losses', semantic_loss) # computing the loss gradient with respect to the mask (ie the insight tensor) and # penalizing its L1-norm # replace 'semantic_loss' with 'tf.reduce_sum(logits)'? insight = tf.gradients(semantic_loss, [mask])[0] insight_loss = tf.reduce_sum(tf.abs(insight)) if flags.insight_loss > 0.0: with tf.control_dependencies([semantic_loss]): tf.add_to_collection('losses', tf.multiply(flags.insight_loss, insight_loss, name = 'insight_loss')) else: tf.summary.scalar('insight_loss_raw', insight_loss) # summing all loss factors and computing the moving average of all individual losses and of # the sum loss = tf.add_n(tf.get_collection('losses'), name = 'total_loss') loss_averages_op = tf.train.ExponentialMovingAverage(0.9, name = 'avg') losses = tf.get_collection('losses') loss_averages = loss_averages_op.apply(losses + [loss]) # attaching a scalar summary to all individual losses and the total loss; # do the same for the averaged version of the losses for l in losses + [loss]: tf.summary.scalar(l.op.name + '_raw', l) tf.summary.scalar(l.op.name + '_avg', loss_averages_op.average(l)) # computing and applying gradients with tf.control_dependencies([loss_averages]): grads = opt.compute_gradients(loss) apply_gradient = opt.apply_gradients(grads, global_step = global_step) # adding histograms for trainable variables and gradients for var in tf.trainable_variables(): tf.summary.histogram(var.op.name, var) for grad, var in grads: if grad is not None: tf.summary.histogram(var.op.name + '/gradients', grad) tf.summary.histogram('insight', insight) # tracking the moving averages of all trainable variables variable_averages_op = tf.train.ExponentialMovingAverage(flags.moving_average_decay, global_step) variable_averages = variable_averages_op.apply(tf.trainable_variables()) # building a Graph that trains the model with one batch of examples and # updates the model parameters with tf.control_dependencies([apply_gradient, variable_averages]): train_op = tf.no_op(name = 'train') # creating a saver saver = tf.train.Saver(tf.global_variables()) # building the summary operation based on the TF collection of Summaries summary_op = tf.summary.merge_all() # creating a session current_global_step = -1 with tf.Session(config = tf.ConfigProto(log_device_placement = False, inter_op_parallelism_threads = flags.num_cpus, device_count = {'GPU': flags.num_gpus})) as sess: # initializing variables if flags.checkpoint_exclude_scopes is not None: optimistic_restore(sess, os.path.join(train_dir, 'inception_V4.ckpt'), flags) # starting the queue runners .. # creating a summary writer .. # training itself .. # saving the model checkpoint checkpoint_path = os.path.join(train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step = current_global_step) # stopping the queue runners .. return current_global_step 

I added a flag in a python script called checkpoint_exclude_scopes where I determine exactly which tensors should not be restored. This is necessary to change the number of classes in the last layer of the network. This is what I call a python script:

 ./toolDetectionInceptions.py --batch_size=32 --running_mode=resume --checkpoint_exclude_scopes=InceptionV4/Logits,InceptionV4/AuxLogits 

My first tests were terrible because I had too many problems .. something like:

 tensorflow.python.framework.errors.NotFoundError: Tensor name "InceptionV4/Mixed_6b/Branch_3/Conv2d_0b_1x1/weights/read:0" not found in checkpoint files 

After some googling, I could find a workaround on this site , where they suggest using the optimistic_restore function presented in the code above, including some of its modifications.

But now the problem is different:

 W tensorflow/core/framework/op_kernel.cc:993] Failed precondition: Attempting to use uninitialized value Variable [[Node: Variable/read = Identity[T=DT_INT32, _class=["loc:@Variable"], _device="/job:localhost/replica:0/task:0/cpu:0"](Variable)]] 

There seems to be a local variable that is not initialized, but I could not find it. Can you please help?

Edition:

To debug this problem, I checked the number of variables that need to be initialized and restored by adding some logs to the optimistic_restore function. Here is a quick one:

  # saved_shapes 609 # var_names 608 # name2var 1519 # variables_to_init: 7 # restore_vars: 596 # global_variables: 1519 

For your information, CheckpointReader.get_variable_to_shape_map(): returns the names of the bit mapping tensors to ints lists representing the shape of the corresponding tensor at the breakpoint. This means that the number of variables at this breakpoint is 609 , and the total number of variables needed for recovery is 1519 .

There seems to be a huge gap between the preliminary control point tensors and the variables used by the network architecture (actually their network). Is there any compression at the checkpoint? Is that what I'm saying? Now I know what is missing: it is just the initialization of variables that have not been restored. However, I need to know why there is a huge difference between their InceptionV4 network architecture and the pre-processed checkpoint?

+7
python tensorflow tensorflow-gpu
source share
2 answers

Here's how I should define the optimistic_restore function so that it works properly:

 def optimistic_restore(session, save_file, flags): if flags.checkpoint_exclude_scopes is not None: exclusions = [scope.strip() for scope in flags.checkpoint_exclude_scopes.split(',')] reader = tf.train.NewCheckpointReader(save_file) saved_shapes = reader.get_variable_to_shape_map() print ('saved_shapes %d' % len(saved_shapes)) var_names = sorted([(var.name, var.name.split(':')[0]) for var in tf.global_variables() if var.name.split(':')[0] in saved_shapes]) var_names_to_be_initialized = sorted([(var.name, var.name.split(':')[0]) for var in tf.global_variables() if var.name.split(':')[0] not in saved_shapes]) print('var_names %d' % len(var_names)) print('var_names_to_be_initialized %d' % len(var_names_to_be_initialized)) restore_vars = [] name2var = dict(zip(map(lambda x: x.name.split(':')[0], tf.global_variables()), tf.global_variables())) print('name2var %d' % len(name2var)) with tf.variable_scope('', reuse=True): variables_to_init = [] for var_name, saved_var_name in var_names: curr_var = name2var[saved_var_name] var_shape = curr_var.get_shape().as_list() if var_shape == saved_shapes[saved_var_name]: excluded = False for exclusion in exclusions: if saved_var_name.startswith(exclusion): variables_to_init.append(curr_var) excluded = True break if not excluded: restore_vars.append(curr_var) else: variables2_to_init.append(curr_var) for var_name, saved_var_name in var_names_to_be_initialized: curr_var = name2var[saved_var_name] variables2_to_init.append(curr_var) print('variables2_to_init : %d ' % len(variables_to_init)) print('global_variables: %d ' % len(tf.global_variables())) print('restore_vars: %d ' % len(restore_vars)) saver = tf.train.Saver(restore_vars) saver.restore(session, save_file) session.run(tf.variables_initializer(variables_to_init)) 
+2
source share

Variables that are not restored by the keeper must be initialized. To this end, you can run v.initializer.run() for every v variable that you cannot restore.

+3
source share

All Articles