Loss plateaus off in neural style transfer

I am writing an implementation of style transfer by loading a vgg model from keras and supplying it to a tensorflow model.

I am using an adam optimizer. The loss function is reducing but it is very slow and plateaus off at about 108. Additionally the generated image color seems to be changing correctly but it is still clearly noise.

Also, the style loss is huge (order of 108) whereas content loss is much smaller(order of 105). This is weird as the paper for style transfer says to scale content loss down by a factor of 100 or 1000 when calculating total loss.

I tried increasing the learning rate but that only makes the gradient overshoot.

I suspect there must be a bug in my implementation but despite searching endlessly I have been unable to find what's wrong.

Here's the code:

# coding: utf-8
# In[1]:

from keras.applications.vgg16 import VGG16
from keras.models import Model
import tensorflow as tf
import tensorflow.contrib.eager as tfe
import numpy as np
import matplotlib.pyplot as plt


# In[2]:


content_image_path = './skyline.jpg'
style_image_path = './starry_night.jpg'
output_image_path = './output.jpg'

# In[4]:

from keras.preprocessing import image
from keras.applications.vgg16 import preprocess_input

# In[5]:

content_image = image.load_img(content_image_path, target_size=(224, 224))
#plt.imshow(content_image)
content_arr = image.img_to_array(content_image)
content_arr = tf.convert_to_tensor(preprocess_input(np.expand_dims(content_arr, axis=0)), tf.float64)
sess.run(tf.shape(content_arr))

# In[6]:

style_image = image.load_img(style_image_path, target_size=(224, 224))
#plt.imshow(style_image)
style_arr = image.img_to_array(style_image)
style_arr = tf.convert_to_tensor(preprocess_input(np.expand_dims(style_arr, axis=0)), tf.float64)
sess.run(tf.shape(style_arr))

# In[7]:

#generate random image with pixel values b/w 0 - 255
o_input = np.random.randint(low=0, high=256, size=(224, 224, 3)).astype('float64')
plt.imshow(o_input)
o_input_old = np.copy(o_input)
o_input = preprocess_input(np.expand_dims(o_input, axis=0))
print(o_input_old)

o_input_var = tf.Variable(o_input, name=gen_img_vector, trainable=True)

# In[8]:

content_model = VGG16(include_top=False, weights='imagenet', input_tensor=content_arr, input_shape=(224, 224, 3))
style_model = VGG16(include_top=False, weights='imagenet', input_tensor=style_arr, input_shape=(224, 224, 3))
train_model = VGG16(include_top=False, weights='imagenet', input_tensor=o_input_var, input_shape=(224, 224, 3))

# In[10]:

content_model.summary()

# In[11]:

def get_feature_rep(layer_type, layer_names, model):

    outputs = []
    for name in layer_names:
        out = model.get_layer(name=name).output

        N = tf.shape(out)[3]#number of channels
        M = tf.multiply(tf.shape(out)[1], tf.shape(out)[2])#product of dimensions

        out = tf.transpose(tf.reshape(out, (M, N)))#Flattens each channel into 1-D tensor  reshapes layer
        if layer_type == 'style':
            out = get_gram_matrix(out)
        print(out)
        outputs.append(out)
    return outputs

# In[12]:

def get_gram_matrix(F):
    G = tf.matmul(F, tf.transpose(F))
    return G


# In[13]:


def style_loss(Gs, As):

    total = tf.Variable(tf.constant(0.0, tf.float64), name=style_loss, trainable=False)
    style_reps = list(zip(Gs, As))

    for layer in style_reps:
        loss = tf.reduce_sum(tf.cast(tf.squared_difference(layer[0], layer[1]), tf.float64), [0, 1])
        N_layer = tf.shape(layer[0])[0]
        M_layer = tf.shape(layer[0])[1]
        den = tf.square(tf.cast(tf.multiply(N_layer, M_layer), tf.float64))
        loss = loss/den
        loss = loss*0.2/4.0 #weighting loss
        total = total + loss

    return total


# In[14]:

def content_loss(P, F):
#     loss = tf.Variable(tf.constant(0.0, tf.float64), name=content_loss, trainable=False)
    loss = tf.reduce_sum(tf.cast(tf.squared_difference(P, F), tf.float64), [0, 1])
    loss = loss/2.0
    return loss

# In[15]:

content_layer_names = ['block4_conv2']
style_layer_names = ['block1_conv1', 'block2_conv1', 'block3_conv1', 'block4_conv1']

# In[32]:

P = tf.squeeze(get_feature_rep('content', content_layer_names, content_model))

# In[34]:

F = tf.squeeze(get_feature_rep('content', content_layer_names, train_model))

# In[18]:

#Each member of As consists of a feature map corresponding to a particular layer (dim. channels x pixels per channel)
As = get_feature_rep('style', style_layer_names, style_model)

# In[19]:

Gs = get_feature_rep('style', style_layer_names, train_model)

# In[20]:

styleloss = style_loss(Gs, As)

# In[21]:

contentloss = content_loss(P, F)

# In[22]:

total_loss = tf.add(styleloss, tf.multiply(tf.constant(0.01, tf.float64), contentloss))


# In[23]:

optimizer = tf.train.AdamOptimizer(5).minimize(total_loss, var_list=[o_input_var])

# In[26]:

def reprocess(x):
    VGG_MEAN = [123.68, 116.78, 103.94]
    means = tf.reshape(tf.constant(VGG_MEAN, tf.float64), [1, 1, 3])
    #Undo mean imagenet scale preprocessing
    x = tf.add(x, means)
    tf.clip_by_value(x, 0, 255)
    #bgr to rgb
    x = x[..., ::-1]
    return x

# In[27]:

saver = tf.train.Saver(tf.global_variables())

# In[28]:

init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)

#     saver.restore(sess, './model/nst_model.ckpt')

    for epoch in range(100):
        _, styleloss_curr, contentloss_curr, loss_curr, new_arr = sess.run([optimizer, styleloss, contentloss, total_loss, o_input_var])

        print('Epoch: %i    Content Loss: %.2f    Style Loss: %.2f    Total Loss: %.2f' % (epoch, contentloss_curr, styleloss_curr, loss_curr))

        if epoch % 15 == 0:
            saver.save(sess, './model/nst_model.ckpt')

# In[30]:

with tf.Session() as sess:
    new_arr = reprocess(new_arr)
    new_im = sess.run(tf.cast(tf.round(tf.squeeze(new_arr)), tf.uint8))
#     new_im = new_im[...,::-1]
#     print(sess.run(new_arr[0]/255))
    print(sess.run(tf.shape(new_im)))
    plt.imshow(new_im)

Here are plots of the style (blue) and content (red) losses after 150 iterations (6-7 minutes):

Typical implementations are known to converge after 15-20 minutes with the loss dropping drastically initially. In this case the image generated is basically colored noise even after 500 iterations.

Topic neural-style-transfer keras tensorflow implementation deep-learning

Category Data Science

About

Geeks Mental is a community that publishes articles and tutorials about Web, Android, Data Science, new techniques and Linux security.