Loss plateaus off in neural style transfer

Question

Loss plateaus off in neural style transfer

RagingStormlight

2021年8月29日 15:42

I am writing an implementation of style transfer by loading a vgg model from keras and supplying it to a tensorflow model.

I am using an adam optimizer. The loss function is reducing but it is very slow and plateaus off at about 10⁸. Additionally the generated image color seems to be changing correctly but it is still clearly noise.

Also, the style loss is huge (order of 10⁸) whereas content loss is much smaller(order of 10⁵). This is weird as the paper for style transfer says to scale content loss down by a factor of 100 or 1000 when calculating total loss.

I tried increasing the learning rate but that only makes the gradient overshoot.

I suspect there must be a bug in my implementation but despite searching endlessly I have been unable to find what's wrong.

Here's the code:

# coding: utf-8
# In[1]:

from keras.applications.vgg16 import VGG16
from keras.models import Model
import tensorflow as tf
import tensorflow.contrib.eager as tfe
import numpy as np
import matplotlib.pyplot as plt


# In[2]:


content_image_path = './skyline.jpg'
style_image_path = './starry_night.jpg'
output_image_path = './output.jpg'

# In[4]:

from keras.preprocessing import image
from keras.applications.vgg16 import preprocess_input

# In[5]:

content_image = image.load_img(content_image_path, target_size=(224, 224))
#plt.imshow(content_image)
content_arr = image.img_to_array(content_image)
content_arr = tf.convert_to_tensor(preprocess_input(np.expand_dims(content_arr, axis=0)), tf.float64)
sess.run(tf.shape(content_arr))

# In[6]:

style_image = image.load_img(style_image_path, target_size=(224, 224))
#plt.imshow(style_image)
style_arr = image.img_to_array(style_image)
style_arr = tf.convert_to_tensor(preprocess_input(np.expand_dims(style_arr, axis=0)), tf.float64)
sess.run(tf.shape(style_arr))

# In[7]:

#generate random image with pixel values b/w 0 - 255
o_input = np.random.randint(low=0, high=256, size=(224, 224, 3)).astype('float64')
plt.imshow(o_input)
o_input_old = np.copy(o_input)
o_input = preprocess_input(np.expand_dims(o_input, axis=0))
print(o_input_old)

o_input_var = tf.Variable(o_input, name=gen_img_vector, trainable=True)

# In[8]:

content_model = VGG16(include_top=False, weights='imagenet', input_tensor=content_arr, input_shape=(224, 224, 3))
style_model = VGG16(include_top=False, weights='imagenet', input_tensor=style_arr, input_shape=(224, 224, 3))
train_model = VGG16(include_top=False, weights='imagenet', input_tensor=o_input_var, input_shape=(224, 224, 3))

# In[10]:

content_model.summary()

# In[11]:

def get_feature_rep(layer_type, layer_names, model):

    outputs = []
    for name in layer_names:
        out = model.get_layer(name=name).output

        N = tf.shape(out)[3]#number of channels
        M = tf.multiply(tf.shape(out)[1], tf.shape(out)[2])#product of dimensions

        out = tf.transpose(tf.reshape(out, (M, N)))#Flattens each channel into 1-D tensor  reshapes layer
        if layer_type == 'style':
            out = get_gram_matrix(out)
        print(out)
        outputs.append(out)
    return outputs

# In[12]:

def get_gram_matrix(F):
    G = tf.matmul(F, tf.transpose(F))
    return G


# In[13]:


def style_loss(Gs, As):

    total = tf.Variable(tf.constant(0.0, tf.float64), name=style_loss, trainable=False)
    style_reps = list(zip(Gs, As))

    for layer in style_reps:
        loss = tf.reduce_sum(tf.cast(tf.squared_difference(layer[0], layer[1]), tf.float64), [0, 1])
        N_layer = tf.shape(layer[0])[0]
        M_layer = tf.shape(layer[0])[1]
        den = tf.square(tf.cast(tf.multiply(N_layer, M_layer), tf.float64))
        loss = loss/den
        loss = loss*0.2/4.0 #weighting loss
        total = total + loss

    return total


# In[14]:

def content_loss(P, F):
#     loss = tf.Variable(tf.constant(0.0, tf.float64), name=content_loss, trainable=False)
    loss = tf.reduce_sum(tf.cast(tf.squared_difference(P, F), tf.float64), [0, 1])
    loss = loss/2.0
    return loss

# In[15]:

content_layer_names = ['block4_conv2']
style_layer_names = ['block1_conv1', 'block2_conv1', 'block3_conv1', 'block4_conv1']

# In[32]:

P = tf.squeeze(get_feature_rep('content', content_layer_names, content_model))

# In[34]:

F = tf.squeeze(get_feature_rep('content', content_layer_names, train_model))

# In[18]:

#Each member of As consists of a feature map corresponding to a particular layer (dim. channels x pixels per channel)
As = get_feature_rep('style', style_layer_names, style_model)

# In[19]:

Gs = get_feature_rep('style', style_layer_names, train_model)

# In[20]:

styleloss = style_loss(Gs, As)

# In[21]:

contentloss = content_loss(P, F)

# In[22]:

total_loss = tf.add(styleloss, tf.multiply(tf.constant(0.01, tf.float64), contentloss))


# In[23]:

optimizer = tf.train.AdamOptimizer(5).minimize(total_loss, var_list=[o_input_var])

# In[26]:

def reprocess(x):
    VGG_MEAN = [123.68, 116.78, 103.94]
    means = tf.reshape(tf.constant(VGG_MEAN, tf.float64), [1, 1, 3])
    #Undo mean imagenet scale preprocessing
    x = tf.add(x, means)
    tf.clip_by_value(x, 0, 255)
    #bgr to rgb
    x = x[..., ::-1]
    return x

# In[27]:

saver = tf.train.Saver(tf.global_variables())

# In[28]:

init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)

#     saver.restore(sess, './model/nst_model.ckpt')

    for epoch in range(100):
        _, styleloss_curr, contentloss_curr, loss_curr, new_arr = sess.run([optimizer, styleloss, contentloss, total_loss, o_input_var])

        print('Epoch: %i    Content Loss: %.2f    Style Loss: %.2f    Total Loss: %.2f' % (epoch, contentloss_curr, styleloss_curr, loss_curr))

        if epoch % 15 == 0:
            saver.save(sess, './model/nst_model.ckpt')

# In[30]:

with tf.Session() as sess:
    new_arr = reprocess(new_arr)
    new_im = sess.run(tf.cast(tf.round(tf.squeeze(new_arr)), tf.uint8))
#     new_im = new_im[...,::-1]
#     print(sess.run(new_arr[0]/255))
    print(sess.run(tf.shape(new_im)))
    plt.imshow(new_im)

Here are plots of the style (blue) and content (red) losses after 150 iterations (6-7 minutes):

Typical implementations are known to converge after 15-20 minutes with the loss dropping drastically initially. In this case the image generated is basically colored noise even after 500 iterations.

Topic neural-style-transfer keras tensorflow implementation deep-learning

Category Data Science

Loss plateaus off in neural style transfer

About