Hi there,
I'm trying to run the training of a model on the GPU, but I'm encountering a few problems.
Here is my code:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing import image
from tensorflow.keras.models import Model
from tensorflow.keras.applications import imagenet_utils
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.applications import MobileNet
from tensorflow.keras.applications.mobilenet import preprocess_input
from tensorflow.keras.optimizers import Adam
class HandCNN:
CONST_MODELS_PATH = "models"
def __init__(self, load=False):
if load:
self.model = keras.models.load_model(self.CONST_MODELS_PATH)
def train(self, data_path: str):
""" The folder data_path should contain one folder per class, each one containing images of that class."""
img_height = 224
img_width = 224
batch_size = 32
epochs = 1
data_augment = True
# Classes inferred by the sub-folders
data_gen = ImageDataGenerator(
preprocessing_function=keras.applications.mobilenet_v2.preprocess_input,
validation_split=0.2)
if data_augment:
data_gen = ImageDataGenerator(
preprocessing_function=keras.applications.mobilenet_v2.preprocess_input,
validation_split=0.2,
height_shift_range=0.2,
width_shift_range=0.2,
rotation_range=20,
brightness_range=[0.2, 1.0],
zoom_range=[0.5, 1.0])
train_generator = data_gen.flow_from_directory(
data_path,
target_size=(img_height, img_width),
batch_size=batch_size,
color_mode='rgb',
class_mode='categorical',
subset='training',
shuffle=True)
validation_generator = data_gen.flow_from_directory(
data_path,
target_size=(img_height, img_width),
batch_size=batch_size,
color_mode='rgb',
class_mode='categorical',
subset='validation',
shuffle=True)
model = self.get_model(train_generator.num_classes)
history = model.fit(
train_generator,
steps_per_epoch=train_generator.n // train_generator.batch_size, # End epoch when all images have been used
validation_data=validation_generator,
validation_steps=validation_generator.n // validation_generator.batch_size,
epochs=epochs)
model.save(self.CONST_MODELS_PATH)
self.model = model
@staticmethod
def get_model(num_classes, learning_rate=0.01):
# Note: input is 224x224x3
base_model = keras.applications.MobileNetV2(weights="imagenet", include_top=False)
# TODO - try to freeze/not freeze the pretrained part
for layer in base_model.layers:
layer.trainable = False
last = base_model.output
last = GlobalAveragePooling2D()(last)
last = Dense(1024, activation='relu')(last)
last = Dense(1024, activation='relu')(last)
last = Dense(512, activation='relu')(last)
predictions = Dense(num_classes, activation='softmax')(last)
model = Model(inputs=base_model.inputs, outputs=predictions)
model.compile(loss=keras.losses.categorical_crossentropy,
optimizer=keras.optimizers.Adam(lr=learning_rate),
metrics=['accuracy'])
return model
if name == "main":
print("Devices list: " + str(tf.config.experimental.list_physical_devices('GPU')))
tf.debugging.set_log_device_placement(True)
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
raise SystemError('GPU device not found, device name: ' + device_name)
print('Found GPU at: {}'.format(device_name))
with tf.device('/device:GPU:0'):
handCNN = HandCNN(load=False)
handCNN.train("/floyd/input/tinyhands/carlos_r/")
I'm running the job with the following command:
floyd run --gpu --data lucamoro/datasets/tinyhands/1:tinyhands --env keras --follow "python hand_classifier/hand_cnn.py"
I understood the training it's not running on GPU because:
- Training takes more time that it takes on my CPU (i7 9850HQ)
- These logs show it wasn't possible to fit on GPU:
2020-11-16 07:44:45,079 INFO - 2020-11-16 15:44:45.077931: I tensorflow/core/common_runtime/eager/execute.cc:573] Executing op _inferencedistributed_function_11849 in device /job:localhost/replica:0/task:0/device:GPU:0
2020-11-16 07:44:45,104 INFO - 2020-11-16 15:44:45.104300: I tensorflow/core/common_runtime/colocation_graph.cc:254] Ignoring device specification /job:localhost/replica:0/task:0/device:GPU:0 for node 'IteratorGetNext' because the input edge from 'input_iterator' is a reference connection and already has a device field set to /job:localhost/replica:0/task:0/device:CPU:0
2020-11-16 07:44:45,111 INFO - 1/76 [..............................] - ETA: 12:24 - loss: 2.0618 - accuracy: 0.0938
2/76 [..............................] - ETA: 6:10 - loss: 73.3607 - accuracy: 0.0938
3/76 [>.............................] - ETA: 4:05 - loss: 54.4771 - accuracy: 0.1146
4/76 [>.............................] - ETA: 3:03 - loss: 42.1837 - accuracy: 0.1250
5/76 [>.............................] - ETA: 2:25 - loss: 34.3630 - accuracy: 0.1125
6/76 [=>............................] - ETA: 2:00 - loss: 28.9858 - accuracy: 0.1250
7/76 [=>............................] - ETA: 1:45 - loss: 25.1179 - accuracy: 0.1339
8/76 [==>...........................] - ETA: 1:36 - loss: 22.2581 - accuracy: 0.1445
9/76 [==>...........................] - ETA: 1:29 - loss: 20.0070 - accuracy: 0.1458
10/76 [==>...........................] - ETA: 1:23 - loss: 18.2018 - accuracy: 0.1469
11/76 [===>..........................] - ETA: 1:18 - loss: 16.7198 - accuracy: 0.1449
12/76 [===>..........................] - ETA: 1:14 - loss: 15.4853 - accuracy: 0.1432
13/76 [====>.........................] - ETA: 1:10 - loss: 14.4430 - accuracy: 0.1442
14/76 [====>.........................] - ETA: 1:07 - loss: 13.5544 - accuracy: 0.1429
15/76 [====>.........................] - ETA: 1:04 - loss: 12.7885 - accuracy: 0.1396
16/76 [=====>........................] - ETA: 1:02 - loss: 12.1145 - accuracy: 0.1406
17/76 [=====>........................] - ETA: 59s - loss: 11.5307 - accuracy: 0.1507
18/76 [======>.......................] - ETA: 57s - loss: 10.9973 - accuracy: 0.1528
19/76 [======>.......................] - ETA: 55s - loss: 10.5185 - accuracy: 0.1546
20/76 [======>.......................] - ETA: 53s - loss: 10.0993 - accuracy: 0.1500
21/76 [=======>......................] - ETA: 52s - loss: 9.7132 - accuracy: 0.1562
22/76 [=======>......................] - ETA: 50s - loss: 9.3594 - accuracy: 0.1619
23/76 [========>.....................] - ETA: 48s - loss: 9.0358 - accuracy: 0.1562
24/76 [========>.....................] - ETA: 47s - loss: 8.7385 - accuracy: 0.1562
25/76 [========>.....................] - ETA: 45s - loss: 8.4641 - accuracy: 0.1575
26/76 [=========>....................] - ETA: 44s - loss: 8.2082 - accuracy: 0.1671
27/76 [=========>....................] - ETA: 43s - loss: 7.9761 - accuracy: 0.1620
28/76 [==========>...................] - ETA: 41s - loss: 7.7562 - accuracy: 0.1663
29/76 [==========>...................] - ETA: 40s - loss: 7.5495 - accuracy: 0.1724
30/76 [==========>...................] - ETA: 39s - loss: 7.3599 - accuracy: 0.1708
31/76 [===========>..................] - ETA: 38s - loss: 7.1766 - accuracy: 0.1734
32/76 [===========>..................] - ETA: 37s - loss: 7.0100 - accuracy: 0.1768
33/76 [============>.................] - ETA: 35s - loss: 6.8457 - accuracy: 0.1818
34/76 [============>.................] - ETA: 34s - loss: 6.6937 - accuracy: 0.1838
35/76 [============>.................] - ETA: 33s - loss: 6.5543 - accuracy: 0.1830
36/76 [=============>................] - ETA: 32s - loss: 6.4155 - accuracy: 0.1918
37/76 [=============>................] - ETA: 31s - loss: 6.2826 - accuracy: 0.1951
38/76 [==============>...............] - ETA: 30s - loss: 6.1636 - accuracy: 0.1963
39/76 [==============>...............] - ETA: 29s - loss: 6.0448 - accuracy: 0.2010
40/76 [==============>...............] - ETA: 28s - loss: 5.9389 - accuracy: 0.2006
41/76 [===============>..............] - ETA: 27s - loss: 5.8374 - accuracy: 0.2026
42/76 [===============>..............] - ETA: 26s - loss: 5.7390 - accuracy: 0.2060
43/76 [===============>..............] - ETA: 25s - loss: 5.6431 - accuracy: 0.2122
44/76 [================>.............] - ETA: 25s - loss: 5.5546 - accuracy: 0.2109
45/76 [================>.............] - ETA: 24s - loss: 5.4715 - accuracy: 0.2111
46/76 [=================>............] - ETA: 23s - loss: 5.3889 - accuracy: 0.2126
47/76 [=================>............] - ETA: 22s - loss: 5.3054 - accuracy: 0.2181
48/76 [=================>............] - ETA: 21s - loss: 5.2276 - accuracy: 0.2188
49/76 [==================>...........] - ETA: 20s - loss: 5.1561 - accuracy: 0.2201
50/76 [==================>...........] - ETA: 19s - loss: 5.0870 - accuracy: 0.2207
51/76 [===================>..........] - ETA: 19s - loss: 5.0156 - accuracy: 0.2244
52/76 [===================>..........] - ETA: 18s - loss: 4.9509 - accuracy: 0.2261
53/76 [===================>..........] - ETA: 17s - loss: 4.8865 - accuracy: 0.2295
54/76 [====================>.........] - ETA: 16s - loss: 4.8234 - accuracy: 0.2345
55/76 [====================>.........] - ETA: 15s - loss: 4.7622 - accuracy: 0.2411
56/76 [=====================>........] - ETA: 15s - loss: 4.7056 - accuracy: 0.2407
57/76 [=====================>........] - ETA: 14s - loss: 4.6500 - accuracy: 0.2431
58/76 [=====================>........] - ETA: 13s - loss: 4.5957 - accuracy: 0.2459
59/76 [======================>.......] - ETA: 12s - loss: 4.5399 - accuracy: 0.2508
60/76 [======================>.......] - ETA: 11s - loss: 4.4897 - accuracy: 0.2545
61/76 [=======================>......] - ETA: 11s - loss: 4.4351 - accuracy: 0.2595
62/76 [=======================>......] - ETA: 10s - loss: 4.3825 - accuracy: 0.2650
63/76 [=======================>......] - ETA: 9s - loss: 4.3335 - accuracy: 0.2687
64/76 [========================>.....] - ETA: 8s - loss: 4.2865 - accuracy: 0.2709
65/76 [========================>.....] - ETA: 8s - loss: 4.2406 - accuracy: 0.2749
66/76 [=========================>....] - ETA: 7s - loss: 4.1987 - accuracy: 0.2765
67/76 [=========================>....] - ETA: 6s - loss: 4.1548 - accuracy: 0.2779
68/76 [=========================>....] - ETA: 5s - loss: 4.1100 - accuracy: 0.2821
69/76 [==========================>...] - ETA: 5s - loss: 4.0851 - accuracy: 0.2812
70/76 [==========================>...] - ETA: 4s - loss: 4.0493 - accuracy: 0.2830
71/76 [===========================>..] - ETA: 3s - loss: 4.0151 - accuracy: 0.2826
72/76 [===========================>..] - ETA: 2s - loss: 3.9849 - accuracy: 0.2830
73/76 [===========================>..] - ETA: 2s - loss: 3.9497 - accuracy: 0.2855
74/76 [============================>.] - ETA: 1s - loss: 3.9188 - accuracy: 0.2867
75/76 [============================>.] - ETA: 0s - loss: 3.8845 - accuracy: 0.2896input_iterator: (_Arg): /job:localhost/replica:0/task:0/device:CPU:0
2020-11-16 07:44:45,111 INFO - input_iterator_1: (_Arg): /job:localhost/replica:0/task:0/device:CPU:0
2020-11-16 07:44:45,112 INFO - model_conv1_conv2d_readvariableop_resource: (_Arg): /job:localhost/replica:0/task:0/device:GPU:0
Can anyone help me?