ResourceExhaustedError: 2 root error(s) found.
2020/03/07 下午 10:21
電腦視覺深度學習討論版
陳主德
觀看數:25
回答數:3
收藏數:0
cvdl-1
cvdl-1-d49
ResourceExhaustedError
專家您好
我在colab上進行浣熊與袋鼠辨識的專題實作。但在訓練模型時會中斷,我把原本的batch_size從16改成8還是會中斷,請問有什麼方式可以改善呢?謝謝!
%tensorflow_version 1.x # 確保 colob 中使用的 tensorflow 是 1.x 版本而不是 tensorflow 2
import tensorflow as tf
print(tf.__version__)
!pip install keras==2.2.4 # 需要安裝 keras 2.2.4 的版本
from google.colab import drive
drive.mount('/content/gdrive') # 將 google drive 掛載在 colob
%cd '/content/gdrive/My Drive'
# 下載基於 keras 的 yolov3 程式碼
import os
if not os.path.exists("keras-yolo3") :
!git clone https://github.com/qqwweee/keras-yolo3
else :
print("keras-yolo3 exists")
%cd keras-yolo3
# model_data/yolo.h5 模型 & 權重
# 下載 yolov3 的網路權重,並且把權重轉換為 keras 能夠讀取的格式
if not os.path.exists("model_data/yolo.h5"):
# 下載 yolov3 的網路權重
print("Model doesn't exist, downloading...")
os.system("wget https://pjreddie.com/media/files/yolov3.weights")
# 把權重轉換為 keras 能夠讀取的格式(.h5)
print("Converting yolov3.weights to yolo.h5...")
os.system("python convert.py yolov3.cfg yolov3.weights model_data/yolo.h5")
else:
print("Model exist")
# 下載 raccoon 與 kangaroo 的資料集
if not os.path.exists("raccoon_dataset"):
!git clone https://github.com/experiencor/raccoon_dataset.git # 下載 raccoon_dataset 資料集
else:
print("raccoon_dataset exists")
if not os.path.exists("kangaroo"):
!git clone https://github.com/experiencor/kangaroo.git # 下載 kangaroo 資料集
else:
print("kangaroo exists")
import numpy as np
# 訓練模型時需使用的 annotation 檔名, 若已經做好轉換, 則不會每次再重新跑這段轉換的程式碼
if not os.path.exists("train_labels.txt"):
import xml.etree.ElementTree as ET # 載入能夠 Parser xml 文件的 library
sets=['train', 'val']
# "raccoon", "kangaroo" 的資料類別
classes = ["raccoon", "kangaroo"]
# 把 annotation(.xml) 轉換到訓練時需要的資料形態
def convert_annotation(image_id, list_file):
in_file = open('annotation_xml/%s.xml'%(image_id))
tree=ET.parse(in_file)
root = tree.getroot()
for obj in root.iter('object'):
difficult = obj.find('difficult').text
cls = obj.find('name').text
if cls not in classes or int(difficult)==1:
continue
cls_id = classes.index(cls) # class index
xmlbox = obj.find('bndbox')
b = (int(xmlbox.find('xmin').text), int(xmlbox.find('ymin').text),
int(xmlbox.find('xmax').text), int(xmlbox.find('ymax').text))
list_file.write(" " + ",".join([str(a) for a in b]) + ',' + str(cls_id))
# 把 raccoon_dataset/images 與 kangaroo/images 檔案合併後, 當成訓練集 & 驗證集資料
for root,dirs,files in os.walk('raccoon_dataset/images') :
print('raccoon jpg 檔數量:', len(files))
for root_2,dirs_2,files_2 in os.walk('kangaroo/images') :
print('kangaroo jpg 檔數量:', len(files_2))
# 把 files_2 合併在 files list 內
files.extend(files_2)
print('所有 jpg 檔數量:', len(files))
jpg_ids = ''.join(files).strip().split('.jpg')[:-1]
# 80% 檔案資料當成訓練集資料
train_index = np.random.choice(jpg_ids, size=int(len(jpg_ids)*0.8), replace=False)
val_index = np.setdiff1d(jpg_ids, train_index)
!mkdir train val
# 把訓練集資料檔索引, 放入 train 資料夾
train_txt = open('train/train.txt', 'w')
print("save train index at train/train.txt")
for train_id in train_index :
train_txt.write('%s' %(train_id))
train_txt.write('\n')
train_txt.close()
# 把驗證集資料檔索引, 放入 val 資料夾
val_txt = open('val/val.txt', 'w')
print("save val index at val/val.txt")
for val_id in val_index :
val_txt.write('%s' %(val_id))
val_txt.write('\n')
val_txt.close()
# 把annotation(.xml), 放入 annotation_xml 資料夾
!mkdir annotation_xml
!cp raccoon_dataset/annotations/*.xml ./annotation_xml
!cp kangaroo/annots/*.xml ./annotation_xml
# 把類別資料放入 class.txt
class_txt = open('class.txt', 'w')
print("save class at class.txt")
for class_id in classes :
class_txt.write('%s' %(class_id))
class_txt.write('\n')
class_txt.close()
for image_set in sets:
image_ids = open('%s/%s.txt'%(image_set, image_set)).read().strip().split()
annotation_path = '%s_labels.txt'%(image_set)
list_file = open(annotation_path, 'w')
print("save annotation at %s" % annotation_path)
# 處理訓練集 & 驗證集資料檔
for image_id in image_ids:
if 'raccoon' in image_id :
list_file.write('./raccoon_dataset/images/%s.jpg' %(image_id))
else :
list_file.write('./kangaroo/images/%s.jpg' %(image_id))
convert_annotation(image_id, list_file)
list_file.write('\n')
list_file.close()
# 將 train.py 所需要的套件載入
import numpy as np
import keras.backend as K
from keras.layers import Input, Lambda
from keras.models import Model
from keras.optimizers import Adam
from keras.callbacks import TensorBoard, ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from yolo3.model import preprocess_true_boxes, yolo_body, tiny_yolo_body, yolo_loss
from yolo3.utils import get_random_data
from train import get_classes, get_anchors, create_model, create_tiny_model, data_generator, data_generator_wrapper
# 因訓練時發生 error, 故加入此程式碼 :
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
# convert.py '-w' : 代表只轉換權重 weights 到 model_data/yolo_weights.h5
if not os.path.exists("model_data/yolo_weights.h5"):
print("Converting pretrained YOLOv3 weights for training")
os.system("python convert.py -w yolov3.cfg yolov3.weights model_data/yolo_weights.h5")
else:
print("Pretrained weights exists")
annotation_path_train = 'train_labels.txt' # 轉換好格式的 train 標註檔案
annotation_path_val = 'val_labels.txt' # 轉換好格式的 val 標註檔案
log_dir = 'logs/000/' # 訓練好的模型儲存的路徑
classes_path = 'class.txt'
anchors_path = 'model_data/yolo_anchors.txt'
class_names = get_classes(classes_path)
num_classes = len(class_names)
anchors = get_anchors(anchors_path)
input_shape = (416,416) # multiple of 32, hw
is_tiny_version = len(anchors)==6 # default setting
if is_tiny_version:
model = create_tiny_model(input_shape, anchors, num_classes,
freeze_body=2, weights_path='model_data/tiny_yolo_weights.h5')
else:
model = create_model(input_shape, anchors, num_classes,
freeze_body=2, weights_path='model_data/yolo_weights.h5') # make sure you know what you freeze
logging = TensorBoard(log_dir=log_dir)
checkpoint = ModelCheckpoint(log_dir + 'ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5',
monitor='val_loss', save_weights_only=True, save_best_only=True, period=30)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1)
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=1)
with open(annotation_path_train) as f:
lines_train = f.readlines()
with open(annotation_path_val) as f:
lines_val = f.readlines()
np.random.seed(10101)
np.random.shuffle(lines_train)
np.random.shuffle(lines_val)
np.random.seed(None)
num_train = len(lines_train) # 訓練資料(80%)
num_val = len(lines_val) # 驗證資料(20%)
# 一開始先 freeze YOLO 除了 output layer 以外的 darknet53 backbone 來 train
if True:
model.compile(optimizer=Adam(lr=1e-3), loss={
# use custom yolo_loss Lambda layer.
'yolo_loss': lambda y_true, y_pred: y_pred})
batch_size = 8
print('Train on {} samples, val on {} samples, with batch size {}.'.format(num_train, num_val, batch_size))
model_1= model.fit_generator(data_generator_wrapper(lines_train, batch_size, input_shape, anchors, num_classes),
steps_per_epoch=max(1, num_train//batch_size),
validation_data=data_generator_wrapper(lines_val, batch_size, input_shape, anchors, num_classes),
validation_steps=max(1, num_val//batch_size),
epochs=50,
initial_epoch=0,
callbacks=[logging, checkpoint, reduce_lr])
model.save_weights(log_dir + 'trained_weights_stage_1.h5')
# Unfreeze and continue training, to fine-tune.
if True:
# 把所有 layer 都改為 trainable
for i in range(len(model.layers)):
model.layers[i].trainable = True
model.compile(optimizer=Adam(lr=1e-4), loss={'yolo_loss': lambda y_true, y_pred: y_pred}) # recompile to apply the change
print('Unfreeze all of the layers.')
batch_size = 8 # note that more GPU memory is required after unfreezing the body
print('Train on {} samples, val on {} samples, with batch size {}.'.format(num_train, num_val, batch_size))
hist_model= model.fit_generator(data_generator_wrapper(lines_train, batch_size, input_shape, anchors, num_classes),
steps_per_epoch=max(1, num_train//batch_size),
validation_data=data_generator_wrapper(lines_val, batch_size, input_shape, anchors, num_classes),
validation_steps=max(1, num_val//batch_size),
epochs=100,
initial_epoch=50,
callbacks=[logging, checkpoint, reduce_lr, early_stopping])
model.save_weights(log_dir + 'trained_weights_final.h5')
回答列表
-
2020/03/08 上午 00:36Jeffrey贊同數:0不贊同數:0留言數:3
請問一下 , 所有程式碼都是在 colab 上執行? 若是, 可以請你先把callback 關掉後, 再執行一次?
-
2020/03/08 下午 08:40胡連福贊同數:2不贊同數:0留言數:3
這看起來是從 epoch_51 開始就發生資源被耗盡了,因為這時已 unfreeze all layers,會消耗更大的記憶體資源。建議你檢查:
1. 是否你先前在 colab 已做了很多次的訓練占用太多資源了 ? 可以先關機重新連結 colab 再 train 一次。
2. 請確認 yolo_weights.h5 是否正確下載 ?
-
2020/03/08 下午 11:58KennyKang贊同數:1不贊同數:0留言數:1
我也有遇到,後來我把第二階段的batch_size = 4就可以順利在colab跑完,順便提醒一下,Goggle雲端硬碟只有15G,避免空間爆掉順便關注一下log_dir = 'logs/000/' # 訓練好的模型儲存的路徑下的model參數檔,把Epoch舊的參數檔一邊刪除掉,避免雲端硬碟空間不足影響到這次的訓練