ResourceExhaustedError: 2 root error(s) found.

2020/03/07 下午 10:21

電腦視覺深度學習討論版

陳主德

觀看數：25

回答數：3

收藏數：0

cvdl-1

cvdl-1-d49

ResourceExhaustedError

專家您好

我在colab上進行浣熊與袋鼠辨識的專題實作。但在訓練模型時會中斷，我把原本的batch_size從16改成8還是會中斷，請問有什麼方式可以改善呢？謝謝！

%tensorflow_version 1.x # 確保 colob 中使用的 tensorflow 是 1.x 版本而不是 tensorflow 2
import tensorflow as tf
print(tf.__version__)
!pip install keras==2.2.4 # 需要安裝 keras 2.2.4 的版本
from google.colab import drive 
drive.mount('/content/gdrive') # 將 google drive 掛載在 colob
%cd '/content/gdrive/My Drive'

# 下載基於 keras 的 yolov3 程式碼
import os
if not os.path.exists("keras-yolo3") :
  !git clone https://github.com/qqwweee/keras-yolo3
else :
  print("keras-yolo3 exists")

%cd keras-yolo3

%tensorflow_version 1.x # 確保 colob 中使用的 tensorflow 是 1.x 版本而不是 tensorflow 2
import tensorflow as tf
print(tf.__version__)
!pip install keras==2.2.4 # 需要安裝 keras 2.2.4 的版本
from google.colab import drive 
drive.mount('/content/gdrive') # 將 google drive 掛載在 colob
%cd '/content/gdrive/My Drive'

# 下載基於 keras 的 yolov3 程式碼
import os
if not os.path.exists("keras-yolo3") :
  !git clone https://github.com/qqwweee/keras-yolo3
else :
  print("keras-yolo3 exists")

%cd keras-yolo3

# model_data/yolo.h5 模型 & 權重
# 下載 yolov3 的網路權重，並且把權重轉換為 keras 能夠讀取的格式
if not os.path.exists("model_data/yolo.h5"):
  # 下載 yolov3 的網路權重
  print("Model doesn't exist, downloading...")
  os.system("wget https://pjreddie.com/media/files/yolov3.weights")
  
  # 把權重轉換為 keras 能夠讀取的格式(.h5)
  print("Converting yolov3.weights to yolo.h5...")
  os.system("python convert.py yolov3.cfg yolov3.weights model_data/yolo.h5")
else:
  print("Model exist")
  
# 下載 raccoon 與 kangaroo 的資料集
if not os.path.exists("raccoon_dataset"):
  !git clone https://github.com/experiencor/raccoon_dataset.git  # 下載 raccoon_dataset 資料集
else:
  print("raccoon_dataset exists")

if not os.path.exists("kangaroo"):
  !git clone https://github.com/experiencor/kangaroo.git  # 下載 kangaroo 資料集
else:
  print("kangaroo exists")

# model_data/yolo.h5 模型 & 權重
# 下載 yolov3 的網路權重，並且把權重轉換為 keras 能夠讀取的格式
if not os.path.exists("model_data/yolo.h5"):
  # 下載 yolov3 的網路權重
  print("Model doesn't exist, downloading...")
  os.system("wget https://pjreddie.com/media/files/yolov3.weights")
  
  # 把權重轉換為 keras 能夠讀取的格式(.h5)
  print("Converting yolov3.weights to yolo.h5...")
  os.system("python convert.py yolov3.cfg yolov3.weights model_data/yolo.h5")
else:
  print("Model exist")
  
# 下載 raccoon 與 kangaroo 的資料集
if not os.path.exists("raccoon_dataset"):
  !git clone https://github.com/experiencor/raccoon_dataset.git  # 下載 raccoon_dataset 資料集
else:
  print("raccoon_dataset exists")

if not os.path.exists("kangaroo"):
  !git clone https://github.com/experiencor/kangaroo.git  # 下載 kangaroo 資料集
else:
  print("kangaroo exists")

import numpy as np
# 訓練模型時需使用的 annotation 檔名, 若已經做好轉換, 則不會每次再重新跑這段轉換的程式碼
if not os.path.exists("train_labels.txt"):
  import xml.etree.ElementTree as ET # 載入能夠 Parser xml 文件的 library
  
  sets=['train', 'val']

  # "raccoon", "kangaroo" 的資料類別
  classes = ["raccoon", "kangaroo"]

  # 把 annotation(.xml) 轉換到訓練時需要的資料形態
  def convert_annotation(image_id, list_file):
      in_file = open('annotation_xml/%s.xml'%(image_id))
      tree=ET.parse(in_file)
      root = tree.getroot()

      for obj in root.iter('object'):
          difficult = obj.find('difficult').text
          cls = obj.find('name').text
          if cls not in classes or int(difficult)==1: 
              continue
          cls_id = classes.index(cls)  # class index
          xmlbox = obj.find('bndbox')
          b = (int(xmlbox.find('xmin').text), int(xmlbox.find('ymin').text), 
                int(xmlbox.find('xmax').text), int(xmlbox.find('ymax').text))
          list_file.write(" " + ",".join([str(a) for a in b]) + ',' + str(cls_id))

  # 把 raccoon_dataset/images 與 kangaroo/images 檔案合併後, 當成訓練集 & 驗證集資料
  for root,dirs,files in os.walk('raccoon_dataset/images') :
    print('raccoon jpg 檔數量:', len(files))
  for root_2,dirs_2,files_2 in os.walk('kangaroo/images') :
    print('kangaroo jpg 檔數量:', len(files_2))
  # 把 files_2 合併在 files list 內
  files.extend(files_2)
  print('所有 jpg 檔數量:', len(files))
    
  jpg_ids = ''.join(files).strip().split('.jpg')[:-1]
  # 80% 檔案資料當成訓練集資料
  train_index = np.random.choice(jpg_ids, size=int(len(jpg_ids)*0.8), replace=False)
  val_index = np.setdiff1d(jpg_ids, train_index)

  !mkdir train val
  # 把訓練集資料檔索引, 放入 train 資料夾
  train_txt = open('train/train.txt', 'w')
  print("save train index at train/train.txt")       
  for train_id in train_index : 
      train_txt.write('%s' %(train_id))
      train_txt.write('\n')
  train_txt.close()

  # 把驗證集資料檔索引, 放入 val 資料夾
  val_txt = open('val/val.txt', 'w')
  print("save val index at val/val.txt")       
  for val_id in val_index : 
      val_txt.write('%s' %(val_id))
      val_txt.write('\n')
  val_txt.close()

  # 把annotation(.xml), 放入 annotation_xml 資料夾
  !mkdir annotation_xml
  !cp raccoon_dataset/annotations/*.xml ./annotation_xml
  !cp kangaroo/annots/*.xml ./annotation_xml

  # 把類別資料放入 class.txt
  class_txt = open('class.txt', 'w')
  print("save class at class.txt")       
  for class_id in classes : 
      class_txt.write('%s' %(class_id))
      class_txt.write('\n')
  class_txt.close()

  for image_set in sets:
      image_ids = open('%s/%s.txt'%(image_set, image_set)).read().strip().split()
      
      annotation_path = '%s_labels.txt'%(image_set)
      list_file = open(annotation_path, 'w')
      print("save annotation at %s" % annotation_path)
      # 處理訓練集 & 驗證集資料檔
      for image_id in image_ids:
        if 'raccoon' in image_id :
          list_file.write('./raccoon_dataset/images/%s.jpg' %(image_id))
        else :
          list_file.write('./kangaroo/images/%s.jpg' %(image_id))  
        convert_annotation(image_id, list_file)
        list_file.write('\n')
      list_file.close()

import numpy as np
# 訓練模型時需使用的 annotation 檔名, 若已經做好轉換, 則不會每次再重新跑這段轉換的程式碼
if not os.path.exists("train_labels.txt"):
  import xml.etree.ElementTree as ET # 載入能夠 Parser xml 文件的 library
  
  sets=['train', 'val']

  # "raccoon", "kangaroo" 的資料類別
  classes = ["raccoon", "kangaroo"]

  # 把 annotation(.xml) 轉換到訓練時需要的資料形態
  def convert_annotation(image_id, list_file):
      in_file = open('annotation_xml/%s.xml'%(image_id))
      tree=ET.parse(in_file)
      root = tree.getroot()

      for obj in root.iter('object'):
          difficult = obj.find('difficult').text
          cls = obj.find('name').text
          if cls not in classes or int(difficult)==1: 
              continue
          cls_id = classes.index(cls)  # class index
          xmlbox = obj.find('bndbox')
          b = (int(xmlbox.find('xmin').text), int(xmlbox.find('ymin').text), 
                int(xmlbox.find('xmax').text), int(xmlbox.find('ymax').text))
          list_file.write(" " + ",".join([str(a) for a in b]) + ',' + str(cls_id))

  # 把 raccoon_dataset/images 與 kangaroo/images 檔案合併後, 當成訓練集 & 驗證集資料
  for root,dirs,files in os.walk('raccoon_dataset/images') :
    print('raccoon jpg 檔數量:', len(files))
  for root_2,dirs_2,files_2 in os.walk('kangaroo/images') :
    print('kangaroo jpg 檔數量:', len(files_2))
  # 把 files_2 合併在 files list 內
  files.extend(files_2)
  print('所有 jpg 檔數量:', len(files))
    
  jpg_ids = ''.join(files).strip().split('.jpg')[:-1]
  # 80% 檔案資料當成訓練集資料
  train_index = np.random.choice(jpg_ids, size=int(len(jpg_ids)*0.8), replace=False)
  val_index = np.setdiff1d(jpg_ids, train_index)

  !mkdir train val
  # 把訓練集資料檔索引, 放入 train 資料夾
  train_txt = open('train/train.txt', 'w')
  print("save train index at train/train.txt")       
  for train_id in train_index : 
      train_txt.write('%s' %(train_id))
      train_txt.write('\n')
  train_txt.close()

  # 把驗證集資料檔索引, 放入 val 資料夾
  val_txt = open('val/val.txt', 'w')
  print("save val index at val/val.txt")       
  for val_id in val_index : 
      val_txt.write('%s' %(val_id))
      val_txt.write('\n')
  val_txt.close()

  # 把annotation(.xml), 放入 annotation_xml 資料夾
  !mkdir annotation_xml
  !cp raccoon_dataset/annotations/*.xml ./annotation_xml
  !cp kangaroo/annots/*.xml ./annotation_xml

  # 把類別資料放入 class.txt
  class_txt = open('class.txt', 'w')
  print("save class at class.txt")       
  for class_id in classes : 
      class_txt.write('%s' %(class_id))
      class_txt.write('\n')
  class_txt.close()

  for image_set in sets:
      image_ids = open('%s/%s.txt'%(image_set, image_set)).read().strip().split()
      
      annotation_path = '%s_labels.txt'%(image_set)
      list_file = open(annotation_path, 'w')
      print("save annotation at %s" % annotation_path)
      # 處理訓練集 & 驗證集資料檔
      for image_id in image_ids:
        if 'raccoon' in image_id :
          list_file.write('./raccoon_dataset/images/%s.jpg' %(image_id))
        else :
          list_file.write('./kangaroo/images/%s.jpg' %(image_id))  
        convert_annotation(image_id, list_file)
        list_file.write('\n')
      list_file.close()

# 將 train.py 所需要的套件載入
import numpy as np
import keras.backend as K
from keras.layers import Input, Lambda
from keras.models import Model
from keras.optimizers import Adam
from keras.callbacks import TensorBoard, ModelCheckpoint, ReduceLROnPlateau, EarlyStopping

from yolo3.model import preprocess_true_boxes, yolo_body, tiny_yolo_body, yolo_loss
from yolo3.utils import get_random_data
from train import get_classes, get_anchors, create_model, create_tiny_model, data_generator, data_generator_wrapper
# 因訓練時發生 error, 故加入此程式碼 :
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
# convert.py '-w' : 代表只轉換權重 weights 到 model_data/yolo_weights.h5
if not os.path.exists("model_data/yolo_weights.h5"):
  print("Converting pretrained YOLOv3 weights for training")
  os.system("python convert.py -w yolov3.cfg yolov3.weights model_data/yolo_weights.h5") 
else:
  print("Pretrained weights exists")

# 將 train.py 所需要的套件載入
import numpy as np
import keras.backend as K
from keras.layers import Input, Lambda
from keras.models import Model
from keras.optimizers import Adam
from keras.callbacks import TensorBoard, ModelCheckpoint, ReduceLROnPlateau, EarlyStopping

from yolo3.model import preprocess_true_boxes, yolo_body, tiny_yolo_body, yolo_loss
from yolo3.utils import get_random_data
from train import get_classes, get_anchors, create_model, create_tiny_model, data_generator, data_generator_wrapper
# 因訓練時發生 error, 故加入此程式碼 :
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
# convert.py '-w' : 代表只轉換權重 weights 到 model_data/yolo_weights.h5
if not os.path.exists("model_data/yolo_weights.h5"):
  print("Converting pretrained YOLOv3 weights for training")
  os.system("python convert.py -w yolov3.cfg yolov3.weights model_data/yolo_weights.h5") 
else:
  print("Pretrained weights exists")

annotation_path_train = 'train_labels.txt' # 轉換好格式的 train 標註檔案
annotation_path_val = 'val_labels.txt' # 轉換好格式的 val 標註檔案
log_dir = 'logs/000/' # 訓練好的模型儲存的路徑
classes_path = 'class.txt'
anchors_path = 'model_data/yolo_anchors.txt'
class_names = get_classes(classes_path)
num_classes = len(class_names)
anchors = get_anchors(anchors_path)

input_shape = (416,416) # multiple of 32, hw

is_tiny_version = len(anchors)==6 # default setting
if is_tiny_version:
    model = create_tiny_model(input_shape, anchors, num_classes,
        freeze_body=2, weights_path='model_data/tiny_yolo_weights.h5')
else:
    model = create_model(input_shape, anchors, num_classes,
        freeze_body=2, weights_path='model_data/yolo_weights.h5') # make sure you know what you freeze

logging = TensorBoard(log_dir=log_dir)
checkpoint = ModelCheckpoint(log_dir + 'ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5',
    monitor='val_loss', save_weights_only=True, save_best_only=True, period=30)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1)
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=1)

with open(annotation_path_train) as f:
    lines_train = f.readlines()
with open(annotation_path_val) as f:
    lines_val = f.readlines()
np.random.seed(10101)
np.random.shuffle(lines_train)
np.random.shuffle(lines_val)
np.random.seed(None)
num_train = len(lines_train)  # 訓練資料(80%)
num_val = len(lines_val)      # 驗證資料(20%)

# 一開始先 freeze YOLO 除了 output layer 以外的 darknet53 backbone 來 train
if True:
    model.compile(optimizer=Adam(lr=1e-3), loss={
        # use custom yolo_loss Lambda layer.
        'yolo_loss': lambda y_true, y_pred: y_pred})

    batch_size = 8
    print('Train on {} samples, val on {} samples, with batch size {}.'.format(num_train, num_val, batch_size))
    model_1= model.fit_generator(data_generator_wrapper(lines_train, batch_size, input_shape, anchors, num_classes),
            steps_per_epoch=max(1, num_train//batch_size),
            validation_data=data_generator_wrapper(lines_val, batch_size, input_shape, anchors, num_classes),
            validation_steps=max(1, num_val//batch_size),
            epochs=50,
            initial_epoch=0,
            callbacks=[logging, checkpoint, reduce_lr])
    model.save_weights(log_dir + 'trained_weights_stage_1.h5')

# Unfreeze and continue training, to fine-tune.
if True:
    # 把所有 layer 都改為 trainable
    for i in range(len(model.layers)):
        model.layers[i].trainable = True
    model.compile(optimizer=Adam(lr=1e-4), loss={'yolo_loss': lambda y_true, y_pred: y_pred}) # recompile to apply the change
    print('Unfreeze all of the layers.')

    batch_size = 8 # note that more GPU memory is required after unfreezing the body
    print('Train on {} samples, val on {} samples, with batch size {}.'.format(num_train, num_val, batch_size))
    hist_model= model.fit_generator(data_generator_wrapper(lines_train, batch_size, input_shape, anchors, num_classes),
        steps_per_epoch=max(1, num_train//batch_size),
        validation_data=data_generator_wrapper(lines_val, batch_size, input_shape, anchors, num_classes),
        validation_steps=max(1, num_val//batch_size),
        epochs=100,
        initial_epoch=50,
        callbacks=[logging, checkpoint, reduce_lr, early_stopping])
    
    model.save_weights(log_dir + 'trained_weights_final.h5')

annotation_path_train = 'train_labels.txt' # 轉換好格式的 train 標註檔案
annotation_path_val = 'val_labels.txt' # 轉換好格式的 val 標註檔案
log_dir = 'logs/000/' # 訓練好的模型儲存的路徑
classes_path = 'class.txt'
anchors_path = 'model_data/yolo_anchors.txt'
class_names = get_classes(classes_path)
num_classes = len(class_names)
anchors = get_anchors(anchors_path)

input_shape = (416,416) # multiple of 32, hw

is_tiny_version = len(anchors)==6 # default setting
if is_tiny_version:
    model = create_tiny_model(input_shape, anchors, num_classes,
        freeze_body=2, weights_path='model_data/tiny_yolo_weights.h5')
else:
    model = create_model(input_shape, anchors, num_classes,
        freeze_body=2, weights_path='model_data/yolo_weights.h5') # make sure you know what you freeze

logging = TensorBoard(log_dir=log_dir)
checkpoint = ModelCheckpoint(log_dir + 'ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5',
    monitor='val_loss', save_weights_only=True, save_best_only=True, period=30)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1)
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=1)

with open(annotation_path_train) as f:
    lines_train = f.readlines()
with open(annotation_path_val) as f:
    lines_val = f.readlines()
np.random.seed(10101)
np.random.shuffle(lines_train)
np.random.shuffle(lines_val)
np.random.seed(None)
num_train = len(lines_train)  # 訓練資料(80%)
num_val = len(lines_val)      # 驗證資料(20%)

# 一開始先 freeze YOLO 除了 output layer 以外的 darknet53 backbone 來 train
if True:
    model.compile(optimizer=Adam(lr=1e-3), loss={
        # use custom yolo_loss Lambda layer.
        'yolo_loss': lambda y_true, y_pred: y_pred})

    batch_size = 8
    print('Train on {} samples, val on {} samples, with batch size {}.'.format(num_train, num_val, batch_size))
    model_1= model.fit_generator(data_generator_wrapper(lines_train, batch_size, input_shape, anchors, num_classes),
            steps_per_epoch=max(1, num_train//batch_size),
            validation_data=data_generator_wrapper(lines_val, batch_size, input_shape, anchors, num_classes),
            validation_steps=max(1, num_val//batch_size),
            epochs=50,
            initial_epoch=0,
            callbacks=[logging, checkpoint, reduce_lr])
    model.save_weights(log_dir + 'trained_weights_stage_1.h5')

# Unfreeze and continue training, to fine-tune.
if True:
    # 把所有 layer 都改為 trainable
    for i in range(len(model.layers)):
        model.layers[i].trainable = True
    model.compile(optimizer=Adam(lr=1e-4), loss={'yolo_loss': lambda y_true, y_pred: y_pred}) # recompile to apply the change
    print('Unfreeze all of the layers.')

    batch_size = 8 # note that more GPU memory is required after unfreezing the body
    print('Train on {} samples, val on {} samples, with batch size {}.'.format(num_train, num_val, batch_size))
    hist_model= model.fit_generator(data_generator_wrapper(lines_train, batch_size, input_shape, anchors, num_classes),
        steps_per_epoch=max(1, num_train//batch_size),
        validation_data=data_generator_wrapper(lines_val, batch_size, input_shape, anchors, num_classes),
        validation_steps=max(1, num_val//batch_size),
        epochs=100,
        initial_epoch=50,
        callbacks=[logging, checkpoint, reduce_lr, early_stopping])
    
    model.save_weights(log_dir + 'trained_weights_final.h5')

回答列表

2020/03/08 上午 00:36

Jeffrey

贊同數：0

不贊同數：0

留言數：3

請問一下 , 所有程式碼都是在 colab 上執行? 若是, 可以請你先把callback 關掉後, 再執行一次?
2020/03/08 下午 08:40

胡連福

贊同數：2

不贊同數：0

留言數：3

這看起來是從 epoch_51 開始就發生資源被耗盡了，因為這時已 unfreeze all layers，會消耗更大的記憶體資源。建議你檢查:

1. 是否你先前在 colab 已做了很多次的訓練占用太多資源了 ? 可以先關機重新連結 colab 再 train 一次。

2. 請確認 yolo_weights.h5 是否正確下載 ?
2020/03/08 下午 11:58

KennyKang

贊同數：1

不贊同數：0

留言數：1

我也有遇到,後來我把第二階段的batch_size = 4就可以順利在colab跑完,順便提醒一下,Goggle雲端硬碟只有15G,避免空間爆掉順便關注一下log_dir = 'logs/000/' # 訓練好的模型儲存的路徑下的model參數檔,把Epoch舊的參數檔一邊刪除掉,避免雲端硬碟空間不足影響到這次的訓練