[备注:我的另外一篇博文对这个Imagenet的数据处理过程有更新,请参考: 
https://blog.csdn.net/gzroy/article/details/85954329
<https://blog.csdn.net/gzroy/article/details/85954329>]

最近想以Imagenet
2012图像分类大赛的数据来进行训练和测试,看看如何能利用这么大量的图像数据来完善卷积神经网络模型。之前做的基于Cifar10的数据量还是大小了,类别也不够多。Imagenet的数据总共有146G,共包含了1000个类别的图像,总共120万张图片。Tensorlfow的官方模型库中的很多模型也是以Imagenet来训练的。


首先要去Imagenet的官网下载数据集,在下载前要用后缀为edu的邮箱注册并审批通过之后才能获取下载地址。不过也可以在网上搜索一下直接查下载地址即可,不用注册。

数据下载之后,是一个大的Tar包,里面包含了1000个图像类别的Tar包。把他们全部解压之后,就可以编写程序来处理了。


我的思路是,先把这1000个文件夹的所有文件的文件名都读取出来,并构建一个字典来表示文件夹名称和图像类别的对应关系,把这些文件名和类别保存在一个CSV文件中。程序代码如下:
#Imagenet图片都保存在/data目录下,里面有1000个子目录,获取这些子目录的名字 classes = os.listdir('data/')
#构建一个字典,Key是目录名,value是类名0-999 labels_dict = {} for i in range(len(classes)):
labels_dict[classes[i]]=i #构建一个列表,里面的每个元素是图片文件名+类名 images_labels_list = [] for
i in range(len(classes)): path = 'data/'+classes[i]+'/' images_files =
os.listdir(path) label = str(labels_dict[classes[i]]) for image_file in
images_files: images_labels_list.append(path+image_file+','+label+'\n')
#把列表进行随机排序,然后取其中80%的数据作为训练集,10%作为验证集,10%作为测试集
random.shuffle(images_labels_list) num = len(images_labels_list) with
open('imagenet_train.csv', 'w') as file:
file.writelines(images_labels_list[:int(num*0.8)]) with
open('imagenet_valid.csv', 'w') as file:
file.writelines(images_labels_list[int(num*0.8):int(num*0.9)]) with
open('imagenet_test.csv', 'w') as file:
file.writelines(images_labels_list[int(num*0.9):])
程序运行后,我们有3个CSV文件,分别对应训练集,验证集和测试集。CSV文件有两列,一列是图片的文件名,另一列是类名。


之后我们就可以在Tensorflow里面构建输入数据了。有两种方法,一种是直接读取CSV的数据来构建Dataset,然后对每一条数据用Map函数来进行图像的解码。这种方法最简单,但是在实际训练中,因为涉及到大量小文件的读取和解码,效率并不太高。程序代码如下:
#定义对Dataset每条数据进行处理的map函数 def _parse_function(filename, label): image_string =
tf.read_file(filename) image_decoded = tf.image.decode_jpeg(image_string,
channels=3) image_height = tf.shape(image_decoded)[0] image_width =
tf.shape(image_decoded)[1] #按照RESNET论文的训练图像的处理方式,对图片的短边随机缩放到256-481之间的数值,然后在随机
#剪切224×224大小的图片。 random_s = tf.random_uniform([1], minval=256, maxval=481,
dtype=tf.int32)[0] resized_height, resized_width =
tf.cond(image_height<image_width, lambda: (random_s,
tf.cast(tf.multiply(tf.cast(image_width,
tf.float64),tf.divide(random_s,image_height)), tf.int32)), lambda:
(tf.cast(tf.multiply(tf.cast(image_height,
tf.float64),tf.divide(random_s,image_width)), tf.int32), random_s)) image_float
= tf.image.convert_image_dtype(image_decoded, tf.float32) image_resized =
tf.image.resize_images(image_float, [resized_height, resized_width])
image_flipped = tf.image.random_flip_left_right(image_resized) image_cropped =
tf.random_crop(image_flipped, [imageCropHeight, imageCropWidth, imageDepth])
image_distorted = tf.image.random_brightness(image_cropped, max_delta=63)
image_distorted = tf.image.random_contrast(image_distorted, lower=0.2,
upper=1.8) image_distorted =
tf.image.per_image_standardization(image_distorted) image_distorted =
tf.transpose(image_distorted, perm=[2, 0, 1]) return image_distorted, label
#构建Dataset with tf.device('/cpu:0'): filename_train = ["imagenet_train.csv"]
filename_valid = ["imagenet_valid.csv"] #filename_test = ["imagenet_test.csv"]
record_defaults = [tf.string, tf.int32] dataset_train =
tf.contrib.data.CsvDataset(filename_train, record_defaults) dataset_valid =
tf.contrib.data.CsvDataset(filename_valid, record_defaults) #dataset_test =
tf.contrib.data.CsvDataset(filename_test, record_defaults) dataset_train =
dataset_train.map(_parse_function, num_parallel_calls=4) dataset_valid =
dataset_valid.map(_parse_test_function, num_parallel_calls=2) #dataset_test =
dataset_test.map(_parse_function, num_parallel_calls=2) dataset_train =
dataset_train.repeat(10) dataset_train = dataset_train.batch(batch_size)
dataset_train = dataset_train.prefetch(batch_size) dataset_valid =
dataset_valid.batch(batch_size) #dataset_test = dataset_test.batch(batch_size)
iterator = tf.data.Iterator.from_structure(dataset_train.output_types,
dataset_train.output_shapes) next_images, next_labels = iterator.get_next()
train_init_op = iterator.make_initializer(dataset_train) valid_init_op =
iterator.make_initializer(dataset_valid) #test_init_op =
iterator.make_initializer(dataset_test)

第二种方法是把这些小图片转换为TFRECORD格式,大概每1000张小图片合并为一个TFRECORD文件,这样可以减少频繁读取小文件带来的开销,提高之后训练的速度。另外,为了提高效率,可以开启多个进程来同时进行TFRECORD文件的生成。我的CPU有4个核心,因此起了4个进程来同时处理。代码如下:
import tensorflow as tf from multiprocessing import Process, Queue
#把图像数据和标签转换为TRRECORD的格式 def make_example(image, label): return
tf.train.Example(features=tf.train.Features(feature={ 'image' :
tf.train.Feature(bytes_list=tf.train.BytesList(value=[image])), 'label' :
tf.train.Feature(int64_list=tf.train.Int64List(value=[label])) }))
#这个函数用来生成TFRECORD文件,第一个参数是列表,每个元素是图片文件名加类名,第二个参数是队列名称,用于和父进程发送消息 def
gen_tfrecord(trainrecords, queue): tfrecords_file_num = 1 file_num = 0
total_num = len(trainrecords) writer =
tf.python_io.TFRecordWriter("tfdata/"+str(os.getpid())+"_"+str(tfrecords_file_num)+".tfrecord")
pid = os.getpid() for record in trainrecords: file_num += 1 fields =
record.strip('\n').split(',') with open(fields[0], 'rb') as jpgfile: img =
jpgfile.read() label = np.array(int(fields[1])) ex = make_example(img, label)
writer.write(ex.SerializeToString()) #每写入100条记录,向父进程发送消息,报告进度 if
file_num%100==0: queue.put((pid, file_num)) if file_num%max_num==0 and
file_num<total_num: writer.close() tfrecords_file_num += 1 writer =
tf.python_io.TFRecordWriter("tfdata/"+str(os.getpid())+"_"+str(tfrecords_file_num)+".tfrecord")
writer.close() max_num = 1000 #max record number in one file tfrecords_file_num
= 1 #读取之前生成的训练集的图片文件名和类名的CSV文件 with open('imagenet_train.csv', 'r') as
trainfile: trainrecords = trainfile.readlines() total_files_num =
len(trainrecords) #CPU有4个核心,因此每个核心处理1/4的数据,把trainrecords列表拆分为4份
each_process_files_num = int(total_files_num/4.0) list1 =
trainrecords[:each_process_files_num] list2 =
trainrecords[each_process_files_num:2*each_process_files_num] list3 =
trainrecords[2*each_process_files_num:3*each_process_files_num] list4 =
trainrecords[3*each_process_files_num:] #设置4个队列,和4个子进程 q1 = Queue() q2 =
Queue() q3 = Queue() q4 = Queue() p1=Process(target=gen_tfrecord,
args=(list1,q1,)) p2=Process(target=gen_tfrecord, args=(list2,q2,))
p3=Process(target=gen_tfrecord, args=(list3,q3,))
p4=Process(target=gen_tfrecord, args=(list4,q4,)) p_list=[p1, p2, p3, p4] _ =
map(Process.start, p_list) #父进程循环查询队列的消息,并且每10秒更新一次 progress_str = 'PID:%i
Processing:%i/%i | PID:%i Processing:%i/%i | PID:%i Processing:%i/%i | PID:%i
Processing:%i/%i \r' while(True): try: msg1 = q1.get() msg2 = q2.get() msg3 =
q3.get() msg4 = q4.get() print progress_str %
(msg1[0],msg1[1],len(list1),msg2[0],msg2[1],len(list2),msg3[0],msg3[1],len(list3),msg4[0],msg4[1],len(list4)),
time.sleep(10) except: break #构建Dataset with tf.device('/cpu:0'):
train_files_names = os.listdir('tfdata/') train_files = ['tfdata/'+item for
item in train_files_names] #train_files = ['testrecords.tfrecord']
dataset_train = tf.data.TFRecordDataset(train_files) dataset_train =
dataset_train.map(_parse_function, num_parallel_calls=4) dataset_train =
dataset_train.repeat(10) dataset_train = dataset_train.batch(batch_size)
dataset_train = dataset_train.prefetch(batch_size) iterator =
tf.data.Iterator.from_structure(dataset_train.output_types,
dataset_train.output_shapes) next_images, next_labels = iterator.get_next()
train_init_op = iterator.make_initializer(dataset_train)
2种方法的性能差异还是很大的。在我的Intel I3 CPU+1070Ti 8G+16G RAM的配置下,Batch Size是64,
采用方法一,每个Batch的训练时间是80秒,方法二的训练时间是40秒,因此方法二对性能的提高还是很大的。