5-1数据管道Dataset——eat_tensorflow2_in_30_days

5-1数据管道Dataset

如果需要训练的数据大小不大，例如不到1G，那么可以直接全部读入内存中进行训练，这样一般效率最高。但如果需要训练的数据很大，例如超过10G，无法一次载入内存，那么通常需要在训练的过程中分批逐渐读入。

使用 tf.data API 可以构建数据输入管道，轻松处理大量的数据，不同的数据格式，以及不同的数据转换。

构建数据管道#

可以从 Numpy array, Pandas DataFrame, Python generator, csv文件, 文本文件, 文件路径, tfrecords文件等方式构建数据管道。

其中通过Numpy array, Pandas DataFrame, 文件路径构建数据管道是最常用的方法。

通过tfrecords文件方式构建数据管道较为复杂，需要对样本构建tf.Example后压缩成字符串写到tfrecoreds文件，读取后再解析成tf.Example。

但tfrecoreds文件的优点是压缩后文件较小，便于网络传播，加载速度较快。

从Numpy array构建数据管道

# 从Numpy ndarray构建数据管道
import tensorflow as tf
import numpy as np
from sklearn import datasets

iris = datasets.load_iris()
ds1 = tf.data.Dataset.from_tensor_slices((iris['data'], iris['target']))
for features, label in ds1.take(5):
    print(features, label)
    
"""
tf.Tensor([5.1 3.5 1.4 0.2], shape=(4,), dtype=float64) tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor([4.9 3.  1.4 0.2], shape=(4,), dtype=float64) tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor([4.7 3.2 1.3 0.2], shape=(4,), dtype=float64) tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor([4.6 3.1 1.5 0.2], shape=(4,), dtype=float64) tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor([5.  3.6 1.4 0.2], shape=(4,), dtype=float64) tf.Tensor(0, shape=(), dtype=int64)
"""

从 Pandas DataFrame构建数据管道

# 从Pandas DataFrame构建数据管道
import tensorflow as tf
from sklearn import datasets
import pandas as pd

iris = datasets.load_iris()
dfiris = pd.DataFrame(iris['data'], columns=iris.feature_names)
ds2 = tf.data.Dataset.from_tensor_slices((dfiris.to_dict('list'), iris['target']))

for features, label in ds2.take(3):
    tf.print(features, label)
    
"""
{'petal length (cm)': 1.4,
 'petal width (cm)': 0.2,
 'sepal length (cm)': 5.1,
 'sepal width (cm)': 3.5} 0
{'petal length (cm)': 1.4,
 'petal width (cm)': 0.2,
 'sepal length (cm)': 4.9,
 'sepal width (cm)': 3} 0
{'petal length (cm)': 1.3,
 'petal width (cm)': 0.2,
 'sepal length (cm)': 4.7,
 'sepal width (cm)': 3.2} 0
"""

%pprint on
for features, label in ds2.take(3):
    print(features, label)
    
"""
Pretty printing has been turned ON
{'sepal length (cm)': <tf.Tensor: shape=(), dtype=float32, numpy=5.1>, 'sepal width (cm)': <tf.Tensor: shape=(), dtype=float32, numpy=3.5>, 'petal length (cm)': <tf.Tensor: shape=(), dtype=float32, numpy=1.4>, 'petal width (cm)': <tf.Tensor: shape=(), dtype=float32, numpy=0.2>} tf.Tensor(0, shape=(), dtype=int64)
{'sepal length (cm)': <tf.Tensor: shape=(), dtype=float32, numpy=4.9>, 'sepal width (cm)': <tf.Tensor: shape=(), dtype=float32, numpy=3.0>, 'petal length (cm)': <tf.Tensor: shape=(), dtype=float32, numpy=1.4>, 'petal width (cm)': <tf.Tensor: shape=(), dtype=float32, numpy=0.2>} tf.Tensor(0, shape=(), dtype=int64)
{'sepal length (cm)': <tf.Tensor: shape=(), dtype=float32, numpy=4.7>, 'sepal width (cm)': <tf.Tensor: shape=(), dtype=float32, numpy=3.2>, 'petal length (cm)': <tf.Tensor: shape=(), dtype=float32, numpy=1.3>, 'petal width (cm)': <tf.Tensor: shape=(), dtype=float32, numpy=0.2>} tf.Tensor(0, shape=(), dtype=int64)
"""

从Python generator构建数据管道

# 从Python generator构建数据管道
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# 定义一个从文件找那个读取图片的generator
image_generator = ImageDataGenerator(rescale=1.0/255).flow_from_directory(
    './data/cifar2/test/',
    target_size=(32, 32),
    batch_size=20,
    class_mode='binary'
)
classdict = image_generator.class_indices
print(classdict)

def generator():
    for features, label in image_generator:
        yield (features, label)
        
ds3 = tf.data.Dataset.from_generator(generator, output_types=(tf.float32, tf.int32))

"""
Found 2000 images belonging to 2 classes.
{'airplane': 0, 'automobile': 1}
"""

%matplotlib inline
%config InlineBackend.figure_format='svg'

plt.figure(figsize=(6, 6))
for i, (img, label) in enumerate(ds3.unbatch().take(9)):
    ax = plt.subplot(3, 3, i+1)
    ax.imshow(img.numpy())
    ax.set_title('label = %d' % label)
    ax.set_xticks([])
    ax.set_yticks([])
plt.show()

从csv文件构建数据管道

# 从csv文件构建数据管道
ds4 = tf.data.experimental.make_csv_dataset(
    file_pattern=['./data/titanic/train.csv', './data/titanic/test.csv'],
    batch_size=3,
    label_name='Survived',
    na_value='',
    num_epochs=1,
    ignore_errors=True
)

for data, label in ds4.take(2):
    tf.print(data, label)
    
"""
OrderedDict([('PassengerId', [32 2 41]),
             ('Pclass', ["" "" ""]),
             ('Name', ["S" "Q" "S"]),
             ('Sex', [8.3625 29.125 14.1083]),
             ('Age', ["Gronnestad, Mr. Daniel Danielsen" "Rice, Master. Eugene" "Hansen, Mr. Claus Peter"]),
             ('SibSp', [0 1 0]),
             ('Parch', [770 17 861]),
             ('Ticket', [3 3 3]),
             ('Fare', ["male" "male" "male"]),
             ('Cabin', [0 4 2]),
             ('Embarked', ["8471" "382652" "350026"])]) [0 0 0]
OrderedDict([('PassengerId', [0 14 42]),
             ('Pclass', ["" "" ""]),
             ('Name', ["Q" "S" "S"]),
             ('Sex', [15.5 7.8542 52]),
             ('Age', ["O\'Brien, Mrs. Thomas (Johanna \"Hannah\" Godfrey)" "Vestrom, Miss. Hulda Amanda Adolfina" "Holverson, Mr. Alexander Oskar"]),
             ('SibSp', [0 0 0]),
             ('Parch', [187 15 36]),
             ('Ticket', [3 3 1]),
             ('Fare', ["female" "female" "male"]),
             ('Cabin', [1 0 1]),
             ('Embarked', ["370365" "350406" "113789"])]) [1 0 0]
"""

从文本文件构建数据管道

# 从文本文件构建数据管道
ds5 = tf.data.TextLineDataset(
    filenames=['./data/titanic/train.csv', './data/titanic/test.csv'],
).skip(1)  # 略去第一行header

for line in ds5.take(5):
    tf.print(line)

"""
493,0,1,"Molson, Mr. Harry Markland",male,55.0,0,0,113787,30.5,C30,S
53,1,1,"Harper, Mrs. Henry Sleeper (Myna Haxtun)",female,49.0,1,0,PC 17572,76.7292,D33,C
388,1,2,"Buss, Miss. Kate",female,36.0,0,0,27849,13.0,,S
192,0,2,"Carbines, Mr. William",male,19.0,0,0,28424,13.0,,S
687,0,3,"Panula, Mr. Jaako Arnold",male,14.0,4,1,3101295,39.6875,,S
"""

从文件路径构建数据管道

# 从文件路径构建数据管道
ds6 = tf.data.Dataset.list_files('./data/cifar2/train/*/*.jpg')
for file in ds6.take(5):
    tf.print(file)
    
"""
./data/cifar2/train/automobile/964.jpg
./data/cifar2/train/airplane/2174.jpg
./data/cifar2/train/automobile/3573.jpg
./data/cifar2/train/automobile/781.jpg
./data/cifar2/train/automobile/902.jpg
"""

def load_image(img_path, size=(32, 32)):
    label = 1 if tf.strings.regex_full_match(img_path, '.*/automobile/.*') else 0
    img = tf.io.read_file(img_path)
    img = tf.image.decode_jpeg(img)  # 注意此处为jpeg格式
    img = tf.image.resize(img, size)
    return (img, label)

%matplotlib inline
%config InlineBackend.figure_format='svg'
for i, (img, label) in enumerate(ds6.map(load_image).take(2)):
    plt.figure(i, figsize=(1, 1))
    plt.imshow((img/255.0).numpy())
    plt.title('label = %d' % label)
    plt.xticks([])
    plt.yticks([])

从tfrecords文件构建数据管道

# 从tf.records文件构建数据管道
import os
import numpy as np

# inpath：原始数据路径outpath:TFRecord文件输出路径
def create_tfrecords(inpath, outpath):
    writer = tf.io.TFRecordWriter(outpath)
    dirs = os.listdir(inpath)
    for index, name in enumerate(dirs):
        class_path = os.path.join(inpath, name)
        for img_name in os.listdir(class_path):
            img_path = os.path.join(class_path, img_name)
            img = tf.io.read_file(img_path)
            example = tf.train.Example(features=tf.train.Features(feature={
                'label': tf.train.Feature(int64_list=tf.train.Int64List(value=[index])),
                'img_raw': tf.train.Feature(bytes_list=tf.train.BytesList(value=[img.numpy()]))
            }))
            writer.write(example.SerializeToString())
    writer.close()
create_tfrecords('./data/cifar2/test/', './data/cifar2_test.tfrecords/')

def parse_example(proto):
    description = {
        'img_raw': tf.io.FixedLenFeature([], tf.string),
        'label': tf.io.FixedLenFeature([], tf.int64)
    }
    example = tf.io.parse_single_example(proto, description)
    img = tf.image.decode_jpeg(example['img_raw'])  # 注意此处为jpeg格式
    img = tf.image.resize(img, (32, 32))
    label = example['label']
    return (img, label)

ds7 = tf.data.TFRecordDataset('./data/cifar2_test.tfrecords').map(parse_example).shuffle(3000)

%matplotlib inline
%config InlineBackend.figure_format = 'svg'
plt.figure(figsize=(6, 6))
for i, (img, label) in enumerate(ds7.take(9)):
    ax=plt.subplot(3,3,i+1)
    ax.imshow((img/255.0).numpy())
    ax.set_title("label = %d"%label)
    ax.set_xticks([])
    ax.set_yticks([]) 
plt.show()

应用数据转换#

Dataset数据结构应用非常灵活，因为它本质上是一个Sequece序列，其每个元素可以是各种类型，例如可以是张量，列表，字典，也可以是Dataset。

Dataset包含了非常丰富的数据转换功能。

map: 将转换函数映射到数据集每一个元素。

flat_map: 将转换函数映射到数据集的每一个元素，并将嵌套的Dataset压平。

interleave: 效果类似flat_map,但可以将不同来源的数据夹在一起。

filter: 过滤掉某些元素。

zip: 将两个长度相同的Dataset横向铰合。

concatenate: 将两个Dataset纵向连接。

reduce: 执行归并操作。

batch : 构建批次，每次放一个批次。比原始数据增加一个维度。其逆操作为unbatch。

padded_batch: 构建批次，类似batch, 但可以填充到相同的形状。

window :构建滑动窗口，返回Dataset of Dataset.

shuffle: 数据顺序洗牌。

repeat: 重复数据若干次，不带参数时，重复无数次。

shard: 采样，从某个位置开始隔固定距离采样一个元素。

take: 采样，从开始位置取前几个元素。

# map:将转换函数映射到数据集每一个元素
ds = tf.data.Dataset.from_tensor_slices(['hello world', 'hello China', 'hello ShenZhen'])
ds_map = ds.map(lambda x: tf.strings.split(x, ' '))
for x in ds_map:
    tf.print(x)
    
tf.print(' ')
for x in ds_map:
    print(x)
    
"""
["hello" "world"]
["hello" "China"]
["hello" "ShenZhen"]
 
tf.Tensor([b'hello' b'world'], shape=(2,), dtype=string)
tf.Tensor([b'hello' b'China'], shape=(2,), dtype=string)
tf.Tensor([b'hello' b'ShenZhen'], shape=(2,), dtype=string)
"""

# filter:过滤掉某些元素
ds = tf.data.Dataset.from_tensor_slices(['hello world', 'hello China', 'hello ShenZhen'])
# 找出含有字母a或S的元素
ds_filter = ds.filter(lambda x: tf.strings.regex_full_match(x, '.*[a|S].*'))
for x in ds_filter:
    tf.print(x)
    
"""
hello China
hello ShenZhen
"""

# zip:将两个长度相同的Dataset横向铰合
ds1 = tf.data.Dataset.range(0, 3)
ds2 = tf.data.Dataset.range(3, 6)
ds3 = tf.data.Dataset.range(6, 9)
ds_zip = tf.data.Dataset.zip((ds1, ds2, ds3))
tf.print(ds_zip)
tf.print()
for x, y, z in ds_zip:
    print('x=', x.numpy(), ',y=', y.numpy(), ',z=', z.numpy())
    
"""
<ZipDataset shapes: ((), (), ()), types: (tf.int64, tf.int64, tf.int64)>

x= 0 ,y= 3 ,z= 6
x= 1 ,y= 4 ,z= 7
x= 2 ,y= 5 ,z= 8
"""

# concatenate:将两个Dataset纵向连接
ds1 = tf.data.Dataset.range(0, 3)
ds2 = tf.data.Dataset.range(3, 6)

ds_concat = tf.data.Dataset.concatenate(ds1, ds2)
for x in ds_concat:
    tf.print(x)
    
"""
0
1
2
3
4
5
"""

# reduce:执行归并操作
ds = tf.data.Dataset.from_tensor_slices([1, 2, 3, 4, 5.0])
result = ds.reduce(0.0, lambda x, y: tf.add(x, y))
tf.print(result)

"""
15
"""

# batch:构建批次，每次放一个批次。比原始数据增加一个维度。其你操作为unbatch
ds = tf.data.Dataset.range(12)
ds_batch = ds.batch(4)
for x in ds_batch:
    print(x)
    
"""
# batch:构建批次，每次放一个批次。比原始数据增加一个维度。其你操作为unbatch

ds = tf.data.Dataset.range(12)

ds_batch = ds.batch(4)

for x in ds_batch:

    print(x)

tf.Tensor([0 1 2 3], shape=(4,), dtype=int64)
tf.Tensor([4 5 6 7], shape=(4,), dtype=int64)
tf.Tensor([ 8  9 10 11], shape=(4,), dtype=int64)
"""

# padded_batch:构建批次，类似batch，但可以填充到相同的形状
elements = [[1, 2], [3, 4, 5], [6, 7], [8]]
ds = tf.data.Dataset.from_generator(lambda: iter(elements), tf.int32)
ds_padding_batch = ds.padded_batch(2, padded_shapes=[4,])
for x in ds_padding_batch:
    print(x)
    
"""
tf.Tensor(
[[1 2 0 0]
 [3 4 5 0]], shape=(2, 4), dtype=int32)
tf.Tensor(
[[6 7 0 0]
 [8 0 0 0]], shape=(2, 4), dtype=int32)
"""

# window:构建滑动窗口，返回Dataset of Dataset
ds = tf.data.Dataset.range(12)
# window返回的是Dataset of Dataset，可以用flat_map压平
ds_window = ds.window(3, shift=1).flat_map(lambda x: x.batch(3, drop_remainder=True))
for x in ds_window:
    print(x)
    
"""
tf.Tensor([0 1 2], shape=(3,), dtype=int64)
tf.Tensor([1 2 3], shape=(3,), dtype=int64)
tf.Tensor([2 3 4], shape=(3,), dtype=int64)
tf.Tensor([3 4 5], shape=(3,), dtype=int64)
tf.Tensor([4 5 6], shape=(3,), dtype=int64)
tf.Tensor([5 6 7], shape=(3,), dtype=int64)
tf.Tensor([6 7 8], shape=(3,), dtype=int64)
tf.Tensor([7 8 9], shape=(3,), dtype=int64)
tf.Tensor([ 8  9 10], shape=(3,), dtype=int64)
tf.Tensor([ 9 10 11], shape=(3,), dtype=int64)
"""

# shuffle：数据顺序洗牌
ds = tf.data.Dataset.range(12)
ds_shuffle = ds.shuffle(buffer_size=5)
for x in ds_shuffle:
    print(x)
    
"""
tf.Tensor(1, shape=(), dtype=int64)
tf.Tensor(4, shape=(), dtype=int64)
tf.Tensor(6, shape=(), dtype=int64)
tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor(7, shape=(), dtype=int64)
tf.Tensor(8, shape=(), dtype=int64)
tf.Tensor(10, shape=(), dtype=int64)
tf.Tensor(5, shape=(), dtype=int64)
tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor(11, shape=(), dtype=int64)
tf.Tensor(3, shape=(), dtype=int64)
tf.Tensor(9, shape=(), dtype=int64)
"""

# repeat:重复数据若干次，不带参数时，重复无数次
ds = tf.data.Dataset.range(3)
ds_repeat = ds.repeat(3)
for x in ds_repeat:
    print(x)
    
"""
f.Tensor(0, shape=(), dtype=int64)
tf.Tensor(1, shape=(), dtype=int64)
tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor(1, shape=(), dtype=int64)
tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor(1, shape=(), dtype=int64)
tf.Tensor(2, shape=(), dtype=int64)
"""

# shard:采样，从某个位置开始隔固定距离采样一个元素
ds = tf.data.Dataset.range(12)
ds_shard = ds.shard(3, index=1)

for x in ds_shard:
    print(x)
    
"""
tf.Tensor(1, shape=(), dtype=int64)
tf.Tensor(4, shape=(), dtype=int64)
tf.Tensor(7, shape=(), dtype=int64)
tf.Tensor(10, shape=(), dtype=int64)
"""

# take:采样，从开始位置取前几个元素
ds = tf.data.Dataset.range(12)
ds_take = ds.take(3)
list(ds_take.as_numpy_iterator())

"""
[0, 1, 2]
"""

提升管道性能#

训练深度学习模型常常会非常耗时。

模型训练的耗时主要来自于两个部分，一部分来自数据准备，另一部分来自参数迭代。

参数迭代过程的耗时通常依赖于GPU来提升。

而数据准备过程的耗时则可以通过构建高效的数据管道进行提升。

以下是一些构建高效数据管道的建议。

1、使用 prefetch 方法让数据准备和参数迭代两个过程相互并行。

2、使用 interleave 方法可以让数据读取过程多进程执行，并将不同来源数据夹在一起。

3、使用 map 时设置num_parallel_calls 让数据转换过程多进行执行。

4、使用 cache 方法让数据在第一个epoch后缓存到内存中，仅限于数据集不大情形。

5、使用 map 转换时，先batch, 然后采用向量化的转换方法对每个batch进行转换。

使用 prefetch 方法让数据准备和参数迭代两个过程相互并行

import tensorflow as tf

#打印时间分割线
@tf.function
def printbar():
    ts = tf.timestamp()
    today_ts = ts%(24*60*60)

    hour = tf.cast(today_ts//3600+8,tf.int32)%tf.constant(24)
    minite = tf.cast((today_ts%3600)//60,tf.int32)
    second = tf.cast(tf.floor(today_ts%60),tf.int32)
    
    def timeformat(m):
        if tf.strings.length(tf.strings.format("{}",m))==1:
            return(tf.strings.format("0{}",m))
        else:
            return(tf.strings.format("{}",m))
    
    timestring = tf.strings.join([timeformat(hour),timeformat(minite),
                timeformat(second)],separator = ":")
    tf.print("=========="*8,end = "")
    tf.print(timestring)

import time

# 数据准备和参数迭代两个过程默认情况下是串行的
def generator():
    for i in range(10):
        # 假设每次准备数据需要2s
        time.sleep(2)
        yield i
ds = tf.data.Dataset.from_generator(generator, output_types=(tf.int32))

# 模拟参数迭代

def train_step():
    time.sleep(1)

# 训练过程预计耗时：10*2 + 10*1 = 30s
printbar()
tf.print(tf.constant('start traing...'))
for x in ds:
    train_step()
printbar()
tf.print(tf.constant('end training...'))

"""
================================================================================11:30:19
start traing...
================================================================================11:30:49
end training...
"""

# 使用prefetch方法让数据准备和参数迭代两个过程相互并行
# 训练过程预计耗时 max(10*2, 10*1) = 20s
printbar()
tf.print(tf.constant('start training with prefetch...'))
# tf.data.experimental.AUTOTUNE 可以让程序自动选择合适的参数
for x in ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE):
    train_step()
printbar()
tf.print(tf.constant("end training..."))

"""
================================================================================11:33:31
start training with prefetch...
================================================================================11:33:52
end training...
"""

使用 interleave 方法可以让数据读取过程多进程执行,并将不同来源数据夹在一起

# 使用interleave方法可以让数据读取过程多进程执行，并将不同来源数据夹在一起
ds_files = tf.data.Dataset.list_files('./data/titanic/*.csv')
ds = ds_files.interleave(lambda x: tf.data.TextLineDataset(x).skip(1))
for line in ds.take(4):
    print(line)
    
"""
tf.Tensor(b'493,0,1,"Molson, Mr. Harry Markland",male,55.0,0,0,113787,30.5,C30,S', shape=(), dtype=string)
tf.Tensor(b'181,0,3,"Sage, Miss. Constance Gladys",female,,8,2,CA. 2343,69.55,,S', shape=(), dtype=string)
tf.Tensor(b'53,1,1,"Harper, Mrs. Henry Sleeper (Myna Haxtun)",female,49.0,1,0,PC 17572,76.7292,D33,C', shape=(), dtype=string)
tf.Tensor(b'405,0,3,"Oreskovic, Miss. Marija",female,20.0,0,0,315096,8.6625,,S', shape=(), dtype=string)
"""

ds_files = tf.data.Dataset.list_files('./data/titanic/*.csv')
ds = ds_files.flat_map(lambda x: tf.data.TextLineDataset(x).skip(1))
for line in ds.take(4):
    print(line)
    
"""
tf.Tensor(b'181,0,3,"Sage, Miss. Constance Gladys",female,,8,2,CA. 2343,69.55,,S', shape=(), dtype=string)
tf.Tensor(b'405,0,3,"Oreskovic, Miss. Marija",female,20.0,0,0,315096,8.6625,,S', shape=(), dtype=string)
tf.Tensor(b'635,0,3,"Skoog, Miss. Mabel",female,9.0,3,2,347088,27.9,,S', shape=(), dtype=string)
tf.Tensor(b'701,1,1,"Astor, Mrs. John Jacob (Madeleine Talmadge Force)",female,18.0,1,0,PC 17757,227.525,C62 C64,C', shape=(), dtype=string)
"""

使用 map 时设置num_parallel_calls 让数据转换过程多进行执行

ds = tf.data.Dataset.list_files('./data/cifar2/train/*/*.jpg')

def load_imagee(img_path, size=(32, 32)):
    label = 1 if tf.strings.regex_full_match(img_path, '.*/automobile/.*') else 0
    img = tf.io.read_file(img_path)
    img = tf.image.decode_jpeg(img)  # 注意此处为jpeg格式
    img =  tf.image.resize(32, 32)
    return (img, label)

# 单进程转换
printbar()
tf.print(tf.constant('start transformation...'))
ds_map = ds.map(load_image)
for _ in ds_map:
    pass

printbar()
tf.print(tf.constant('end transformation...'))

"""
================================================================================11:50:11
start transformation...
================================================================================11:50:13
end transformation...
"""

# 多进程转换
printbar()
tf.print(tf.constant('start transformation...'))

ds_map_parallel = ds.map(load_image, num_parallel_calls=tf.data.experimental.AUTOTUNE)

for _ in ds_map_parallel:
    pass

printbar()
tf.print(tf.constant("end parallel transformation..."))

"""
================================================================================11:51:29
start transformation...
================================================================================11:51:29
end parallel transformation...
"""

使用 cache 方法让数据在第一个epoch后缓存到内存中，仅限于数据集不大情形

# 使用cache方法让数据在第一个epoch后缓存到内存中，仅限于数据集不大情形
import time

# 模拟数据准备
def generator():
    for i in range(5):
        # 假设每次准备数据需要2s
        time.sleep(2)
        yield i
ds = tf.data.Dataset.from_generator(generator, output_types=tf.int32)

# 模拟参数迭代

def train_step():
    # 假设每一步训练需要0s
    pass

# 训练过程预计耗时 (5*2 + 5*0) * 3 = 30s
printbar()
tf.print(tf.constant('start training...'))
for epoch in tf.range(3):
    for x in ds:
        train_step()
    printbar()
    tf.print('epoch=', epoch, 'ended')
printbar()
tf.print(tf.constant("end training..."))

"""
================================================================================12:59:01
start training...
================================================================================12:59:11
epoch= 0 ended
================================================================================12:59:21
epoch= 1 ended
================================================================================12:59:31
epoch= 2 ended
================================================================================12:59:31
end training...
"""

import time

# 模拟数据准备
def generator():
    for i in range(5):
        #假设每次准备数据需要2s
        time.sleep(2) 
        yield i 
        
# 使用cache方法让数据在第一个epoch后缓存到内存中，仅限于数据集不大情形
ds = tf.data.Dataset.from_generator(generator, output_types=tf.int32).cache()

# 模拟参数迭代
def train_step():
    #假设每一步训练需要0s
    time.sleep(0)
    
# 训练过程预计耗时(5*2 + 5*0) + (5*0+5*0)*2 = 10s
printbar()
tf.print(tf.constant("start training..."))
for epoch in tf.range(3):
    for x in ds:
        train_step()  
    printbar()
    tf.print("epoch =",epoch," ended")
printbar()
tf.print(tf.constant("end training..."))

"""
================================================================================13:05:12
start training...
================================================================================13:05:22
epoch = 0  ended
================================================================================13:05:22
epoch = 1  ended
================================================================================13:05:22
epoch = 2  ended
================================================================================13:05:22
end training...
"""

使用 map转换时，先batch, 然后采用向量化的转换方法对每个batch进行转换

# 使用map转换时，先batch，然后采用向量化的转换方法对每个batch进行转换
ds = tf.data.Dataset.range(100000)
ds_map_batch = ds.map(lambda x: x**2).batch(20)

printbar()
tf.print(tf.constant('start scaler transformation...'))
for x in ds_map_batch:
    pass
printbar()
tf.print(tf.constant('end scaler transformation'))

"""
================================================================================13:09:18
start scaler transformation...
================================================================================13:09:19
end scaler transformation
"""

# 先batch后map
ds = tf.data.Dataset.range(100000)
ds_batch_map = ds.batch(20).map(lambda x: x**2)

printbar()
tf.print(tf.constant('start vector transformation...'))
for x in ds_batch_map:
    pass
printbar()
tf.print(tf.constant("end vector transformation..."))

"""
================================================================================13:12:05
start vector transformation...
================================================================================13:12:05
end vector transformation...
"""