In [1]:
import sys
sys.path.append("../loader")
# 载入 CASIA 的 MPF 
from casia.feature import CASIAFeature

载入一些必备包

In [2]:
import numpy as np
import tensorflow as tf

from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras import Model

载入 HDF5 数据并将其转换为训练集与测试集：

In [3]:
hdf_path = 'data/features.h5'
mpf_dataset = CASIAFeature(hdf_path)
# 划分数据集为训练集与测试集
train_features = np.concatenate([features for features, _ in mpf_dataset.train_iter()])
train_labels = np.concatenate([labels for _, labels in mpf_dataset.train_iter()])
test_features = np.concatenate([features for features, _ in mpf_dataset.test_iter()])
test_labels = np.concatenate([labels for _, labels in mpf_dataset.test_iter()])

分别获取测试集与训练集的样本数：

In [4]:
len(test_labels), len(train_labels)

(1072276, 4299331)

### 获取数据的标签及其类别个数

In [5]:
# 获取训练集的类别名称集
train_class_names = set(train_labels)
# 获取训练测试集的类别名称集
test_class_names = set(test_labels)
is_same_class = "相同" if train_class_names == test_class_names else "不相同"
print(f'训练集与测试集的类别标签是{is_same_class}的')

训练集与测试集的类别标签是相同的


由于训练集与测试集的类别标签是相同的，所以下面可以将类别名称写作：

In [6]:
class_names = test_class_names
# 获取类别个数
CLASS_NUM = len(class_names)
print('类别个数：', CLASS_NUM )

类别个数： 3755


将类别名称转换为 编号 -> 类别名称 的映射关系

In [7]:
cat_dict = dict(enumerate(class_names))
cat_dict[7]

'号'

## 选择指定类别名称列表的数据集迭代器：

In [8]:
from casia.feature import SubCASIA

class_names = ['登', '印', '枕', '孤', '美', '好', '琴', '驱', '吞', '山']
mpf_dataset = SubCASIA(class_names, hdf_path)

统计有多少个 MPF 文件（即有多少人）：

In [9]:
n_train = 0
for features, labels in mpf_dataset.sub_train_iter():
    n_train += 1
    
n_train

1152

统计有多少个样本：

In [10]:
n_train = 0
for features, labels in mpf_dataset.sub_train_iter():
    n_train += len(labels) 
n_test = 0
for features, labels in mpf_dataset.sub_test_iter():
    n_test += len(labels)
print(f"训练集的样本数 {n_train}，测试集的样本数 {n_test}")

训练集的样本数 11500，测试集的样本数 2869


重置 cat_dict：

In [11]:
cat_dict = dict(enumerate(class_names))
# 重新划分数据集为训练集与测试集
train_features = np.concatenate([features for features, _ in mpf_dataset.sub_train_iter()])
train_labels = np.concatenate([labels for _, labels in mpf_dataset.sub_train_iter()])
test_features = np.concatenate([features for features, _ in mpf_dataset.sub_test_iter()])
test_labels = np.concatenate([labels for _, labels in mpf_dataset.sub_test_iter()])

In [12]:
# 查看类别字典
cat_dict

{0: '登',
 1: '印',
 2: '枕',
 3: '孤',
 4: '美',
 5: '好',
 6: '琴',
 7: '驱',
 8: '吞',
 9: '山'}

将标签数组转换为数值型数组：

In [13]:
# 获取 index -> name 的 dict
name2index = {cat:cat_id for cat_id, cat in cat_dict.items()}
# 转换训练集的标签
train_labels = np.array([name2index[cat_name] for cat_name in train_labels])
# 转换测试集的标签
test_labels = np.array([name2index[cat_name] for cat_name in test_labels])

将 NumPy 数组转换为 TensorFlow 的迭代器：

In [14]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_features, train_labels))
test_dataset = tf.data.Dataset.from_tensor_slices((test_features, test_labels))

将数据分成批量，且打乱训练集数据：

In [15]:
BATCH_SIZE = 64
SHUFFLE_BUFFER_SIZE = 100

train_dataset = train_dataset.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE)
test_dataset = test_dataset.batch(BATCH_SIZE)

## 定义损失函数，评估函数，优化方法

In [16]:
# 定义损失函数
loss_object = tf.keras.losses.SparseCategoricalCrossentropy()
# 定义模型训练的优化方法
optimizer = tf.keras.optimizers.Adam()
# 选择衡量指标来度量模型的损失值（loss）和准确率（accuracy）。这些指标在 epoch 上累积值，然后打印出整体结果
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')
test_loss = tf.keras.metrics.Mean(name='test_loss')
test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='test_accuracy')

## 使用 `tf.GradientTape` 定义如何训练模型以及评估模型性能

In [17]:
@tf.function
def train_step(images, labels):
    with tf.GradientTape() as tape:
        predictions = model(images)
        loss = loss_object(labels, predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    train_loss(loss)
    train_accuracy(labels, predictions)


@tf.function
def test_step(images, labels):
    predictions = model(images)
    t_loss = loss_object(labels, predictions)

    test_loss(t_loss)
    test_accuracy(labels, predictions)

## 定义模型

In [18]:
class MyModel(Model):
    def __init__(self, CLASS_NUM):
        super().__init__()
        self.d1 = Dense(300, activation='relu')
        self.d2 = Dense(CLASS_NUM, activation='softmax')

    def call(self, x):
        x = self.d1(x)
        return self.d2(x)


# Create an instance of the model
CLASS_NUM = len(cat_dict)
model = MyModel(CLASS_NUM)

## 训练并评估模型

In [19]:
EPOCHS = 20

for epoch in range(EPOCHS):
    for images, labels in train_dataset:
        train_step(images, labels)

    for test_images, test_labels in test_dataset:
        test_step(test_images, test_labels)

    template = 'Epoch {}, Loss: {}, Accuracy: {}, Test Loss: {}, Test Accuracy: {}'
    print(template.format(epoch+1,
                          train_loss.result(),
                          train_accuracy.result()*100,
                          test_loss.result(),
                          test_accuracy.result()*100))

    # Reset the metrics for the next epoch
    train_loss.reset_states()
    train_accuracy.reset_states()
    test_loss.reset_states()
    test_accuracy.reset_states()

Epoch 1, Loss: 1.7724343538284302, Accuracy: 92.5478286743164, Test Loss: 3.6418545246124268, Test Accuracy: 72.15057373046875
Epoch 2, Loss: 0.17651645839214325, Accuracy: 97.65217590332031, Test Loss: 0.15243582427501678, Test Accuracy: 97.69954681396484
Epoch 3, Loss: 0.019473550841212273, Accuracy: 99.5130386352539, Test Loss: 0.18716318905353546, Test Accuracy: 98.22237396240234
Epoch 4, Loss: 0.009412109851837158, Accuracy: 99.73912811279297, Test Loss: 0.10833640396595001, Test Accuracy: 98.84977722167969
Epoch 5, Loss: 0.010610833764076233, Accuracy: 99.83478546142578, Test Loss: 0.3134274482727051, Test Accuracy: 96.09619903564453
Epoch 6, Loss: 0.00728351715952158, Accuracy: 99.81739044189453, Test Loss: 0.14625050127506256, Test Accuracy: 98.43151092529297
Epoch 7, Loss: 0.0035412530414760113, Accuracy: 99.88695526123047, Test Loss: 0.16272664070129395, Test Accuracy: 98.08295440673828
Epoch 8, Loss: 0.00792350061237812, Accuracy: 99.86956787109375, Test Loss: 0.123289965093