使用keras对电影评论进行分类

加载IMDB数据集

1
from tensorflow.keras.datasets import imdb
1
(train_data,train_labels),(test_data,test_labels)=imdb.load_data(num_words=10000)
1
train_data.shape
(25000,)
1
train_data[0]
[1,
 14,
 22,
 16,
 43,
 ...
 ...
 ...
 178,
 32]

train_data中的每一项都是这样的一个列表,代表的单词对应的下标索引,每一个数字都对应一个单词

下面这一段代码可以将某条评论解码为英文单词

1
2
3
4
5
6
7
8
9
10
11
#word_index是一个将单词映射为整数的字典
word_index=imdb.get_word_index()
#reverse_word_index :整数->单词
reverse_word_index=dict(
[(value,key) for (key,value) in word_index.items()]
)
#注意,索引减去了 3,因为 0、 1、 2
#是为“padding”(填充)、“start of sequence”(序
#列开始)、“unknown”(未知词)分别保留的索引
decoded_review=' '.join(
[reverse_word_index.get(i-3,'?') for i in train_data[0]])
1
decoded_review
"? this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert ? is an amazing actor and now the same being director ? father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for ? and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also ? to the two little boy's that played the ? of norman and paul they were just brilliant children are often left out of the ? list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and should be praised for what they have done don't you think the whole story was so lovely because it was true and was someone's life after all that was shared with us all"

将整数序列编码为二进制矩阵(one-hot编码)

1
import numpy as np
1
2
3
4
5
6
7
8
def vectorize_sequences(sequences,dimension=10000):
#创建一个size为(len(sequences),dimension)的
#全零矩阵results
results=np.zeros((len(sequences),dimension))
for i ,sequence in enumerate(sequences):
results[i,sequence]=1
return results
#ps:感觉这里的行列些反了,不过没关系
1
2
#have a try
vectorize_sequences(train_data[0])
array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

记得我们的train_data[0]的第一个元素是1,所以第一行中下标为1的地方是1,其余为0.

1
2
3
#现在将训练数据和测试数据向量化
x_train=vectorize_sequences(train_data)
x_test=vectorize_sequences(test_data)
1
x_train.shape
(25000, 10000)
1
2
3
#将标签向量化
y_train=np.asarray(train_labels).astype('float32')
y_test=np.asarray(test_labels).astype('float32')

构建网络

1
2
3
4
5
6
#模型定义
from tensorflow.keras import models,layers
model=models.Sequential()
model.add(layers.Dense(16,activation='relu',input_shape=(10000,)))
model.add(layers.Dense(16,activation='relu'))
model.add(layers.Dense(1,activation='sigmoid'))
1
2
3
4
#编译模型
model.compile(optimizer='rmsprop',
loss='binary_crossentropy',
metrics=['accuracy'])
1
2
3
4
5
#配置优化器
from tensorflow.keras import optimizers
model.compile(optimizer=optimizers.RMSprop(lr=0.001),
loss='binary_crossentropy',
metrics=['accuracy'])
1
2
3
4
5
#使用自定义的损失和指标
from tensorflow.keras import losses,metrics
model.compile(optimizer=optimizers.RMSprop(lr=0.001),
loss=losses.binary_crossentropy,
metrics=[metrics.binary_accuracy])

训练模型+验证模型

1
2
3
4
5
6
#留出验证集
x_val=x_train[:10000]
partial_x_train=x_train[10000:]

y_val=y_train[:10000]
partial_y_train=y_train[10000:]
1
2
3
4
5
6
7
8
9
10
#训练模型并同时在验证集上做验证
model.compile(optimizer='rmsprop',
loss='binary_crossentropy',
metrics=['acc'])

history=model.fit(partial_x_train,
partial_y_train,
epochs=20,
batch_size=512,
validation_data=(x_val,y_val))
Epoch 1/20
30/30 [==============================] - 1s 29ms/step - loss: 0.0046 - acc: 0.9989 - val_loss: 1.3084 - val_acc: 0.8560
Epoch 2/20
30/30 [==============================] - 1s 24ms/step - loss: 8.3249e-05 - acc: 1.0000 - val_loss: 1.3129 - val_acc: 0.8571
Epoch 3/20
30/30 [==============================] - 1s 25ms/step - loss: 6.5058e-05 - acc: 1.0000 - val_loss: 1.3250 - val_acc: 0.8568
Epoch 4/20
30/30 [==============================] - 1s 24ms/step - loss: 5.2597e-05 - acc: 1.0000 - val_loss: 1.3834 - val_acc: 0.8561
Epoch 5/20
30/30 [==============================] - 1s 26ms/step - loss: 7.6493e-04 - acc: 0.9998 - val_loss: 1.4176 - val_acc: 0.8562
Epoch 6/20
30/30 [==============================] - 1s 26ms/step - loss: 3.2046e-05 - acc: 1.0000 - val_loss: 1.4258 - val_acc: 0.8561
Epoch 7/20
30/30 [==============================] - 1s 31ms/step - loss: 2.9245e-05 - acc: 1.0000 - val_loss: 1.4446 - val_acc: 0.8557
Epoch 8/20
30/30 [==============================] - 1s 26ms/step - loss: 2.5217e-05 - acc: 1.0000 - val_loss: 1.4725 - val_acc: 0.8563
Epoch 9/20
30/30 [==============================] - 1s 26ms/step - loss: 0.0011 - acc: 0.9996 - val_loss: 1.5182 - val_acc: 0.8561
Epoch 10/20
30/30 [==============================] - 1s 26ms/step - loss: 1.5805e-05 - acc: 1.0000 - val_loss: 1.5262 - val_acc: 0.8555
Epoch 11/20
30/30 [==============================] - 1s 24ms/step - loss: 1.4758e-05 - acc: 1.0000 - val_loss: 1.5354 - val_acc: 0.8553
Epoch 12/20
30/30 [==============================] - 1s 24ms/step - loss: 1.3594e-05 - acc: 1.0000 - val_loss: 1.5507 - val_acc: 0.8546
Epoch 13/20
30/30 [==============================] - 1s 25ms/step - loss: 1.1385e-05 - acc: 1.0000 - val_loss: 1.5909 - val_acc: 0.8551
Epoch 14/20
30/30 [==============================] - 1s 25ms/step - loss: 8.7676e-04 - acc: 0.9997 - val_loss: 1.6677 - val_acc: 0.8538
Epoch 15/20
30/30 [==============================] - 1s 24ms/step - loss: 1.1577e-05 - acc: 1.0000 - val_loss: 1.6620 - val_acc: 0.8560
Epoch 16/20
30/30 [==============================] - 1s 36ms/step - loss: 8.7293e-06 - acc: 1.0000 - val_loss: 1.6632 - val_acc: 0.8555
Epoch 17/20
30/30 [==============================] - 1s 27ms/step - loss: 7.2850e-06 - acc: 1.0000 - val_loss: 1.6734 - val_acc: 0.8555
Epoch 18/20
30/30 [==============================] - 1s 26ms/step - loss: 6.0855e-06 - acc: 1.0000 - val_loss: 1.7012 - val_acc: 0.8552
Epoch 19/20
30/30 [==============================] - 1s 24ms/step - loss: 1.4016e-04 - acc: 0.9999 - val_loss: 1.7631 - val_acc: 0.8541
Epoch 20/20
30/30 [==============================] - 1s 24ms/step - loss: 4.7701e-06 - acc: 1.0000 - val_loss: 1.7625 - val_acc: 0.8558
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#绘制训练损失和验证损失
import matplotlib.pyplot as plt
history_dict=history.history
#print(history_dict.keys())#dict_keys(['loss', 'acc', 'val_loss', 'val_acc'])
loss_values=history_dict['loss']
val_loss_values=history_dict['val_loss']

epochs=range(1,len(loss_values)+1)#训练的轮次

plt.plot(epochs,loss_values,'bo',label='Training loss')
plt.plot(epochs,val_loss_values,'b',label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

1.png

1
2
3
4
5
6
7
8
9
10
11
12
#绘制训练精度和验证精度
plt.clf()
acc=history_dict['acc']
val_acc=history_dict['val_acc']

plt.plot(epochs,acc,'bo',label='Training acc')
plt.plot(epochs,val_acc,'b',label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

2.png

过拟合了!

观察loss图像,发现epochs为4时loss最小,所以接下来重新训练模型,设置epoch为4

1
2
3
4
5
6
7
8
9
10
11
#尝试重新训练一个模型
model=models.Sequential()
model.add(layers.Dense(16,activation='relu',input_shape=(10000,)))
model.add(layers.Dense(16,activation='relu'))
model.add(layers.Dense(1,activation='sigmoid'))

model.compile(optimizer='rmsprop',
loss='binary_crossentropy',
metrics=['accuracy'])
model.fit(x_train,y_train,epochs=4,batch_size=512)
results=model.evaluate(x_test,y_test)
Epoch 1/4
49/49 [==============================] - 1s 13ms/step - loss: 0.4649 - accuracy: 0.8296
Epoch 2/4
49/49 [==============================] - 1s 14ms/step - loss: 0.2706 - accuracy: 0.9058
Epoch 3/4
49/49 [==============================] - 1s 13ms/step - loss: 0.2064 - accuracy: 0.9260
Epoch 4/4
49/49 [==============================] - 1s 13ms/step - loss: 0.1738 - accuracy: 0.9366
782/782 [==============================] - 1s 2ms/step - loss: 0.2923 - accuracy: 0.8841
1
results
[0.29230669140815735, 0.8840799927711487]
1
2
#预测
model.predict(x_test)
array([[0.16965643],
       [0.99986595],
       [0.86651677],
       ...,
       [0.101327  ],
       [0.05761746],
       [0.545027  ]], dtype=float32)
凡希 wechat
喜欢所以热爱,坚持干货分享,欢迎订阅我的微信公众号
呐,请我吃辣条