Uploaded by Jorge Rios

Embedding RecurrentNN

advertisement
In [1]: import keras
from keras import preprocessing, models, layers, datasets, callbacks
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding, Bidirectional, Dropout, LSTM
from keras.datasets import reuters
Using TensorFlow backend.
In [2]: caracters_max= 8982
long_max=100
batch_size =32
In [3]: print('Loading data...')
(features_train,target_train),(features_test,target_test)=reuters.load_data(nu
m_words=caracters_max)
print("Datos de entrenamiento: {0}, identificadores: {1}".format(len(features_
train),len(target_train)))
print("Datos de test: {0}, identificadores: {1}".format(len(features_test),len
(target_test)))
Loading data...
Datos de entrenamiento: 8982, identificadores: 8982
Datos de test: 2246, identificadores: 2246
In [4]: indice_palabra= reuters.get_word_index()
indice_palabra= {i:(j+3) for i,j in indice_palabra.items()}
indice_palabra["<PAD>"] = 0
indice_palabra["<START>"]=1
indice_palabra["<UNK>"]=2 #palabra desconocida
indice_palabra["<UNUSED>"] = 3
indice_palabra_retornada= dict([(valor, clave) for (clave,valor) in indice_pal
abra.items()])
def decodificar_revision(texto):
return ' '.join([indice_palabra_retornada.get(x, '?')for x in texto])
In [5]: z=20
print('Cantidad de palabras en la revision:', len(features_train[z]),'palabra
s')
print(decodificar_revision(features_train[z]))
Cantidad de palabras en la revision: 231 palabras
<START> leading u s farm state senators are seeking to <UNK> into the <UNK>
<UNK> trade bill a provision that would broaden <UNK> requirements under the
u s agriculture department's export enhancement program eep to include tradi
tional buyers of u s farm products including the soviet union senate staff s
aid under existing criteria usda can offer eep subsidies to <UNK> export mar
kets lost to competing nations' unfair trading practices senate agriculture
committee chairman patrick leahy d <UNK> is leading a group of farm state se
nators in an effort to broaden the criteria in such a way as to enable mosco
w to be eligible for the subsidies sources said the senators including senat
e finance committee chairman lloyd bentsen d tex <UNK> <UNK> d <UNK> david <
UNK> d <UNK> john <UNK> d <UNK> and <UNK> <UNK> r miss also may <UNK> into t
he trade bill a measure to shield pork producers and processors from canadia
n imports the measure sponsored by sen charles <UNK> r iowa would clarify th
e definition of industry in determining whether or not imports were causing
injury to u s producers <UNK> bill stems from a 1985 decision by the interna
tional trade commission that imports from canada of live <UNK> but not fresh
chilled and frozen pork were <UNK> u s producers the bill's proponents have
argued canada has simply replaced shipments of live hogs with fresh pork reu
ter 3
In [6]: print('añadiendo Padding a las secuencias')
padding_train=keras.preprocessing.sequence.pad_sequences(features_train,value=
indice_palabra['<PAD>'],padding='post',maxlen=long_max)
padding_test=keras.preprocessing.sequence.pad_sequences(features_test,value=in
dice_palabra['<PAD>'],padding='post',maxlen=long_max)
añadiendo Padding a las secuencias
In [7]: X=8981
print('Padding añadido a las revisiones')
print('Cantidad de palabras en la revision:', len(padding_train[X]),'palabras'
)
print(decodificar_revision(padding_train[X]))
Padding añadido a las revisiones
Cantidad de palabras en la revision: 100 palabras
their influence on the bullion market in the near future bullion bankers sam
uel montagu and co ltd said in a market report but the firm said silver may
lag behind gold in any <UNK> to movements on foreign exchanges opec's failur
e to address the recent decline in oil prices remains a worrying factor howe
ver and on balance it appears that the market should be approached cautiousl
y montagu said the bank said the us economy has shown no <UNK> long term imp
rovement and that both latin american debt and the iranian arms affair could
undermine confidence in the dollar reuter 3
In [8]: print('dimensiones de los datos de entrenamiento:', padding_train.shape)
print('dimensiones de los datos de prueba:',padding_test.shape)
dimensiones de los datos de entrenamiento: (8982, 100)
dimensiones de los datos de prueba: (2246, 100)
In [9]: #transformacion de etiquetas a categorias
labels_train = keras.utils.to_categorical(target_train)
labels_test = keras.utils.to_categorical(target_test)
In [10]: print('Build model...')
earlystopping=[keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, m
ode='auto')]
LSTM_model = Sequential()
LSTM_model.add(Embedding(input_dim=caracters_max,output_dim=128,input_length=l
ong_max))
LSTM_model.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
LSTM_model.add(Dense(units=46, activation="sigmoid"))
Build model...
WARNING:tensorflow:From C:\Users\riosm\Anaconda3\lib\site-packages\tensorflo
w\python\framework\op_def_library.py:263: colocate_with (from tensorflow.pyt
hon.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.
WARNING:tensorflow:From C:\Users\riosm\Anaconda3\lib\site-packages\keras\bac
kend\tensorflow_backend.py:3445: calling dropout (from tensorflow.python.op
s.nn_ops) with keep_prob is deprecated and will be removed in a future versi
on.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 keep_prob`.
In [11]: LSTM_model.summary()
_________________________________________________________________
Layer (type)
Output Shape
Param #
=================================================================
embedding_1 (Embedding)
(None, 100, 128)
1149696
_________________________________________________________________
lstm_1 (LSTM)
(None, 128)
131584
_________________________________________________________________
dense_1 (Dense)
(None, 46)
5934
=================================================================
Total params: 1,287,214
Trainable params: 1,287,214
Non-trainable params: 0
_________________________________________________________________
In [12]: LSTM_model.compile(loss='binary_crossentropy', optimizer='adam',
metrics=['accuracy'])
In [13]: savemodel=LSTM_model.fit(padding_train,labels_train, batch_size=batch_size,
epochs=8, validation_data=(padding_test,labels_test
),callbacks=earlystopping)
WARNING:tensorflow:From C:\Users\riosm\Anaconda3\lib\site-packages\tensorflo
w\python\ops\math_ops.py:3066: to_int32 (from tensorflow.python.ops.math_op
s) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.
Train on 8982 samples, validate on 2246 samples
Epoch 1/8
8982/8982 [==============================] - 49s 5ms/step - loss: 0.1016 - a
cc: 0.9731 - val_loss: 0.0709 - val_acc: 0.9783
Epoch 2/8
8982/8982 [==============================] - 47s 5ms/step - loss: 0.0669 - a
cc: 0.9816 - val_loss: 0.0604 - val_acc: 0.9846
Epoch 3/8
8982/8982 [==============================] - 48s 5ms/step - loss: 0.0590 - a
cc: 0.9845 - val_loss: 0.0582 - val_acc: 0.9848
Epoch 4/8
8982/8982 [==============================] - 47s 5ms/step - loss: 0.0567 - a
cc: 0.9849 - val_loss: 0.0574 - val_acc: 0.9848
Epoch 5/8
8982/8982 [==============================] - 47s 5ms/step - loss: 0.0547 - a
cc: 0.9865 - val_loss: 0.0554 - val_acc: 0.9871
Epoch 6/8
8982/8982 [==============================] - 47s 5ms/step - loss: 0.0542 - a
cc: 0.9872 - val_loss: 0.0544 - val_acc: 0.9873
Epoch 7/8
8982/8982 [==============================] - 47s 5ms/step - loss: 0.0504 - a
cc: 0.9883 - val_loss: 0.0520 - val_acc: 0.9876
Epoch 8/8
8982/8982 [==============================] - 47s 5ms/step - loss: 0.0470 - a
cc: 0.9890 - val_loss: 0.0507 - val_acc: 0.9876
In [14]: test_loss, test_acc = LSTM_model.evaluate(x=padding_test, y=labels_test, batch
_size=batch_size)
print('accuracy en el conjunto de datos de test:', test_acc, 'Perdida de datos
en el dataset de test:', test_loss)
2246/2246 [==============================] - 3s 1ms/step
accuracy en el conjunto de datos de test: 0.9875527866058655 Perdida de dato
s en el dataset de test: 0.05070327855281711
In [15]: import matplotlib
from matplotlib import pyplot
%matplotlib inline
acc=savemodel.history['acc']
val_acc=savemodel.history['acc']
loss=savemodel.history['loss']
val_loss=savemodel.history['val_loss']
epochs= range(1,len(acc)+1)
pyplot.plot(epochs,loss, 'bo',label='Training_loss')
pyplot.plot(epochs,val_loss,'b',label='Validation loss')
pyplot.title('Training and validation loss')
pyplot.xlabel('Epochs')
pyplot.ylabel('Loss')
pyplot.legend()
pyplot.show()
In [16]: pyplot.clf()
acc_values=savemodel.history['acc']
val_acc_values=savemodel.history['val_acc']
pyplot.plot(epochs,acc_values, 'bo',label='Training_accuracy')
pyplot.plot(epochs,val_acc_values,'b',label='Validation accuracy')
pyplot.title('Training and validation accuracy')
pyplot.xlabel('Epochs')
pyplot.ylabel('Accuracy')
pyplot.show()
Download