NAME:- JYOTI MONDAL
DATE:- 28/07/2024
keyboard_arrow_down Importing Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import LabelEncoder
!pip install unidecode
!pip install openpyxl
import pandas as pd
from pandas import read_excel
import numpy as np
import re
from re import sub
import multiprocessing
from unidecode import unidecode
import os
from time import time
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Embedding, Flatten, Conv1D, BatchNormalization
from tensorflow.keras.optimizers import SGD,Adam
from tensorflow.keras import regularizers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import tensorflow.keras.backend as K
import h5py
import csv
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
Collecting unidecode
Downloading Unidecode-1.3.8-py3-none-any.whl.metadata (13 kB)
Downloading Unidecode-1.3.8-py3-none-any.whl (235 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 235.5/235.5 kB 8.3 MB/s eta 0:00:00
Installing collected packages: unidecode
Successfully installed unidecode-1.3.8
Requirement already satisfied: openpyxl in /usr/local/lib/python3.10/dist-packages (3.1.5)
Requirement already satisfied: et-xmlfile in /usr/local/lib/python3.10/dist-packages (from openpyxl) (1.1.0)
dataset_cricket = pd.read_excel(io="Cricket.xlsx", sheet_name="Sheet1", engine='openpyxl')
dataset_cricket.head()
Source
Date
Text
Category
Polarity
0
BBC Bangla
2018-06-03 00:00:00
জয় বাংলা কাপ! তাও আবার স্বাধীনতার মাস মার্চে। ...
other
positive
1
BBC Bangla
2018-06-04 00:00:00
জয় বাংলা কাপ! তাও আবার স্বাধীনতার মাস মার্চে। ...
team
positive
2
BBC Bangla
15/6/2017
বাংলাদেশের পরে ভারতের সাপর্ট ই করি ?
team
positive
3
BBC Bangla
15/6/2020
সৌম্যকে বাদ দেওয়া হোক
batting
negative
4
BBC Bangla
27/1/2018
প্রথমটি হচ্ছে, কোচ অত:পর সাকিব,সাকিব আর সাকিবর...
team
positive
Next steps:
Generate code with dataset_cricket
sns.countplot(dataset_cricket['Polarity'])
<Axes: xlabel='count', ylabel='Polarity'>
toggle_off
View recommended plots
New interactive sheet
def text_to_word_list(text):
text = text.split()
return text
def replace_strings(text):
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F"
u"\U0001F300-\U0001F5FF"
u"\U0001F680-\U0001F6FF"
u"\U0001F1E0-\U0001F1FF"
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
u"\u00C0-\u017F"
u"\u2000-\u206F"
"]+", flags=re.UNICODE)
english_pattern=re.compile('[a-zA-Z0-9]+', flags=re.I)
text=emoji_pattern.sub(r'', text)
text=english_pattern.sub(r'', text)
return text
def remove_punctuations(my_str):
punctuations = '''````£|¢|Ñ+-*/=EROero৳০১২৩৪৫৬৭৮৯012–34567•89।!()-[]{};:'"“\’,<>./?@#$%^&*_~‘—॥”‰⚽️✌�
৷
'''
no_punct = ""
for char in my_str:
if char not in punctuations:
no_punct = no_punct + char
return no_punct
def joining(text):
out=' '.join(text)
return out
def preprocessing(text):
out=remove_punctuations(replace_strings(text))
return out
dataset_cricket['Text'] = dataset_cricket.Text.apply(lambda x: preprocessing(str(x)))
dataset_cricket.reset_index(drop=True, inplace=True)
enc = LabelEncoder()
dataset_cricket['Polarity'] = enc.fit_transform(dataset_cricket['Polarity'])
train1, test1 = train_test_split(dataset_cricket,random_state=69, test_size=0.2)
training_sentences_cricket = []
testing_sentences_cricket = []
train_sentences_cricket=train1['Text'].values
train_labels_cricket=train1['Polarity'].values
for i in range(train_sentences_cricket.shape[0]):
x=str(train_sentences_cricket[i])
training_sentences_cricket.append(x)
training_sentences_cricket=np.array(training_sentences_cricket)
test_sentences_cricket=test1['Text'].values
test_labels_cricket=test1['Polarity'].values
for i in range(test_sentences_cricket.shape[0]):
x=str(test_sentences_cricket[i])
testing_sentences_cricket.append(x)
testing_sentences_cricket=np.array(testing_sentences_cricket)
train_labels_cricket=tf.keras.utils.to_categorical(train_labels_cricket)
test_labels_cricket=tf.keras.utils.to_categorical(test_labels_cricket)
print("Training Set Length: "+str(len(train1)))
print("Testing Set Length: "+str(len(test1)))
print("training_sentences shape: "+str(training_sentences_cricket.shape))
print("testing_sentences shape: "+str(testing_sentences_cricket.shape))
print("train_labels shape: "+str(train_labels_cricket.shape))
print("test_labels shape: "+str(test_labels_cricket.shape))
Training Set Length: 2383
Testing Set Length: 596
training_sentences shape: (2383,)
testing_sentences shape: (596,)
train_labels shape: (2383, 3)
test_labels shape: (596, 3)
print(training_sentences_cricket[1])
print(train_labels_cricket[0])
নবাগত প্লেয়ার হয়েও চাপের মুখে দলের বিপদে চমৎকার একটি
[1. 0. 0.]
রানের ইনিংস খেলেছেন লিটন
vocab_size = len(training_sentences_cricket)+1
embedding_dim = 100
max_length = 100
trunc_type='post'
oov_tok = "<OOV>"
print(training_sentences_cricket.shape)
print(train_labels_cricket.shape)
(2383,)
(2383, 3)
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences_cricket)
word_index = tokenizer.word_index
print(len(word_index))
print("Word index length:"+str(len(tokenizer.word_index)))
sequences = tokenizer.texts_to_sequences(training_sentences_cricket)
padded = pad_sequences(sequences,maxlen=max_length, truncating=trunc_type)
test_sequences = tokenizer.texts_to_sequences(testing_sentences_cricket)
testing_padded = pad_sequences(test_sequences,maxlen=max_length)
6414
Word index length:6414
print("Sentence :--> \n")
print(training_sentences_cricket[2]+"\n")
print("Sentence Tokenized and Converted into Sequence :--> \n")
print(str(sequences[2])+"\n")
print("After Padding the Sequence with padding length 100 :--> \n")
print(padded[2])
Sentence :-->
তারা কোথায় যারা বলেছিল হাতু রাসিংহে বনাম বাংলাদেশ এর খেলা
Sentence Tokenized and Converted into Sequence :-->
[70, 1514, 141, 859, 1, 460, 4, 19, 23]
After Padding the Sequence with padding length 100 :-->
[
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
19
0
0
0
0
23]
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
70 1514
0
0
0
141
0
0
0
859
0
0
0
1
0
0
0
460
0
0
0
4
print("Padded shape(training): "+str(padded.shape))
print("Padded shape(testing): "+str(testing_padded.shape))
Padded shape(training): (2383, 100)
Padded shape(testing): (596, 100)
def precision(y_true, y_pred):
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
precision = true_positives / (predicted_positives + K.epsilon())
return precision
def recall(y_true, y_pred):
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
recall = true_positives / (possible_positives + K.epsilon())
return recall
def f1_score(y_true, y_pred):
pr = precision(y_true, y_pred)
rec = recall(y_true, y_pred)
f1_score = 2 * (pr * rec) / (pr + rec)
return f1_score
# padded
train_labels_cricket
array([[1., 0., 0.],
[0., 0., 1.],
[1., 0., 0.],
...,
[1., 0., 0.],
[1., 0., 0.],
[1., 0., 0.]], dtype=float32)
detection=Sequential()
detection.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
detection.add(Conv1D(64 ,kernel_size=3))
detection.add(BatchNormalization())
detection.add(Activation('relu'))
detection.add(Dropout(0.1))
detection.add(Conv1D(96,kernel_size=3))
detection.add(BatchNormalization())
detection.add(Activation('relu'))
detection.add(Conv1D(128,kernel_size=3))
detection.add(BatchNormalization())
detection.add(Activation('relu'))
detection.add(Dropout(0.15))
detection.add(Flatten())
detection.add(Dense(128))
detection.add(BatchNormalization())
detection.add(Activation('relu'))
detection.add(Flatten())
detection.add(Dense(64))
detection.add(BatchNormalization())
detection.add(Activation('relu'))
detection.add(Dense(3,activation='sigmoid'))
optimum=Adam(lr=0.00001)
detection.summary()
detection.compile(optimizer=optimum,loss='binary_crossentropy',metrics=['accuracy', precision, recall])
WARNING:absl:`lr` is deprecated in Keras optimizer, please use `learning_rate` or use the legacy optimizer, e.g.,tf.keras.optimizers.legacy.Adam.
Model: "sequential"
_________________________________________________________________
Layer (type)
Output Shape
Param #
=================================================================
embedding (Embedding)
(None, 100, 100)
238400
conv1d (Conv1D)
(None, 98, 64)
19264
batch_normalization (Batch
Normalization)
(None, 98, 64)
256
activation (Activation)
(None, 98, 64)
0
dropout (Dropout)
(None, 98, 64)
0
conv1d_1 (Conv1D)
(None, 96, 96)
18528
batch_normalization_1 (Bat
chNormalization)
(None, 96, 96)
384
activation_1 (Activation)
(None, 96, 96)
0
conv1d_2 (Conv1D)
(None, 94, 128)
36992
batch_normalization_2 (Bat
chNormalization)
(None, 94, 128)
512
activation_2 (Activation)
(None, 94, 128)
0
dropout_1 (Dropout)
(None, 94, 128)
0
flatten (Flatten)
(None, 12032)
0
dense (Dense)
(None, 128)
1540224
batch_normalization_3 (Bat
chNormalization)
(None, 128)
512
activation_3 (Activation)
(None, 128)
0
flatten_1 (Flatten)
(None, 128)
0
dense_1 (Dense)
(None, 64)
8256
batch_normalization_4 (Bat
chNormalization)
(None, 64)
256
activation_4 (Activation)
(None, 64)
0
dense_2 (Dense)
(None, 3)
195
=================================================================
Total params: 1863779 (7.11 MB)
Trainable params: 1862819 (7.11 MB)
Non-trainable params: 960 (3.75 KB)
_________________________________________________________________
from sklearn.model_selection import train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(padded, train_labels_cricket, test_size=0.15, shuffle= True)
model_cricket = detection.fit(x_train, y_train,shuffle=True,epochs=100,batch_size=5,validation_data=(x_valid,y_valid))
405/405 [==============================] - 19s 46ms/step - loss: 0.0365 - accuracy: 0.9847 - precision: 0.9862 - recall: 0.9842 - val_loss: 1.3391 - val_accuracy: 0.6872 - val_precision: 0.6740 - val_recall: 0.6907
Epoch 74/100
405/405 [==============================] - 18s 44ms/step - loss: 0.0262 - accuracy: 0.9837 - precision: 0.9847 - recall: 0.9812 - val_loss: 1.1010 - val_accuracy: 0.6620 - val_precision: 0.6657 - val_recall: 0.6435
Epoch 75/100
405/405 [==============================] - 18s 44ms/step - loss: 0.0517 - accuracy: 0.9783 - precision: 0.9800 - recall: 0.9763 - val_loss: 0.9544 - val_accuracy: 0.7011 - val_precision: 0.7021 - val_recall: 0.7046
Epoch 76/100
405/405 [==============================] - 20s 48ms/step - loss: 0.0519 - accuracy: 0.9758 - precision: 0.9764 - recall: 0.9723 - val_loss: 1.0381 - val_accuracy: 0.6704 - val_precision: 0.6694 - val_recall: 0.6574
Epoch 77/100
405/405 [==============================] - 18s 44ms/step - loss: 0.0301 - accuracy: 0.9812 - precision: 0.9787 - recall: 0.9778 - val_loss: 0.9615 - val_accuracy: 0.6899 - val_precision: 0.7000 - val_recall: 0.6796
Epoch 78/100
405/405 [==============================] - 18s 44ms/step - loss: 0.0214 - accuracy: 0.9852 - precision: 0.9867 - recall: 0.9832 - val_loss: 1.0184 - val_accuracy: 0.6955 - val_precision: 0.6836 - val_recall: 0.6954
Epoch 79/100
405/405 [==============================] - 18s 44ms/step - loss: 0.0218 - accuracy: 0.9817 - precision: 0.9842 - recall: 0.9807 - val_loss: 1.1837 - val_accuracy: 0.6983 - val_precision: 0.6921 - val_recall: 0.6907
Epoch 80/100
405/405 [==============================] - 19s 46ms/step - loss: 0.0302 - accuracy: 0.9822 - precision: 0.9871 - recall: 0.9798 - val_loss: 0.9007 - val_accuracy: 0.7095 - val_precision: 0.7025 - val_recall: 0.7037
Epoch 81/100
405/405 [==============================] - 18s 45ms/step - loss: 0.0249 - accuracy: 0.9822 - precision: 0.9843 - recall: 0.9827 - val_loss: 1.1817 - val_accuracy: 0.6844 - val_precision: 0.6852 - val_recall: 0.6769
Epoch 82/100
405/405 [==============================] - 18s 44ms/step - loss: 0.0187 - accuracy: 0.9852 - precision: 0.9869 - recall: 0.9827 - val_loss: 0.9894 - val_accuracy: 0.7123 - val_precision: 0.6988 - val_recall: 0.7130
Epoch 83/100
405/405 [==============================] - 19s 47ms/step - loss: 0.0200 - accuracy: 0.9832 - precision: 0.9845 - recall: 0.9827 - val_loss: 1.8195 - val_accuracy: 0.7151 - val_precision: 0.7219 - val_recall: 0.7472
Epoch 84/100
405/405 [==============================] - 19s 46ms/step - loss: 0.0424 - accuracy: 0.9798 - precision: 0.9801 - recall: 0.9783 - val_loss: 1.0674 - val_accuracy: 0.6844 - val_precision: 0.6850 - val_recall: 0.6741
Epoch 85/100
405/405 [==============================] - 18s 44ms/step - loss: 0.0281 - accuracy: 0.9822 - precision: 0.9830 - recall: 0.9802 - val_loss: 1.0539 - val_accuracy: 0.6955 - val_precision: 0.6850 - val_recall: 0.6796
Epoch 86/100
405/405 [==============================] - 18s 44ms/step - loss: 0.0274 - accuracy: 0.9837 - precision: 0.9862 - recall: 0.9807 - val_loss: 0.9816 - val_accuracy: 0.7123 - val_precision: 0.7072 - val_recall: 0.7102
Epoch 87/100
405/405 [==============================] - 18s 45ms/step - loss: 0.0219 - accuracy: 0.9832 - precision: 0.9835 - recall: 0.9807 - val_loss: 1.6062 - val_accuracy: 0.6034 - val_precision: 0.6125 - val_recall: 0.5926
Epoch 88/100
405/405 [==============================] - 19s 46ms/step - loss: 0.0256 - accuracy: 0.9807 - precision: 0.9835 - recall: 0.9817 - val_loss: 0.9959 - val_accuracy: 0.6872 - val_precision: 0.6810 - val_recall: 0.6787
Epoch 89/100
405/405 [==============================] - 18s 44ms/step - loss: 0.0247 - accuracy: 0.9827 - precision: 0.9827 - recall: 0.9842 - val_loss: 1.0585 - val_accuracy: 0.6983 - val_precision: 0.7109 - val_recall: 0.7148
Epoch 90/100
405/405 [==============================] - 18s 44ms/step - loss: 0.0192 - accuracy: 0.9862 - precision: 0.9859 - recall: 0.9837 - val_loss: 1.0858 - val_accuracy: 0.6899 - val_precision: 0.6919 - val_recall: 0.6898
Epoch 91/100
405/405 [==============================] - 19s 46ms/step - loss: 0.0278 - accuracy: 0.9847 - precision: 0.9872 - recall: 0.9827 - val_loss: 1.2611 - val_accuracy: 0.6536 - val_precision: 0.6481 - val_recall: 0.6926
Epoch 92/100
405/405 [==============================] - 19s 47ms/step - loss: 0.0201 - accuracy: 0.9857 - precision: 0.9872 - recall: 0.9847 - val_loss: 1.0520 - val_accuracy: 0.6955 - val_precision: 0.6935 - val_recall: 0.6954
Epoch 93/100
405/405 [==============================] - 18s 44ms/step - loss: 0.0274 - accuracy: 0.9822 - precision: 0.9834 - recall: 0.9807 - val_loss: 1.0859 - val_accuracy: 0.6788 - val_precision: 0.6887 - val_recall: 0.6907
Epoch 94/100
405/405 [==============================] - 18s 44ms/step - loss: 0.0413 - accuracy: 0.9832 - precision: 0.9828 - recall: 0.9817 - val_loss: 1.1549 - val_accuracy: 0.5838 - val_precision: 0.5856 - val_recall: 0.5056
Epoch 95/100
405/405 [==============================] - 18s 45ms/step - loss: 0.0233 - accuracy: 0.9832 - precision: 0.9830 - recall: 0.9832 - val_loss: 1.5674 - val_accuracy: 0.7207 - val_precision: 0.6996 - val_recall: 0.7426
Epoch 96/100
405/405 [==============================] - 18s 46ms/step - loss: 0.0189 - accuracy: 0.9842 - precision: 0.9843 - recall: 0.9832 - val_loss: 1.0338 - val_accuracy: 0.6899 - val_precision: 0.7065 - val_recall: 0.6741
Epoch 97/100
405/405 [==============================] - 18s 45ms/step - loss: 0.0319 - accuracy: 0.9822 - precision: 0.9834 - recall: 0.9807 - val_loss: 1.1044 - val_accuracy: 0.6872 - val_precision: 0.6967 - val_recall: 0.6824
Epoch 98/100
405/405 [==============================] - 18s 44ms/step - loss: 0.0254 - accuracy: 0.9802 - precision: 0.9816 - recall: 0.9812 - val_loss: 1.0166 - val_accuracy: 0.7179 - val_precision: 0.7160 - val_recall: 0.7176
Epoch 99/100
405/405 [==============================] - 19s 46ms/step - loss: 0.0325 - accuracy: 0.9837 - precision: 0.9834 - recall: 0.9822 - val_loss: 1.1085 - val_accuracy: 0.6872 - val_precision: 0.6921 - val_recall: 0.6907
Epoch 100/100
405/405 [==============================] - 18s 45ms/step - loss: 0.0235 - accuracy: 0.9827 - precision: 0.9882 - recall: 0.9827 - val loss: 1.0105 - val accuracy: 0.6816 - val precision: 0.6840 - val recall: 0.6657
detection.evaluate(x=testing_padded,y=test_labels_cricket,verbose=1)
19/19 [==============================] - 0s 15ms/step - loss: 1.0186 - accuracy: 0.6762 - precision: 0.6783 - recall: 0.6586
[1.018591046333313, 0.676174521446228, 0.6782708168029785, 0.6585526466369629]
plt.plot(model_cricket.history['loss'], label='loss_train')
plt.plot(model_cricket.history['val_loss'], label='loss_val')
plt.legend()
plt.title('Train Val_Loss in Proposed Neural Network')
plt.show()
plt.savefig('LossVal_Loss')
<Figure size 640x480 with 0 Axes>
plt.plot(model_cricket.history['accuracy'], label='train acc')
plt.plot(model_cricket.history['val_accuracy'], label='val acc')
plt.legend()
plt.title('TrainVal_Acc in Proposed Neural Network')
plt.show()
plt.savefig('AccVal_Acc')
0
You can add this document to your study collection(s)
Sign in Available only to authorized usersYou can add this document to your saved list
Sign in Available only to authorized users(For complaints, use another form )