Predicting The Next Word In A Sentence

import string
#Provides a set of diverse algorithms for an LP and for Computational Linguistics.
import nltk
import re
import string
from nltk.corpus import stopwords
import pkg_resources
import pickle
import nltk
import re, string, json
from tqdm.notebook import tqdm
from nltk.tokenize import word_tokenize

from numpy import array
from pickle import dump
from keras.preprocessing.text import Tokenizer
#keras's library function that is embedded into the TensorFlow Library
#TensorFlow provides both high level and low level APIs for working with deep learning models
#Using packages required to handle string data. 
from tensorflow.keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

from random import randint
from pickle import load
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# turn a doc into clean tokens
def clean_doc(doc):
    # replace '--' with a space ' '
    doc = doc.replace('--', ' ')
    # split into tokens by white space
    tokens = doc.split()
    # remove punctuation from each token
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # make lower case
    tokens = [word.lower() for word in tokens]
    return tokens

# save tokens to file, one dialog per line
def save_doc(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

def tokenize_word(sentences):
    """
    Tokenize sentences into tokens (words)
    
    Args:
        sentences: List of strings
    
    Returns:
        List of lists of tokens
    """
    print("Starting Cleaning Process")
    tokenized_sentences = []
    for sentence in tqdm(sentences):
        
        # Convert to lowercase letters
        sentence = cleanhtml(sentence)
        sentence = _replace_urls(sentence)
        sentence = remove_email(sentence)
        #Anything that is not a lower or upper case word remove from our sentence repalce with a empty string
        sentence = re.sub(r'[^a-zA-Z]', ' ', sentence)
        sentence = sentence.lower()
        sentence = misc(sentence)
        

        # tokenized = nltk.word_tokenize(sentence)
        
        # append the list of words to the list of lists
        # tokenized_sentences.append(tokenized)
        tokenized_sentences.append(sentence)
    
    return tokenized_sentences

#We use particular functions below to clean the string list.
def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext

#Function replaces and removes URLS from the dataset.
def _replace_urls(data):
    #Removing URLs with a regular expression
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    data = url_pattern.sub(r'', data)
    return data

#Function removes emails 
def remove_email(data):
    # Remove Emails
    data = re.sub('\S*@\S*\s?', '', data)
    return data

#Function removes other random data not needed
def misc(data):
    # Remove new line characters
    data = re.sub('\s+', ' ', data)
    # Remove distracting single quotes
    data = re.sub("\'", "", data)
    data = re.sub("ww+", "", data)
    # Removing roman-case:
    MAYBE_ROMAN = re.compile(r'(\b[MDCLXVI]+\b)(\.)?', re.I)
    data = re.sub(MAYBE_ROMAN, "", data)
    return data

#This function is looking for sentences that the number of tokens are less than 5 remove then sentence. 
#Sentences with less than 5 tokens are to short for our LSTM model to be useful.
def littleCleaning(sentences):
    print("Starting cleaning Process")
    ret_list = []
    for sentence in sentences:
        words = sentence.split(" ")
        if len(words) > 5:
            ret_list.append(sentence)
        else:
            continue
    return ret_list

#Download the Natural Language Processing with Python package that is needed
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\PC-8783213\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PC-8783213\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!

True

path = 'Dataset/republic.txt'
text = open(path).read().lower()
print('length of the corpus is: :', len(text))

length of the corpus is: : 1174387

# Converting the data into lists.
# Creating sentences into strings using the split fuction

data_list = text.split(".")
data_list[:20]

['the project gutenberg ebook of the republic, by plato\n\nthis ebook is for the use of anyone anywhere in the united states and\nmost other parts of the world at no cost and with almost no restrictions\nwhatsoever',
 ' you may copy it, give it away or re-use it under the terms\nof the project gutenberg license included with this ebook or online at\nwww',
 'gutenberg',
 'org',
 ' if you are not located in the united states, you\nwill have to check the laws of the country where you are located before\nusing this ebook',
 '\n\ntitle: the republic\n\nauthor: plato\n\ntranslator: b',
 ' jowett\n\nrelease date: october, 1998 [ebook #1497]\n[most recently updated: september 11, 2021]\n\nlanguage: english\n\n\nproduced by: sue asscher and david widger\n\n*** start of the project gutenberg ebook the republic ***\n\n\n\n\nthe republic\n\nby plato\n\ntranslated by benjamin jowett\n\nnote: see also “the republic” by plato, jowett, ebook #150\n\n\ncontents\n\n introduction and analysis',
 '\n the republic',
 '\n persons of the dialogue',
 '\n book i',
 '\n book ii',
 '\n book iii',
 '\n book iv',
 '\n book v',
 '\n book vi',
 '\n book vii',
 '\n book viii',
 '\n book ix',
 '\n book x',
 '\n\n\n\n\n introduction and analysis']

pro_sentences = []
# Function to clean up data
def normalization_pipeline(sentences):
    print("Starting Normalization Process")
    sentences = tokenize_word(sentences)
    sentences = littleCleaning(sentences)
    print("Normalization Process Finished")
    return sentences

pro_sentences = normalization_pipeline(data_list)
pro_sentences[: 5]

Starting Normalization Process
Starting Cleaning Process

{"version_major":2,"version_minor":0,"model_id":"c14c6d187eb64f729cc53fa007733e27"}

Starting cleaning Process
Normalization Process Finished

['the project gutenberg ebook of the republic by plato this ebook is for the use of anyone anywhere in the united states and most other parts of the world at no cost and with almost no restrictions whatsoever',
 ' you may copy it give it away or re use it under the terms of the project gutenberg license included with this ebook or online at ',
 ' if you are not located in the united states you will have to check the laws of the country where you are located before using this ebook',
 ' title the republic author plato translator b',
 ' jowett release date october ebook most recently updated september language english produced by sue asscher and david widger start of the project gutenberg ebook the republic the republic by plato translated by benjamin jowett note see also the republic by plato jowett ebook contents introduction and analysis']

#After the clean up procress we have 6,309 sentences compaired to 7,012 setences before the cleaning process
len(pro_sentences)

# Structuring th etext into a paragraph:
# We are only using 700 sentences out of 6,309 sentences. 
dataText = "".join(pro_sentences[: 700])
dataText[: 200]

{"type":"string"}

# turn a doc into clean tokens
def clean_doc(doc):
    # replace '--' with a space ' '
    doc = doc.replace('--', ' ')
    # split into tokens by white space
    tokens = doc.split()
    # remove punctuation from each token
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # make lower case
    tokens = [word.lower() for word in tokens]
    return tokens

# clean document
tokens = clean_doc(dataText)
print(tokens[:200])
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))

['the', 'project', 'gutenberg', 'ebook', 'of', 'the', 'republic', 'by', 'plato', 'this', 'ebook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'in', 'the', 'united', 'states', 'and', 'most', 'other', 'parts', 'of', 'the', 'world', 'at', 'no', 'cost', 'and', 'with', 'almost', 'no', 'restrictions', 'whatsoever', 'you', 'may', 'copy', 'it', 'give', 'it', 'away', 'or', 're', 'use', 'it', 'under', 'the', 'terms', 'of', 'the', 'project', 'gutenberg', 'license', 'included', 'with', 'this', 'ebook', 'or', 'online', 'at', 'if', 'you', 'are', 'not', 'located', 'in', 'the', 'united', 'states', 'you', 'will', 'have', 'to', 'check', 'the', 'laws', 'of', 'the', 'country', 'where', 'you', 'are', 'located', 'before', 'using', 'this', 'ebook', 'title', 'the', 'republic', 'author', 'plato', 'translator', 'b', 'jowett', 'release', 'date', 'october', 'ebook', 'most', 'recently', 'updated', 'september', 'language', 'english', 'produced', 'by', 'sue', 'asscher', 'and', 'david', 'widger', 'start', 'of', 'the', 'project', 'gutenberg', 'ebook', 'the', 'republic', 'the', 'republic', 'by', 'plato', 'translated', 'by', 'benjamin', 'jowett', 'note', 'see', 'also', 'the', 'republic', 'by', 'plato', 'jowett', 'ebook', 'contents', 'introduction', 'and', 'analysis', 'the', 'republic', 'of', 'plato', 'is', 'the', 'longest', 'of', 'his', 'works', 'with', 'the', 'exception', 'of', 'the', 'laws', 'and', 'is', 'certainly', 'the', 'greatest', 'of', 'them', 'there', 'are', 'nearer', 'approaches', 'to', 'modern', 'metaphysics', 'in', 'the', 'philebus', 'and', 'in', 'the', 'sophist', 'the', 'politicus', 'or', 'statesman', 'is', 'more', 'ideal', 'the', 'form', 'and', 'institutions', 'of', 'the', 'state', 'are', 'more', 'clearly', 'drawn']
Total Tokens: 21735
Unique Tokens: 3548

# organize into sequences of tokens to create our input data
#Creating a list that has 51 words
#the words will be input and the 51 word will be the independent variable
#The 51 word will be the word we use to predict
length = 50 + 1
sequences = list()
for i in range(length, len(tokens)):
    # select sequence of tokens
    seq = tokens[i-length:i]
    # convert into a line
    line = ' '.join(seq)
    # store
    sequences.append(line)
print('Total Sequences: %d' % len(sequences))

Total Sequences: 21684

# save sequences to file

# save tokens to file, one dialog per line
def save_doc(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()
    
out_filename = 'Dataset/republic_sequences.txt'
save_doc(sequences, out_filename)

# load
in_filename = 'Dataset/republic_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')

# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
# Transforming my regular texts into numberical ones like the list of vectors
sequences = tokenizer.texts_to_sequences(lines)
# vocabulary size meaning the number of unqunie words
vocab_size = len(tokenizer.word_index) + 1

# Separate into input and output
# Convert the sequences into a array to bulid our X and Y variables
sequences = array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]

# define model
model = Sequential()
#Embedding the model use vocab_size is to create vector and vector for each word
model.add(Embedding(vocab_size, 50, input_length=seq_length))
#Using the LSTM function with input to 50 tokens or words
model.add(LSTM(50, return_sequences=True))
model.add(LSTM(50))
#In our model, we are classified by taking the input variable and classifying unique words used for the prediction 
#of our model. Then we use the words to make the prediction. 
#Dense model understanding our input for the 50 words
model.add(Dense(50, activation='relu'))
#Desse model to understand the vocab_size and use the activation function softmax which is used for classification
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit model
batch_size=128
epochs=50
model.fit(X, y, batch_size=batch_size, epochs=epochs)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 embedding_1 (Embedding)     (None, 50, 50)            177450    
                                                                 
 lstm_2 (LSTM)               (None, 50, 50)            20200     
                                                                 
 lstm_3 (LSTM)               (None, 50)                20200     
                                                                 
 dense_2 (Dense)             (None, 50)                2550      
                                                                 
 dense_3 (Dense)             (None, 3549)              180999    
                                                                 
=================================================================
Total params: 401,399
Trainable params: 401,399
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/50
170/170 [==============================] - 25s 120ms/step - loss: 6.6283 - accuracy: 0.0831
Epoch 2/50
170/170 [==============================] - 20s 119ms/step - loss: 6.1222 - accuracy: 0.0843
Epoch 3/50
170/170 [==============================] - 20s 119ms/step - loss: 5.9787 - accuracy: 0.0904
Epoch 4/50
170/170 [==============================] - 20s 118ms/step - loss: 5.8110 - accuracy: 0.1136
Epoch 5/50
170/170 [==============================] - 20s 119ms/step - loss: 5.6685 - accuracy: 0.1232
Epoch 6/50
170/170 [==============================] - 22s 129ms/step - loss: 5.5670 - accuracy: 0.1287
Epoch 7/50
170/170 [==============================] - 21s 123ms/step - loss: 5.4900 - accuracy: 0.1319
Epoch 8/50
170/170 [==============================] - 21s 121ms/step - loss: 5.4197 - accuracy: 0.1360
Epoch 9/50
170/170 [==============================] - 21s 123ms/step - loss: 5.3558 - accuracy: 0.1394
Epoch 10/50
170/170 [==============================] - 22s 128ms/step - loss: 5.3001 - accuracy: 0.1426
Epoch 11/50
170/170 [==============================] - 22s 128ms/step - loss: 5.2517 - accuracy: 0.1461
Epoch 12/50
170/170 [==============================] - 21s 125ms/step - loss: 5.2090 - accuracy: 0.1487
Epoch 13/50
170/170 [==============================] - 22s 128ms/step - loss: 5.1653 - accuracy: 0.1537
Epoch 14/50
170/170 [==============================] - 21s 126ms/step - loss: 5.1244 - accuracy: 0.1563
Epoch 15/50
170/170 [==============================] - 21s 122ms/step - loss: 5.0810 - accuracy: 0.1600
Epoch 16/50
170/170 [==============================] - 20s 119ms/step - loss: 5.0371 - accuracy: 0.1630
Epoch 17/50
170/170 [==============================] - 20s 120ms/step - loss: 4.9889 - accuracy: 0.1669
Epoch 18/50
170/170 [==============================] - 20s 119ms/step - loss: 4.9458 - accuracy: 0.1706
Epoch 19/50
170/170 [==============================] - 21s 122ms/step - loss: 4.8946 - accuracy: 0.1740
Epoch 20/50
170/170 [==============================] - 21s 121ms/step - loss: 4.8453 - accuracy: 0.1780
Epoch 21/50
170/170 [==============================] - 20s 119ms/step - loss: 4.7958 - accuracy: 0.1814
Epoch 22/50
170/170 [==============================] - 20s 119ms/step - loss: 4.7466 - accuracy: 0.1833
Epoch 23/50
170/170 [==============================] - 20s 119ms/step - loss: 4.6949 - accuracy: 0.1863
Epoch 24/50
170/170 [==============================] - 20s 120ms/step - loss: 4.6473 - accuracy: 0.1906
Epoch 25/50
170/170 [==============================] - 21s 121ms/step - loss: 4.5978 - accuracy: 0.1916
Epoch 26/50
170/170 [==============================] - 21s 121ms/step - loss: 4.5498 - accuracy: 0.1951
Epoch 27/50
170/170 [==============================] - 20s 120ms/step - loss: 4.5037 - accuracy: 0.1981
Epoch 28/50
170/170 [==============================] - 20s 119ms/step - loss: 4.4519 - accuracy: 0.2008
Epoch 29/50
170/170 [==============================] - 20s 119ms/step - loss: 4.4069 - accuracy: 0.2031
Epoch 30/50
170/170 [==============================] - 20s 118ms/step - loss: 4.3608 - accuracy: 0.2070
Epoch 31/50
170/170 [==============================] - 20s 120ms/step - loss: 4.3129 - accuracy: 0.2091
Epoch 32/50
170/170 [==============================] - 20s 118ms/step - loss: 4.2663 - accuracy: 0.2122
Epoch 33/50
170/170 [==============================] - 20s 119ms/step - loss: 4.2238 - accuracy: 0.2161
Epoch 34/50
170/170 [==============================] - 20s 118ms/step - loss: 4.1812 - accuracy: 0.2183
Epoch 35/50
170/170 [==============================] - 20s 120ms/step - loss: 4.1387 - accuracy: 0.2209
Epoch 36/50
170/170 [==============================] - 20s 118ms/step - loss: 4.0970 - accuracy: 0.2239
Epoch 37/50
170/170 [==============================] - 20s 120ms/step - loss: 4.0553 - accuracy: 0.2276
Epoch 38/50
170/170 [==============================] - 20s 120ms/step - loss: 4.0153 - accuracy: 0.2285
Epoch 39/50
170/170 [==============================] - 20s 119ms/step - loss: 3.9730 - accuracy: 0.2336
Epoch 40/50
170/170 [==============================] - 20s 120ms/step - loss: 3.9365 - accuracy: 0.2352
Epoch 41/50
170/170 [==============================] - 20s 118ms/step - loss: 3.8928 - accuracy: 0.2390
Epoch 42/50
170/170 [==============================] - 20s 119ms/step - loss: 3.8526 - accuracy: 0.2400
Epoch 43/50
170/170 [==============================] - 21s 122ms/step - loss: 3.8142 - accuracy: 0.2449
Epoch 44/50
170/170 [==============================] - 20s 121ms/step - loss: 3.7788 - accuracy: 0.2471
Epoch 45/50
170/170 [==============================] - 21s 125ms/step - loss: 3.7364 - accuracy: 0.2492
Epoch 46/50
170/170 [==============================] - 21s 121ms/step - loss: 3.7004 - accuracy: 0.2534
Epoch 47/50
170/170 [==============================] - 20s 120ms/step - loss: 3.6622 - accuracy: 0.2575
Epoch 48/50
170/170 [==============================] - 20s 119ms/step - loss: 3.6262 - accuracy: 0.2591
Epoch 49/50
170/170 [==============================] - 20s 119ms/step - loss: 3.5933 - accuracy: 0.2647
Epoch 50/50
170/170 [==============================] - 20s 120ms/step - loss: 3.5530 - accuracy: 0.2711

<keras.callbacks.History at 0x7f342e611a50>

First Observation:

After testing our model, For the 50 words are tokens, our accuracy only reached 27.11 percent at the 50 callbacks. That is not much but the results, but also it is not a large cycle. There are two reasons for our accuracy; however, as we can see, the model's accuracy increases as the callbacks increase. As the callbacks increase, the accuracy of the model also increases. Also, we can see the loss is also decreasing as the callbacks increase. We would have gotten better accuracy if we were to do more callbacks and our training size sample.

Second Observation:

I took the data from the center first. I only wrote 700 sentences versus the 6000 sentences. If we were to use more data or model would also improve; however, there's also a limitation due to the Computer Resources I had. However, I am not sure if the increase in that sentence size dramatically makes a difference in the accuracy of our model. However, the acceptable precision depends on many factors, and for this project, an accuracy of about 27% is acceptable.

# save the model to file
model.save("Dataset/DataScienceModels/nexWordPredict/nextWord.h5")
# save the tokenizer
dump(tokenizer, open('Dataset/DataScienceModels/tokenizer.pkl', 'wb'))

# generate a sequence from a language model to test our model
import numpy as np

def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
    result = list()
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # truncate sequences to a fixed length
        #padding function for to make word size
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # predict probabilities for each word
        # yhat = model.predict_classes(encoded, verbose=0)
        predict_x=model.predict(encoded) 
        yhat=np.argmax(predict_x,axis=1)
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text += ' ' + out_word
        result.append(out_word)
    return ' '.join(result)

# load cleaned text sequences
in_filename = 'Dataset/republic_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')
seq_length = len(lines[0].split()) - 1

print(len(lines))
print(lines[0])

21684
the project gutenberg ebook of the republic by plato this ebook is for the use of anyone anywhere in the united states and most other parts of the world at no cost and with almost no restrictions whatsoever you may copy it give it away or re use it under the

# load the model
model = load_model("Dataset/DataScienceModels/nexWordPredict/nextWord.h5")

# load the tokenizer
tokenizer = load(open('Dataset/DataScienceModels/tokenizer.pkl', 'rb'))

# select a seed text
seed_text = lines[randint(0,len(lines))]
print(seed_text + '\n')

# generate new text
generated = generate_seq(model, tokenizer, seq_length, seed_text, 12)
print(generated)

after ages which few great writers have ever been able to anticipate for themselves they do not perceive the want of connexion in their own writings or the gaps in their systems which are visible enough to those who come after them in the beginnings of literature and philosophy amid the

best of good the republic is not not to be the embodiment

Input:

We load the model file and the tokenizer file that were developed in our analysis. We get a random sentence from our file and we assign it to the seed_text variable. We can see in the diagram. We have a sentence of 50 words for the model to analyze.

Output:

As the predicted output from the generated variable, in the text sample above. However, we notice that our predicted output repeats the word “not” twice. Although out model predicts word based on the model we develop, we also have to consider that our current model is only about 27 percent accurate—the fact seems to fit what output we got in the diagram above. Keep in mind that we only used 50 callbacks when training the model. As the callbacks in our training dataset increase, the model's accuracy also increases, and we can also see that the loss is also decreasing as the callbacks increase in the training sample. If we were to do more callbacks and our training size sample, we would have gotten better accuracy.

Introduction: Predicting The Next Word In A Sentence

Token_Word Funciton:

Note:

Define Input and Output:

X= List[50 words] -> Sample Input

Y=List[1 word] -> This is the Predicting word for our model

Define Model:

First Observation:

Second Observation:

Input:

Output: