#Import the bert functions
!wget https://raw.githubusercontent.com/google-research/bert/master/modeling.py 
!wget https://raw.githubusercontent.com/google-research/bert/master/optimization.py
!wget https://raw.githubusercontent.com/google-research/bert/master/run_classifier.py
!wget https://raw.githubusercontent.com/google-research/bert/master/tokenization.py


import numpy as np # linear algebra
import re, os
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import tensorflow as tf
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import datetime
import matplotlib 
from datetime import datetime

# BERT
import optimization
import run_classifier
import tokenization
import tensorflow_hub as hub

WARNING:tensorflow:From C:\Users\PC-8783213\Desktop\Data Science\NLP\BERTHomeWork1\optimization.py:87: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PC-8783213\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


df = pd.read_csv('C:/Users/PC-8783213/Desktop/Data Science/NLP/Use Bert NLP/BERTNLPHomeWork/Data/HomeworkExercise.csv')


df.head()


df.shape

(300, 3)


def conv_data(S):
    # 1. Starts with “#” and name hash assign either 1 if found and 0 if not found
    if type(re.search('#',S))==re.Match:
        hash=1
    else:
        hash=0
    # 2. Starts with “MCDONALD’S” named f_val assign either 1 if found and 0 if not found
    if type(re.search("MCDONALD'S",S))==re.Match:
        f_val=1
    else:
        f_val=0
    stp_s= S.split()
    # 3. Starts only with digits names ss_s assign either 1 if found and 0 if not found
    for i in range(len(stp_s)):
        if stp_s[i].isdigit():
            ss_s=1
            break
        else:
            ss_s=0
    # 4. Starts with “FOOTWARE”, “STORE”, “PETRO” it gets named st_n assign either 1 if found and 0 if not found
    if (type(re.search(" FOOTWEAR",S))==re.Match) or (type(re.search(" STORE",S))==re.Match) or (type(
        re.search(" PETRO",S))==re.Match):
        st_n=1
    else:
        st_n=0
    return hash ,f_val,st_n,ss_s


Labels=[]
dim = df.shape
for i in range(dim[0]):
    hash_val,ff_val,sts_n,s_s_s=conv_data(df['transaction_descriptor'][i])
    if hash_val==1:
        Labels.append('Hash_String')
    elif ff_val==1:
        Labels.append('F_strings')
    elif sts_n==1:
        Labels.append('Store_string')
    elif s_s_s==1:
        Labels.append('PETRO_string')
    else:
        Labels.append('Unkown_string')


Labels=pd.Series(Labels)


Labels.shape

(300,)


for label in np.unique(Labels):
    print(label)

F_strings
Hash_String
PETRO_string
Store_string
Unkown_string


Labels.value_counts().sort_values(ascending=False).plot(kind='bar')

<AxesSubplot:>


le = LabelEncoder()
Label = le.fit_transform(Labels)
print(Label)

[2 1 2 2 1 2 2 3 2 1 1 2 1 1 1 1 2 0 2 1 0 2 3 2 2 1 2 2 1 0 1 2 1 1 2 2 3
 0 1 2 0 1 1 4 1 2 1 2 2 2 1 2 1 1 2 1 1 1 1 1 1 1 1 1 2 2 1 1 2 3 3 2 1 1
 0 2 3 2 2 1 1 2 2 1 2 1 1 1 3 1 0 3 2 2 4 1 1 1 1 2 2 3 1 1 0 1 2 2 1 2 1
 1 1 1 1 1 2 2 1 2 1 1 1 1 1 1 1 2 1 1 4 3 1 2 3 2 2 0 2 2 3 2 2 1 2 1 2 1
 2 1 1 1 4 2 1 2 1 2 2 2 2 2 1 2 2 1 2 2 2 1 1 1 1 2 1 1 1 2 1 1 2 2 1 1 1
 2 2 2 1 1 1 1 1 1 1 1 1 2 1 2 1 1 2 1 1 1 2 1 1 3 1 2 1 1 2 1 2 1 1 1 1 2
 1 4 2 1 1 1 1 1 2 2 2 2 2 1 3 1 1 1 1 1 1 1 1 1 0 0 0 1 1 1 2 0 1 1 0 1 2
 0 1 1 2 1 3 1 1 2 1 1 1 1 2 2 2 1 2 1 1 1 1 1 3 3 2 1 3 2 1 1 4 1 2 2 1 0
 1 1 1 2]


REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z +_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
#     text = re.sub(r'\W+', '', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text


df['transaction_descriptor'] = df['transaction_descriptor'].apply(clean_text)
df.head()


def get_split(text1):
    l_total = []
    l_parcial = []
    if len(text1.split())//150 >0:
        n = len(text1.split())//150
    else: 
        n = 1
    for w in range(n):
        if w == 0:
            l_parcial = text1.split()[:200]
            l_total.append(" ".join(l_parcial))
        else:
            l_parcial = text1.split()[w*150:w*150 + 200]
            l_total.append(" ".join(l_parcial))
    return l_total


df['text_split'] = df['transaction_descriptor'].apply(get_split)
df.head()


BERT_MODEL_HUB = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"

def create_tokenizer_from_hub_module():
  """Get the vocab file and casing info from the Hub module."""
  with tf.Graph().as_default():
    bert_module = hub.Module(BERT_MODEL_HUB)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    with tf.Session() as sess:
#     with tf.compat.v1.Session() as sess:
        vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
                                            tokenization_info["do_lower_case"]])
      
    return tokenization.FullTokenizer(
      vocab_file=vocab_file, do_lower_case=do_lower_case)

tokenizer = create_tokenizer_from_hub_module()

INFO:tensorflow:Saver not created because there are no variables in the graph to restore

INFO:tensorflow:Saver not created because there are no variables in the graph to restore

WARNING:tensorflow:From C:\Users\PC-8783213\AppData\Local\Temp\ipykernel_3064\2607540136.py:8: The name tf.Session is deprecated. Please use tf.compat.v1.Session instead.

WARNING:tensorflow:From C:\Users\PC-8783213\AppData\Local\Temp\ipykernel_3064\2607540136.py:8: The name tf.Session is deprecated. Please use tf.compat.v1.Session instead.

WARNING:tensorflow:From C:\Users\PC-8783213\Desktop\Data Science\NLP\BERTHomeWork1\tokenization.py:125: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.

WARNING:tensorflow:From C:\Users\PC-8783213\Desktop\Data Science\NLP\BERTHomeWork1\tokenization.py:125: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.


len(tokenizer.vocab.keys())

30522


df['label']=Label
X_train=df['transaction_descriptor']


type(X_train.iloc[0])

str


#Here is what the tokenised sample of the first training set observation looks like
print(tokenizer.tokenize(X_train.iloc[0]))

['do', '##lr', '##tree', '225', '##7', '000', '##22', '##57', '##4', 'ro', '##swell']


X_train=df[['transaction_descriptor','label','text_split']]


X_train.shape

(300, 3)


train_l = []
label_l = []
index_l =[]
for idx,row in X_train.iterrows():
    for l in row['text_split']:
        train_l.append(l)
        label_l.append(row['label'])
        index_l.append(idx)
len(train_l), len(label_l), len(index_l)

(300, 300, 300)


train_df = pd.DataFrame({'DATA_COLUMN':train_l, 'LABEL_COLUMN':label_l})
train_df.head()


# Use the InputExample class from BERT's run_classifier code to create examples from the data
train_InputExamples = train_df.apply(lambda x: run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example
                                                                   text_a = x['DATA_COLUMN'], 
                                                                   text_b = None, 
                                                                   label = x['LABEL_COLUMN']), axis = 1)


train_InputExamples

0      <run_classifier.InputExample object at 0x00000...
1      <run_classifier.InputExample object at 0x00000...
2      <run_classifier.InputExample object at 0x00000...
3      <run_classifier.InputExample object at 0x00000...
4      <run_classifier.InputExample object at 0x00000...
                             ...                        
295    <run_classifier.InputExample object at 0x00000...
296    <run_classifier.InputExample object at 0x00000...
297    <run_classifier.InputExample object at 0x00000...
298    <run_classifier.InputExample object at 0x00000...
299    <run_classifier.InputExample object at 0x00000...
Length: 300, dtype: object


print("Row 0 - guid of training set : ", train_InputExamples.iloc[0].guid)
print("\n__________\nRow 0 - text_a of training set : ", train_InputExamples.iloc[0].text_a)
print("\n__________\nRow 0 - text_b of training set : ", train_InputExamples.iloc[0].text_b)
print("\n__________\nRow 0 - label of training set : ", train_InputExamples.iloc[0].label)

Row 0 - guid of training set :  None

__________
Row 0 - text_a of training set :  dolrtree 2257 00022574 roswell

__________
Row 0 - text_b of training set :  None

__________
Row 0 - label of training set :  2


BERT_MODEL_HUB = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"

def create_tokenizer_from_hub_module():
  """Get the vocab file and casing info from the Hub module."""
  with tf.Graph().as_default():
    bert_module = hub.Module(BERT_MODEL_HUB)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    with tf.Session() as sess:
#     with tf.compat.v1.Session() as sess:
        vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
                                            tokenization_info["do_lower_case"]])
      
    return tokenization.FullTokenizer(
      vocab_file=vocab_file, do_lower_case=do_lower_case)

tokenizer = create_tokenizer_from_hub_module()

INFO:tensorflow:Saver not created because there are no variables in the graph to restore

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


len(tokenizer.vocab.keys())

30522


print(tokenizer.tokenize(train_InputExamples.iloc[0].text_a))

['do', '##lr', '##tree', '225', '##7', '000', '##22', '##57', '##4', 'ro', '##swell']


label_list = [x for x in np.unique(df.label)]
label_list

[0, 1, 2, 3, 4]


MAX_SEQ_LENGTH = 200

train_features = run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)

WARNING:tensorflow:From C:\Users\PC-8783213\Desktop\Data Science\NLP\BERTHomeWork1\run_classifier.py:774: The name tf.logging.info is deprecated. Please use tf.compat.v1.logging.info instead.

WARNING:tensorflow:From C:\Users\PC-8783213\Desktop\Data Science\NLP\BERTHomeWork1\run_classifier.py:774: The name tf.logging.info is deprecated. Please use tf.compat.v1.logging.info instead.

INFO:tensorflow:Writing example 0 of 300

INFO:tensorflow:Writing example 0 of 300

INFO:tensorflow:*** Example ***

INFO:tensorflow:*** Example ***

INFO:tensorflow:guid: None

INFO:tensorflow:guid: None

INFO:tensorflow:tokens: [CLS] do ##lr ##tree 225 ##7 000 ##22 ##57 ##4 ro ##swell [SEP]

INFO:tensorflow:tokens: [CLS] do ##lr ##tree 225 ##7 000 ##22 ##57 ##4 ro ##swell [SEP]


#Example on first observation in the training set
print("Sentence : ", train_InputExamples.iloc[0].text_a)
print("-"*30)
print("Tokens : ", tokenizer.tokenize(train_InputExamples.iloc[0].text_a))
print("-"*30)
print("Input IDs : ", train_features[0].input_ids)
print("-"*30)
print("Input Masks : ", train_features[0].input_mask)
print("-"*30)
print("Segment IDs : ", train_features[0].segment_ids)

Sentence :  dolrtree 2257 00022574 roswell
------------------------------
Tokens :  ['do', '##lr', '##tree', '225', '##7', '000', '##22', '##57', '##4', 'ro', '##swell']
------------------------------
Input IDs :  [101, 2079, 20974, 13334, 14993, 2581, 2199, 19317, 28311, 2549, 20996, 19228, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
------------------------------
Input Masks :  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
------------------------------
Segment IDs :  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


def create_model(is_predicting, input_ids, input_mask, segment_ids, labels,
                 num_labels):
    """
    Create models for prediction using BERT.
    """
    bert_module = hub.Module(
        BERT_MODEL_HUB,
        trainable=True)
    bert_inputs = dict(
      input_ids=input_ids,
      input_mask=input_mask,
      segment_ids=segment_ids)
    bert_outputs = bert_module(
      inputs=bert_inputs,
      signature="tokens",
      as_dict=True)

    # Use "pooled_output" for classification tasks on an entire sentence.
    # Use "sequence_outputs" for token-level output.
    output_layer = bert_outputs["pooled_output"]
    # with tf.Session() as sess:
    output_layer1 = bert_outputs["pooled_output"]
    # output_layer1 = 999
    hidden_size = output_layer.shape[-1].value

    # Create our own layer to tune for politeness data.
    output_weights = tf.get_variable(
      "output_weights", [num_labels, hidden_size],
      initializer=tf.truncated_normal_initializer(stddev=0.02))

    output_bias = tf.get_variable(
      "output_bias", [num_labels], initializer=tf.zeros_initializer())
    with tf.variable_scope("loss"):

        # Dropout helps prevent overfitting
        output_layer = tf.nn.dropout(output_layer, keep_prob=0.8)

        logits = tf.matmul(output_layer, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)

        # Convert labels into one-hot encoding
        one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
        predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, output_type=tf.int32))
        # If we're predicting, we want predicted labels and the probabiltiies.
        if is_predicting:
            return (predicted_labels, log_probs, output_layer1)

        # If we're train/eval, compute loss between predicted and actual label
        per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
        loss = tf.reduce_mean(per_example_loss)
        
        return (loss, predicted_labels, log_probs)


def model_fn_builder(num_labels, learning_rate, num_train_steps,
                     num_warmup_steps):
    """Returns `model_fn` closure for TPUEstimator."""
    
    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
        """The `model_fn` for TPUEstimator."""

        input_ids = features["input_ids"]
        input_mask = features["input_mask"]
        segment_ids = features["segment_ids"]
        label_ids = features["label_ids"]

        is_predicting = (mode == tf.estimator.ModeKeys.PREDICT)

        # TRAIN and EVAL
        if not is_predicting:

            (loss, predicted_labels, log_probs) = create_model(
            is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)
            train_op = optimization.create_optimizer(
              loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu=False)

            # Calculate evaluation metrics. 
            def metric_fn(label_ids, predicted_labels):
                accuracy = tf.metrics.accuracy(label_ids, predicted_labels)
                true_pos = tf.metrics.true_positives(
                    label_ids,
                    predicted_labels)
                true_neg = tf.metrics.true_negatives(
                    label_ids,
                    predicted_labels)   
                false_pos = tf.metrics.false_positives(
                    label_ids,
                    predicted_labels)  
                false_neg = tf.metrics.false_negatives(
                    label_ids,
                    predicted_labels)
                return {
                    "eval_accuracy": accuracy,
                    "true_positives": true_pos,
                    "true_negatives": true_neg,
                    "false_positives": false_pos,
                    "false_negatives": false_neg,
                    }

            eval_metrics = metric_fn(label_ids, predicted_labels)

            if mode == tf.estimator.ModeKeys.TRAIN:
                return tf.estimator.EstimatorSpec(mode=mode,
                  loss=loss,
                  train_op=train_op)
            else:
                return tf.estimator.EstimatorSpec(mode=mode,
                    loss=loss,
                    eval_metric_ops=eval_metrics)
        else:
            (predicted_labels, log_probs, output_layer) = create_model(
            is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)
            predictions = {
              'probabilities': log_probs,
              'labels': predicted_labels,
              'pooled_output': output_layer
            }
        return tf.estimator.EstimatorSpec(mode, predictions=predictions)

    # Return the actual model function in the closure
    return model_fn


# Set the output directory for saving model file
OUTPUT_DIR = '/bert_news_category'

#@markdown Whether or not to clear/delete the directory and create a new one
DO_DELETE = True #@param {type:"boolean"}

if DO_DELETE:
    try:
        tf.gfile.DeleteRecursively(OUTPUT_DIR)
#         tf.compat.v1.gfile.DeleteRecursively(OUTPUT_DIR)
    except:
        pass

tf.gfile.MakeDirs(OUTPUT_DIR)
print('***** Model output directory: {} *****'.format(OUTPUT_DIR))

WARNING:tensorflow:From C:\Users\PC-8783213\AppData\Local\Temp\ipykernel_3064\2725379644.py:9: The name tf.gfile.DeleteRecursively is deprecated. Please use tf.io.gfile.rmtree instead.

WARNING:tensorflow:From C:\Users\PC-8783213\AppData\Local\Temp\ipykernel_3064\2725379644.py:9: The name tf.gfile.DeleteRecursively is deprecated. Please use tf.io.gfile.rmtree instead.

WARNING:tensorflow:From C:\Users\PC-8783213\AppData\Local\Temp\ipykernel_3064\2725379644.py:14: The name tf.gfile.MakeDirs is deprecated. Please use tf.io.gfile.makedirs instead.

WARNING:tensorflow:From C:\Users\PC-8783213\AppData\Local\Temp\ipykernel_3064\2725379644.py:14: The name tf.gfile.MakeDirs is deprecated. Please use tf.io.gfile.makedirs instead.

***** Model output directory: /bert_news_category *****


BATCH_SIZE = 16
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 1.0
# Warmup is a period of time where the learning rate is small and gradually increases--usually helps training.
WARMUP_PROPORTION = 0.1
# Model configs
SAVE_CHECKPOINTS_STEPS = 300
SAVE_SUMMARY_STEPS = 100

# Compute train and warmup steps from batch size
num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

# Specify output directory and number of checkpoint steps to save
run_config = tf.estimator.RunConfig(
    model_dir=OUTPUT_DIR,
    save_summary_steps=SAVE_SUMMARY_STEPS,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)
# Specify output directory and number of checkpoint steps to save
run_config = tf.estimator.RunConfig(
    model_dir=OUTPUT_DIR,
    save_summary_steps=SAVE_SUMMARY_STEPS,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)


num_train_steps, len(label_list)

(18, 5)


model_fn = model_fn_builder(
  num_labels=len(label_list),
  learning_rate=LEARNING_RATE,
  num_train_steps=num_train_steps,
  num_warmup_steps=num_warmup_steps)

estimator = tf.estimator.Estimator(
  model_fn=model_fn,
  config=run_config,
  params={"batch_size": BATCH_SIZE})

INFO:tensorflow:Using config: {'_model_dir': '/bert_news_category', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 300, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7ff9b9280e10>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}

INFO:tensorflow:Using config: {'_model_dir': '/bert_news_category', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 300, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7ff9b9280e10>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


train_input_fn = run_classifier.input_fn_builder(
    features=train_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=True,
    drop_remainder=False)


# Training
print(f'Beginning Training!')
current_time = datetime.now()
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
print("Training took time ", datetime.now() - current_time)

Beginning Training!
WARNING:tensorflow:From /usr/local/lib/python3.7/dist-packages/tensorflow_core/python/training/training_util.py:236: Variable.initialized_value (from tensorflow.python.ops.variables) is deprecated and will be removed in a future version.
Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.

WARNING:tensorflow:From /usr/local/lib/python3.7/dist-packages/tensorflow_core/python/training/training_util.py:236: Variable.initialized_value (from tensorflow.python.ops.variables) is deprecated and will be removed in a future version.
Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.

INFO:tensorflow:Calling model_fn.

INFO:tensorflow:Calling model_fn.

INFO:tensorflow:Saver not created because there are no variables in the graph to restore

INFO:tensorflow:Saver not created because there are no variables in the graph to restore

WARNING:tensorflow:From <ipython-input-47-71ef87d40b03>:36: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.

WARNING:tensorflow:From <ipython-input-47-71ef87d40b03>:36: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.

WARNING:tensorflow:From /content/optimization.py:27: The name tf.train.get_or_create_global_step is deprecated. Please use tf.compat.v1.train.get_or_create_global_step instead.

WARNING:tensorflow:From /content/optimization.py:27: The name tf.train.get_or_create_global_step is deprecated. Please use tf.compat.v1.train.get_or_create_global_step instead.


estimator.evaluate(input_fn=train_input_fn, steps=1)

INFO:tensorflow:Calling model_fn.

INFO:tensorflow:Calling model_fn.

INFO:tensorflow:Saver not created because there are no variables in the graph to restore

INFO:tensorflow:Saver not created because there are no variables in the graph to restore
/usr/local/lib/python3.7/dist-packages/tensorflow_core/python/framework/indexed_slices.py:424: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "

INFO:tensorflow:Done calling model_fn.

INFO:tensorflow:Done calling model_fn.

INFO:tensorflow:Starting evaluation at 2022-05-23T05:18:49Z

INFO:tensorflow:Starting evaluation at 2022-05-23T05:18:49Z

INFO:tensorflow:Graph was finalized.

INFO:tensorflow:Graph was finalized.

	transaction_descriptor	store_number	dataset
0	DOLRTREE 2257 00022574 ROSWELL	2257	train
1	AUTOZONE #3547	3547	train
2	TGI FRIDAYS 1485 0000	1485	train
3	BUFFALO WILD WINGS 003	3	train
4	J. CREW #568 0	568	train

	transaction_descriptor	store_number	dataset
0	dolrtree 2257 00022574 roswell	2257	train
1	autozone #3547	3547	train
2	tgi fridays 1485 0000	1485	train
3	buffalo wild wings 003	3	train
4	j. crew #568 0	568	train

	transaction_descriptor	store_number	dataset	text_split
0	dolrtree 2257 00022574 roswell	2257	train	[dolrtree 2257 00022574 roswell]
1	autozone #3547	3547	train	[autozone #3547]
2	tgi fridays 1485 0000	1485	train	[tgi fridays 1485 0000]
3	buffalo wild wings 003	3	train	[buffalo wild wings 003]
4	j. crew #568 0	568	train	[j. crew #568 0]

	DATA_COLUMN	LABEL_COLUMN
0	dolrtree 2257 00022574 roswell	2
1	autozone #3547	1
2	tgi fridays 1485 0000	2
3	buffalo wild wings 003	2
4	j. crew #568 0	1

Jacinto Jimenez¶

What is BERT NLP Model?¶