# Naive Bayes Classifier - Movie Review 

In [26]:
import pandas as pd
df_reviews=pd.read_csv("rt_reviews.csv",encoding="iso-8859-1")
df_reviews=df_reviews.dropna()
df_reviews.head(5)

Unnamed: 0,Freshness,Review
0,fresh,"Manakamana doesn't answer any questions, yet ..."
1,fresh,Wilfully offensive and powered by a chest-thu...
2,rotten,It would be difficult to imagine material mor...
3,rotten,Despite the gusto its star brings to the role...
4,rotten,If there was a good idea at the core of this ...


In [27]:
df_reviews = df_reviews.sample(frac=1)
 
total_rows = df_reviews.shape[0]
train_size = int(total_rows*0.70)
 
# Spliting data into test and train    

train = df_reviews[0:train_size]
test_set = df_reviews[train_size:]

# train into train set and validation set

train= train.sample(frac=1)
train_size = int(0.8*len(train))

train_set = train[:train_size]
val_set = train[train_size:]

In [28]:
print('Train size -',train_set.shape)
print('Val size - ',val_set.shape)
print('Test size - ',test_set.shape)

Train size - (268800, 2)
Val size -  (67200, 2)
Test size -  (144000, 2)


In [29]:
def textProcessing(text):
    text=text.str.replace('\W', ' ') # to remove punctuations from string
    return text.str.lower()  #to convert string into lowercase format
train_set['Review']=textProcessing(train_set['Review'])
train_set.head()

  text=text.str.replace('\W', ' ') # to remove punctuations from string
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_set['Review']=textProcessing(train_set['Review'])


Unnamed: 0,Freshness,Review
109604,fresh,with a story that ups the stakes and brings i...
272303,rotten,a typical sundance type comedy that confuses ...
44092,fresh,barbershop 2 achieves what most sequels can o...
309490,rotten,unfortunately donner has run out of ammuniti...
260348,rotten,nair s least persuasive film in part because...


In [30]:
train_set['Review'] = train_set['Review'].astype(str).str.split()

vocabulary = []
for review in train_set['Review']:
   for word in review:
    if len(word)>=3:
      vocabulary.append(word)

vocabulary = list(set(vocabulary))
len(vocabulary)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_set['Review'] = train_set['Review'].astype(str).str.split()


73136

In [None]:
from scipy.sparse import lil_matrix

# Initializing the sparse matrix of shape num_reviews x num_words
word_counts = lil_matrix((len(train_set['Review']), len(vocabulary)), dtype=int)

# Updating the word counts for each review from training set
for index, review in enumerate(train_set['Review']):
    for word in review:
     if len(word)>=3:
        word_index = vocabulary.index(word)
        word_counts[index, word_index] += 1

In [None]:

# Converting the sparse matrix to a DataFrame
word_counts_df = pd.DataFrame(word_counts.todense(), columns=vocabulary)

# Adding the Review and Freshness columns
word_counts_df['Review'] = train_set['Review']
word_counts_df['Freshness'] = train_set['Freshness']

cols = list(word_counts_df.columns)
cols.remove('Review')
cols.remove('Freshness')
word_counts_df = word_counts_df[['Review', 'Freshness'] + cols]
word_counts_df.head()

In [None]:
# Isolating rotten and fresh review 
rotten_review = word_counts_df[word_counts_df['Freshness'] == 'rotten']
fresh_review = word_counts_df[word_counts_df['Freshness'] == 'fresh']

# P(rotten) and P(fresh)
p_rotten = len(rotten_review) / len(word_counts_df)
p_fresh = len(fresh_review) / len(word_counts_df)

# N_rotten
n_words_per_rotten_review = rotten_review['Review'].apply(len)
n_rotten = n_words_per_rotten_review.sum()

# N_fresh
n_words_per_fresh_review = fresh_review['Review'].apply(len)
n_fresh = n_words_per_fresh_review.sum()

# N_Vocabulary
n_vocabulary = len(vocabulary)

# Laplace smoothing
alpha = 1

In [None]:
# Initiate parameters
parameters_rotten = {unique_word:0 for unique_word in vocabulary}
parameters_fresh = {unique_word:0 for unique_word in vocabulary}

# Calculate parameters
for word in vocabulary:
   n_word_given_rotten = rotten_review[word].sum() # spam_messages already defined
   p_word_given_rotten = (n_word_given_rotten + alpha) / (n_rotten + alpha*n_vocabulary)
   parameters_rotten[word] = p_word_given_rotten

   n_word_given_fresh = fresh_review[word].sum() # ham_messages already defined
   p_word_given_fresh = (n_word_given_fresh + alpha) / (n_fresh + alpha*n_vocabulary)
   parameters_fresh[word] = p_word_given_fresh

In [None]:
import re

def classify_test_set(review):
   '''
   review: a string
   '''

   review = re.sub('\W', ' ', review)
   review = review.lower().split()

   p_rotten_given_review = p_rotten
   p_fresh_given_review = p_fresh

   for word in review:
      if word in parameters_rotten:
         p_rotten_given_review *= parameters_rotten[word]

      if word in parameters_fresh: 
         p_fresh_given_review *= parameters_fresh[word]

   print('P(rotten|review):', p_rotten_given_review)
   print('P(fresh|review):', p_fresh_given_review)

   if p_fresh_given_review > p_rotten_given_review:
      return 'fresh'
   elif p_fresh_given_review < p_rotten_given_review:
      return 'rotten'
   else:
      return 'Equal proabilities, have a human classify this!'


In [None]:
test_set['predicted'] = test_set['Review'].apply(classify_test_set)
test_set.head()

In [None]:
correct = 0
total = test_set.shape[0]

for row in test_set.iterrows():
   row = row[1]
   if row['Freshness'] == row['predicted']:
      correct += 1

print('Correct:', correct)
print('Incorrect:', total - correct)
print('Accuracy:', correct/total)

Reference
https://www.geeksforgeeks.org/how-to-split-data-into-training-and-testing-in-python-without-sklearn/
https://www.kdnuggets.com/2020/07/spam-filter-python-naive-bayes-scratch.html