Skip to content
Related Articles

Related Articles

Classification of Text Documents using the approach of Naïve Bayes

View Discussion
Improve Article
Save Article
  • Difficulty Level : Hard
  • Last Updated : 02 Sep, 2022
View Discussion
Improve Article
Save Article

This article aims to implement Document Classification using Naïve Bayes using python.

Step wise Implementation:

Step-1:

  • Input the total Number of Documents from the user.
  • Input the text and class of Each document and split it into a List.
  • Create a 2D array and append each document list into an array.
  • Using a Set data structure, store all the keywords in a list.
  • Input the text to be classified by the user.

Python3




print('\n *-----* Classification using Naïve bayes *-----* \n')
total_documents = int(input("Enter the Total Number of documents: "))
doc_class = []
i = 0
keywords = []
while not i == total_documents:
    doc_class.append([])
    text = input(f"\nEnter the text of Doc-{i+1} : ").lower()
    class = input(f"Enter the class of Doc-{i+1} : ")
    doc_class[i].append(text.split())
    doc_class[i].append(clas)
    keywords.extend(text.split())
    i = i+1
keywords = set(keywords)
keywords = list(keywords)
keywords.sort()
to_find = input("\nEnter the Text to classify using Naive Bayes: ").lower().split()

Step-2:

  • Create an empty list named “probability_table”.
  • Count all the occurrences of all the keywords in Each document and store them in the list “probability_table”.

Python3




probability_table = []
for i in range(total_documents):
    probability_table.append([])
    for j in keywords:
        probability_table[i].append(0)
doc_id = 1
for i in range(total_documents):
    for k in range(len(keywords)):
        if keywords[k] in doc_class[i][0]:
            probability_table[i][k] += doc_class[i][0].count(keywords[k])
print('\n')

Step-3:

  • Import a pretty table for displaying the “probability_table” list in a neat tabular format.
  • Give the title of the Table as ‘Probability of Documents’
  • Print the table.

Python3




import prettytable
keywords.insert(0, 'Document ID')
keywords.append("Class")
Prob_Table = prettytable.PrettyTable()
Prob_Table.field_names = keywords
Prob_Table.title = 'Probability of Documents'
x=0
for i in probability_table:
    i.insert(0,x+1)
    i.append(doc_class[x][1])
    Prob_Table.add_row(i)
    x=x+1
print(Prob_Table)
print('\n')
for i in probability_table:
    i.pop(0)

Step-4:

  • Count the Number of Total words which belong to ‘+’ class.
  • Count the Number of Total words which belong to ‘-’ class.
  • Count the Number of Total documents which belong to ‘+’ class.
  • Count the Number of Total documents which belong to ‘-’ class.

Python3




totalpluswords=0
totalnegwords=0
totalplus=0
totalneg=0
vocabulary=len(keywords)-2
for i in probability_table:
    if i[len(i)-1]=="+":
        totalplus+=1
        totalpluswords+=sum(i[0:len(i)-1])
    else:
        totalneg+=1
        totalnegwords+=sum(i[0:len(i)-1])
keywords.pop(0)
keywords.pop(len(keywords)-1)

Step-5:

  • In order to overcome the Zero-frequency problem, use the below formula to find the final probability of each word present in the text to be classified.

P(Word/Class) = (No. of occurrences of word in class+1) / (Total No. of words present in class + Total keywords)

  • Format the Probability of each word up to the precision of 4-digits.
     

Python3




#For positive class
temp=[]
for i in to_find:
    count=0
    x=keywords.index(i)
    for j in probability_table:
        if j[len(j)-1]=="+":
            count=count+j[x]
    temp.append(count)
    count=0
for i in range(len(temp)):
    temp[i]=format((temp[i]+1)/(vocabulary+totalpluswords),".4f")
print()
temp=[float(f) for f in temp]
print("Probabilities of Each word to be in '+' class are: ")
h=0
for i in to_find:
    print(f"P({i}/+) = {temp[h]}")
    h=h+1
print()

Step-6:

  • Find the final probability of Each class using the Naïve Bayes formula.
  • Format the Final result up to 8-digit precision.

Python3




pplus=float(format((totalplus)/(totalplus+totalneg),".8f"))
for i in temp:
    pplus=pplus*i
pplus=format(pplus,".8f")
print("probability of Given text to be in '+' class is :",pplus)
print()

Step-7:

  • Perform Step-5 & Step-6 for Negative Classes as well.

Python3




#For Negative class
temp=[]
for i in to_find:
    count=0
    x=keywords.index(i)
    for j in probability_table:
        if j[len(j)-1]=="-":
            count=count+j[x]
    temp.append(count)
    count=0
for i in range(len(temp)):
    temp[i]=format((temp[i]+1)/(vocabulary+totalnegwords),".4f")
print()
temp=[float(f) for f in temp]
print("Probabilities of Each word to be in '-' class are: ")
h=0
for i in to_find:
    print(f"P({i}/-) = {temp[h]}")
    h=h+1
print()
pneg=float(format((totalneg)/(totalplus+totalneg),".8f"))
for i in temp:
    pneg=pneg*i
pneg=format(pneg,".8f")
print("probability of Given text to be in '-' class is :",pneg)
print('\n')

Step-8:

  • Compare the Final probabilities of both the classes.
  • Print the Final result.

Python3




if pplus>pneg:
    print(f"Using Naive Bayes Classification, We can clearly say that the given text belongs to '+' class with probability {pplus}")
else:
    print(f"Using Naive Bayes Classification, We can clearly say that the given text belongs to '-' class with probability {pneg}")
print('\n')

Below is the Full Implementation:

Python3




print('\n *-----* Classification using Naïve bayes *-----* \n')
total_documents = int(input("Enter the Total Number of documents: "))
doc_class = []
i = 0
keywords = []
while not i == total_documents:
    doc_class.append([])
    text = input(f"\nEnter the text of Doc-{i+1} : ").lower()
    class = input(f"Enter the class of Doc-{i+1} : ")
    doc_class[i].append(text.split())
    doc_class[i].append(clas)
    keywords.extend(text.split())
    i = i+1
keywords = set(keywords)
keywords = list(keywords)
keywords.sort()
to_find = input("\nEnter the Text to classify using Naive Bayes: ").lower().split()
 
probability_table = []
for i in range(total_documents):
    probability_table.append([])
    for j in keywords:
        probability_table[i].append(0)
doc_id = 1
for i in range(total_documents):
    for k in range(len(keywords)):
        if keywords[k] in doc_class[i][0]:
            probability_table[i][k] += doc_class[i][0].count(keywords[k])
print('\n')
import prettytable
keywords.insert(0, 'Document ID')
keywords.append("Class")
Prob_Table = prettytable.PrettyTable()
Prob_Table.field_names = keywords
Prob_Table.title = 'Probability of Documents'
x=0
for i in probability_table:
    i.insert(0,x+1)
    i.append(doc_class[x][1])
    Prob_Table.add_row(i)
    x=x+1
print(Prob_Table)
print('\n')
for i in probability_table:
    i.pop(0)
totalpluswords=0
totalnegwords=0
totalplus=0
totalneg=0
vocabulary=len(keywords)-2
for i in probability_table:
    if i[len(i)-1]=="+":
        totalplus+=1
        totalpluswords+=sum(i[0:len(i)-1])
    else:
        totalneg+=1
        totalnegwords+=sum(i[0:len(i)-1])
keywords.pop(0)
keywords.pop(len(keywords)-1)
#For positive class
temp=[]
for i in to_find:
    count=0
    x=keywords.index(i)
    for j in probability_table:
        if j[len(j)-1]=="+":
            count=count+j[x]
    temp.append(count)
    count=0
for i in range(len(temp)):
    temp[i]=format((temp[i]+1)/(vocabulary+totalpluswords),".4f")
print()
temp=[float(f) for f in temp]
print("Probabilities of Each word to be in '+' class are: ")
h=0
for i in to_find:
    print(f"P({i}/+) = {temp[h]}")
    h=h+1
print()
pplus=float(format((totalplus)/(totalplus+totalneg),".8f"))
for i in temp:
    pplus=pplus*i
pplus=format(pplus,".8f")
print("probability of Given text to be in '+' class is :",pplus)
print()
#For Negative class
temp=[]
for i in to_find:
    count=0
    x=keywords.index(i)
    for j in probability_table:
        if j[len(j)-1]=="-":
            count=count+j[x]
    temp.append(count)
    count=0
for i in range(len(temp)):
    temp[i]=format((temp[i]+1)/(vocabulary+totalnegwords),".4f")
print()
temp=[float(f) for f in temp]
print("Probabilities of Each word to be in '-' class are: ")
h=0
for i in to_find:
    print(f"P({i}/-) = {temp[h]}")
    h=h+1
print()
pneg=float(format((totalneg)/(totalplus+totalneg),".8f"))
for i in temp:
    pneg=pneg*i
pneg=format(pneg,".8f")
print("probability of Given text to be in '-' class is :",pneg)
print('\n')
if pplus>pneg:
    print(f"Using Naive Bayes Classification, We can clearly say that the given text belongs to '+' class with probability {pplus}")
else:
    print(f"Using Naive Bayes Classification, We can clearly say that the given text belongs to '-' class with probability {pneg}")
print('\n')

Output Screenshot:

 

 

 


My Personal Notes arrow_drop_up
Recommended Articles
Page :

Start Your Coding Journey Now!