import
pandas
from
contextlib
import
redirect_stdout
import
math
terms
=
[]
keys
=
[]
vec_Dic
=
{}
dicti
=
{}
dummy_List
=
[]
term_Freq
=
{}
idf
=
{}
weight
=
{}
def
filter
(documents, rows, cols):
for
i
in
range
(rows):
for
j
in
range
(cols):
if
(j
=
=
0
):
keys.append(documents.loc[i].iat[j])
else
:
dummy_List.append(documents.loc[i].iat[j])
if
documents.loc[i].iat[j]
not
in
terms:
terms.append(documents.loc[i].iat[j])
copy
=
dummy_List.copy()
dicti.update({documents.loc[i].iat[
0
]: copy})
dummy_List.clear()
def
compute_Weight(doc_Count, cols):
for
i
in
terms:
if
i
not
in
term_Freq:
term_Freq.update({i:
0
})
for
key, value
in
dicti.items():
for
k
in
value:
if
k
in
term_Freq:
term_Freq[k]
+
=
1
idf
=
term_Freq.copy()
for
i
in
term_Freq:
term_Freq[i]
=
term_Freq[i]
/
cols
for
i
in
idf:
if
idf[i] !
=
doc_Count:
idf[i]
=
math.log2(cols
/
idf[i])
else
:
idf[i]
=
0
for
i
in
idf:
weight.update({i: idf[i]
*
term_Freq[i]})
for
i
in
dicti:
for
j
in
dicti[i]:
dummy_List.append(weight[j])
copy
=
dummy_List.copy()
vec_Dic.update({i: copy})
dummy_List.clear()
def
get_Weight_For_Query(query):
query_Freq
=
{}
for
i
in
terms:
if
i
not
in
query_Freq:
query_Freq.update({i:
0
})
for
val
in
query:
if
val
in
query_Freq:
query_Freq[val]
+
=
1
for
i
in
query_Freq:
query_Freq[i]
=
query_Freq[i]
/
len
(query)
return
query_Freq
def
similarity_Computation(query_Weight):
numerator
=
0
denomi1
=
0
denomi2
=
0
similarity
=
{}
for
document
in
dicti:
for
terms
in
dicti[document]:
numerator
+
=
weight[terms]
*
query_Weight[terms]
denomi1
+
=
weight[terms]
*
weight[terms]
denomi2
+
=
query_Weight[terms]
*
query_Weight[terms]
if
denomi1 !
=
0
and
denomi2 !
=
0
:
simi
=
numerator
/
(math.sqrt(denomi1)
*
math.sqrt(denomi2))
similarity.update({document: simi})
numerator
=
0
denomi2
=
0
denomi1
=
0
return
(similarity)
def
prediction(similarity, doc_count):
with
open
(
'output.txt'
,
'w'
) as f:
with redirect_stdout(f):
ans
=
max
(similarity, key
=
similarity.get)
print
(ans,
"is the most relevant document"
)
print
(
"ranking of the documents"
)
for
i
in
range
(doc_count):
ans
=
max
(similarity, key
=
lambda
x: similarity[x])
print
(ans,
"rank is"
, i
+
1
)
similarity.pop(ans)
def
main():
documents
=
pandas.read_csv(r
'documents.csv'
)
rows
=
len
(documents)
cols
=
len
(documents.columns)
filter
(documents, rows, cols)
compute_Weight(rows, cols)
print
(
"Enter the query"
)
query
=
input
()
query
=
query.split(
' '
)
query_Weight
=
get_Weight_For_Query(query)
similarity
=
similarity_Computation(query_Weight)
prediction(similarity, rows)
main()