1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
| import nltk
nltk.download('punkt')
import pandas as pd
import numpy as np
# Program to measure the similarity between
# two sentences using cosine similarity.
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
data = pd.read_csv("C:/email-password-recovery-code.csv", sep=';')
print(data)
# X = input("Enter first string: ").lower()
# Y = input("Enter second string: ").lower()
#a =[['290729','79076','76789','59462','49952','33291','21725','20901','20553','16648'],['12345','123456789','password','iloveyou' 'princess','1234567','rockyou','12345678','abc123']]
x = np.array(['290729','79076','76789','59462','49952','33291','21725','20901','20553','16648'])
y = np.array(['12345','123456789','password','iloveyou' 'princess','1234567','rockyou','12345678','abc123'])
# tokenization
X_list = word_tokenize(x)
Y_list = word_tokenize(y)
# sw contains the list of stopwords
sw = stopwords.words('english')
l1 =[];l2 =[]
# remove stop words from the string
X_set = {w for w in X_list if not w in sw}
Y_set = {w for w in Y_list if not w in sw}
# form a set containing keywords of both strings
rvector = X_set.union(Y_set)
for w in rvector:
if w in X_set: l1.append(1) # create a vector
else: l1.append(0)
if w in Y_set: l2.append(1)
else: l2.append(0)
c = 0
# cosine formula
for i in range(len(rvector)):
c+= l1[i]*l2[i]
cosine = c / float((sum(l1)*sum(l2))**0.5)
print("similarity: ", cosine) |