1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70
|
def levenshtein(chaine1, chaine2):
taille_chaine1 = len(chaine1) + 1
taille_chaine2 = len(chaine2) + 1
levenshtein_matrix = np.zeros ((taille_chaine1, taille_chaine2))
for x in range(taille_chaine1):
levenshtein_matrix [x, 0] = x
for y in range(taille_chaine2):
levenshtein_matrix [0, y] = y
for x in range(1, taille_chaine1):
for y in range(1, taille_chaine2):
if chaine1[x-1] == chaine2[y-1]:
levenshtein_matrix [x,y] = min(
levenshtein_matrix[x-1, y] + 1,
levenshtein_matrix[x-1, y-1],
levenshtein_matrix[x, y-1] + 1
)
else:
levenshtein_matrix [x,y] = min(
levenshtein_matrix[x-1,y] + 1,
levenshtein_matrix[x-1,y-1] + 1,
levenshtein_matrix[x,y-1] + 1
)
return (levenshtein_matrix[taille_chaine1 - 1, taille_chaine2 - 1])
print("distance de levenshtein = " + str(levenshtein("Lorem ipsum dolor sit amet", "Laram zpsam dilir siy amot")))
def get_cosine(vec1, vec2):
intersection = set(vec1.keys()) & set(vec2.keys())
numerator = sum([vec1[x] * vec2[x] for x in intersection])
sum1 = sum([vec1[x]**2 for x in vec1.keys()])
sum2 = sum([vec2[x]**2 for x in vec2.keys()])
denominator = math.sqrt(sum1) * math.sqrt(sum2)
if not denominator:
return 0.0
else:
return float(numerator) / denominator
def text_to_vector(text):
word = re.compile(r'\w+')
words = word.findall(text)
return Counter(words)
def get_result(content_a, content_b):
text1 = content_a
text2 = content_b
vector1 = text_to_vector(text1)
vector2 = text_to_vector(text2)
cosine_result = get_cosine(vector1, vector2)
return cosine_result
je parcours les lignes pourtant a cet endroit
for i in df_result.index:
result = get_result(df_result["id"][i],df_result["password"][i])
Levensthein = levenshtein(df_result["id"][i],df_result["password"][i])
jaccard = jaccard_similarity(df_result["id"][i],df_result["password"][i])
print(result)
print(Levensthein)
print(jaccard)
print(result) |