1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
| #!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Apr 9 00:09:39 2021
@author: donutman
"""
from scipy import sparse
from numpy import savez
def print_entry(name, syno_list):
print("- %s : %s" % (name, ' - '.join(syno_list) ) )
filein = './data/step0/thes_fr.dat'
fileout1 = './data/step1/thesaurus_matrix'
fileout2 = './data/step1/thesaurus_entries'
d = {} # empty dictionnary
name = ""
syno_list = []
# STEP 1 : we read the input file and parse the data in a dictionnary structure
print('''
******************************
STEP 1
******************************
''')
with open(filein, mode='rt', encoding='utf-8') as f:
next(f) # we skip first line
for line in f:
line = line.rstrip('\n')
if not line.startswith('('): # we read a new entry !
# First, we need to add the previous entry into our dictionnary
# But only if name is not empty !
if not name=="":
#print_entry(name, syno_list)
d[name] = syno_list
syno_list = []
(name, _) = line.split('|')
else:
syno_list_tmp = line.split('|')
del syno_list_tmp[0]
syno_list = syno_list + syno_list_tmp
# Now that we read all the entries we need to add the last entry
d[name] = syno_list
# STEP 2 : we go through the dictionnary and delete synonyms that are NOT
# an entry of the dictionnary itself (self consistency check)
print('''
******************************
STEP 2
******************************
''')
d2 = {}
for (name, syno_list) in d.items():
syno_list_2 = [syno for syno in syno_list if syno in d.keys()]
deleted_words = set(syno_list).symmetric_difference(set(syno_list_2))
if len(deleted_words)>0:
#print('****')
#print('l1 : %s' % ' - '.join(syno_list))
#print('l2 : %s' % ' - '.join(syno_list_2))
print('- Deleted entries for "%s" : %s' % ( name, " - ".join(deleted_words) ))
d2[name] = syno_list_2
# STEP 3 : Now we want to convert all that in a sparse matrix
# This step is a little bit slow and might be in some way optmized...
print('''
******************************
STEP 3
******************************
''')
keys = tuple(sorted(d2.keys()))
Nentries = len(keys)
indices = range(Nentries)
# Construction of I,J,V vectors in COO format
I = []; J = []; V = [];
for (name, syno_list) in d2.items():
rank_key = keys.index(name)
rank_values = [keys.index(n) for n in syno_list]
if len(rank_values)>0: # This should always be the case, but we check
I.extend([rank_key]*len(rank_values))
J.extend(rank_values)
V.extend([1]*len(rank_values))
M = sparse.coo_matrix((V, (I,J)), shape=(Nentries, Nentries), dtype=bool)
sparse.save_npz(fileout1, M)
savez(fileout2, names=keys) |
Partager