Bonjour

on veut annoter des fichiers texte, donc pour chaque terme extrait on déterminer son offset (son numéro de début et de fin), voila un code qui fait cette tache, mais il y a des erreurs, ce code prend deux fichier en entrée: le premier contient le texte originale et le second les termes annotés

Code : Sélectionner tout - Visualiser dans une fenêtre à part
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import optparse, sys
 
splitchar1 = '\t'
splitchar2 = ' '
 
# for brat, overlapped is not permitted (or at least a warning is generated)
# we could use this simplification in sorting by simply sorting on begin.  it is
# probably a good idea anyway.
class AnnotationRecord:
    label = 'T0'
    type = ''
    begin = -1
    end = -1
    text = ''
 
    def __repr__(self):
        return self.label  + splitchar1 + self.type + splitchar2 + str(self.begin) + splitchar2 + str(self.end) + splitchar1 + self.text
 
def create_record(parts):
    record = AnnotationRecord()
    record.label = parts[0]
    middle_parts = parts[1].split(splitchar2)
    record.type = middle_parts[0]
    record.begin = middle_parts[1]
    record.end = middle_parts[2]
    record.text = parts[2]
    return record
 
def main(filename, out_filename):
    fo = open(filename, 'r')
    lines = fo.readlines()
    fo.close()
 
    annotation_records = []
 
    for line in lines:
        parts = line.split(splitchar1)
        annotation_records.append(create_record(parts))
 
    # sort based upon begin    
    sorted_annotation_records = sorted(annotation_records, key=lambda a: int(a.begin))
 
    # now relabel based upon the sorted order
    label_value = 1
    for sorted_record in sorted_annotation_records:
        sorted_record.label = 'T' + str(label_value)
        label_value += 1
 
    # now write the resulting file to disk
    fo = open(out_filename, 'w')
    for sorted_record in sorted_annotation_records:
        fo.write(sorted_record.__repr__())        
    fo.close()
 
 
#format of .ann file is T# Type Start End Text
#args are input file, output file
if __name__ == '__main__':
    parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), usage=globals()['__doc__'], version='$Id$')
    parser.add_option ('-v', '--verbose', action='store_true', default=False, help='verbose output')
    (options, args) = parser.parse_args()
    if len(args) < 2:
        parser.error ('missing argument')
    main(args[0], args[1])
    sys.exit(0)
merci