1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
|
import re
import os
from os import listdir
text_directory = 'Texte'
for name in listdir(text_directory):
filename = text_directory + '/' + name
#open the text file
with open(filename,encoding="utf8") as file:
data = file.read()
# Split text by 2 line break to have a kind of bloc
paragraphs = [item for item in data.split('\n\n') if item]
rawParagraphs = []
for paragraph in paragraphs:
newParagraph = []
# Split the bloc by line break
lines = paragraph.split('\n')
for line in lines:
# Split lines by 2 white spaces to have kind of colones
cols = [item for item in line.split(' ') if item]
newParagraph.append(cols)
# Find max cols of pg
maxcol = max([len(line) for line in newParagraph])
# Patch lines
for index, line in enumerate(newParagraph):
if len(line)< maxcol:
if lines[index].startswith(' '):
for i in range(maxcol-len(line)):
line.insert(0, '')
else:
for i in range(maxcol-len(line)):
line.append('')
newParagraph[index]= line
rawParagraph = []
# Join line to have paragraph per colones
for i in range(maxcol):
for j in range(len(newParagraph)):
rawParagraph.append(newParagraph[j][i])
rawParagraph = ' '.join(rawParagraph).replace('- ','')
rawParagraphs.append(rawParagraph)
# Create a text file to write the new paragraph
textcle = open("Text_org" + "/" + name ,"w",encoding="utf8")
# Get new paragraph
references = [paragraph for paragraph in rawParagraphs if paragraph]
# Write the paragraph in the texte file
for index, reference in enumerate(references):
text = f"{index} {reference}\n\n"
textcle.write(text)
textcle.close |
Partager