| 12
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 
 | import requests
import re
 
english_folder1 = r"c:\test\test\2\1"
 
extension_file = ".html"
 
use_parse_folder = True
 
import os
 
en1_directory = os.fsencode(english_folder1)
 
print('Going through english folder')
for file in os.listdir(en1_directory):
    filename = os.fsdecode(file)
    print(filename)
    if filename == 'y_key_e479323ce281e459.html' or filename == 'TS_4fg4_tr78.html': #ignore this files
        continue
    if filename.endswith(extension_file):
        with open(os.path.join(english_folder1, filename), encoding='utf-8') as html:
            html = html.read()
 
            try:
                with open(os.path.join(english_folder1, filename), encoding='utf-8') as en_html:
                    en_html = en_html.read()
 
 
                    try:
                        parse_1 = re.search('<title>.+</title>', html, flags=re.DOTALL)[0]
                        en_html = re.sub('<meta name="Subject" content=".+"/>', parse_1, en_html, flags=re.DOTALL)
                    except:
                        pass
 
 
 
            except FileNotFoundError:
                continue
 
        print(f'{filename} parsed')
        if use_parse_folder:
            try:
                with open(os.path.join(english_folder1+r'\parsed', 'parsed_'+filename), 'w', encoding='utf-8') as new_html:
                    new_html.write(en_html)
            except:
                os.mkdir(english_folder1+r'\parsed')
                with open(os.path.join(english_folder1+r'\parsed', 'parsed_'+filename), 'w', encoding='utf-8') as new_html:
                    new_html.write(en_html)
        else:
            with open(os.path.join(english_folder1, 'parsed_'+filename), 'w', encoding='utf-8') as html:
                html.write(en_html) | 
Partager