Bonjour à toutes et à tous.
Je vous présente le programme pdf_analyzer qui résulte d'une demande professionnelle et répondant à une expression de besoins qu'il couvre à 100 %.
Développé en Python 3.10.10 sous Windows, il est exploitable sous d'autres plateformes. S'appuyant sur la lib PyMUPDF/, il permet à l'utilisateur :
- de rechercher des mots ou des phrases dans une page d'un fichier PDF,
- de déterminer un rapport entre le texte et les images présentes sur la page,
- de rechercher la présence de calques ou de marquages.
Tous les critères ci-dessus répondent à l'expression des besoins et pourront paraître bizarres, voire saugrenus pour quiconque lira le code source.
Les différents types de recherches peuvent porter sur les n premières pages et/ou les n dernières pages, ou bien la totalité du PDF.
Toutes les méthodes, variables sont typées. Là aussi, une demande de l'expression de besoins.
Voici les différents codes sources :
pdf_analyzer.py :
exceptions.py :
Code : Sélectionner tout - Visualiser dans une fenêtre à part
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183 from dataclasses import dataclass from pathlib import Path from types import TracebackType from typing_extensions import Self import fitz from .exception import UnreadablePDF @dataclass class PDFAnalyzer: """PDF analyzer class. Attributes: filename: Path of the PDF file to analyze. """ filename: Path document: fitz.Document | None = None error_msg: str | None = None def _load_document(self) -> None: """Method for instantiate fitz.Document.""" try: self.document = fitz.Document(self.filename) except (fitz.fitz.FileDataError, fitz.fitz.FileNotFoundError) as err: self.error_msg = str(err) def _pages_to_scan(self, first_pages: int = 0, last_pages: int = 0) -> list[int]: """Determines the list of pages to analyze according to the requested page ranges. Args: first_pages: The number of first pages to be analyzed. last_pages: The number of last pages to be analyzed. Returns: List of pages to be analyzed. """ if self.document is None: return [] total_pages: int = self.document.page_count if not first_pages and not last_pages: return list(range(total_pages)) first_pages = first_pages if first_pages < total_pages else total_pages last_pages = last_pages if last_pages < total_pages else total_pages if first_pages and not last_pages: return list(range(first_pages)) if last_pages and not first_pages: return list(range(total_pages))[-last_pages:total_pages] pages: list[int] = list(range(first_pages)) pages.extend(list(range(total_pages)[-last_pages:total_pages])) return sorted(list(set(pages))) def readability_rate(self, first_pages: int = 0, last_pages: int = 0) -> float: """Calculates the readability rate of the pages to analyze. Args: first_pages: The number of first pages to be analyzed. last_pages: The number of last pages to be analyzed. Returns: The readability rate with 2 decimals. """ if self.document is None: return 0.0 rates: list[float] = [] for page in self._pages_to_scan(first_pages=first_pages, last_pages=last_pages): text: int = 0 content: fitz.Page = self.document.load_page(page) if content.get_xobjects(): return 0.0 if content.get_text().strip(): text = 1 image: int = len(content.get_images()) try: rates.append(text / (text + image)) except ZeroDivisionError: rates.append(0.0) return round(sum(rates) / len(rates), 2) def layer(self, first_pages: int = 0, last_pages: int = 0) -> bool: """Search for a layer in the pages to analyze. Args: first_pages: The number of first pages to be analyzed. last_pages: The number of last pages to be analyzed. Returns: True if a layer is found. False if no layer found. """ if self.document is None: return False for page in self._pages_to_scan(first_pages=first_pages, last_pages=last_pages): content: fitz.Page = self.document.load_page(page) if content.get_xobjects(): return True return False @property def corrupted_file(self) -> bool: """Determines whether the file is corrupted or not. Returns: True if corrupted file. False otherwise. """ return bool(self.error_msg) def terms_found( self, first_pages: int = 0, last_pages: int = 0, words: list[str] | None = None, case_sensitive: bool = False, ) -> bool: """Look for one of the terms on each page. Args: first_pages: The number of first pages to be analyzed. last_pages: The number of last pages to be analyzed. words: List of words to find. case_sensitive: If the search for words must be in sensitive case or not. Raises: UnreadablePDF if the file is unknown or corrupted. Returns: True if a term is found. False otherwise. """ if self.document is None: raise UnreadablePDF(self.error_msg) if words is None: return False for page in self._pages_to_scan(first_pages=first_pages, last_pages=last_pages): content: fitz.Page = self.document.load_page(page) for sentence in words: if case_sensitive: if sentence in content.get_text(): return True else: if content.search_for(sentence): return True return False def is_readable( self, first_pages: int = 0, last_pages: int = 0, acceptance: float = 0.0 ) -> bool: """Determines if a PDF has sufficient readability to be processed. Args: first_pages: The number of first pages to be analyzed. last_pages: The number of last pages to be analyzed. acceptance: Minimum readability rate required to consider the PDF readable. Returns: True if the PDF is readable enough. False otherwise. """ return ( self.readability_rate(first_pages=first_pages, last_pages=last_pages) >= acceptance ) def __enter__(self) -> Self: """__enter__ method for use with 'with' instantiation of the class. Returns: The instance. """ self._load_document() return self def __exit__( self, exc_type: type[Exception] | None, exc_val: Exception | None, exc_tb: TracebackType | None, ) -> None: """__exit__ method for use with 'with' at the end of the instance.""" if self.document is not None: self.document.close()
Pour les tests (qui s'appuient sur de vrais fichiers PDF non présents ici), test_pdf_analyzer.py :
Code : Sélectionner tout - Visualiser dans une fenêtre à part
1
2 class UnreadablePDF(Exception): """Exception for damaged, unreadable PDF files or other problems."""
Les tests couvrent 100 % du code (là aussi... Expression des besoins).
Code : Sélectionner tout - Visualiser dans une fenêtre à part
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179 from pathlib import Path import pytest import fitz from pdf_analyzer import PDFAnalyzer, UnreadablePDF class TestSearch: def test_instance_with_minimal_argv(sef) -> None: with PDFAnalyzer(filename=Path("tests/samples/match.pdf")) as search: assert search.filename == Path("tests/samples/match.pdf") assert type(search.document) == fitz.Document assert search.error_msg is None @pytest.mark.parametrize( "filename,expected_type,expected_error", [ ( Path("tests/samples/damaged_pdf.pdf"), None, "cannot open broken document", ), ( Path("unknown_file.pdf.pdf"), None, "no such file: 'unknown_file.pdf.pdf'", ), ], ) def test_instance_with_bad_pdf_file( self, filename: Path, expected_type: None, expected_error: str ) -> None: with PDFAnalyzer(filename=filename) as search: assert search.document == expected_type assert search.error_msg == expected_error @pytest.mark.parametrize( "filename,first_pages,last_pages,expected", [ (Path("tests/samples/match.pdf"), 1, 0, [0]), (Path("tests/samples/match.pdf"), 10, 0, [0]), (Path("tests/samples/match.pdf"), 10, 10, [0]), (Path("tests/samples/match.pdf"), 0, 5, [0]), (Path("tests/samples/watermark.pdf"), 0, 5, [144, 145, 146, 147, 148]), (Path("tests/samples/watermark.pdf"), 3, 3, [0, 1, 2, 146, 147, 148]), (Path("tests/samples/watermark.pdf"), 0, 0, list(range(149))), (Path("tests/samples/damaged_pdf.pdf"), 5, 10, []), ], ) def test_pages_to_scan( self, filename: Path, first_pages: int, last_pages: int, expected: list[int] ) -> None: with PDFAnalyzer(filename=filename) as doc: assert ( doc._pages_to_scan(first_pages=first_pages, last_pages=last_pages) == expected ) @pytest.mark.parametrize( "filename,first_pages,expected", [ (Path("tests/samples/full_pdfi.pdf"), 0, 0.0), (Path("tests/samples/match.pdf"), 0, 1.0), (Path("tests/samples/watermark.pdf"), 40, 0.0), (Path("tests/samples/blank.pdf"), 0, 0.0), (Path("unknown_file.pdf"), 0, 0.0), ], ) def test_readability_rate( self, filename: Path, first_pages: int, expected: float ) -> None: with PDFAnalyzer(filename=filename) as doc: assert doc.readability_rate(first_pages=first_pages) == expected @pytest.mark.parametrize( "filename,expected", [ (Path("tests/samples/match.pdf"), False), (Path("tests/samples/watermark.pdf"), True), (Path("tests/samples/damaged_pdf.pdf"), False), ], ) def test_layer(self, filename: Path, expected: bool) -> None: with PDFAnalyzer(filename=filename) as doc: assert doc.layer() is expected @pytest.mark.parametrize( "filename,expected", [ (Path("tests/samples/match.pdf"), False), (Path("tests/samples/watermark.pdf"), False), (Path("tests/samples/damaged_pdf.pdf"), True), ], ) def test_corrupted_file(self, filename: Path, expected: bool) -> None: with PDFAnalyzer(filename=filename) as doc: assert doc.corrupted_file is expected def test_terms_found_raises_an_exception(self) -> None: with PDFAnalyzer(filename=Path("tests/samples/damaged_pdf.pdf")) as doc: with pytest.raises(UnreadablePDF): doc.terms_found() @pytest.mark.parametrize( "filename,first_pages,words,case_sensitive,expected", [ (Path("tests/samples/match.pdf"), 1, None, True, False), (Path("tests/samples/match.pdf"), 1, None, False, False), (Path("tests/samples/match.pdf"), 1, ["Annexe Nationale"], True, True), (Path("tests/samples/match.pdf"), 1, ["ANNEXE NATIONALE"], False, True), (Path("tests/samples/match.pdf"), 1, ["ANNEXE NATIONALE"], True, False), (Path("tests/samples/not_match.pdf"), 1, ["Annexe Nationale"], True, False), ( Path("tests/samples/not_match.pdf"), 1, ["Annexe Nationale"], False, False, ), (Path("tests/samples/watermark.pdf"), 1, ["Eurocode 6"], True, True), (Path("tests/samples/watermark.pdf"), 1, ["EUROCODE 6"], True, False), (Path("tests/samples/watermark.pdf"), 1, ["EUROCODE 6"], False, True), ( Path("tests/samples/watermark.pdf"), 1, [ "La présente Norme européenne a été adoptée par le CEN le 3 janvier " + "2022.\n\nLes membres du CEN sont tenus de se soumettre au " + "Règlement Intérieur du CEN/CENELEC" ], True, False, ), ( Path("tests/samples/watermark.pdf"), 1, [ "la présente norme européenne a été adoptée par le cen le 3 janvier " + "2022.\n\nLes membres du cen sont tenus de se soumettre au " + "Règlement Intérieur du cen/cenelec" ], False, True, ), ], ) def test_terms_found( self, filename: Path, first_pages: int, words: list[str], case_sensitive: bool, expected: bool, ) -> None: with PDFAnalyzer(filename=filename) as doc: assert ( doc.terms_found( first_pages=first_pages, words=words, case_sensitive=case_sensitive, ) is expected ) @pytest.mark.parametrize( "filename,acceptance,expected", [ (Path("tests/samples/match.pdf"), 1.0, True), (Path("tests/samples/blank.pdf"), 0.5, False), (Path("tests/samples/damaged_pdf.pdf"), 0.5, False), (Path("tests/samples/full_pdfi.pdf"), 1, False), (Path("tests/samples/not_match.pdf"), 1.0, True), (Path("tests/samples/watermark.pdf"), 0.1, False), ], ) def test_is_readable( self, filename: Path, acceptance: float, expected: bool ) -> None: with PDFAnalyzer(filename=filename) as doc: assert doc.is_readable(acceptance=acceptance) is expected
Je poste ce code ici car je me suis énormément éclaté à le réaliser et je me dis que, peut être, ça pourra permettre à certain(e)s de découvrir la lib PyMUPDF et ses classes Document et Page, pytest et la puissante fixture Parametrize.
Si vous trouvez ça bidon, pas de soucis. Je supprimerai ce topic.
Partager