1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
|
private String GetPdfContent(String pdfFile)
{
// utilisation de PdfBox
PDDocument pdfDoc = PDDocument.load(pdfFile);
PDFTextStripper pdfStripper = new PDFTextStripper();
return pdfStripper.getText(pdfDoc);
}
private String GetDocxContent(String docxFile)
{
// extraction zip du fichier document.xml dans l'archive ".docx"
// et extraction du contenu des balises "<w:t />"
String content = string.Empty;
using (ZipInputStream zipStream = new ZipInputStream(File.OpenRead(docxFile))) {
ZipEntry theEntry;
while ((theEntry = zipStream.GetNextEntry()) != null) {
if (theEntry.IsFile) {
if (theEntry.Name.Equals("word/document.xml")) {
int size = 8192;
byte[] buffer = new byte[size];
int bytesRead;
StringBuilder xmlStr = new StringBuilder();
while ((bytesRead = zipStream.Read(buffer, 0, size)) > 0) {
xmlStr.Append(UTF8Encoding.UTF8.GetString(buffer, 0, bytesRead));
}
XmlDocument xmlDoc = new XmlDocument();
xmlDoc.LoadXml(xmlStr.ToString());
XmlNodeList list = xmlDoc.GetElementsByTagName("w:t");
foreach (XmlNode node in list) {
content += node.InnerText;
}
}
}
}
}
return content;
}
private String GetDocContent(String docFile)
{
return String.Empty;
} |
Partager