1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
| package fr.opsys.petra.indexation;
import java.io.File;
import java.io.IOException;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.junit.Test;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.util.PDFTextStripper;
//import org.pdfbox.searchengine.lucene.LucenePDFDocument; non compatible avec Lucene 2.4.0
public class IndexationPDFTest {
@Test public void monTest() throws IOException {
//1. Extraction contenu du PDF
File file = new File("D:\\text_extraction.pdf");
PDDocument pddoc = PDDocument.load(file);
PDFTextStripper stripper = new PDFTextStripper();
String contenu = stripper.getText(pddoc);
//2. Creation de l'index
Directory dir = FSDirectory.getDirectory("C:\\WhitespaceAnalyser100000"); // Emplacement de l'index
IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED);
//3. Creation du document Lucene
Document doc = new Document();
doc.add(new Field("id", file.getName(), Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("valeur", contenu, Field.Store.NO, Field.Index.ANALYZED));
//4. Indexation
writer.addDocument(doc);
writer.close();
} |
Partager