1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57
| import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.net.URL;
import java.io.Reader;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import javax.swing.text.BadLocationException;
import javax.swing.text.EditorKit;
import javax.swing.text.SimpleAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLDocument;
import javax.swing.text.html.HTMLEditorKit;
public class robot {
private static List<String> visitedURL = new ArrayList();
public static void main(String[] args) {
collectURL("http://www.bde-ges.com");
}
private static void collectURL(String myurl){
try {
// Chargement de la page
URL url = new URL(myurl);
URLConnection uconnection = url.openConnection();
Reader rd = new InputStreamReader(uconnection.getInputStream());
// Lecture du document
EditorKit kit = new HTMLEditorKit();
HTMLDocument doc = (HTMLDocument) kit.createDefaultDocument();
doc.putProperty("IgnoreCharsetDirective", new Boolean(true));
kit.read(rd, doc, 0);
// Parcour des balises de lien
HTMLDocument.Iterator it = doc.getIterator(HTML.Tag.A);
while (it.isValid()) {
SimpleAttributeSet s = (SimpleAttributeSet) it.getAttributes();
String link = (String) s.getAttribute(HTML.Attribute.HREF);
if ((link != null)&& !(visitedURL.contains(link))){
// ajout des liens trouvés à la List
System.out.println(link);
ecrire("C:/Users/log.txt", link);
visitedURL.add(link);
collectURL(link);
}
it.next();
}
} catch (BadLocationException ex) {
Logger.getLogger(robot.class.getName()).log(Level.SEVERE, null, ex);
} catch (IOException ex) {
Logger.getLogger(robot.class.getName()).log(Level.SEVERE, null, ex);
}
}
} |
Partager