1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61
| import java.io.*;
import java.net.*;
import javax.swing.text.*;
import javax.swing.text.html.*;
public class LinksRetrieve
{
// This method takes a URI which can be either a filename (e.g. file://c:/dir/file.html)
// or a URL (e.g. http://host.com/page.html) and returns all HREF links in the document.
public static void main(String args[])
{
String uriStr = "http://www.paris-turf.com/pid56-reunion.html?date=2003-09-02/" ;
try
{
// Create a reader on the HTML content
URL url = new URI(uriStr).toURL();
URLConnection conn = url.openConnection();
InputStreamReader rd = new InputStreamReader(conn.getInputStream());
// Parse the HTML
EditorKit kit = new HTMLEditorKit();
HTMLDocument doc = (HTMLDocument)kit.createDefaultDocument();
try
{
doc.putProperty("IgnoreCharsetDirective", new Boolean(true));
kit.read(rd, doc, 0);
}
catch (javax.swing.text.ChangedCharSetException e)
{
System.out.println(e.getCharSetSpec().substring(e.getCharSetSpec().indexOf("=" ) + 1).trim());
}
finally
{
// Find all the A elements in the HTML document
HTMLDocument.Iterator it = doc.getIterator(HTML.Tag.A);
try
{
SimpleAttributeSet s;
String link;
while (it.isValid())
{
s = (SimpleAttributeSet) it.getAttributes();
link = (String) s.getAttribute(HTML.Attribute.HREF);
if (link != null)
System.out.println (link);
it.next();
}
}catch (NullPointerException e) {}
}
}
catch (URISyntaxException e) {}
catch (BadLocationException e) {}
catch (IOException e){}
}
} |
Partager