Mega Code Archive

 
Categories / Java Tutorial / Swing
 

Iterating Across HTML Documents for Links

import java.io.BufferedReader; import java.io.InputStream; import java.io.InputStreamReader; import java.net.URL; import java.net.URLConnection; import javax.swing.text.AttributeSet; import javax.swing.text.html.HTML; import javax.swing.text.html.HTMLDocument; import javax.swing.text.html.HTMLEditorKit; import javax.swing.text.html.parser.ParserDelegator; public class MainClass {   public static void main(String args[]) throws Exception {     URL url = new URL("http://www.google.com");     URLConnection connection = url.openConnection();     InputStream is = connection.getInputStream();     InputStreamReader isr = new InputStreamReader(is);     BufferedReader br = new BufferedReader(isr);     HTMLEditorKit htmlKit = new HTMLEditorKit();     HTMLDocument htmlDoc = (HTMLDocument) htmlKit.createDefaultDocument();     HTMLEditorKit.Parser parser = new ParserDelegator();     HTMLEditorKit.ParserCallback callback = htmlDoc.getReader(0);     parser.parse(br, callback, true);     for (HTMLDocument.Iterator iterator = htmlDoc.getIterator(HTML.Tag.A); iterator.isValid(); iterator         .next()) {       AttributeSet attributes = iterator.getAttributes();       String srcString = (String) attributes.getAttribute(HTML.Attribute.HREF);       System.out.print(srcString);       int startOffset = iterator.getStartOffset();       int endOffset = iterator.getEndOffset();       int length = endOffset - startOffset;       String text = htmlDoc.getText(startOffset, length);       System.out.println(" - " + text);     }   } } url?sa=p&pref=ig&pval=3&q=http://www.google.ca/ig%3Fhl%3Den&usg=__o-KrRDBI3nbRElKzYEMqfOl3_t0= – Personalize this page https://www.google.com/accounts/Login?continue=http://www.google.ca/&hl=en – Sign in http://images.google.ca/imghp?ie=ISO-8859-1&oe=ISO-8859-1&hl=en&tab=wi – Images http://groups.google.ca/grphp?ie=ISO-8859-1&oe=ISO-8859-1&hl=en&tab=wg – Groups http://news.google.ca/nwshp?ie=ISO-8859-1&oe=ISO-8859-1&hl=en&tab=wn – News /maps?ie=ISO-8859-1&oe=ISO-8859-1&hl=en&tab=wl – Maps http://scholar.google.com/schhp?ie=ISO-8859-1&oe=ISO-8859-1&hl=en&tab=ws – Scholar /intl/en/options/ – more » /advanced_search?hl=en – Advanced Search /preferences?hl=en – Preferences /language_tools?hl=en – Language Tools