In this post, I am going to show you a Java class using that we can get all URLs from a website URL like a web crawler.
Here goes the Java Class
- package client;
- import java.io.IOException;
- import java.io.InputStream;
- import java.io.InputStreamReader;
- import java.io.Reader;
- import java.net.MalformedURLException;
- import java.net.URI;
- import java.net.URISyntaxException;
- import java.net.URL;
- import java.util.Enumeration;
- import javax.swing.text.MutableAttributeSet;
- import javax.swing.text.html.HTML;
- import javax.swing.text.html.HTMLEditorKit;
- import javax.swing.text.html.parser.ParserDelegator;
- public class GetAllUrls {
- public GetAllUrls() {
- super();
- }
- public static void main(String[] args) {
- final String name = "http://www.awasthiashish.com";
- Reader r = null;
- try {
- URL u = new URL(name);
- InputStream in = u.openStream();
- r = new InputStreamReader(in);
- ParserDelegator hp = new ParserDelegator();
- hp.parse(r, new HTMLEditorKit.ParserCallback() {
- public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
- if (t == HTML.Tag.A) {
- Enumeration attrNames = a.getAttributeNames();
- while (attrNames.hasMoreElements()) {
- String exactUrl = "";
- Object key = attrNames.nextElement();
- if ("href".equals(key.toString())) {
- exactUrl = a.getAttribute(key).toString();
- if (!a.getAttribute(key).toString().startsWith("http://") &&
- !a.getAttribute(key).toString().startsWith("https://")) {
- if (a.getAttribute(key).toString().startsWith("/")) {
- exactUrl = name + a.getAttribute(key);
- } else {
- exactUrl = name.concat("/").concat(a.getAttribute(key).toString());
- }
- URI uri;
- try {
- uri = new java.net.URI(exactUrl);
- System.out.println(uri);
- } catch (URISyntaxException e) {
- }
- } else {
- URI uri;
- try {
- uri = new java.net.URI(exactUrl);
- System.out.println(uri);
- } catch (URISyntaxException e) {
- }
- }
- }
- }
- }
- }
- }, true);
- } catch (MalformedURLException e) {
- } catch (IOException e) {
- } finally {
- if (r != null) {
- try {
- r.close();
- } catch (IOException e) {
- }
- }
- }
- }
- }
and output on the console
Here you can see all the URLs linked to the home page of this website
Cheer 🙂 Happy Learning