Please disable your adblock and script blockers to view this page

Search this blog

Showing posts with label Web Crawler. Show all posts
Showing posts with label Web Crawler. Show all posts

Wednesday, 5 December 2018

Get All URLs From a Website Using Java Code


In this post, I am going to show you a Java class using that we can get all URLs from a website URL like a web crawler.

Here goes the Java Class





  1. package client;
  2. import java.io.IOException;
  3. import java.io.InputStream;
  4. import java.io.InputStreamReader;
  5. import java.io.Reader;
  6. import java.net.MalformedURLException;
  7. import java.net.URI;
  8. import java.net.URISyntaxException;
  9. import java.net.URL;
  10. import java.util.Enumeration;
  11. import javax.swing.text.MutableAttributeSet;
  12. import javax.swing.text.html.HTML;
  13. import javax.swing.text.html.HTMLEditorKit;
  14. import javax.swing.text.html.parser.ParserDelegator;
  15. public class GetAllUrls {
  16. public GetAllUrls() {
  17. super();
  18. }
  19. public static void main(String[] args) {
  20. final String name = "http://www.awasthiashish.com";
  21. Reader r = null;
  22. try {
  23. URL u = new URL(name);
  24. InputStream in = u.openStream();
  25. r = new InputStreamReader(in);
  26. ParserDelegator hp = new ParserDelegator();
  27. hp.parse(r, new HTMLEditorKit.ParserCallback() {
  28. public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
  29. if (t == HTML.Tag.A) {
  30. Enumeration attrNames = a.getAttributeNames();
  31. while (attrNames.hasMoreElements()) {
  32. String exactUrl = "";
  33. Object key = attrNames.nextElement();
  34. if ("href".equals(key.toString())) {
  35. exactUrl = a.getAttribute(key).toString();
  36. if (!a.getAttribute(key).toString().startsWith("http://") &&
  37. !a.getAttribute(key).toString().startsWith("https://")) {
  38. if (a.getAttribute(key).toString().startsWith("/")) {
  39. exactUrl = name + a.getAttribute(key);
  40. } else {
  41. exactUrl = name.concat("/").concat(a.getAttribute(key).toString());
  42. }
  43. URI uri;
  44. try {
  45. uri = new java.net.URI(exactUrl);
  46. System.out.println(uri);
  47. } catch (URISyntaxException e) {
  48. }
  49. } else {
  50. URI uri;
  51. try {
  52. uri = new java.net.URI(exactUrl);
  53. System.out.println(uri);
  54. } catch (URISyntaxException e) {
  55. }
  56. }
  57. }
  58. }
  59. }
  60. }
  61. }, true);
  62. } catch (MalformedURLException e) {
  63. } catch (IOException e) {
  64. } finally {
  65. if (r != null) {
  66. try {
  67. r.close();
  68. } catch (IOException e) {
  69. }
  70. }
  71. }
  72. }
  73. }

and output on the console

Here you can see all the URLs linked to the home page of this website


Cheer 🙂 Happy Learning