Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions src/webcrawler/ApplicationRunner.java
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
package webcrawler;

import webcrawler.crawl.HtmlPageParser;
import webcrawler.source.PageSourceReader;

public class ApplicationRunner {
public static void main(String[] args) {
new WebCrawler();
}

public static void main(String[] args) {
HtmlPageParser pageParser = new HtmlPageParser();
PageSourceReader pageSourceReader = new PageSourceReader();
new WebCrawlerWindow(pageParser, pageSourceReader);
}
}
12 changes: 0 additions & 12 deletions src/webcrawler/WebCrawler.java

This file was deleted.

98 changes: 98 additions & 0 deletions src/webcrawler/WebCrawlerWindow.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
package webcrawler;

import java.awt.ComponentOrientation;
import java.awt.Dimension;
import java.awt.FlowLayout;
import javax.swing.BorderFactory;
import javax.swing.BoxLayout;
import javax.swing.JButton;
import javax.swing.JFrame;
import javax.swing.JLabel;
import javax.swing.JPanel;
import javax.swing.JScrollPane;
import javax.swing.JTable;
import javax.swing.JTextField;
import javax.swing.SwingUtilities;
import javax.swing.table.DefaultTableModel;
import webcrawler.crawl.HtmlPageParser;
import webcrawler.source.PageSourceReader;
import webcrawler.workers.PageLoadWorker;

public class WebCrawlerWindow extends JFrame {


private final JTable table;
private final JTextField location;
private final JButton goButton;
private final JLabel titleLabelInfo;
private final JLabel titleLabel;
private final JLabel urlLabel;

private final DefaultTableModel model;

private final PageSourceReader pageReader;
private final HtmlPageParser pageParser;

public WebCrawlerWindow(HtmlPageParser pageParser, PageSourceReader pageReader) {
this.pageParser = pageParser;
this.pageReader = pageReader;
setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
setSize(600, 600);

this.urlLabel = new JLabel("URL: ");
this.location = new JTextField("https://wikipedia.org");
this.goButton = new JButton("Parse");
this.table = new JTable();
this.model = new DefaultTableModel(new String[]{"URL", "Title"}, 0);
this.titleLabelInfo = new JLabel("Title: ");
this.titleLabel = new JLabel();

initLayout();
initActions();
setVisible(true);
}

private void initActions() {
goButton.addActionListener(e ->
SwingUtilities.invokeLater(() -> {
String url = location.getText();
PageLoadWorker pageLoadWorker = new PageLoadWorker(url, titleLabel, model, pageReader,
pageParser);
pageLoadWorker.execute();
}));
}

private void initLayout() {
var rootPanel = getContentPane();

var locationPanel = new JPanel();
locationPanel.setLayout(new BoxLayout(locationPanel, BoxLayout.LINE_AXIS));
locationPanel.setBorder(BorderFactory.createEmptyBorder(5, 5, 5, 5));
locationPanel.add(urlLabel);
locationPanel.add(location);
locationPanel.add(goButton);

table.setModel(model);
var titlePanel = new JPanel();
titlePanel.setLayout(new FlowLayout(FlowLayout.LEFT));
titlePanel.setComponentOrientation(ComponentOrientation.LEFT_TO_RIGHT);
titlePanel.setBorder(BorderFactory.createEmptyBorder(5, 5, 5, 5));
titlePanel.add(titleLabelInfo);
titlePanel.add(titleLabel);

var scrollPane = new JScrollPane(table);
scrollPane.setPreferredSize(new Dimension(400, 400));

var areaPanel = new JPanel();
areaPanel.setLayout(new BoxLayout(areaPanel, BoxLayout.PAGE_AXIS));
areaPanel.setBorder(BorderFactory.createEmptyBorder(10, 10, 10, 10));
areaPanel.add(scrollPane);

rootPanel.setLayout(new BoxLayout(rootPanel, BoxLayout.PAGE_AXIS));
rootPanel.add(locationPanel);
rootPanel.add(titlePanel);
rootPanel.add(areaPanel);

pack();
}
}
22 changes: 22 additions & 0 deletions src/webcrawler/crawl/Html.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
package webcrawler.crawl;

import java.util.Set;

public class Html {

private final String title;
private final Set<String> links;

public Html(String title, Set<String> links) {
this.title = title;
this.links = links;
}

public String getTitle() {
return title;
}

public Set<String> getLinks() {
return links;
}
}
29 changes: 29 additions & 0 deletions src/webcrawler/crawl/HtmlBuilder.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
package webcrawler.crawl;

import java.util.Set;

public class HtmlBuilder {


private String title;
private Set<String> links;

public HtmlBuilder() {
title = "no title";
links = Set.of();
}

HtmlBuilder withTitle(String title) {
this.title = title;
return this;
}

HtmlBuilder withLinks(Set<String> links) {
this.links = links;
return this;
}

Html build() {
return new Html(title, links);
}
}
57 changes: 57 additions & 0 deletions src/webcrawler/crawl/HtmlPageParser.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
package webcrawler.crawl;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class HtmlPageParser {

private Pattern titlePattern;
private Pattern linkPattern;

public HtmlPageParser() {
this.titlePattern = Pattern.compile("<title>(.*)</title>");
this.linkPattern = Pattern.compile("<a\\s+(?:[^>]*?\\s+)?href=([\"'])(.*?)\\1");
}

public Html parse(String siteText, String rootHost) {
HtmlBuilder htmlBuilder = new HtmlBuilder();
buildTitle(htmlBuilder, siteText);
buildLinks(htmlBuilder, siteText, rootHost);

return htmlBuilder.build();
}

private void buildLinks(HtmlBuilder htmlBuilder, String siteText, String rootHost) {
Matcher linksMatcher = linkPattern.matcher(siteText);
boolean found = linksMatcher.find();
HashSet<String> links = new HashSet<>();
while (found) {
String group = linksMatcher.group(2);
String link = normalize(group, rootHost);
links.add(link);
found = linksMatcher.find();
}
htmlBuilder.withLinks(links);
}

private void buildTitle(HtmlBuilder htmlBuilder, String siteText) {
Matcher titleMatcher = titlePattern.matcher(siteText);
boolean found = titleMatcher.find();
if (found) {
htmlBuilder.withTitle(titleMatcher.group(1));
}
}

private String normalize(String link, String rootHost) {
if (link.startsWith("http://") || link.startsWith("https://")) {
return link;
} else if (link.startsWith("//")) {
String protocol = rootHost.startsWith("https") ? "https:" : "http:";
return protocol + link;
} else {
return rootHost + "/" + link;
}
}
}
47 changes: 47 additions & 0 deletions src/webcrawler/source/PageSourceReader.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
package webcrawler.source;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;

public class PageSourceReader {

public static final String LINE_SEPARATOR = System.getProperty("line.separator");
public static final String TEXT_HTML = "text/html";

public boolean isHtml(String url) {
try {
URLConnection urlConnection = new URL(url).openConnection();
if (urlConnection.getContentType() != null && urlConnection.getContentType()
.contains(TEXT_HTML)) {
return true;
}
} catch (IOException e) {
System.out.println("Error " + e);
return false;
}
return false;
}

public String readPageSource(String url) {
try (
InputStream inputStream = new URL(url).openStream();
BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream))
) {
final StringBuilder stringBuilder = new StringBuilder();
String nextLine;
while ((nextLine = reader.readLine()) != null) {
stringBuilder.append(nextLine);
stringBuilder.append(LINE_SEPARATOR);
}
return stringBuilder.toString();
} catch (
IOException e1) {
System.out.println("Error on parse page " + url);
return "";
}
}
}
47 changes: 47 additions & 0 deletions src/webcrawler/workers/PageLoadWorker.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
package webcrawler.workers;

import java.util.concurrent.ExecutionException;
import javax.swing.JLabel;
import javax.swing.SwingWorker;
import javax.swing.table.DefaultTableModel;
import webcrawler.crawl.Html;
import webcrawler.crawl.HtmlPageParser;
import webcrawler.source.PageSourceReader;

public class PageLoadWorker extends SwingWorker<Html, Void> {

private final String url;
private final JLabel titleLabel;

private DefaultTableModel linksTable;
private final PageSourceReader pageReader;
private final HtmlPageParser pageParser;

public PageLoadWorker(String location, JLabel titleLabel, DefaultTableModel linksTable,
PageSourceReader pageReader, HtmlPageParser pageParser) {
this.url = location;
this.titleLabel = titleLabel;
this.linksTable = linksTable;
this.pageReader = pageReader;
this.pageParser = pageParser;
}

@Override
protected Html doInBackground() throws Exception {
final String siteText = pageReader.readPageSource(url);
return pageParser.parse(siteText, url);
}

@Override
protected void done() {
try {
Html html = get();
titleLabel.setText(html.getTitle());
ParseLinksWorker parseLinksWorker = new ParseLinksWorker(html, linksTable, pageReader,
pageParser);
parseLinksWorker.execute();
} catch (InterruptedException | ExecutionException e) {
Thread.currentThread().interrupt();
}
}
}
37 changes: 37 additions & 0 deletions src/webcrawler/workers/ParseLinksWorker.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
package webcrawler.workers;

import javax.swing.SwingWorker;
import javax.swing.table.DefaultTableModel;
import webcrawler.crawl.Html;
import webcrawler.crawl.HtmlPageParser;
import webcrawler.source.PageSourceReader;

public class ParseLinksWorker extends SwingWorker<Void, Void> {

private final Html rootPage;
private final DefaultTableModel linksTable;

private final PageSourceReader pageReader;
private final HtmlPageParser pageParser;

public ParseLinksWorker(Html rootPage, DefaultTableModel linksTable,
PageSourceReader pageReader, HtmlPageParser pageParser) {
this.rootPage = rootPage;
this.linksTable = linksTable;
this.pageReader = pageReader;
this.pageParser = pageParser;
}


@Override
protected Void doInBackground() throws Exception {
for (String link : rootPage.getLinks()) {
if (pageReader.isHtml(link)) {
final String linkText = pageReader.readPageSource(link);
Html htmlLink = pageParser.parse(linkText, link);
linksTable.addRow(new Object[]{link, htmlLink.getTitle()});
}
}
return null;
}
}