Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions src/webcrawler/ApplicationRunner.java
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
package webcrawler;

import webcrawler.crawl.HtmlPageParser;
import webcrawler.source.PageSourceReader;

public class ApplicationRunner {
public static void main(String[] args) {
new WebCrawler();
}

public static void main(String[] args) {
HtmlPageParser pageParser = new HtmlPageParser();
PageSourceReader pageSourceReader = new PageSourceReader();
new WebCrawlerWindow(pageParser, pageSourceReader);
}
}
12 changes: 0 additions & 12 deletions src/webcrawler/WebCrawler.java

This file was deleted.

121 changes: 121 additions & 0 deletions src/webcrawler/WebCrawlerWindow.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
package webcrawler;

import java.awt.ComponentOrientation;
import java.awt.Dimension;
import java.awt.FlowLayout;
import java.nio.file.Paths;
import javax.swing.BorderFactory;
import javax.swing.BoxLayout;
import javax.swing.JButton;
import javax.swing.JFrame;
import javax.swing.JLabel;
import javax.swing.JPanel;
import javax.swing.JScrollPane;
import javax.swing.JTable;
import javax.swing.JTextField;
import javax.swing.SwingUtilities;
import javax.swing.table.DefaultTableModel;
import webcrawler.crawl.HtmlPageParser;
import webcrawler.source.PageSourceReader;
import webcrawler.workers.ExportTableWorker;
import webcrawler.workers.PageLoadWorker;

public class WebCrawlerWindow extends JFrame {

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Emptiness

private final JTable table;
private final JTextField location;
private final JButton goButton;
private final JLabel titleLabelInfo;
private final JLabel titleLabel;
private final JLabel urlLabel;
private final JLabel exportLabel;
private final JTextField exportLocation;
private final JButton exportButton;

private final DefaultTableModel tableModel;

private final PageSourceReader pageReader;
private final HtmlPageParser pageParser;

public WebCrawlerWindow(HtmlPageParser pageParser, PageSourceReader pageReader) {
this.pageParser = pageParser;
this.pageReader = pageReader;
setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
setSize(600, 600);

this.urlLabel = new JLabel("URL: ");
this.location = new JTextField("https://wikipedia.org");
this.goButton = new JButton("Parse");
this.table = new JTable();
this.tableModel = new DefaultTableModel(new String[]{"URL", "Title"}, 0);
this.titleLabelInfo = new JLabel("Title: ");
this.titleLabel = new JLabel();
this.exportLabel = new JLabel("Export: ");
this.exportLocation = new JTextField("export/links.txt");
this.exportButton = new JButton("Save");

initLayout();
initActions();
setVisible(true);
}

private void initActions() {
goButton.addActionListener(e ->
SwingUtilities.invokeLater(() -> {
String url = location.getText();
PageLoadWorker pageLoadWorker = new PageLoadWorker(url, titleLabel, tableModel,
pageReader,
pageParser);
pageLoadWorker.execute();
}));

exportButton.addActionListener(e ->
SwingUtilities.invokeLater(() -> {
String file = exportLocation.getText();
ExportTableWorker pageLoadWorker = new ExportTableWorker(Paths.get(file), tableModel);
pageLoadWorker.execute();
}));
}

private void initLayout() {
var rootPanel = getContentPane();

var locationPanel = new JPanel();
locationPanel.setLayout(new BoxLayout(locationPanel, BoxLayout.LINE_AXIS));
locationPanel.setBorder(BorderFactory.createEmptyBorder(5, 5, 5, 5));
locationPanel.add(urlLabel);
locationPanel.add(location);
locationPanel.add(goButton);

table.setModel(tableModel);
var titlePanel = new JPanel();
titlePanel.setLayout(new FlowLayout(FlowLayout.LEFT));
titlePanel.setComponentOrientation(ComponentOrientation.LEFT_TO_RIGHT);
titlePanel.setBorder(BorderFactory.createEmptyBorder(5, 5, 5, 5));
titlePanel.add(titleLabelInfo);
titlePanel.add(titleLabel);

var scrollPane = new JScrollPane(table);
scrollPane.setPreferredSize(new Dimension(400, 400));

var areaPanel = new JPanel();
areaPanel.setLayout(new BoxLayout(areaPanel, BoxLayout.PAGE_AXIS));
areaPanel.setBorder(BorderFactory.createEmptyBorder(10, 10, 10, 10));
areaPanel.add(scrollPane);

var exportPanel = new JPanel();
exportPanel.setLayout(new BoxLayout(exportPanel, BoxLayout.LINE_AXIS));
exportPanel.setBorder(BorderFactory.createEmptyBorder(5, 5, 5, 5));
exportPanel.add(exportLabel);
exportPanel.add(exportLocation);
exportPanel.add(exportButton);

rootPanel.setLayout(new BoxLayout(rootPanel, BoxLayout.PAGE_AXIS));
rootPanel.add(locationPanel);
rootPanel.add(titlePanel);
rootPanel.add(areaPanel);
rootPanel.add(exportPanel);

pack();
}
}
22 changes: 22 additions & 0 deletions src/webcrawler/crawl/Html.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
package webcrawler.crawl;

import java.util.Set;

public class Html {

private final String title;
private final Set<String> links;

Html(String title, Set<String> links) {
this.title = title;
this.links = links;
}

public String getTitle() {
return title;
}

public Set<String> getLinks() {
return links;
}
}
28 changes: 28 additions & 0 deletions src/webcrawler/crawl/HtmlBuilder.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
package webcrawler.crawl;

import java.util.Set;

public class HtmlBuilder {

private String title;
private Set<String> links;

HtmlBuilder() {
title = "no title";
links = Set.of();
}

HtmlBuilder withTitle(String title) {
this.title = title;
return this;
}

HtmlBuilder withLinks(Set<String> links) {
this.links = links;
return this;
}

Html build() {
return new Html(title, links);
}
}
57 changes: 57 additions & 0 deletions src/webcrawler/crawl/HtmlPageParser.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
package webcrawler.crawl;

import java.util.HashSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class HtmlPageParser {

private Pattern titlePattern;
private Pattern linkPattern;

public HtmlPageParser() {
this.titlePattern = Pattern.compile("<title>(.*)</title>");
this.linkPattern = Pattern.compile("<a\\s+(?:[^>]*?\\s+)?href=([\"'])(.*?)\\1");
}

public Html parse(String siteText, String rootHost) {
HtmlBuilder htmlBuilder = new HtmlBuilder();

buildTitle(htmlBuilder, siteText);
buildLinks(htmlBuilder, siteText, rootHost);

return htmlBuilder.build();
}

private void buildLinks(HtmlBuilder htmlBuilder, String siteText, String rootHost) {
Matcher linksMatcher = linkPattern.matcher(siteText);
boolean found = linksMatcher.find();
HashSet<String> links = new HashSet<>();
while (found) {
String group = linksMatcher.group(2);
String link = normalize(group, rootHost);
links.add(link);
found = linksMatcher.find();
}
htmlBuilder.withLinks(links);
}

private void buildTitle(HtmlBuilder htmlBuilder, String siteText) {
Matcher titleMatcher = titlePattern.matcher(siteText);
boolean found = titleMatcher.find();
if (found) {
htmlBuilder.withTitle(titleMatcher.group(1));
}
}

private String normalize(String link, String rootHost) {
if (link.startsWith("http://") || link.startsWith("https://")) {
return link;
} else if (link.startsWith("//")) {
String protocol = rootHost.startsWith("https") ? "https:" : "http:";
return protocol + link;
} else {
return rootHost + "/" + link;
}
}
}
59 changes: 59 additions & 0 deletions src/webcrawler/source/PageSourceReader.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
package webcrawler.source;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;

public class PageSourceReader {

private static final String LINE_SEPARATOR = System.getProperty("line.separator");
private static final String TEXT_HTML = "text/html";
public static final String USER_AGENT_MOZILLA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0";

public boolean isHtml(String url) {
try {
URLConnection urlConnection = new URL(url).openConnection();
urlConnection.setRequestProperty("User-Agent", USER_AGENT_MOZILLA);
if (urlConnection.getContentType() != null && urlConnection.getContentType()
.contains(TEXT_HTML)) {
return true;
}
} catch (IOException e) {
System.out.println("Error " + e);
return false;
}
return false;
}

public String readPageSource(String url) {
final URLConnection urlConnection;
try {
urlConnection = new URL(url).openConnection();
urlConnection.setRequestProperty("User-Agent", USER_AGENT_MOZILLA);
return readFromConnection(urlConnection);
} catch (IOException e) {
System.out.println("Error on parse page " + url);
return "";
}
}

private String readFromConnection(URLConnection urlConnection) {
try (
InputStream inputStream = urlConnection.getInputStream();
BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream))
) {
final StringBuilder stringBuilder = new StringBuilder();
String nextLine;
while ((nextLine = reader.readLine()) != null) {
stringBuilder.append(nextLine);
stringBuilder.append(LINE_SEPARATOR);
}
return stringBuilder.toString();
} catch (IOException e) {
return "";
}
}
}
32 changes: 32 additions & 0 deletions src/webcrawler/workers/ExportTableWorker.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
package webcrawler.workers;

import java.io.BufferedWriter;
import java.io.PrintWriter;
import java.nio.file.Files;
import java.nio.file.Path;
import javax.swing.SwingWorker;
import javax.swing.table.DefaultTableModel;

public class ExportTableWorker extends SwingWorker<Void, Void> {

private Path exportPath;
private DefaultTableModel tableModel;

public ExportTableWorker(Path exportPath, DefaultTableModel tableModel) {
this.exportPath = exportPath;
this.tableModel = tableModel;
}

@Override
protected Void doInBackground() throws Exception {
Files.createDirectories(exportPath.getParent());
try (BufferedWriter bufferedWriter = Files.newBufferedWriter(exportPath);
PrintWriter printWriter = new PrintWriter(bufferedWriter)) {
int rowCount = tableModel.getRowCount();
for (int i = 0; i < rowCount; i++) {
printWriter.printf("%s\n%s\n", tableModel.getValueAt(i, 0), tableModel.getValueAt(i, 1));
}
}
return null;
}
}
Loading