diff --git a/src/webcrawler/ApplicationRunner.java b/src/webcrawler/ApplicationRunner.java index e2a4260..0be7286 100644 --- a/src/webcrawler/ApplicationRunner.java +++ b/src/webcrawler/ApplicationRunner.java @@ -1,7 +1,13 @@ package webcrawler; +import webcrawler.crawl.HtmlPageParser; +import webcrawler.source.PageSourceReader; + public class ApplicationRunner { - public static void main(String[] args) { - new WebCrawler(); - } + + public static void main(String[] args) { + HtmlPageParser pageParser = new HtmlPageParser(); + PageSourceReader pageSourceReader = new PageSourceReader(); + new WebCrawlerWindow(pageParser, pageSourceReader); + } } diff --git a/src/webcrawler/WebCrawler.java b/src/webcrawler/WebCrawler.java deleted file mode 100644 index 737f85a..0000000 --- a/src/webcrawler/WebCrawler.java +++ /dev/null @@ -1,12 +0,0 @@ -package webcrawler; - -import javax.swing.*; - -public class WebCrawler extends JFrame { - public WebCrawler() { - setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); - setSize(300, 300); - setVisible(true); - setLayout(null); - } -} \ No newline at end of file diff --git a/src/webcrawler/WebCrawlerWindow.java b/src/webcrawler/WebCrawlerWindow.java new file mode 100644 index 0000000..aa9c7f6 --- /dev/null +++ b/src/webcrawler/WebCrawlerWindow.java @@ -0,0 +1,121 @@ +package webcrawler; + +import java.awt.ComponentOrientation; +import java.awt.Dimension; +import java.awt.FlowLayout; +import java.nio.file.Paths; +import javax.swing.BorderFactory; +import javax.swing.BoxLayout; +import javax.swing.JButton; +import javax.swing.JFrame; +import javax.swing.JLabel; +import javax.swing.JPanel; +import javax.swing.JScrollPane; +import javax.swing.JTable; +import javax.swing.JTextField; +import javax.swing.SwingUtilities; +import javax.swing.table.DefaultTableModel; +import webcrawler.crawl.HtmlPageParser; +import webcrawler.source.PageSourceReader; +import webcrawler.workers.ExportTableWorker; +import webcrawler.workers.PageLoadWorker; + +public class WebCrawlerWindow extends JFrame { + + private final JTable table; + private final JTextField location; + private final JButton goButton; + private final JLabel titleLabelInfo; + private final JLabel titleLabel; + private final JLabel urlLabel; + private final JLabel exportLabel; + private final JTextField exportLocation; + private final JButton exportButton; + + private final DefaultTableModel tableModel; + + private final PageSourceReader pageReader; + private final HtmlPageParser pageParser; + + public WebCrawlerWindow(HtmlPageParser pageParser, PageSourceReader pageReader) { + this.pageParser = pageParser; + this.pageReader = pageReader; + setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); + setSize(600, 600); + + this.urlLabel = new JLabel("URL: "); + this.location = new JTextField("https://wikipedia.org"); + this.goButton = new JButton("Parse"); + this.table = new JTable(); + this.tableModel = new DefaultTableModel(new String[]{"URL", "Title"}, 0); + this.titleLabelInfo = new JLabel("Title: "); + this.titleLabel = new JLabel(); + this.exportLabel = new JLabel("Export: "); + this.exportLocation = new JTextField("export/links.txt"); + this.exportButton = new JButton("Save"); + + initLayout(); + initActions(); + setVisible(true); + } + + private void initActions() { + goButton.addActionListener(e -> + SwingUtilities.invokeLater(() -> { + String url = location.getText(); + PageLoadWorker pageLoadWorker = new PageLoadWorker(url, titleLabel, tableModel, + pageReader, + pageParser); + pageLoadWorker.execute(); + })); + + exportButton.addActionListener(e -> + SwingUtilities.invokeLater(() -> { + String file = exportLocation.getText(); + ExportTableWorker pageLoadWorker = new ExportTableWorker(Paths.get(file), tableModel); + pageLoadWorker.execute(); + })); + } + + private void initLayout() { + var rootPanel = getContentPane(); + + var locationPanel = new JPanel(); + locationPanel.setLayout(new BoxLayout(locationPanel, BoxLayout.LINE_AXIS)); + locationPanel.setBorder(BorderFactory.createEmptyBorder(5, 5, 5, 5)); + locationPanel.add(urlLabel); + locationPanel.add(location); + locationPanel.add(goButton); + + table.setModel(tableModel); + var titlePanel = new JPanel(); + titlePanel.setLayout(new FlowLayout(FlowLayout.LEFT)); + titlePanel.setComponentOrientation(ComponentOrientation.LEFT_TO_RIGHT); + titlePanel.setBorder(BorderFactory.createEmptyBorder(5, 5, 5, 5)); + titlePanel.add(titleLabelInfo); + titlePanel.add(titleLabel); + + var scrollPane = new JScrollPane(table); + scrollPane.setPreferredSize(new Dimension(400, 400)); + + var areaPanel = new JPanel(); + areaPanel.setLayout(new BoxLayout(areaPanel, BoxLayout.PAGE_AXIS)); + areaPanel.setBorder(BorderFactory.createEmptyBorder(10, 10, 10, 10)); + areaPanel.add(scrollPane); + + var exportPanel = new JPanel(); + exportPanel.setLayout(new BoxLayout(exportPanel, BoxLayout.LINE_AXIS)); + exportPanel.setBorder(BorderFactory.createEmptyBorder(5, 5, 5, 5)); + exportPanel.add(exportLabel); + exportPanel.add(exportLocation); + exportPanel.add(exportButton); + + rootPanel.setLayout(new BoxLayout(rootPanel, BoxLayout.PAGE_AXIS)); + rootPanel.add(locationPanel); + rootPanel.add(titlePanel); + rootPanel.add(areaPanel); + rootPanel.add(exportPanel); + + pack(); + } +} \ No newline at end of file diff --git a/src/webcrawler/crawl/Html.java b/src/webcrawler/crawl/Html.java new file mode 100644 index 0000000..3b5c79d --- /dev/null +++ b/src/webcrawler/crawl/Html.java @@ -0,0 +1,22 @@ +package webcrawler.crawl; + +import java.util.Set; + +public class Html { + + private final String title; + private final Set links; + + Html(String title, Set links) { + this.title = title; + this.links = links; + } + + public String getTitle() { + return title; + } + + public Set getLinks() { + return links; + } +} diff --git a/src/webcrawler/crawl/HtmlBuilder.java b/src/webcrawler/crawl/HtmlBuilder.java new file mode 100644 index 0000000..cdca2b0 --- /dev/null +++ b/src/webcrawler/crawl/HtmlBuilder.java @@ -0,0 +1,28 @@ +package webcrawler.crawl; + +import java.util.Set; + +public class HtmlBuilder { + + private String title; + private Set links; + + HtmlBuilder() { + title = "no title"; + links = Set.of(); + } + + HtmlBuilder withTitle(String title) { + this.title = title; + return this; + } + + HtmlBuilder withLinks(Set links) { + this.links = links; + return this; + } + + Html build() { + return new Html(title, links); + } +} diff --git a/src/webcrawler/crawl/HtmlPageParser.java b/src/webcrawler/crawl/HtmlPageParser.java new file mode 100644 index 0000000..1a17b38 --- /dev/null +++ b/src/webcrawler/crawl/HtmlPageParser.java @@ -0,0 +1,57 @@ +package webcrawler.crawl; + +import java.util.HashSet; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class HtmlPageParser { + + private Pattern titlePattern; + private Pattern linkPattern; + + public HtmlPageParser() { + this.titlePattern = Pattern.compile("(.*)"); + this.linkPattern = Pattern.compile("]*?\\s+)?href=([\"'])(.*?)\\1"); + } + + public Html parse(String siteText, String rootHost) { + HtmlBuilder htmlBuilder = new HtmlBuilder(); + + buildTitle(htmlBuilder, siteText); + buildLinks(htmlBuilder, siteText, rootHost); + + return htmlBuilder.build(); + } + + private void buildLinks(HtmlBuilder htmlBuilder, String siteText, String rootHost) { + Matcher linksMatcher = linkPattern.matcher(siteText); + boolean found = linksMatcher.find(); + HashSet links = new HashSet<>(); + while (found) { + String group = linksMatcher.group(2); + String link = normalize(group, rootHost); + links.add(link); + found = linksMatcher.find(); + } + htmlBuilder.withLinks(links); + } + + private void buildTitle(HtmlBuilder htmlBuilder, String siteText) { + Matcher titleMatcher = titlePattern.matcher(siteText); + boolean found = titleMatcher.find(); + if (found) { + htmlBuilder.withTitle(titleMatcher.group(1)); + } + } + + private String normalize(String link, String rootHost) { + if (link.startsWith("http://") || link.startsWith("https://")) { + return link; + } else if (link.startsWith("//")) { + String protocol = rootHost.startsWith("https") ? "https:" : "http:"; + return protocol + link; + } else { + return rootHost + "/" + link; + } + } +} diff --git a/src/webcrawler/source/PageSourceReader.java b/src/webcrawler/source/PageSourceReader.java new file mode 100644 index 0000000..3e5d3dc --- /dev/null +++ b/src/webcrawler/source/PageSourceReader.java @@ -0,0 +1,59 @@ +package webcrawler.source; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.net.URL; +import java.net.URLConnection; + +public class PageSourceReader { + + private static final String LINE_SEPARATOR = System.getProperty("line.separator"); + private static final String TEXT_HTML = "text/html"; + public static final String USER_AGENT_MOZILLA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0"; + + public boolean isHtml(String url) { + try { + URLConnection urlConnection = new URL(url).openConnection(); + urlConnection.setRequestProperty("User-Agent", USER_AGENT_MOZILLA); + if (urlConnection.getContentType() != null && urlConnection.getContentType() + .contains(TEXT_HTML)) { + return true; + } + } catch (IOException e) { + System.out.println("Error " + e); + return false; + } + return false; + } + + public String readPageSource(String url) { + final URLConnection urlConnection; + try { + urlConnection = new URL(url).openConnection(); + urlConnection.setRequestProperty("User-Agent", USER_AGENT_MOZILLA); + return readFromConnection(urlConnection); + } catch (IOException e) { + System.out.println("Error on parse page " + url); + return ""; + } + } + + private String readFromConnection(URLConnection urlConnection) { + try ( + InputStream inputStream = urlConnection.getInputStream(); + BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream)) + ) { + final StringBuilder stringBuilder = new StringBuilder(); + String nextLine; + while ((nextLine = reader.readLine()) != null) { + stringBuilder.append(nextLine); + stringBuilder.append(LINE_SEPARATOR); + } + return stringBuilder.toString(); + } catch (IOException e) { + return ""; + } + } +} diff --git a/src/webcrawler/workers/ExportTableWorker.java b/src/webcrawler/workers/ExportTableWorker.java new file mode 100644 index 0000000..7a6924e --- /dev/null +++ b/src/webcrawler/workers/ExportTableWorker.java @@ -0,0 +1,32 @@ +package webcrawler.workers; + +import java.io.BufferedWriter; +import java.io.PrintWriter; +import java.nio.file.Files; +import java.nio.file.Path; +import javax.swing.SwingWorker; +import javax.swing.table.DefaultTableModel; + +public class ExportTableWorker extends SwingWorker { + + private Path exportPath; + private DefaultTableModel tableModel; + + public ExportTableWorker(Path exportPath, DefaultTableModel tableModel) { + this.exportPath = exportPath; + this.tableModel = tableModel; + } + + @Override + protected Void doInBackground() throws Exception { + Files.createDirectories(exportPath.getParent()); + try (BufferedWriter bufferedWriter = Files.newBufferedWriter(exportPath); + PrintWriter printWriter = new PrintWriter(bufferedWriter)) { + int rowCount = tableModel.getRowCount(); + for (int i = 0; i < rowCount; i++) { + printWriter.printf("%s\n%s\n", tableModel.getValueAt(i, 0), tableModel.getValueAt(i, 1)); + } + } + return null; + } +} diff --git a/src/webcrawler/workers/PageLoadWorker.java b/src/webcrawler/workers/PageLoadWorker.java new file mode 100644 index 0000000..6cbf654 --- /dev/null +++ b/src/webcrawler/workers/PageLoadWorker.java @@ -0,0 +1,47 @@ +package webcrawler.workers; + +import java.util.concurrent.ExecutionException; +import javax.swing.JLabel; +import javax.swing.SwingWorker; +import javax.swing.table.DefaultTableModel; +import webcrawler.crawl.Html; +import webcrawler.crawl.HtmlPageParser; +import webcrawler.source.PageSourceReader; + +public class PageLoadWorker extends SwingWorker { + + private final String url; + private final JLabel titleLabel; + + private DefaultTableModel linksTable; + private final PageSourceReader pageReader; + private final HtmlPageParser pageParser; + + public PageLoadWorker(String location, JLabel titleLabel, DefaultTableModel linksTable, + PageSourceReader pageReader, HtmlPageParser pageParser) { + this.url = location; + this.titleLabel = titleLabel; + this.linksTable = linksTable; + this.pageReader = pageReader; + this.pageParser = pageParser; + } + + @Override + protected Html doInBackground() throws Exception { + final String siteText = pageReader.readPageSource(url); + return pageParser.parse(siteText, url); + } + + @Override + protected void done() { + try { + Html html = get(); + titleLabel.setText(html.getTitle()); + ParseLinksWorker parseLinksWorker = new ParseLinksWorker(html, linksTable, pageReader, + pageParser); + parseLinksWorker.execute(); + } catch (InterruptedException | ExecutionException e) { + Thread.currentThread().interrupt(); + } + } +} diff --git a/src/webcrawler/workers/ParseLinksWorker.java b/src/webcrawler/workers/ParseLinksWorker.java new file mode 100644 index 0000000..ce090fb --- /dev/null +++ b/src/webcrawler/workers/ParseLinksWorker.java @@ -0,0 +1,54 @@ +package webcrawler.workers; + +import java.util.List; +import java.util.Vector; +import java.util.concurrent.ExecutionException; +import javax.swing.SwingWorker; +import javax.swing.table.DefaultTableModel; +import webcrawler.crawl.Html; +import webcrawler.crawl.HtmlPageParser; +import webcrawler.source.PageSourceReader; + +public class ParseLinksWorker extends SwingWorker>, Void> { + + private final Html rootPage; + private final DefaultTableModel linksTable; + + private final PageSourceReader pageReader; + private final HtmlPageParser pageParser; + + public ParseLinksWorker(Html rootPage, DefaultTableModel linksTable, + PageSourceReader pageReader, HtmlPageParser pageParser) { + this.rootPage = rootPage; + this.linksTable = linksTable; + this.pageReader = pageReader; + this.pageParser = pageParser; + } + + + @Override + protected Vector> doInBackground() throws Exception { + Vector> allData = new Vector<>(); + for (String link : rootPage.getLinks()) { + if (pageReader.isHtml(link)) { + Vector res = new Vector<>(); + final String linkText = pageReader.readPageSource(link); + Html htmlLink = pageParser.parse(linkText, link); + res.add(link); + res.add(htmlLink.getTitle()); + allData.add(res); + } + } + return allData; + } + + @Override + protected void done() { + try { + Vector> allData = get(); + linksTable.setDataVector(allData, new Vector<>(List.of("URL", "Title"))); + } catch (InterruptedException | ExecutionException e) { + Thread.currentThread().interrupt(); + } + } +}