From 240c7e2304cc333ade314f1afdcb1d4557d72ac2 Mon Sep 17 00:00:00 2001 From: oleksii Date: Mon, 24 Dec 2018 00:30:03 +0200 Subject: [PATCH 1/8] first phase frame implement --- src/webcrawler/ApplicationRunner.java | 7 +++-- src/webcrawler/WebCrawler.java | 42 ++++++++++++++++++++++----- 2 files changed, 39 insertions(+), 10 deletions(-) diff --git a/src/webcrawler/ApplicationRunner.java b/src/webcrawler/ApplicationRunner.java index e2a4260..c69dd22 100644 --- a/src/webcrawler/ApplicationRunner.java +++ b/src/webcrawler/ApplicationRunner.java @@ -1,7 +1,8 @@ package webcrawler; public class ApplicationRunner { - public static void main(String[] args) { - new WebCrawler(); - } + + public static void main(String[] args) { + new WebCrawler(); + } } diff --git a/src/webcrawler/WebCrawler.java b/src/webcrawler/WebCrawler.java index 737f85a..47b9a55 100644 --- a/src/webcrawler/WebCrawler.java +++ b/src/webcrawler/WebCrawler.java @@ -1,12 +1,40 @@ package webcrawler; -import javax.swing.*; +import java.awt.BorderLayout; +import java.awt.Dimension; +import javax.swing.BorderFactory; +import javax.swing.BoxLayout; +import javax.swing.JFrame; +import javax.swing.JPanel; +import javax.swing.JScrollPane; +import javax.swing.JTextArea; public class WebCrawler extends JFrame { - public WebCrawler() { - setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); - setSize(300, 300); - setVisible(true); - setLayout(null); - } + + private final JTextArea textArea; + + public WebCrawler() { + setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); + setSize(300, 300); + setVisible(true); + + this.textArea = new JTextArea(); + initLayout(); + } + + private void initLayout() { + var rootPanel = getContentPane(); + + textArea.setText("HTML code?"); + var scrollPane = new JScrollPane(textArea); + scrollPane.setPreferredSize(new Dimension(200, 200)); + + var areaPanel = new JPanel(); + areaPanel.setLayout(new BoxLayout(areaPanel, BoxLayout.PAGE_AXIS)); + areaPanel.setBorder(BorderFactory.createEmptyBorder(40, 40, 40, 40)); + areaPanel.add(scrollPane); + + rootPanel.add(areaPanel, BorderLayout.CENTER); + pack(); + } } \ No newline at end of file From fcc75f5c955d22ae144288141f1a4fda9f2a338f Mon Sep 17 00:00:00 2001 From: oleksii Date: Mon, 24 Dec 2018 01:11:44 +0200 Subject: [PATCH 2/8] second phase frame implement --- src/webcrawler/ApplicationRunner.java | 2 +- src/webcrawler/WebCrawler.java | 40 ------------- src/webcrawler/WebCrawlerWindow.java | 81 +++++++++++++++++++++++++++ 3 files changed, 82 insertions(+), 41 deletions(-) delete mode 100644 src/webcrawler/WebCrawler.java create mode 100644 src/webcrawler/WebCrawlerWindow.java diff --git a/src/webcrawler/ApplicationRunner.java b/src/webcrawler/ApplicationRunner.java index c69dd22..a37a6d9 100644 --- a/src/webcrawler/ApplicationRunner.java +++ b/src/webcrawler/ApplicationRunner.java @@ -3,6 +3,6 @@ public class ApplicationRunner { public static void main(String[] args) { - new WebCrawler(); + new WebCrawlerWindow(); } } diff --git a/src/webcrawler/WebCrawler.java b/src/webcrawler/WebCrawler.java deleted file mode 100644 index 47b9a55..0000000 --- a/src/webcrawler/WebCrawler.java +++ /dev/null @@ -1,40 +0,0 @@ -package webcrawler; - -import java.awt.BorderLayout; -import java.awt.Dimension; -import javax.swing.BorderFactory; -import javax.swing.BoxLayout; -import javax.swing.JFrame; -import javax.swing.JPanel; -import javax.swing.JScrollPane; -import javax.swing.JTextArea; - -public class WebCrawler extends JFrame { - - private final JTextArea textArea; - - public WebCrawler() { - setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); - setSize(300, 300); - setVisible(true); - - this.textArea = new JTextArea(); - initLayout(); - } - - private void initLayout() { - var rootPanel = getContentPane(); - - textArea.setText("HTML code?"); - var scrollPane = new JScrollPane(textArea); - scrollPane.setPreferredSize(new Dimension(200, 200)); - - var areaPanel = new JPanel(); - areaPanel.setLayout(new BoxLayout(areaPanel, BoxLayout.PAGE_AXIS)); - areaPanel.setBorder(BorderFactory.createEmptyBorder(40, 40, 40, 40)); - areaPanel.add(scrollPane); - - rootPanel.add(areaPanel, BorderLayout.CENTER); - pack(); - } -} \ No newline at end of file diff --git a/src/webcrawler/WebCrawlerWindow.java b/src/webcrawler/WebCrawlerWindow.java new file mode 100644 index 0000000..471bf50 --- /dev/null +++ b/src/webcrawler/WebCrawlerWindow.java @@ -0,0 +1,81 @@ +package webcrawler; + +import java.awt.BorderLayout; +import java.awt.Dimension; +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.net.URL; +import javax.swing.BorderFactory; +import javax.swing.BoxLayout; +import javax.swing.JButton; +import javax.swing.JFrame; +import javax.swing.JPanel; +import javax.swing.JScrollPane; +import javax.swing.JTextArea; +import javax.swing.JTextField; + +public class WebCrawlerWindow extends JFrame { + + public static final String LINE_SEPARATOR = System.getProperty("line.separator"); + private final JTextArea textArea; + private final JTextField location; + private final JButton goButton; + + public WebCrawlerWindow() { + setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); + setSize(600, 600); + + this.location = new JTextField("http://example.com"); + this.goButton = new JButton("Get text!"); + this.textArea = new JTextArea(); + initLayout(); + initActions(); + setVisible(true); + } + + private void initActions() { + goButton.addActionListener(e -> { + String url = location.getText(); + try ( + InputStream inputStream = new URL(url).openStream(); + BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream)) + ) { + final StringBuilder stringBuilder = new StringBuilder(); + String nextLine; + while ((nextLine = reader.readLine()) != null) { + stringBuilder.append(nextLine); + stringBuilder.append(LINE_SEPARATOR); + } + final String siteText = stringBuilder.toString(); + textArea.setText(siteText); + } catch (IOException e1) { + e1.printStackTrace(); + } + }); + } + + private void initLayout() { + var rootPanel = getContentPane(); + + var locationPanel = new JPanel(); + locationPanel.setLayout(new BoxLayout(locationPanel, BoxLayout.LINE_AXIS)); + locationPanel.setBorder(BorderFactory.createEmptyBorder(10, 10, 10, 10)); + locationPanel.add(location); + locationPanel.add(goButton); + + var scrollPane = new JScrollPane(textArea); + scrollPane.setPreferredSize(new Dimension(400, 400)); + + var areaPanel = new JPanel(); + areaPanel.setLayout(new BoxLayout(areaPanel, BoxLayout.PAGE_AXIS)); + areaPanel.setBorder(BorderFactory.createEmptyBorder(10, 10, 10, 10)); + areaPanel.add(scrollPane); + + rootPanel.add(locationPanel, BorderLayout.PAGE_START); + rootPanel.add(areaPanel, BorderLayout.CENTER); + + + } +} \ No newline at end of file From 4072d7d2cced9650feaecfcff5bfd631ec0fe01d Mon Sep 17 00:00:00 2001 From: oleksii Date: Mon, 24 Dec 2018 15:22:18 +0200 Subject: [PATCH 3/8] third phase frame implementation --- src/webcrawler/ApplicationRunner.java | 5 +- src/webcrawler/WebCrawlerWindow.java | 67 ++++++++++++++++-------- src/webcrawler/crawl/Html.java | 14 +++++ src/webcrawler/crawl/HtmlPageParser.java | 19 +++++++ 4 files changed, 81 insertions(+), 24 deletions(-) create mode 100644 src/webcrawler/crawl/Html.java create mode 100644 src/webcrawler/crawl/HtmlPageParser.java diff --git a/src/webcrawler/ApplicationRunner.java b/src/webcrawler/ApplicationRunner.java index a37a6d9..7a51cac 100644 --- a/src/webcrawler/ApplicationRunner.java +++ b/src/webcrawler/ApplicationRunner.java @@ -1,8 +1,11 @@ package webcrawler; +import webcrawler.crawl.HtmlPageParser; + public class ApplicationRunner { public static void main(String[] args) { - new WebCrawlerWindow(); + HtmlPageParser pageParser = new HtmlPageParser(); + new WebCrawlerWindow(pageParser); } } diff --git a/src/webcrawler/WebCrawlerWindow.java b/src/webcrawler/WebCrawlerWindow.java index 471bf50..b758cc2 100644 --- a/src/webcrawler/WebCrawlerWindow.java +++ b/src/webcrawler/WebCrawlerWindow.java @@ -1,6 +1,5 @@ package webcrawler; -import java.awt.BorderLayout; import java.awt.Dimension; import java.io.BufferedReader; import java.io.IOException; @@ -11,10 +10,14 @@ import javax.swing.BoxLayout; import javax.swing.JButton; import javax.swing.JFrame; +import javax.swing.JLabel; import javax.swing.JPanel; import javax.swing.JScrollPane; import javax.swing.JTextArea; import javax.swing.JTextField; +import javax.swing.SwingUtilities; +import webcrawler.crawl.Html; +import webcrawler.crawl.HtmlPageParser; public class WebCrawlerWindow extends JFrame { @@ -22,38 +25,48 @@ public class WebCrawlerWindow extends JFrame { private final JTextArea textArea; private final JTextField location; private final JButton goButton; + private final JLabel titleLabelInfo; + private final JLabel titleLabel; - public WebCrawlerWindow() { + private final HtmlPageParser pageParser; + + public WebCrawlerWindow(HtmlPageParser pageParser) { + this.pageParser = pageParser; setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); setSize(600, 600); this.location = new JTextField("http://example.com"); this.goButton = new JButton("Get text!"); this.textArea = new JTextArea(); + this.titleLabelInfo = new JLabel("Title: "); + this.titleLabel = new JLabel(); initLayout(); initActions(); setVisible(true); } private void initActions() { - goButton.addActionListener(e -> { - String url = location.getText(); - try ( - InputStream inputStream = new URL(url).openStream(); - BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream)) - ) { - final StringBuilder stringBuilder = new StringBuilder(); - String nextLine; - while ((nextLine = reader.readLine()) != null) { - stringBuilder.append(nextLine); - stringBuilder.append(LINE_SEPARATOR); - } - final String siteText = stringBuilder.toString(); - textArea.setText(siteText); - } catch (IOException e1) { - e1.printStackTrace(); - } - }); + goButton.addActionListener(e -> + SwingUtilities.invokeLater(() -> { + String url = location.getText(); + try ( + InputStream inputStream = new URL(url).openStream(); + BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream)) + ) { + final StringBuilder stringBuilder = new StringBuilder(); + String nextLine; + while ((nextLine = reader.readLine()) != null) { + stringBuilder.append(nextLine); + stringBuilder.append(LINE_SEPARATOR); + } + final String siteText = stringBuilder.toString(); + Html html = pageParser.parse(siteText); + textArea.setText(siteText); + titleLabel.setText(html.getTitle()); + } catch (IOException e1) { + textArea.setText("No content"); + } + })); } private void initLayout() { @@ -65,6 +78,12 @@ private void initLayout() { locationPanel.add(location); locationPanel.add(goButton); + var titlePanel = new JPanel(); + titlePanel.setLayout(new BoxLayout(titlePanel, BoxLayout.LINE_AXIS)); + titlePanel.setBorder(BorderFactory.createEmptyBorder(10, 10, 10, 10)); + titlePanel.add(titleLabelInfo); + titlePanel.add(titleLabel); + var scrollPane = new JScrollPane(textArea); scrollPane.setPreferredSize(new Dimension(400, 400)); @@ -73,9 +92,11 @@ private void initLayout() { areaPanel.setBorder(BorderFactory.createEmptyBorder(10, 10, 10, 10)); areaPanel.add(scrollPane); - rootPanel.add(locationPanel, BorderLayout.PAGE_START); - rootPanel.add(areaPanel, BorderLayout.CENTER); - + rootPanel.setLayout(new BoxLayout(rootPanel, BoxLayout.PAGE_AXIS)); + rootPanel.add(locationPanel); + rootPanel.add(titlePanel); + rootPanel.add(areaPanel); + pack(); } } \ No newline at end of file diff --git a/src/webcrawler/crawl/Html.java b/src/webcrawler/crawl/Html.java new file mode 100644 index 0000000..341bba0 --- /dev/null +++ b/src/webcrawler/crawl/Html.java @@ -0,0 +1,14 @@ +package webcrawler.crawl; + +public class Html { + + private String title; + + public Html(String title) { + this.title = title; + } + + public String getTitle() { + return title; + } +} diff --git a/src/webcrawler/crawl/HtmlPageParser.java b/src/webcrawler/crawl/HtmlPageParser.java new file mode 100644 index 0000000..bacc88e --- /dev/null +++ b/src/webcrawler/crawl/HtmlPageParser.java @@ -0,0 +1,19 @@ +package webcrawler.crawl; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class HtmlPageParser { + + private Pattern titlePattern; + + public HtmlPageParser() { + this.titlePattern = Pattern.compile("(.*)"); + } + + public Html parse(String siteText) { + Matcher matcher = titlePattern.matcher(siteText); + boolean found = matcher.find(); + return found ? new Html(matcher.group(1)) : new Html("No title"); + } +} From 56308c555f6e1e556e65c08be7b59072c4f6beda Mon Sep 17 00:00:00 2001 From: oleksii Date: Tue, 25 Dec 2018 00:12:55 +0200 Subject: [PATCH 4/8] third phase frame implementation --- src/webcrawler/WebCrawlerWindow.java | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/webcrawler/WebCrawlerWindow.java b/src/webcrawler/WebCrawlerWindow.java index b758cc2..bee359e 100644 --- a/src/webcrawler/WebCrawlerWindow.java +++ b/src/webcrawler/WebCrawlerWindow.java @@ -1,6 +1,8 @@ package webcrawler; +import java.awt.ComponentOrientation; import java.awt.Dimension; +import java.awt.FlowLayout; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; @@ -74,13 +76,14 @@ private void initLayout() { var locationPanel = new JPanel(); locationPanel.setLayout(new BoxLayout(locationPanel, BoxLayout.LINE_AXIS)); - locationPanel.setBorder(BorderFactory.createEmptyBorder(10, 10, 10, 10)); + locationPanel.setBorder(BorderFactory.createEmptyBorder(5, 5, 5, 5)); locationPanel.add(location); locationPanel.add(goButton); var titlePanel = new JPanel(); - titlePanel.setLayout(new BoxLayout(titlePanel, BoxLayout.LINE_AXIS)); - titlePanel.setBorder(BorderFactory.createEmptyBorder(10, 10, 10, 10)); + titlePanel.setLayout(new FlowLayout(FlowLayout.LEFT)); + titlePanel.setComponentOrientation(ComponentOrientation.LEFT_TO_RIGHT); + titlePanel.setBorder(BorderFactory.createEmptyBorder(5, 5, 5, 5)); titlePanel.add(titleLabelInfo); titlePanel.add(titleLabel); From 077f6d67d43b62f1836d9cdb88b70f99a73bdbff Mon Sep 17 00:00:00 2001 From: oleksii Date: Tue, 25 Dec 2018 01:45:01 +0200 Subject: [PATCH 5/8] fourth phase frame implementation --- src/webcrawler/ApplicationRunner.java | 4 +- src/webcrawler/WebCrawlerWindow.java | 57 +++++++++++---------- src/webcrawler/crawl/Html.java | 12 ++++- src/webcrawler/crawl/HtmlBuilder.java | 29 +++++++++++ src/webcrawler/crawl/HtmlPageParser.java | 46 +++++++++++++++-- src/webcrawler/source/PageSourceReader.java | 45 ++++++++++++++++ 6 files changed, 158 insertions(+), 35 deletions(-) create mode 100644 src/webcrawler/crawl/HtmlBuilder.java create mode 100644 src/webcrawler/source/PageSourceReader.java diff --git a/src/webcrawler/ApplicationRunner.java b/src/webcrawler/ApplicationRunner.java index 7a51cac..0be7286 100644 --- a/src/webcrawler/ApplicationRunner.java +++ b/src/webcrawler/ApplicationRunner.java @@ -1,11 +1,13 @@ package webcrawler; import webcrawler.crawl.HtmlPageParser; +import webcrawler.source.PageSourceReader; public class ApplicationRunner { public static void main(String[] args) { HtmlPageParser pageParser = new HtmlPageParser(); - new WebCrawlerWindow(pageParser); + PageSourceReader pageSourceReader = new PageSourceReader(); + new WebCrawlerWindow(pageParser, pageSourceReader); } } diff --git a/src/webcrawler/WebCrawlerWindow.java b/src/webcrawler/WebCrawlerWindow.java index bee359e..d983eae 100644 --- a/src/webcrawler/WebCrawlerWindow.java +++ b/src/webcrawler/WebCrawlerWindow.java @@ -3,11 +3,6 @@ import java.awt.ComponentOrientation; import java.awt.Dimension; import java.awt.FlowLayout; -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.net.URL; import javax.swing.BorderFactory; import javax.swing.BoxLayout; import javax.swing.JButton; @@ -15,33 +10,43 @@ import javax.swing.JLabel; import javax.swing.JPanel; import javax.swing.JScrollPane; -import javax.swing.JTextArea; +import javax.swing.JTable; import javax.swing.JTextField; import javax.swing.SwingUtilities; +import javax.swing.table.DefaultTableModel; import webcrawler.crawl.Html; import webcrawler.crawl.HtmlPageParser; +import webcrawler.source.PageSourceReader; public class WebCrawlerWindow extends JFrame { - public static final String LINE_SEPARATOR = System.getProperty("line.separator"); - private final JTextArea textArea; + + private final JTable table; private final JTextField location; private final JButton goButton; private final JLabel titleLabelInfo; private final JLabel titleLabel; + private final JLabel urlLabel; + + private final DefaultTableModel model; + private final PageSourceReader pageReader; private final HtmlPageParser pageParser; - public WebCrawlerWindow(HtmlPageParser pageParser) { + public WebCrawlerWindow(HtmlPageParser pageParser, PageSourceReader pageReader) { this.pageParser = pageParser; + this.pageReader = pageReader; setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); setSize(600, 600); - this.location = new JTextField("http://example.com"); - this.goButton = new JButton("Get text!"); - this.textArea = new JTextArea(); + this.urlLabel = new JLabel("URL: "); + this.location = new JTextField("https://wikipedia.org"); + this.goButton = new JButton("Parse"); + this.table = new JTable(); + this.model = new DefaultTableModel(new String[]{"URL", "Title"}, 0); this.titleLabelInfo = new JLabel("Title: "); this.titleLabel = new JLabel(); + initLayout(); initActions(); setVisible(true); @@ -51,22 +56,16 @@ private void initActions() { goButton.addActionListener(e -> SwingUtilities.invokeLater(() -> { String url = location.getText(); - try ( - InputStream inputStream = new URL(url).openStream(); - BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream)) - ) { - final StringBuilder stringBuilder = new StringBuilder(); - String nextLine; - while ((nextLine = reader.readLine()) != null) { - stringBuilder.append(nextLine); - stringBuilder.append(LINE_SEPARATOR); + final String siteText = pageReader.readPageSource(url); + Html html = pageParser.parse(siteText, url); + titleLabel.setText(html.getTitle()); + for (String link : html.getLinks()) { + if (pageReader.isHtml(link)) { + final String linkText = pageReader.readPageSource(link); + Html htmlLink = pageParser.parse(linkText, link); + System.out.println(link + " : " + html.getTitle()); + model.addRow(new Object[]{link, htmlLink.getTitle()}); } - final String siteText = stringBuilder.toString(); - Html html = pageParser.parse(siteText); - textArea.setText(siteText); - titleLabel.setText(html.getTitle()); - } catch (IOException e1) { - textArea.setText("No content"); } })); } @@ -77,9 +76,11 @@ private void initLayout() { var locationPanel = new JPanel(); locationPanel.setLayout(new BoxLayout(locationPanel, BoxLayout.LINE_AXIS)); locationPanel.setBorder(BorderFactory.createEmptyBorder(5, 5, 5, 5)); + locationPanel.add(urlLabel); locationPanel.add(location); locationPanel.add(goButton); + table.setModel(model); var titlePanel = new JPanel(); titlePanel.setLayout(new FlowLayout(FlowLayout.LEFT)); titlePanel.setComponentOrientation(ComponentOrientation.LEFT_TO_RIGHT); @@ -87,7 +88,7 @@ private void initLayout() { titlePanel.add(titleLabelInfo); titlePanel.add(titleLabel); - var scrollPane = new JScrollPane(textArea); + var scrollPane = new JScrollPane(table); scrollPane.setPreferredSize(new Dimension(400, 400)); var areaPanel = new JPanel(); diff --git a/src/webcrawler/crawl/Html.java b/src/webcrawler/crawl/Html.java index 341bba0..e2603d2 100644 --- a/src/webcrawler/crawl/Html.java +++ b/src/webcrawler/crawl/Html.java @@ -1,14 +1,22 @@ package webcrawler.crawl; +import java.util.Set; + public class Html { - private String title; + private final String title; + private final Set links; - public Html(String title) { + public Html(String title, Set links) { this.title = title; + this.links = links; } public String getTitle() { return title; } + + public Set getLinks() { + return links; + } } diff --git a/src/webcrawler/crawl/HtmlBuilder.java b/src/webcrawler/crawl/HtmlBuilder.java new file mode 100644 index 0000000..95c8781 --- /dev/null +++ b/src/webcrawler/crawl/HtmlBuilder.java @@ -0,0 +1,29 @@ +package webcrawler.crawl; + +import java.util.Set; + +public class HtmlBuilder { + + + private String title; + private Set links; + + public HtmlBuilder() { + title = "no title"; + links = Set.of(); + } + + HtmlBuilder withTitle(String title) { + this.title = title; + return this; + } + + HtmlBuilder withLinks(Set links) { + this.links = links; + return this; + } + + Html build() { + return new Html(title, links); + } +} diff --git a/src/webcrawler/crawl/HtmlPageParser.java b/src/webcrawler/crawl/HtmlPageParser.java index bacc88e..cd93962 100644 --- a/src/webcrawler/crawl/HtmlPageParser.java +++ b/src/webcrawler/crawl/HtmlPageParser.java @@ -1,19 +1,57 @@ package webcrawler.crawl; +import java.util.ArrayList; +import java.util.HashSet; import java.util.regex.Matcher; import java.util.regex.Pattern; public class HtmlPageParser { private Pattern titlePattern; + private Pattern linkPattern; public HtmlPageParser() { this.titlePattern = Pattern.compile("(.*)"); + this.linkPattern = Pattern.compile("]*?\\s+)?href=([\"'])(.*?)\\1"); } - public Html parse(String siteText) { - Matcher matcher = titlePattern.matcher(siteText); - boolean found = matcher.find(); - return found ? new Html(matcher.group(1)) : new Html("No title"); + public Html parse(String siteText, String rootHost) { + HtmlBuilder htmlBuilder = new HtmlBuilder(); + buildTitle(htmlBuilder, siteText); + buildLinks(htmlBuilder, siteText, rootHost); + + return htmlBuilder.build(); + } + + private void buildLinks(HtmlBuilder htmlBuilder, String siteText, String rootHost) { + Matcher linksMatcher = linkPattern.matcher(siteText); + boolean found = linksMatcher.find(); + HashSet links = new HashSet<>(); + while (found) { + String group = linksMatcher.group(2); + String link = normalize(group, rootHost); + links.add(link); + found = linksMatcher.find(); + } + htmlBuilder.withLinks(links); + } + + private void buildTitle(HtmlBuilder htmlBuilder, String siteText) { + Matcher titleMatcher = titlePattern.matcher(siteText); + boolean found = titleMatcher.find(); + if (found) { + htmlBuilder.withTitle(titleMatcher.group(1)); + } + } + + private String normalize(String link, String rootHost) { + if (link.startsWith("http://") || link.startsWith("https://")) { + return link; + } else if (link.startsWith("//")) { + String protocol = rootHost.startsWith("https") ? "https:" : "http:"; + return protocol + link; + } else { + return rootHost + "/" + link; + } } } diff --git a/src/webcrawler/source/PageSourceReader.java b/src/webcrawler/source/PageSourceReader.java new file mode 100644 index 0000000..cd43b66 --- /dev/null +++ b/src/webcrawler/source/PageSourceReader.java @@ -0,0 +1,45 @@ +package webcrawler.source; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.net.URL; +import java.net.URLConnection; + +public class PageSourceReader { + + public static final String LINE_SEPARATOR = System.getProperty("line.separator"); + public static final String TEXT_HTML = "text/html"; + + public boolean isHtml(String url) { + try { + URLConnection urlConnection = new URL(url).openConnection(); + if (urlConnection.getContentType().equals(TEXT_HTML)) { + return true; + } + } catch (IOException e) { + return false; + } + return false; + } + + public String readPageSource(String url) { + try ( + InputStream inputStream = new URL(url).openStream(); + BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream)) + ) { + final StringBuilder stringBuilder = new StringBuilder(); + String nextLine; + while ((nextLine = reader.readLine()) != null) { + stringBuilder.append(nextLine); + stringBuilder.append(LINE_SEPARATOR); + } + return stringBuilder.toString(); + } catch ( + IOException e1) { + System.err.println("Error on parse page " + url); + return ""; + } + } +} From 71adb15702ce89f9ace83e8975193de0d3fca6de Mon Sep 17 00:00:00 2001 From: oleksii Date: Tue, 25 Dec 2018 16:25:29 +0200 Subject: [PATCH 6/8] fourth phase frame implementation --- src/webcrawler/WebCrawlerWindow.java | 16 ++----- src/webcrawler/source/PageSourceReader.java | 6 ++- src/webcrawler/workers/PageLoadWorker.java | 47 ++++++++++++++++++++ src/webcrawler/workers/ParseLinksWorker.java | 37 +++++++++++++++ 4 files changed, 92 insertions(+), 14 deletions(-) create mode 100644 src/webcrawler/workers/PageLoadWorker.java create mode 100644 src/webcrawler/workers/ParseLinksWorker.java diff --git a/src/webcrawler/WebCrawlerWindow.java b/src/webcrawler/WebCrawlerWindow.java index d983eae..5f0d1ff 100644 --- a/src/webcrawler/WebCrawlerWindow.java +++ b/src/webcrawler/WebCrawlerWindow.java @@ -14,9 +14,9 @@ import javax.swing.JTextField; import javax.swing.SwingUtilities; import javax.swing.table.DefaultTableModel; -import webcrawler.crawl.Html; import webcrawler.crawl.HtmlPageParser; import webcrawler.source.PageSourceReader; +import webcrawler.workers.PageLoadWorker; public class WebCrawlerWindow extends JFrame { @@ -56,17 +56,9 @@ private void initActions() { goButton.addActionListener(e -> SwingUtilities.invokeLater(() -> { String url = location.getText(); - final String siteText = pageReader.readPageSource(url); - Html html = pageParser.parse(siteText, url); - titleLabel.setText(html.getTitle()); - for (String link : html.getLinks()) { - if (pageReader.isHtml(link)) { - final String linkText = pageReader.readPageSource(link); - Html htmlLink = pageParser.parse(linkText, link); - System.out.println(link + " : " + html.getTitle()); - model.addRow(new Object[]{link, htmlLink.getTitle()}); - } - } + PageLoadWorker pageLoadWorker = new PageLoadWorker(url, titleLabel, model, pageReader, + pageParser); + pageLoadWorker.execute(); })); } diff --git a/src/webcrawler/source/PageSourceReader.java b/src/webcrawler/source/PageSourceReader.java index cd43b66..d6f66c4 100644 --- a/src/webcrawler/source/PageSourceReader.java +++ b/src/webcrawler/source/PageSourceReader.java @@ -15,10 +15,12 @@ public class PageSourceReader { public boolean isHtml(String url) { try { URLConnection urlConnection = new URL(url).openConnection(); - if (urlConnection.getContentType().equals(TEXT_HTML)) { + if (urlConnection.getContentType() != null && urlConnection.getContentType() + .contains(TEXT_HTML)) { return true; } } catch (IOException e) { + System.out.println("Error " + e); return false; } return false; @@ -38,7 +40,7 @@ public String readPageSource(String url) { return stringBuilder.toString(); } catch ( IOException e1) { - System.err.println("Error on parse page " + url); + System.out.println("Error on parse page " + url); return ""; } } diff --git a/src/webcrawler/workers/PageLoadWorker.java b/src/webcrawler/workers/PageLoadWorker.java new file mode 100644 index 0000000..6cbf654 --- /dev/null +++ b/src/webcrawler/workers/PageLoadWorker.java @@ -0,0 +1,47 @@ +package webcrawler.workers; + +import java.util.concurrent.ExecutionException; +import javax.swing.JLabel; +import javax.swing.SwingWorker; +import javax.swing.table.DefaultTableModel; +import webcrawler.crawl.Html; +import webcrawler.crawl.HtmlPageParser; +import webcrawler.source.PageSourceReader; + +public class PageLoadWorker extends SwingWorker { + + private final String url; + private final JLabel titleLabel; + + private DefaultTableModel linksTable; + private final PageSourceReader pageReader; + private final HtmlPageParser pageParser; + + public PageLoadWorker(String location, JLabel titleLabel, DefaultTableModel linksTable, + PageSourceReader pageReader, HtmlPageParser pageParser) { + this.url = location; + this.titleLabel = titleLabel; + this.linksTable = linksTable; + this.pageReader = pageReader; + this.pageParser = pageParser; + } + + @Override + protected Html doInBackground() throws Exception { + final String siteText = pageReader.readPageSource(url); + return pageParser.parse(siteText, url); + } + + @Override + protected void done() { + try { + Html html = get(); + titleLabel.setText(html.getTitle()); + ParseLinksWorker parseLinksWorker = new ParseLinksWorker(html, linksTable, pageReader, + pageParser); + parseLinksWorker.execute(); + } catch (InterruptedException | ExecutionException e) { + Thread.currentThread().interrupt(); + } + } +} diff --git a/src/webcrawler/workers/ParseLinksWorker.java b/src/webcrawler/workers/ParseLinksWorker.java new file mode 100644 index 0000000..66d085d --- /dev/null +++ b/src/webcrawler/workers/ParseLinksWorker.java @@ -0,0 +1,37 @@ +package webcrawler.workers; + +import javax.swing.SwingWorker; +import javax.swing.table.DefaultTableModel; +import webcrawler.crawl.Html; +import webcrawler.crawl.HtmlPageParser; +import webcrawler.source.PageSourceReader; + +public class ParseLinksWorker extends SwingWorker { + + private final Html rootPage; + private final DefaultTableModel linksTable; + + private final PageSourceReader pageReader; + private final HtmlPageParser pageParser; + + public ParseLinksWorker(Html rootPage, DefaultTableModel linksTable, + PageSourceReader pageReader, HtmlPageParser pageParser) { + this.rootPage = rootPage; + this.linksTable = linksTable; + this.pageReader = pageReader; + this.pageParser = pageParser; + } + + + @Override + protected Void doInBackground() throws Exception { + for (String link : rootPage.getLinks()) { + if (pageReader.isHtml(link)) { + final String linkText = pageReader.readPageSource(link); + Html htmlLink = pageParser.parse(linkText, link); + linksTable.addRow(new Object[]{link, htmlLink.getTitle()}); + } + } + return null; + } +} From 61510c1381ee7976000a2c0a19131378e162b56f Mon Sep 17 00:00:00 2001 From: oleksii Date: Wed, 26 Dec 2018 22:53:55 +0200 Subject: [PATCH 7/8] fifth phase frame implementation --- src/webcrawler/WebCrawlerWindow.java | 32 ++++++++++++++++--- src/webcrawler/crawl/Html.java | 2 +- src/webcrawler/crawl/HtmlBuilder.java | 3 +- src/webcrawler/crawl/HtmlPageParser.java | 2 +- src/webcrawler/source/PageSourceReader.java | 24 ++++++++++---- src/webcrawler/workers/ExportTableWorker.java | 32 +++++++++++++++++++ src/webcrawler/workers/ParseLinksWorker.java | 9 +++++- 7 files changed, 89 insertions(+), 15 deletions(-) create mode 100644 src/webcrawler/workers/ExportTableWorker.java diff --git a/src/webcrawler/WebCrawlerWindow.java b/src/webcrawler/WebCrawlerWindow.java index 5f0d1ff..0215da0 100644 --- a/src/webcrawler/WebCrawlerWindow.java +++ b/src/webcrawler/WebCrawlerWindow.java @@ -3,6 +3,7 @@ import java.awt.ComponentOrientation; import java.awt.Dimension; import java.awt.FlowLayout; +import java.nio.file.Paths; import javax.swing.BorderFactory; import javax.swing.BoxLayout; import javax.swing.JButton; @@ -16,6 +17,7 @@ import javax.swing.table.DefaultTableModel; import webcrawler.crawl.HtmlPageParser; import webcrawler.source.PageSourceReader; +import webcrawler.workers.ExportTableWorker; import webcrawler.workers.PageLoadWorker; public class WebCrawlerWindow extends JFrame { @@ -27,8 +29,11 @@ public class WebCrawlerWindow extends JFrame { private final JLabel titleLabelInfo; private final JLabel titleLabel; private final JLabel urlLabel; + private final JLabel exportLabel; + private final JTextField exportLocation; + private final JButton exportButton; - private final DefaultTableModel model; + private final DefaultTableModel tableModel; private final PageSourceReader pageReader; private final HtmlPageParser pageParser; @@ -43,9 +48,12 @@ public WebCrawlerWindow(HtmlPageParser pageParser, PageSourceReader pageReader) this.location = new JTextField("https://wikipedia.org"); this.goButton = new JButton("Parse"); this.table = new JTable(); - this.model = new DefaultTableModel(new String[]{"URL", "Title"}, 0); + this.tableModel = new DefaultTableModel(new String[]{"URL", "Title"}, 0); this.titleLabelInfo = new JLabel("Title: "); this.titleLabel = new JLabel(); + this.exportLabel = new JLabel("Export: "); + this.exportLocation = new JTextField("export/links.txt"); + this.exportButton = new JButton("Save"); initLayout(); initActions(); @@ -56,10 +64,18 @@ private void initActions() { goButton.addActionListener(e -> SwingUtilities.invokeLater(() -> { String url = location.getText(); - PageLoadWorker pageLoadWorker = new PageLoadWorker(url, titleLabel, model, pageReader, + PageLoadWorker pageLoadWorker = new PageLoadWorker(url, titleLabel, tableModel, + pageReader, pageParser); pageLoadWorker.execute(); })); + + exportButton.addActionListener(e -> + SwingUtilities.invokeLater(() -> { + String file = exportLocation.getText(); + ExportTableWorker pageLoadWorker = new ExportTableWorker(Paths.get(file), tableModel); + pageLoadWorker.execute(); + })); } private void initLayout() { @@ -72,7 +88,7 @@ private void initLayout() { locationPanel.add(location); locationPanel.add(goButton); - table.setModel(model); + table.setModel(tableModel); var titlePanel = new JPanel(); titlePanel.setLayout(new FlowLayout(FlowLayout.LEFT)); titlePanel.setComponentOrientation(ComponentOrientation.LEFT_TO_RIGHT); @@ -88,10 +104,18 @@ private void initLayout() { areaPanel.setBorder(BorderFactory.createEmptyBorder(10, 10, 10, 10)); areaPanel.add(scrollPane); + var exportPanel = new JPanel(); + exportPanel.setLayout(new BoxLayout(exportPanel, BoxLayout.LINE_AXIS)); + exportPanel.setBorder(BorderFactory.createEmptyBorder(5, 5, 5, 5)); + exportPanel.add(exportLabel); + exportPanel.add(exportLocation); + exportPanel.add(exportButton); + rootPanel.setLayout(new BoxLayout(rootPanel, BoxLayout.PAGE_AXIS)); rootPanel.add(locationPanel); rootPanel.add(titlePanel); rootPanel.add(areaPanel); + rootPanel.add(exportPanel); pack(); } diff --git a/src/webcrawler/crawl/Html.java b/src/webcrawler/crawl/Html.java index e2603d2..3b5c79d 100644 --- a/src/webcrawler/crawl/Html.java +++ b/src/webcrawler/crawl/Html.java @@ -7,7 +7,7 @@ public class Html { private final String title; private final Set links; - public Html(String title, Set links) { + Html(String title, Set links) { this.title = title; this.links = links; } diff --git a/src/webcrawler/crawl/HtmlBuilder.java b/src/webcrawler/crawl/HtmlBuilder.java index 95c8781..cdca2b0 100644 --- a/src/webcrawler/crawl/HtmlBuilder.java +++ b/src/webcrawler/crawl/HtmlBuilder.java @@ -4,11 +4,10 @@ public class HtmlBuilder { - private String title; private Set links; - public HtmlBuilder() { + HtmlBuilder() { title = "no title"; links = Set.of(); } diff --git a/src/webcrawler/crawl/HtmlPageParser.java b/src/webcrawler/crawl/HtmlPageParser.java index cd93962..1a17b38 100644 --- a/src/webcrawler/crawl/HtmlPageParser.java +++ b/src/webcrawler/crawl/HtmlPageParser.java @@ -1,6 +1,5 @@ package webcrawler.crawl; -import java.util.ArrayList; import java.util.HashSet; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -17,6 +16,7 @@ public HtmlPageParser() { public Html parse(String siteText, String rootHost) { HtmlBuilder htmlBuilder = new HtmlBuilder(); + buildTitle(htmlBuilder, siteText); buildLinks(htmlBuilder, siteText, rootHost); diff --git a/src/webcrawler/source/PageSourceReader.java b/src/webcrawler/source/PageSourceReader.java index d6f66c4..3e5d3dc 100644 --- a/src/webcrawler/source/PageSourceReader.java +++ b/src/webcrawler/source/PageSourceReader.java @@ -9,12 +9,14 @@ public class PageSourceReader { - public static final String LINE_SEPARATOR = System.getProperty("line.separator"); - public static final String TEXT_HTML = "text/html"; + private static final String LINE_SEPARATOR = System.getProperty("line.separator"); + private static final String TEXT_HTML = "text/html"; + public static final String USER_AGENT_MOZILLA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0"; public boolean isHtml(String url) { try { URLConnection urlConnection = new URL(url).openConnection(); + urlConnection.setRequestProperty("User-Agent", USER_AGENT_MOZILLA); if (urlConnection.getContentType() != null && urlConnection.getContentType() .contains(TEXT_HTML)) { return true; @@ -27,8 +29,20 @@ public boolean isHtml(String url) { } public String readPageSource(String url) { + final URLConnection urlConnection; + try { + urlConnection = new URL(url).openConnection(); + urlConnection.setRequestProperty("User-Agent", USER_AGENT_MOZILLA); + return readFromConnection(urlConnection); + } catch (IOException e) { + System.out.println("Error on parse page " + url); + return ""; + } + } + + private String readFromConnection(URLConnection urlConnection) { try ( - InputStream inputStream = new URL(url).openStream(); + InputStream inputStream = urlConnection.getInputStream(); BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream)) ) { final StringBuilder stringBuilder = new StringBuilder(); @@ -38,9 +52,7 @@ public String readPageSource(String url) { stringBuilder.append(LINE_SEPARATOR); } return stringBuilder.toString(); - } catch ( - IOException e1) { - System.out.println("Error on parse page " + url); + } catch (IOException e) { return ""; } } diff --git a/src/webcrawler/workers/ExportTableWorker.java b/src/webcrawler/workers/ExportTableWorker.java new file mode 100644 index 0000000..7a6924e --- /dev/null +++ b/src/webcrawler/workers/ExportTableWorker.java @@ -0,0 +1,32 @@ +package webcrawler.workers; + +import java.io.BufferedWriter; +import java.io.PrintWriter; +import java.nio.file.Files; +import java.nio.file.Path; +import javax.swing.SwingWorker; +import javax.swing.table.DefaultTableModel; + +public class ExportTableWorker extends SwingWorker { + + private Path exportPath; + private DefaultTableModel tableModel; + + public ExportTableWorker(Path exportPath, DefaultTableModel tableModel) { + this.exportPath = exportPath; + this.tableModel = tableModel; + } + + @Override + protected Void doInBackground() throws Exception { + Files.createDirectories(exportPath.getParent()); + try (BufferedWriter bufferedWriter = Files.newBufferedWriter(exportPath); + PrintWriter printWriter = new PrintWriter(bufferedWriter)) { + int rowCount = tableModel.getRowCount(); + for (int i = 0; i < rowCount; i++) { + printWriter.printf("%s\n%s\n", tableModel.getValueAt(i, 0), tableModel.getValueAt(i, 1)); + } + } + return null; + } +} diff --git a/src/webcrawler/workers/ParseLinksWorker.java b/src/webcrawler/workers/ParseLinksWorker.java index 66d085d..65ab6ee 100644 --- a/src/webcrawler/workers/ParseLinksWorker.java +++ b/src/webcrawler/workers/ParseLinksWorker.java @@ -1,5 +1,7 @@ package webcrawler.workers; +import java.util.List; +import java.util.Vector; import javax.swing.SwingWorker; import javax.swing.table.DefaultTableModel; import webcrawler.crawl.Html; @@ -25,13 +27,18 @@ public ParseLinksWorker(Html rootPage, DefaultTableModel linksTable, @Override protected Void doInBackground() throws Exception { + Vector> allData = new Vector<>(); for (String link : rootPage.getLinks()) { if (pageReader.isHtml(link)) { + Vector res = new Vector<>(); final String linkText = pageReader.readPageSource(link); Html htmlLink = pageParser.parse(linkText, link); - linksTable.addRow(new Object[]{link, htmlLink.getTitle()}); + res.add(link); + res.add(htmlLink.getTitle()); + allData.add(res); } } + linksTable.setDataVector(allData, new Vector<>(List.of("URL", "Title"))); return null; } } From ee09f629db1e7325546abb97d9ebc4e5a2659ebe Mon Sep 17 00:00:00 2001 From: oleksii Date: Thu, 27 Dec 2018 23:36:41 +0200 Subject: [PATCH 8/8] fifth phase frame implementation fixed review comments --- src/webcrawler/WebCrawlerWindow.java | 1 - src/webcrawler/workers/ParseLinksWorker.java | 18 ++++++++++++++---- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/src/webcrawler/WebCrawlerWindow.java b/src/webcrawler/WebCrawlerWindow.java index 0215da0..aa9c7f6 100644 --- a/src/webcrawler/WebCrawlerWindow.java +++ b/src/webcrawler/WebCrawlerWindow.java @@ -22,7 +22,6 @@ public class WebCrawlerWindow extends JFrame { - private final JTable table; private final JTextField location; private final JButton goButton; diff --git a/src/webcrawler/workers/ParseLinksWorker.java b/src/webcrawler/workers/ParseLinksWorker.java index 65ab6ee..ce090fb 100644 --- a/src/webcrawler/workers/ParseLinksWorker.java +++ b/src/webcrawler/workers/ParseLinksWorker.java @@ -2,13 +2,14 @@ import java.util.List; import java.util.Vector; +import java.util.concurrent.ExecutionException; import javax.swing.SwingWorker; import javax.swing.table.DefaultTableModel; import webcrawler.crawl.Html; import webcrawler.crawl.HtmlPageParser; import webcrawler.source.PageSourceReader; -public class ParseLinksWorker extends SwingWorker { +public class ParseLinksWorker extends SwingWorker>, Void> { private final Html rootPage; private final DefaultTableModel linksTable; @@ -26,7 +27,7 @@ public ParseLinksWorker(Html rootPage, DefaultTableModel linksTable, @Override - protected Void doInBackground() throws Exception { + protected Vector> doInBackground() throws Exception { Vector> allData = new Vector<>(); for (String link : rootPage.getLinks()) { if (pageReader.isHtml(link)) { @@ -38,7 +39,16 @@ protected Void doInBackground() throws Exception { allData.add(res); } } - linksTable.setDataVector(allData, new Vector<>(List.of("URL", "Title"))); - return null; + return allData; + } + + @Override + protected void done() { + try { + Vector> allData = get(); + linksTable.setDataVector(allData, new Vector<>(List.of("URL", "Title"))); + } catch (InterruptedException | ExecutionException e) { + Thread.currentThread().interrupt(); + } } }