Article_Scraper/grab_content.py at main · PeterRobards/Article_Scraper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
#!/usr/bin/env python3
"""Program designed to extract text content from a handful of popular websites"""
# -*- coding: utf-8 -*-
#
# Web Scraping Program designed to extract text content from a handful of popular websites
# Created by Peter Robards -- Version 1.1
#
##########################################################################################

import sys
import requests
import tldextract
from bs4 import BeautifulSoup

# method to extract the top level domain from a given url - requires tldextract library
def extract_domain(url):
    """Extracts the top level domain from a given url - requires tldextract library"""
    ext = tldextract.extract(url)
    domain = ext.domain
    return domain


# Method to save output to a specified file
def output_to_file(file_name, text):
    """Saves output to a specified file"""
    with open(file_name, "w") as out_file:
        # write the new line to output file
        out_file.write(text)

    print(f"\nThe results have been saved to: {file_name} ")


# Method to set web scraping variables based on the target website
#  website_list = ["arstechnica", "bleepingcomputer", "vice", "wired"]
def set_scraper_variables(domain, website_list):
    """set which elements to extract content from based on the target website"""

    if domain == website_list[0]:

        title_wrapper = "h1"
        title_attr = "itemprop"
        title_attr_name = "headline"
        content_wrapper = "div"
        content_attr = "itemprop"
        content_attr_name = "articleBody"
        target_wrapper = "p"

    elif domain == website_list[1]:

        title_wrapper = "div"
        title_attr = "class"
        title_attr_name = "article_section"
        content_wrapper = "div"
        content_attr = "class"
        content_attr_name = "articleBody"
        target_wrapper = "p"

    elif domain == website_list[2]:

        title_wrapper = "h1"
        title_attr = "class"
        title_attr_name = "smart-header__hed smart-header__hed--size-2"
        content_wrapper = "div"
        content_attr = "class"
        content_attr_name = "article__body-components"
        target_wrapper = "p"

    elif domain == website_list[3]:

        title_wrapper = "h1"
        title_attr = "class"
        title_attr_name = "content-header__row content-header__hed"
        content_wrapper = "div"
        content_attr = "class"
        content_attr_name = "article__chunks"
        target_wrapper = "p"

    else:
        print("\nERROR -> Domain: {domain} is not a valid entry.")
        print("\nValid entries include: ")
        print(f"{website_list}")
        sys.exit("\nPlease check URL and retry - thank you!\n")

    return (
        title_wrapper,
        title_attr,
        title_attr_name,
        content_wrapper,
        content_attr,
        content_attr_name,
        target_wrapper,
    )


# Method to remove unwanted content from specific websites
def remove_unwanted_content(domain, soup_object):
    """Remove unwanted content collected from non-standard websites"""
    if domain == "bleepingcomputer":
        # Remove unwanted content from soup object
        soup_object.find("div", {"class": "cz-related-article-wrapp"}).decompose()

    return soup_object


# Method to extract title text from supported websites
def get_title(website, soup, title_wrapper, title_attr, title_attr_name):
    """Extracts title text from supported website"""
    # Code to extract title from SomeWebsite.com
    # Check if website is the single edge cases
    # if it is use the customized method, else use the default method
    if website == "bleepingcomputer":
        title = (
            soup.find(title_wrapper, attrs={title_attr: title_attr_name})
            .find("h1")
            .text
        )
    else:
        title = soup.find(title_wrapper, attrs={title_attr: title_attr_name}).text

    return title


# Method to extract the text content from supported websites
def get_text(soup, content_wrapper, content_attr, content_attr_name, target_wrapper):
    """Extracts the text content from supported websites"""
    content_text = ""
    target = soup.find(content_wrapper, {content_attr: content_attr_name}).findAll(
        target_wrapper
    )

    for element in target:
        content_text += "\n\n" + "".join(element.findAll(text=True)) + " "

    return content_text


# Method to extract content title and full targeted text from supported websites
def get_content(url, website, site_list):
    """Extracts content title and full targeted text from supported websites"""
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")

    # set scraper variables based on name of target website
    (
        title_wrapper,
        title_attr,
        title_attr_name,
        content_wrapper,
        content_attr,
        content_attr_name,
        target_wrapper,
    ) = set_scraper_variables(website, site_list)

    title = get_title(website, soup, title_wrapper, title_attr, title_attr_name)

    # Remove unwanted content from soup object for a few select websites
    soup = remove_unwanted_content(website, soup)

    content_text = get_text(
        soup, content_wrapper, content_attr, content_attr_name, target_wrapper
    )

    return title, content_text


##########################################################################################


def main():
    """Main() method for program"""
    target = input("\nPlease enter the url of webpage you wish to scrape: ")

    supported_sites = ["arstechnica", "bleepingcomputer", "vice", "wired"]
    # extract domain from url
    website = extract_domain(target)
    if website not in supported_sites:
        print(f"\nERROR -> Website: {website} is not a valid entry.")
        print("\nValid entries include: ")
        print(f"{supported_sites}")
        sys.exit(1)

    # print out the HTML content of the page, formatted nicely,
    # using the prettify method on the BeautifulSoup object
    # print(soup.prettify())

    title, article = get_content(target, website, supported_sites)

    while True:
        choice = input("\nWould you like to save article to file? [Yes or No] : ")

        if choice[0].lower() == "y":
            outfile = input("\nPlease enter path for output file\n\t:")
            output_text = title + ". \n\n" + article + " \n\nSource: " + target
            output_to_file(outfile, output_text)
            break
        if choice[0].lower() == "n":
            print("\nTITLE:\n")
            print(title)
            print("\nText:")
            print(article)
            break

        print(f"\nERROR - your response {choice} is invalid!\n")
        print('\nPlease type either "Yes" or "No"!\n')

    print("\nJob Done!\n")


##########################################################################################

if __name__ == "__main__":

    main()