Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 10 additions & 10 deletions Pipfile.lock
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are these version changes needed? You bump up the minimum Python version

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Most of them aren't necessary, but the jinja2 version change is necessary as a security update (the repo current has 4 Dependabot alerts for security vulnerabilities in the current jinja2 version). I had simply run pipenv update when I was making my changes for this PR.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Make the dependency changes in a different PR

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

121 changes: 75 additions & 46 deletions pittapi/news.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,48 +21,20 @@

import math
from requests_html import Element, HTMLResponse, HTMLSession
from typing import Literal, NamedTuple
from typing import NamedTuple

NUM_ARTICLES_PER_PAGE = 20

NEWS_BY_CATEGORY_URL = (
"https://www.pitt.edu/pittwire/news/{category}?field_topics_target_id={topic_id}&field_article_date_value={year}"
PITT_BASE_URL = "https://www.pitt.edu"
PITTWIRE_URL = PITT_BASE_URL + "/pittwire"
FEATURES_ARTICLES_URL = PITTWIRE_URL + "/news/features-articles"
NEWS_BY_CATEGORY_URL = PITTWIRE_URL + (
"/news/{category}?field_topics_target_id={topic_id}&field_article_date_value={year}"
"&title={query}&field_category_target_id=All&page={page_num}"
)
PITT_BASE_URL = "https://www.pitt.edu"

Category = Literal["features-articles", "accolades-honors", "ones-to-watch", "announcements-and-updates"]
Topic = Literal[
"university-news",
"health-and-wellness",
"technology-and-science",
"arts-and-humanities",
"community-impact",
"innovation-and-research",
"global",
"diversity-equity-and-inclusion",
"our-city-our-campus",
"teaching-and-learning",
"space",
"ukraine",
"sustainability",
]

TOPIC_ID_MAP: dict[Topic, int] = {
"university-news": 432,
"health-and-wellness": 2,
"technology-and-science": 391,
"arts-and-humanities": 4,
"community-impact": 6,
"innovation-and-research": 1,
"global": 9,
"diversity-equity-and-inclusion": 8,
"our-city-our-campus": 12,
"teaching-and-learning": 7,
"space": 440,
"ukraine": 441,
"sustainability": 470,
}
CATEGORY_URL_NAME_MAP: dict[str, str] | None = None
TOPIC_ID_MAP: dict[str, int] | None = None

sess = HTMLSession()

Expand All @@ -87,18 +59,49 @@ def from_html(cls, article_html: Element) -> Article:
return cls(title=article_title, description=article_description, url=article_url, tags=article_tags)


def _get_page_articles(
topic: Topic,
category: Category,
query: str,
year: int | None,
page_num: int,
) -> list[Article]:
def _scrape_categories() -> dict[str, str]:
response: HTMLResponse = sess.get(PITTWIRE_URL)
category_menu: Element = response.html.find("div#block-views-block-category-menu-category-menu", first=True)
category_list: list[Element] = category_menu.find("ul.hamburger-menu-list li")
category_map: dict[str, str] = {}
for category in category_list:
category_link: Element = category.find("a", first=True)
category_url_name = category_link.attrs["href"].split("/")[-1]
category_map[category.text.strip()] = category_url_name
if not category_map:
raise RuntimeError("No categories found, please open a GitHub issue")
return category_map


def _scrape_topics() -> dict[str, int]:
response: HTMLResponse = sess.get(FEATURES_ARTICLES_URL)
main_content: Element = response.html.xpath("/html/body/div/main/div/section", first=True)
topic_fieldset: Element = main_content.find("fieldset.form-item-field-topics-target-id", first=True)
topic_options: list[Element] = topic_fieldset.find("option")
topic_map: dict[str, int] = {}
for topic_option in topic_options:
if (topic_id := topic_option.attrs["value"].strip()) == "All": # Skip placeholder "Topics" option
continue
topic_name = topic_option.text.strip()
topic_map[topic_name] = int(topic_id)
if not topic_map:
raise RuntimeError("No topics found, please open a GitHub issue")
return topic_map


def _get_page_articles(topic: str, category: str, query: str, year: int | None, page_num: int) -> list[Article]:
assert CATEGORY_URL_NAME_MAP is not None
assert TOPIC_ID_MAP is not None
year_str = str(year) if year else ""
page_num_str = str(page_num) if page_num else ""

response: HTMLResponse = sess.get(
NEWS_BY_CATEGORY_URL.format(
category=category, topic_id=TOPIC_ID_MAP[topic], year=year_str, query=query, page_num=page_num_str
category=CATEGORY_URL_NAME_MAP[category],
topic_id=TOPIC_ID_MAP[topic],
year=year_str,
query=query,
page_num=page_num_str,
)
)
main_content: Element = response.html.xpath("/html/body/div/main/div/section", first=True)
Expand All @@ -107,13 +110,39 @@ def _get_page_articles(
return page_articles


def get_categories() -> list[str]:
global CATEGORY_URL_NAME_MAP
if not CATEGORY_URL_NAME_MAP:
CATEGORY_URL_NAME_MAP = _scrape_categories()
return list(CATEGORY_URL_NAME_MAP.keys())


def get_topics() -> list[str]:
global TOPIC_ID_MAP
if not TOPIC_ID_MAP:
TOPIC_ID_MAP = _scrape_topics()
return list(TOPIC_ID_MAP.keys())


def get_articles_by_topic(
topic: Topic,
category: Category = "features-articles",
topic: str,
category: str = "Features & Articles",
query: str = "",
year: int | None = None,
max_num_results: int = NUM_ARTICLES_PER_PAGE,
) -> list[Article]:
global TOPIC_ID_MAP
if not TOPIC_ID_MAP:
TOPIC_ID_MAP = _scrape_topics()
if topic not in TOPIC_ID_MAP:
raise ValueError(f"'{topic}' is not a valid topic, must be one of the following: {get_topics()}")

global CATEGORY_URL_NAME_MAP
if not CATEGORY_URL_NAME_MAP:
CATEGORY_URL_NAME_MAP = _scrape_categories()
if category not in CATEGORY_URL_NAME_MAP:
raise ValueError(f"'{category}' is not a valid category, must be one of the following: {get_categories()}")

num_pages = math.ceil(max_num_results / NUM_ARTICLES_PER_PAGE)

# Get articles sequentially and synchronously (i.e., not using grequests) because the news pages must stay in order
Expand Down
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ black==24.10.0
certifi==2024.12.14
cfgv==3.4.0
charset-normalizer==3.4.0
click==8.1.7
click==8.1.8
coverage[toml]==7.6.9
distlib==0.3.9
docutils==0.21.2
Expand All @@ -14,7 +14,7 @@ identify==2.6.3
idna==3.10
imagesize==1.4.1
iniconfig==2.0.0
jinja2==3.1.4
jinja2==3.1.5
markupsafe==3.0.2
mccabe==0.7.0
mypy-extensions==1.0.0
Expand Down
70 changes: 65 additions & 5 deletions tests/news_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@
class NewsTest(unittest.TestCase):
def __init__(self, *args, **kwargs):
unittest.TestCase.__init__(self, *args, **kwargs)
with (SAMPLE_PATH / "news_pittwire.html").open() as f:
self.pittwire = f.read()
with (SAMPLE_PATH / "news_features_articles.html").open() as f:
self.features_articles = f.read()
with (SAMPLE_PATH / "news_university_news_features_articles_page_0.html").open() as f:
self.university_news_features_articles_page_0 = f.read()
with (SAMPLE_PATH / "news_university_news_features_articles_page_1.html").open() as f:
Expand All @@ -39,16 +43,38 @@ def __init__(self, *args, **kwargs):
with (SAMPLE_PATH / "news_university_news_features_articles_2020.html").open() as f:
self.university_news_features_articles_2020 = f.read()

@responses.activate
def test_get_categories(self):
if not news.CATEGORY_URL_NAME_MAP:
responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire)

categories = news.get_categories()

self.assertEqual(len(categories), 4)

@responses.activate
def test_get_topics(self):
if not news.TOPIC_ID_MAP:
responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles)

topics = news.get_topics()

self.assertEqual(len(topics), 13)

@responses.activate
def test_get_articles_by_topic(self):
if not news.CATEGORY_URL_NAME_MAP:
responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire)
if not news.TOPIC_ID_MAP:
responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles)
responses.add(
responses.GET,
"https://www.pitt.edu/pittwire/news/features-articles?field_topics_target_id=432&field_article_date_value=&title="
"&field_category_target_id=All",
body=self.university_news_features_articles_page_0,
)

university_news_articles = news.get_articles_by_topic("university-news")
university_news_articles = news.get_articles_by_topic("University News")

self.assertEqual(len(university_news_articles), news.NUM_ARTICLES_PER_PAGE)
self.assertEqual(
Expand All @@ -75,14 +101,18 @@ def test_get_articles_by_topic(self):
@responses.activate
def test_get_articles_by_topic_query(self):
query = "fulbright"
if not news.CATEGORY_URL_NAME_MAP:
responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire)
if not news.TOPIC_ID_MAP:
responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles)
responses.add(
responses.GET,
"https://www.pitt.edu/pittwire/news/features-articles?field_topics_target_id=432&field_article_date_value="
f"&title={query}&field_category_target_id=All",
body=self.university_news_features_articles_fulbright,
)

university_news_articles = news.get_articles_by_topic("university-news", query=query)
university_news_articles = news.get_articles_by_topic("University News", query=query)

self.assertEqual(len(university_news_articles), 3)
self.assertEqual(
Expand Down Expand Up @@ -115,14 +145,18 @@ def test_get_articles_by_topic_query(self):
@responses.activate
def test_get_articles_by_topic_year(self):
year = 2020
if not news.CATEGORY_URL_NAME_MAP:
responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire)
if not news.TOPIC_ID_MAP:
responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles)
responses.add(
responses.GET,
f"https://www.pitt.edu/pittwire/news/features-articles?field_topics_target_id=432&field_article_date_value={year}"
"&title=&field_category_target_id=All",
body=self.university_news_features_articles_2020,
)

university_news_articles = news.get_articles_by_topic("university-news", year=year)
university_news_articles = news.get_articles_by_topic("University News", year=year)

self.assertEqual(len(university_news_articles), 5)
self.assertEqual(
Expand Down Expand Up @@ -152,14 +186,18 @@ def test_get_articles_by_topic_year(self):
@responses.activate
def test_get_articles_by_topic_less_than_one_page(self):
num_results = 5
if not news.CATEGORY_URL_NAME_MAP:
responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire)
if not news.TOPIC_ID_MAP:
responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles)
responses.add(
responses.GET,
"https://www.pitt.edu/pittwire/news/features-articles?field_topics_target_id=432&field_article_date_value=&title="
"&field_category_target_id=All",
body=self.university_news_features_articles_page_0,
)

university_news_articles = news.get_articles_by_topic("university-news", max_num_results=num_results)
university_news_articles = news.get_articles_by_topic("University News", max_num_results=num_results)

self.assertEqual(len(university_news_articles), num_results)
self.assertEqual(
Expand All @@ -186,6 +224,10 @@ def test_get_articles_by_topic_less_than_one_page(self):
@responses.activate
def test_get_articles_by_topic_multiple_pages(self):
num_results = news.NUM_ARTICLES_PER_PAGE + 5
if not news.CATEGORY_URL_NAME_MAP:
responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire)
if not news.TOPIC_ID_MAP:
responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles)
responses.add(
responses.GET,
"https://www.pitt.edu/pittwire/news/features-articles?field_topics_target_id=432&field_article_date_value=&title="
Expand All @@ -199,7 +241,7 @@ def test_get_articles_by_topic_multiple_pages(self):
body=self.university_news_features_articles_page_1,
)

university_news_articles = news.get_articles_by_topic("university-news", max_num_results=num_results)
university_news_articles = news.get_articles_by_topic("University News", max_num_results=num_results)

self.assertEqual(len(university_news_articles), num_results)
self.assertEqual(
Expand Down Expand Up @@ -227,3 +269,21 @@ def test_get_articles_by_topic_multiple_pages(self):
],
),
)

@responses.activate
def test_get_articles_by_topic_invalid_category(self):
if not news.CATEGORY_URL_NAME_MAP:
responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire)
if not news.TOPIC_ID_MAP:
responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles)

self.assertRaises(ValueError, news.get_articles_by_topic, "University News", "Invalid Category")

@responses.activate
def test_get_articles_by_topic_invalid_topic(self):
if not news.CATEGORY_URL_NAME_MAP:
responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire)
if not news.TOPIC_ID_MAP:
responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles)

self.assertRaises(ValueError, news.get_articles_by_topic, "Invalid Topic")
1,839 changes: 1,839 additions & 0 deletions tests/samples/news_features_articles.html

Large diffs are not rendered by default.

1,942 changes: 1,942 additions & 0 deletions tests/samples/news_pittwire.html

Large diffs are not rendered by default.

Loading