pittcsc · tianyizheng02 · Dec 24, 2024 · Dec 25, 2024 · Dec 29, 2024 · RitwikGupta
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/pittapi/news.py b/pittapi/news.py
@@ -21,48 +21,20 @@
 
 import math
 from requests_html import Element, HTMLResponse, HTMLSession
-from typing import Literal, NamedTuple
+from typing import NamedTuple
 
 NUM_ARTICLES_PER_PAGE = 20
 
-NEWS_BY_CATEGORY_URL = (
-    "https://www.pitt.edu/pittwire/news/{category}?field_topics_target_id={topic_id}&field_article_date_value={year}"
+PITT_BASE_URL = "https://www.pitt.edu"
+PITTWIRE_URL = PITT_BASE_URL + "/pittwire"
+FEATURES_ARTICLES_URL = PITTWIRE_URL + "/news/features-articles"
+NEWS_BY_CATEGORY_URL = PITTWIRE_URL + (
+    "/news/{category}?field_topics_target_id={topic_id}&field_article_date_value={year}"
     "&title={query}&field_category_target_id=All&page={page_num}"
 )
-PITT_BASE_URL = "https://www.pitt.edu"
 
-Category = Literal["features-articles", "accolades-honors", "ones-to-watch", "announcements-and-updates"]
-Topic = Literal[
-    "university-news",
-    "health-and-wellness",
-    "technology-and-science",
-    "arts-and-humanities",
-    "community-impact",
-    "innovation-and-research",
-    "global",
-    "diversity-equity-and-inclusion",
-    "our-city-our-campus",
-    "teaching-and-learning",
-    "space",
-    "ukraine",
-    "sustainability",
-]
-
-TOPIC_ID_MAP: dict[Topic, int] = {
-    "university-news": 432,
-    "health-and-wellness": 2,
-    "technology-and-science": 391,
-    "arts-and-humanities": 4,
-    "community-impact": 6,
-    "innovation-and-research": 1,
-    "global": 9,
-    "diversity-equity-and-inclusion": 8,
-    "our-city-our-campus": 12,
-    "teaching-and-learning": 7,
-    "space": 440,
-    "ukraine": 441,
-    "sustainability": 470,
-}
+CATEGORY_URL_NAME_MAP: dict[str, str] | None = None
+TOPIC_ID_MAP: dict[str, int] | None = None
 
 sess = HTMLSession()
 
@@ -87,18 +59,49 @@ def from_html(cls, article_html: Element) -> Article:
         return cls(title=article_title, description=article_description, url=article_url, tags=article_tags)
 
 
-def _get_page_articles(
-    topic: Topic,
-    category: Category,
-    query: str,
-    year: int | None,
-    page_num: int,
-) -> list[Article]:
+def _scrape_categories() -> dict[str, str]:
+    response: HTMLResponse = sess.get(PITTWIRE_URL)
+    category_menu: Element = response.html.find("div#block-views-block-category-menu-category-menu", first=True)
+    category_list: list[Element] = category_menu.find("ul.hamburger-menu-list li")
+    category_map: dict[str, str] = {}
+    for category in category_list:
+        category_link: Element = category.find("a", first=True)
+        category_url_name = category_link.attrs["href"].split("/")[-1]
+        category_map[category.text.strip()] = category_url_name
+    if not category_map:
+        raise RuntimeError("No categories found, please open a GitHub issue")
+    return category_map
+
+
+def _scrape_topics() -> dict[str, int]:
+    response: HTMLResponse = sess.get(FEATURES_ARTICLES_URL)
+    main_content: Element = response.html.xpath("/html/body/div/main/div/section", first=True)
+    topic_fieldset: Element = main_content.find("fieldset.form-item-field-topics-target-id", first=True)
+    topic_options: list[Element] = topic_fieldset.find("option")
+    topic_map: dict[str, int] = {}
+    for topic_option in topic_options:
+        if (topic_id := topic_option.attrs["value"].strip()) == "All":  # Skip placeholder "Topics" option
+            continue
+        topic_name = topic_option.text.strip()
+        topic_map[topic_name] = int(topic_id)
+    if not topic_map:
+        raise RuntimeError("No topics found, please open a GitHub issue")
+    return topic_map
+
+
+def _get_page_articles(topic: str, category: str, query: str, year: int | None, page_num: int) -> list[Article]:
+    assert CATEGORY_URL_NAME_MAP is not None
+    assert TOPIC_ID_MAP is not None
     year_str = str(year) if year else ""
     page_num_str = str(page_num) if page_num else ""
+
     response: HTMLResponse = sess.get(
         NEWS_BY_CATEGORY_URL.format(
-            category=category, topic_id=TOPIC_ID_MAP[topic], year=year_str, query=query, page_num=page_num_str
+            category=CATEGORY_URL_NAME_MAP[category],
+            topic_id=TOPIC_ID_MAP[topic],
+            year=year_str,
+            query=query,
+            page_num=page_num_str,
         )
     )
     main_content: Element = response.html.xpath("/html/body/div/main/div/section", first=True)
@@ -107,13 +110,39 @@ def _get_page_articles(
     return page_articles
 
 
+def get_categories() -> list[str]:
+    global CATEGORY_URL_NAME_MAP
+    if not CATEGORY_URL_NAME_MAP:
+        CATEGORY_URL_NAME_MAP = _scrape_categories()
+    return list(CATEGORY_URL_NAME_MAP.keys())
+
+
+def get_topics() -> list[str]:
+    global TOPIC_ID_MAP
+    if not TOPIC_ID_MAP:
+        TOPIC_ID_MAP = _scrape_topics()
+    return list(TOPIC_ID_MAP.keys())
+
+
 def get_articles_by_topic(
-    topic: Topic,
-    category: Category = "features-articles",
+    topic: str,
+    category: str = "Features & Articles",
     query: str = "",
     year: int | None = None,
     max_num_results: int = NUM_ARTICLES_PER_PAGE,
 ) -> list[Article]:
+    global TOPIC_ID_MAP
+    if not TOPIC_ID_MAP:
+        TOPIC_ID_MAP = _scrape_topics()
+    if topic not in TOPIC_ID_MAP:
+        raise ValueError(f"'{topic}' is not a valid topic, must be one of the following: {get_topics()}")
+
+    global CATEGORY_URL_NAME_MAP
+    if not CATEGORY_URL_NAME_MAP:
+        CATEGORY_URL_NAME_MAP = _scrape_categories()
+    if category not in CATEGORY_URL_NAME_MAP:
+        raise ValueError(f"'{category}' is not a valid category, must be one of the following: {get_categories()}")
+
     num_pages = math.ceil(max_num_results / NUM_ARTICLES_PER_PAGE)
 
     # Get articles sequentially and synchronously (i.e., not using grequests) because the news pages must stay in order

diff --git a/requirements.txt b/requirements.txt
@@ -4,7 +4,7 @@ black==24.10.0
 certifi==2024.12.14
 cfgv==3.4.0
 charset-normalizer==3.4.0
-click==8.1.7
+click==8.1.8
 coverage[toml]==7.6.9
 distlib==0.3.9
 docutils==0.21.2
@@ -14,7 +14,7 @@ identify==2.6.3
 idna==3.10
 imagesize==1.4.1
 iniconfig==2.0.0
-jinja2==3.1.4
+jinja2==3.1.5
 markupsafe==3.0.2
 mccabe==0.7.0
 mypy-extensions==1.0.0

diff --git a/tests/news_test.py b/tests/news_test.py
@@ -30,6 +30,10 @@
 class NewsTest(unittest.TestCase):
     def __init__(self, *args, **kwargs):
         unittest.TestCase.__init__(self, *args, **kwargs)
+        with (SAMPLE_PATH / "news_pittwire.html").open() as f:
+            self.pittwire = f.read()
+        with (SAMPLE_PATH / "news_features_articles.html").open() as f:
+            self.features_articles = f.read()
         with (SAMPLE_PATH / "news_university_news_features_articles_page_0.html").open() as f:
             self.university_news_features_articles_page_0 = f.read()
         with (SAMPLE_PATH / "news_university_news_features_articles_page_1.html").open() as f:
@@ -39,16 +43,38 @@ def __init__(self, *args, **kwargs):
         with (SAMPLE_PATH / "news_university_news_features_articles_2020.html").open() as f:
             self.university_news_features_articles_2020 = f.read()
 
+    @responses.activate
+    def test_get_categories(self):
+        if not news.CATEGORY_URL_NAME_MAP:
+            responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire)
+
+        categories = news.get_categories()
+
+        self.assertEqual(len(categories), 4)
+
+    @responses.activate
+    def test_get_topics(self):
+        if not news.TOPIC_ID_MAP:
+            responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles)
+
+        topics = news.get_topics()
+
+        self.assertEqual(len(topics), 13)
+
     @responses.activate
     def test_get_articles_by_topic(self):
+        if not news.CATEGORY_URL_NAME_MAP:
+            responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire)
+        if not news.TOPIC_ID_MAP:
+            responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles)
         responses.add(
             responses.GET,
             "https://www.pitt.edu/pittwire/news/features-articles?field_topics_target_id=432&field_article_date_value=&title="
             "&field_category_target_id=All",
             body=self.university_news_features_articles_page_0,
         )
 
-        university_news_articles = news.get_articles_by_topic("university-news")
+        university_news_articles = news.get_articles_by_topic("University News")
 
         self.assertEqual(len(university_news_articles), news.NUM_ARTICLES_PER_PAGE)
         self.assertEqual(
@@ -75,14 +101,18 @@ def test_get_articles_by_topic(self):
     @responses.activate
     def test_get_articles_by_topic_query(self):
         query = "fulbright"
+        if not news.CATEGORY_URL_NAME_MAP:
+            responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire)
+        if not news.TOPIC_ID_MAP:
+            responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles)
         responses.add(
             responses.GET,
             "https://www.pitt.edu/pittwire/news/features-articles?field_topics_target_id=432&field_article_date_value="
             f"&title={query}&field_category_target_id=All",
             body=self.university_news_features_articles_fulbright,
         )
 
-        university_news_articles = news.get_articles_by_topic("university-news", query=query)
+        university_news_articles = news.get_articles_by_topic("University News", query=query)
 
         self.assertEqual(len(university_news_articles), 3)
         self.assertEqual(
@@ -115,14 +145,18 @@ def test_get_articles_by_topic_query(self):
     @responses.activate
     def test_get_articles_by_topic_year(self):
         year = 2020
+        if not news.CATEGORY_URL_NAME_MAP:
+            responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire)
+        if not news.TOPIC_ID_MAP:
+            responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles)
         responses.add(
             responses.GET,
             f"https://www.pitt.edu/pittwire/news/features-articles?field_topics_target_id=432&field_article_date_value={year}"
             "&title=&field_category_target_id=All",
             body=self.university_news_features_articles_2020,
         )
 
-        university_news_articles = news.get_articles_by_topic("university-news", year=year)
+        university_news_articles = news.get_articles_by_topic("University News", year=year)
 
         self.assertEqual(len(university_news_articles), 5)
         self.assertEqual(
@@ -152,14 +186,18 @@ def test_get_articles_by_topic_year(self):
     @responses.activate
     def test_get_articles_by_topic_less_than_one_page(self):
         num_results = 5
+        if not news.CATEGORY_URL_NAME_MAP:
+            responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire)
+        if not news.TOPIC_ID_MAP:
+            responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles)
         responses.add(
             responses.GET,
             "https://www.pitt.edu/pittwire/news/features-articles?field_topics_target_id=432&field_article_date_value=&title="
             "&field_category_target_id=All",
             body=self.university_news_features_articles_page_0,
         )
 
-        university_news_articles = news.get_articles_by_topic("university-news", max_num_results=num_results)
+        university_news_articles = news.get_articles_by_topic("University News", max_num_results=num_results)
 
         self.assertEqual(len(university_news_articles), num_results)
         self.assertEqual(
@@ -186,6 +224,10 @@ def test_get_articles_by_topic_less_than_one_page(self):
     @responses.activate
     def test_get_articles_by_topic_multiple_pages(self):
         num_results = news.NUM_ARTICLES_PER_PAGE + 5
+        if not news.CATEGORY_URL_NAME_MAP:
+            responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire)
+        if not news.TOPIC_ID_MAP:
+            responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles)
         responses.add(
             responses.GET,
             "https://www.pitt.edu/pittwire/news/features-articles?field_topics_target_id=432&field_article_date_value=&title="
@@ -199,7 +241,7 @@ def test_get_articles_by_topic_multiple_pages(self):
             body=self.university_news_features_articles_page_1,
         )
 
-        university_news_articles = news.get_articles_by_topic("university-news", max_num_results=num_results)
+        university_news_articles = news.get_articles_by_topic("University News", max_num_results=num_results)
 
         self.assertEqual(len(university_news_articles), num_results)
         self.assertEqual(
@@ -227,3 +269,21 @@ def test_get_articles_by_topic_multiple_pages(self):
                 ],
             ),
         )
+
+    @responses.activate
+    def test_get_articles_by_topic_invalid_category(self):
+        if not news.CATEGORY_URL_NAME_MAP:
+            responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire)
+        if not news.TOPIC_ID_MAP:
+            responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles)
+
+        self.assertRaises(ValueError, news.get_articles_by_topic, "University News", "Invalid Category")
+
+    @responses.activate
+    def test_get_articles_by_topic_invalid_topic(self):
+        if not news.CATEGORY_URL_NAME_MAP:
+            responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire)
+        if not news.TOPIC_ID_MAP:
+            responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles)
+
+        self.assertRaises(ValueError, news.get_articles_by_topic, "Invalid Topic")
diff --git a/tests/samples/news_features_articles.html b/tests/samples/news_features_articles.html
diff --git a/tests/samples/news_pittwire.html b/tests/samples/news_pittwire.html