From 24f3f61ee872cc5d99a159d5bef40bca31e112fe Mon Sep 17 00:00:00 2001
From: Yushuhuan <22215064@zju.edu.cn>
Date: Thu, 19 Dec 2024 19:51:19 +0800
Subject: [PATCH 1/2] change function schoolarParser for accurate search

---
 PyPaperBot/HTMLparsers.py | 54 ++++++++++++++++++++++-----------------
 1 file changed, 31 insertions(+), 23 deletions(-)

diff --git a/PyPaperBot/HTMLparsers.py b/PyPaperBot/HTMLparsers.py
index 937ce86..dc90e06 100644
--- a/PyPaperBot/HTMLparsers.py
+++ b/PyPaperBot/HTMLparsers.py
@@ -6,10 +6,13 @@
 """
 from bs4 import BeautifulSoup
 
+
 def schoolarParser(html):
     result = []
     soup = BeautifulSoup(html, "html.parser")
-    for element in soup.findAll("div", class_="gs_r gs_or gs_scl"):
+    for element in soup.findAll(
+        "div", class_=["gs_r gs_or gs_scl", "gs_r gs_or gs_scl gs_fmar"]
+    ):  # "gs_r gs_or gs_scl gs_fmar" for only one search result
         if isBook(element) == False:
             title = None
             link = None
@@ -25,13 +28,15 @@ def schoolarParser(html):
                         link = a.get("href")
                         found = True
             for a in element.findAll("a"):
-                 if "Cited by" in a.text:
-                     cites = int(a.text[8:])
-                 if "[PDF]" in a.text:
-                     link_pdf = a.get("href")
+                if "Cited by" in a.text:
+                    cites = int(a.text[8:])
+                if "[PDF]" in a.text:
+                    link_pdf = a.get("href")
             for div in element.findAll("div", class_="gs_a"):
                 try:
-                    authors, source_and_year, source = div.text.replace('\u00A0', ' ').split(" - ")
+                    authors, source_and_year, source = div.text.replace(
+                        '\u00A0', ' '
+                    ).split(" - ")
                 except ValueError:
                     continue
 
@@ -48,27 +53,28 @@ def schoolarParser(html):
                     year = None
                 else:
                     year = str(year)
-            if title!=None:
-                result.append({
-                    'title' : title,
-                    'link' : link,
-                    'cites' : cites,
-                    'link_pdf' : link_pdf,
-                    'year' : year,
-                    'authors' : authors})
+            if title != None:
+                result.append(
+                    {
+                        'title': title,
+                        'link': link,
+                        'cites': cites,
+                        'link_pdf': link_pdf,
+                        'year': year,
+                        'authors': authors,
+                    }
+                )
     return result
 
 
-
 def isBook(tag):
     result = False
     for span in tag.findAll("span", class_="gs_ct2"):
-        if span.text=="[B]":
+        if span.text == "[B]":
             result = True
     return result
 
 
-
 def getSchiHubPDF(html):
     result = None
     soup = BeautifulSoup(html, "html.parser")
@@ -76,17 +82,18 @@ def getSchiHubPDF(html):
     iframe = soup.find(id='pdf')
     plugin = soup.find(id='plugin')
 
-    if iframe!=None:
+    if iframe != None:
         result = iframe.get("src")
 
-    if plugin!=None and result==None:
+    if plugin != None and result == None:
         result = plugin.get("src")
 
-    if result!=None and result[0]!="h":
-        result = "https:"+result
+    if result != None and result[0] != "h":
+        result = "https:" + result
 
     return result
 
+
 def SciHubUrls(html):
     result = []
     soup = BeautifulSoup(html, "html.parser")
@@ -94,8 +101,9 @@ def SciHubUrls(html):
     for ul in soup.findAll("ul"):
         for a in ul.findAll("a"):
             link = a.get("href")
-            if link.startswith("https://sci-hub.") or link.startswith("http://sci-hub."):
+            if link.startswith("https://sci-hub.") or link.startswith(
+                "http://sci-hub."
+            ):
                 result.append(link)
 
     return result
-

From cf04533c8743c3e780d9cd40678d9ba7e8302459 Mon Sep 17 00:00:00 2001
From: Yushuhuan <22215064@zju.edu.cn>
Date: Thu, 19 Dec 2024 20:09:50 +0800
Subject: [PATCH 2/2] change function scholarParser (previously named
 schoolarParser) for accurate search

---
 PyPaperBot/HTMLparsers.py |  2 +-
 PyPaperBot/Scholar.py     | 44 +++++++++++++++++++++++++--------------
 2 files changed, 29 insertions(+), 17 deletions(-)

diff --git a/PyPaperBot/HTMLparsers.py b/PyPaperBot/HTMLparsers.py
index dc90e06..a2d7c64 100644
--- a/PyPaperBot/HTMLparsers.py
+++ b/PyPaperBot/HTMLparsers.py
@@ -7,7 +7,7 @@
 from bs4 import BeautifulSoup
 
 
-def schoolarParser(html):
+def scholarParser(html):
     result = []
     soup = BeautifulSoup(html, "html.parser")
     for element in soup.findAll(
diff --git a/PyPaperBot/Scholar.py b/PyPaperBot/Scholar.py
index 184b6a9..dfac72b 100644
--- a/PyPaperBot/Scholar.py
+++ b/PyPaperBot/Scholar.py
@@ -1,15 +1,17 @@
 import time
 import requests
 import functools
-from .HTMLparsers import schoolarParser
+from .HTMLparsers import scholarParser
 from .Crossref import getPapersInfo
 from .NetInfo import NetInfo
 
 
 def waithIPchange():
     while True:
-        inp = input('You have been blocked, try changing your IP or using a VPN. '
-              'Press Enter to continue downloading, or type "exit" to stop and exit....')
+        inp = input(
+            'You have been blocked, try changing your IP or using a VPN. '
+            'Press Enter to continue downloading, or type "exit" to stop and exit....'
+        )
         if inp.strip().lower() == "exit":
             return False
         elif not inp.strip():
@@ -17,8 +19,11 @@ def waithIPchange():
             time.sleep(30)
             return True
 
+
 def scholar_requests(scholar_pages, url, restrict, scholar_results=10):
-    javascript_error = "Sorry, we can't verify that you're not a robot when JavaScript is turned off"
+    javascript_error = (
+        "Sorry, we can't verify that you're not a robot when JavaScript is turned off"
+    )
     to_download = []
     for i in scholar_pages:
         while True:
@@ -33,13 +38,15 @@ def scholar_requests(scholar_pages, url, restrict, scholar_results=10):
             else:
                 break
 
-        papers = schoolarParser(html)
-        print("\nGoogle Scholar page {} : {} papers found".format(i,scholar_results))
+        papers = scholarParser(html)
+        print("\nGoogle Scholar page {} : {} papers found".format(i, scholar_results))
 
-        if(len(papers)>0):
+        if len(papers) > 0:
             papersInfo = getPapersInfo(papers, url, restrict, scholar_results)
-            info_valids = functools.reduce(lambda a,b : a+1 if b.DOI!=None else a, papersInfo, 0)
-            print("Papers found on Crossref: {}/{}\n".format(info_valids,len(papers)))
+            info_valids = functools.reduce(
+                lambda a, b: a + 1 if b.DOI != None else a, papersInfo, 0
+            )
+            print("Papers found on Crossref: {}/{}\n".format(info_valids, len(papers)))
 
             to_download.append(papersInfo)
         else:
@@ -48,15 +55,20 @@ def scholar_requests(scholar_pages, url, restrict, scholar_results=10):
     return to_download
 
 
+def ScholarPapersInfo(
+    query, scholar_pages, restrict, min_date=None, scholar_results=10
+):
 
-def ScholarPapersInfo(query, scholar_pages, restrict, min_date=None, scholar_results=10):
-
-    url = r"https://scholar.google.com/scholar?hl=en&q="+query+"&as_vis=1&as_sdt=1,5&start=%d"
-    if min_date!=None:
-        url += "&as_ylo="+str(min_date)
+    url = (
+        r"https://scholar.google.com/scholar?hl=en&q="
+        + query
+        + "&as_vis=1&as_sdt=1,5&start=%d"
+    )
+    if min_date != None:
+        url += "&as_ylo=" + str(min_date)
 
-    if len(query)>7 and (query[0:7]=="http://" or query[0:8]=="https://"):
-         url = query
+    if len(query) > 7 and (query[0:7] == "http://" or query[0:8] == "https://"):
+        url = query
 
     to_download = scholar_requests(scholar_pages, url, restrict, scholar_results)