ferru97 · somewordstoolate · Dec 19, 2024 · Dec 19, 2024
diff --git a/PyPaperBot/HTMLparsers.py b/PyPaperBot/HTMLparsers.py
@@ -6,10 +6,13 @@
 """
 from bs4 import BeautifulSoup
 
-def schoolarParser(html):
+
+def scholarParser(html):
     result = []
     soup = BeautifulSoup(html, "html.parser")
-    for element in soup.findAll("div", class_="gs_r gs_or gs_scl"):
+    for element in soup.findAll(
+        "div", class_=["gs_r gs_or gs_scl", "gs_r gs_or gs_scl gs_fmar"]
+    ):  # "gs_r gs_or gs_scl gs_fmar" for only one search result
         if isBook(element) == False:
             title = None
             link = None
@@ -25,13 +28,15 @@ def schoolarParser(html):
                         link = a.get("href")
                         found = True
             for a in element.findAll("a"):
-                 if "Cited by" in a.text:
-                     cites = int(a.text[8:])
-                 if "[PDF]" in a.text:
-                     link_pdf = a.get("href")
+                if "Cited by" in a.text:
+                    cites = int(a.text[8:])
+                if "[PDF]" in a.text:
+                    link_pdf = a.get("href")
             for div in element.findAll("div", class_="gs_a"):
                 try:
-                    authors, source_and_year, source = div.text.replace('\u00A0', ' ').split(" - ")
+                    authors, source_and_year, source = div.text.replace(
+                        '\u00A0', ' '
+                    ).split(" - ")
                 except ValueError:
                     continue
 
@@ -48,54 +53,57 @@ def schoolarParser(html):
                     year = None
                 else:
                     year = str(year)
-            if title!=None:
-                result.append({
-                    'title' : title,
-                    'link' : link,
-                    'cites' : cites,
-                    'link_pdf' : link_pdf,
-                    'year' : year,
-                    'authors' : authors})
+            if title != None:
+                result.append(
+                    {
+                        'title': title,
+                        'link': link,
+                        'cites': cites,
+                        'link_pdf': link_pdf,
+                        'year': year,
+                        'authors': authors,
+                    }
+                )
     return result
 
 
-
 def isBook(tag):
     result = False
     for span in tag.findAll("span", class_="gs_ct2"):
-        if span.text=="[B]":
+        if span.text == "[B]":
             result = True
     return result
 
 
-
 def getSchiHubPDF(html):
     result = None
     soup = BeautifulSoup(html, "html.parser")
 
     iframe = soup.find(id='pdf')
     plugin = soup.find(id='plugin')
 
-    if iframe!=None:
+    if iframe != None:
         result = iframe.get("src")
 
-    if plugin!=None and result==None:
+    if plugin != None and result == None:
         result = plugin.get("src")
 
-    if result!=None and result[0]!="h":
-        result = "https:"+result
+    if result != None and result[0] != "h":
+        result = "https:" + result
 
     return result
 
+
 def SciHubUrls(html):
     result = []
     soup = BeautifulSoup(html, "html.parser")
 
     for ul in soup.findAll("ul"):
         for a in ul.findAll("a"):
             link = a.get("href")
-            if link.startswith("https://sci-hub.") or link.startswith("http://sci-hub."):
+            if link.startswith("https://sci-hub.") or link.startswith(
+                "http://sci-hub."
+            ):
                 result.append(link)
 
     return result
-
diff --git a/PyPaperBot/Scholar.py b/PyPaperBot/Scholar.py
@@ -1,24 +1,29 @@
 import time
 import requests
 import functools
-from .HTMLparsers import schoolarParser
+from .HTMLparsers import scholarParser
 from .Crossref import getPapersInfo
 from .NetInfo import NetInfo
 
 
 def waithIPchange():
     while True:
-        inp = input('You have been blocked, try changing your IP or using a VPN. '
-              'Press Enter to continue downloading, or type "exit" to stop and exit....')
+        inp = input(
+            'You have been blocked, try changing your IP or using a VPN. '
+            'Press Enter to continue downloading, or type "exit" to stop and exit....'
+        )
         if inp.strip().lower() == "exit":
             return False
         elif not inp.strip():
             print("Wait 30 seconds...")
             time.sleep(30)
             return True
 
+
 def scholar_requests(scholar_pages, url, restrict, scholar_results=10):
-    javascript_error = "Sorry, we can't verify that you're not a robot when JavaScript is turned off"
+    javascript_error = (
+        "Sorry, we can't verify that you're not a robot when JavaScript is turned off"
+    )
     to_download = []
     for i in scholar_pages:
         while True:
@@ -33,13 +38,15 @@ def scholar_requests(scholar_pages, url, restrict, scholar_results=10):
             else:
                 break
 
-        papers = schoolarParser(html)
-        print("\nGoogle Scholar page {} : {} papers found".format(i,scholar_results))
+        papers = scholarParser(html)
+        print("\nGoogle Scholar page {} : {} papers found".format(i, scholar_results))
 
-        if(len(papers)>0):
+        if len(papers) > 0:
             papersInfo = getPapersInfo(papers, url, restrict, scholar_results)
-            info_valids = functools.reduce(lambda a,b : a+1 if b.DOI!=None else a, papersInfo, 0)
-            print("Papers found on Crossref: {}/{}\n".format(info_valids,len(papers)))
+            info_valids = functools.reduce(
+                lambda a, b: a + 1 if b.DOI != None else a, papersInfo, 0
+            )
+            print("Papers found on Crossref: {}/{}\n".format(info_valids, len(papers)))
 
             to_download.append(papersInfo)
         else:
@@ -48,15 +55,20 @@ def scholar_requests(scholar_pages, url, restrict, scholar_results=10):
     return to_download
 
 
+def ScholarPapersInfo(
+    query, scholar_pages, restrict, min_date=None, scholar_results=10
+):
 
-def ScholarPapersInfo(query, scholar_pages, restrict, min_date=None, scholar_results=10):
-
-    url = r"https://scholar.google.com/scholar?hl=en&q="+query+"&as_vis=1&as_sdt=1,5&start=%d"
-    if min_date!=None:
-        url += "&as_ylo="+str(min_date)
+    url = (
+        r"https://scholar.google.com/scholar?hl=en&q="
+        + query
+        + "&as_vis=1&as_sdt=1,5&start=%d"
+    )
+    if min_date != None:
+        url += "&as_ylo=" + str(min_date)
 
-    if len(query)>7 and (query[0:7]=="http://" or query[0:8]=="https://"):
-         url = query
+    if len(query) > 7 and (query[0:7] == "http://" or query[0:8] == "https://"):
+        url = query
 
     to_download = scholar_requests(scholar_pages, url, restrict, scholar_results)