diff --git a/PyPaperBot/HTMLparsers.py b/PyPaperBot/HTMLparsers.py
index 937ce86..a2d7c64 100644
--- a/PyPaperBot/HTMLparsers.py
+++ b/PyPaperBot/HTMLparsers.py
@@ -6,10 +6,13 @@
"""
from bs4 import BeautifulSoup
-def schoolarParser(html):
+
+def scholarParser(html):
result = []
soup = BeautifulSoup(html, "html.parser")
- for element in soup.findAll("div", class_="gs_r gs_or gs_scl"):
+ for element in soup.findAll(
+ "div", class_=["gs_r gs_or gs_scl", "gs_r gs_or gs_scl gs_fmar"]
+ ): # "gs_r gs_or gs_scl gs_fmar" for only one search result
if isBook(element) == False:
title = None
link = None
@@ -25,13 +28,15 @@ def schoolarParser(html):
link = a.get("href")
found = True
for a in element.findAll("a"):
- if "Cited by" in a.text:
- cites = int(a.text[8:])
- if "[PDF]" in a.text:
- link_pdf = a.get("href")
+ if "Cited by" in a.text:
+ cites = int(a.text[8:])
+ if "[PDF]" in a.text:
+ link_pdf = a.get("href")
for div in element.findAll("div", class_="gs_a"):
try:
- authors, source_and_year, source = div.text.replace('\u00A0', ' ').split(" - ")
+ authors, source_and_year, source = div.text.replace(
+ '\u00A0', ' '
+ ).split(" - ")
except ValueError:
continue
@@ -48,27 +53,28 @@ def schoolarParser(html):
year = None
else:
year = str(year)
- if title!=None:
- result.append({
- 'title' : title,
- 'link' : link,
- 'cites' : cites,
- 'link_pdf' : link_pdf,
- 'year' : year,
- 'authors' : authors})
+ if title != None:
+ result.append(
+ {
+ 'title': title,
+ 'link': link,
+ 'cites': cites,
+ 'link_pdf': link_pdf,
+ 'year': year,
+ 'authors': authors,
+ }
+ )
return result
-
def isBook(tag):
result = False
for span in tag.findAll("span", class_="gs_ct2"):
- if span.text=="[B]":
+ if span.text == "[B]":
result = True
return result
-
def getSchiHubPDF(html):
result = None
soup = BeautifulSoup(html, "html.parser")
@@ -76,17 +82,18 @@ def getSchiHubPDF(html):
iframe = soup.find(id='pdf')
plugin = soup.find(id='plugin')
- if iframe!=None:
+ if iframe != None:
result = iframe.get("src")
- if plugin!=None and result==None:
+ if plugin != None and result == None:
result = plugin.get("src")
- if result!=None and result[0]!="h":
- result = "https:"+result
+ if result != None and result[0] != "h":
+ result = "https:" + result
return result
+
def SciHubUrls(html):
result = []
soup = BeautifulSoup(html, "html.parser")
@@ -94,8 +101,9 @@ def SciHubUrls(html):
for ul in soup.findAll("ul"):
for a in ul.findAll("a"):
link = a.get("href")
- if link.startswith("https://sci-hub.") or link.startswith("http://sci-hub."):
+ if link.startswith("https://sci-hub.") or link.startswith(
+ "http://sci-hub."
+ ):
result.append(link)
return result
-
diff --git a/PyPaperBot/Scholar.py b/PyPaperBot/Scholar.py
index 184b6a9..dfac72b 100644
--- a/PyPaperBot/Scholar.py
+++ b/PyPaperBot/Scholar.py
@@ -1,15 +1,17 @@
import time
import requests
import functools
-from .HTMLparsers import schoolarParser
+from .HTMLparsers import scholarParser
from .Crossref import getPapersInfo
from .NetInfo import NetInfo
def waithIPchange():
while True:
- inp = input('You have been blocked, try changing your IP or using a VPN. '
- 'Press Enter to continue downloading, or type "exit" to stop and exit....')
+ inp = input(
+ 'You have been blocked, try changing your IP or using a VPN. '
+ 'Press Enter to continue downloading, or type "exit" to stop and exit....'
+ )
if inp.strip().lower() == "exit":
return False
elif not inp.strip():
@@ -17,8 +19,11 @@ def waithIPchange():
time.sleep(30)
return True
+
def scholar_requests(scholar_pages, url, restrict, scholar_results=10):
- javascript_error = "Sorry, we can't verify that you're not a robot when JavaScript is turned off"
+ javascript_error = (
+ "Sorry, we can't verify that you're not a robot when JavaScript is turned off"
+ )
to_download = []
for i in scholar_pages:
while True:
@@ -33,13 +38,15 @@ def scholar_requests(scholar_pages, url, restrict, scholar_results=10):
else:
break
- papers = schoolarParser(html)
- print("\nGoogle Scholar page {} : {} papers found".format(i,scholar_results))
+ papers = scholarParser(html)
+ print("\nGoogle Scholar page {} : {} papers found".format(i, scholar_results))
- if(len(papers)>0):
+ if len(papers) > 0:
papersInfo = getPapersInfo(papers, url, restrict, scholar_results)
- info_valids = functools.reduce(lambda a,b : a+1 if b.DOI!=None else a, papersInfo, 0)
- print("Papers found on Crossref: {}/{}\n".format(info_valids,len(papers)))
+ info_valids = functools.reduce(
+ lambda a, b: a + 1 if b.DOI != None else a, papersInfo, 0
+ )
+ print("Papers found on Crossref: {}/{}\n".format(info_valids, len(papers)))
to_download.append(papersInfo)
else:
@@ -48,15 +55,20 @@ def scholar_requests(scholar_pages, url, restrict, scholar_results=10):
return to_download
+def ScholarPapersInfo(
+ query, scholar_pages, restrict, min_date=None, scholar_results=10
+):
-def ScholarPapersInfo(query, scholar_pages, restrict, min_date=None, scholar_results=10):
-
- url = r"https://scholar.google.com/scholar?hl=en&q="+query+"&as_vis=1&as_sdt=1,5&start=%d"
- if min_date!=None:
- url += "&as_ylo="+str(min_date)
+ url = (
+ r"https://scholar.google.com/scholar?hl=en&q="
+ + query
+ + "&as_vis=1&as_sdt=1,5&start=%d"
+ )
+ if min_date != None:
+ url += "&as_ylo=" + str(min_date)
- if len(query)>7 and (query[0:7]=="http://" or query[0:8]=="https://"):
- url = query
+ if len(query) > 7 and (query[0:7] == "http://" or query[0:8] == "https://"):
+ url = query
to_download = scholar_requests(scholar_pages, url, restrict, scholar_results)