Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 32 additions & 24 deletions PyPaperBot/HTMLparsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,13 @@
"""
from bs4 import BeautifulSoup

def schoolarParser(html):

def scholarParser(html):
result = []
soup = BeautifulSoup(html, "html.parser")
for element in soup.findAll("div", class_="gs_r gs_or gs_scl"):
for element in soup.findAll(
"div", class_=["gs_r gs_or gs_scl", "gs_r gs_or gs_scl gs_fmar"]
): # "gs_r gs_or gs_scl gs_fmar" for only one search result
if isBook(element) == False:
title = None
link = None
Expand All @@ -25,13 +28,15 @@ def schoolarParser(html):
link = a.get("href")
found = True
for a in element.findAll("a"):
if "Cited by" in a.text:
cites = int(a.text[8:])
if "[PDF]" in a.text:
link_pdf = a.get("href")
if "Cited by" in a.text:
cites = int(a.text[8:])
if "[PDF]" in a.text:
link_pdf = a.get("href")
for div in element.findAll("div", class_="gs_a"):
try:
authors, source_and_year, source = div.text.replace('\u00A0', ' ').split(" - ")
authors, source_and_year, source = div.text.replace(
'\u00A0', ' '
).split(" - ")
except ValueError:
continue

Expand All @@ -48,54 +53,57 @@ def schoolarParser(html):
year = None
else:
year = str(year)
if title!=None:
result.append({
'title' : title,
'link' : link,
'cites' : cites,
'link_pdf' : link_pdf,
'year' : year,
'authors' : authors})
if title != None:
result.append(
{
'title': title,
'link': link,
'cites': cites,
'link_pdf': link_pdf,
'year': year,
'authors': authors,
}
)
return result



def isBook(tag):
result = False
for span in tag.findAll("span", class_="gs_ct2"):
if span.text=="[B]":
if span.text == "[B]":
result = True
return result



def getSchiHubPDF(html):
result = None
soup = BeautifulSoup(html, "html.parser")

iframe = soup.find(id='pdf')
plugin = soup.find(id='plugin')

if iframe!=None:
if iframe != None:
result = iframe.get("src")

if plugin!=None and result==None:
if plugin != None and result == None:
result = plugin.get("src")

if result!=None and result[0]!="h":
result = "https:"+result
if result != None and result[0] != "h":
result = "https:" + result

return result


def SciHubUrls(html):
result = []
soup = BeautifulSoup(html, "html.parser")

for ul in soup.findAll("ul"):
for a in ul.findAll("a"):
link = a.get("href")
if link.startswith("https://sci-hub.") or link.startswith("http://sci-hub."):
if link.startswith("https://sci-hub.") or link.startswith(
"http://sci-hub."
):
result.append(link)

return result

44 changes: 28 additions & 16 deletions PyPaperBot/Scholar.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,29 @@
import time
import requests
import functools
from .HTMLparsers import schoolarParser
from .HTMLparsers import scholarParser
from .Crossref import getPapersInfo
from .NetInfo import NetInfo


def waithIPchange():
while True:
inp = input('You have been blocked, try changing your IP or using a VPN. '
'Press Enter to continue downloading, or type "exit" to stop and exit....')
inp = input(
'You have been blocked, try changing your IP or using a VPN. '
'Press Enter to continue downloading, or type "exit" to stop and exit....'
)
if inp.strip().lower() == "exit":
return False
elif not inp.strip():
print("Wait 30 seconds...")
time.sleep(30)
return True


def scholar_requests(scholar_pages, url, restrict, scholar_results=10):
javascript_error = "Sorry, we can't verify that you're not a robot when JavaScript is turned off"
javascript_error = (
"Sorry, we can't verify that you're not a robot when JavaScript is turned off"
)
to_download = []
for i in scholar_pages:
while True:
Expand All @@ -33,13 +38,15 @@ def scholar_requests(scholar_pages, url, restrict, scholar_results=10):
else:
break

papers = schoolarParser(html)
print("\nGoogle Scholar page {} : {} papers found".format(i,scholar_results))
papers = scholarParser(html)
print("\nGoogle Scholar page {} : {} papers found".format(i, scholar_results))

if(len(papers)>0):
if len(papers) > 0:
papersInfo = getPapersInfo(papers, url, restrict, scholar_results)
info_valids = functools.reduce(lambda a,b : a+1 if b.DOI!=None else a, papersInfo, 0)
print("Papers found on Crossref: {}/{}\n".format(info_valids,len(papers)))
info_valids = functools.reduce(
lambda a, b: a + 1 if b.DOI != None else a, papersInfo, 0
)
print("Papers found on Crossref: {}/{}\n".format(info_valids, len(papers)))

to_download.append(papersInfo)
else:
Expand All @@ -48,15 +55,20 @@ def scholar_requests(scholar_pages, url, restrict, scholar_results=10):
return to_download


def ScholarPapersInfo(
query, scholar_pages, restrict, min_date=None, scholar_results=10
):

def ScholarPapersInfo(query, scholar_pages, restrict, min_date=None, scholar_results=10):

url = r"https://scholar.google.com/scholar?hl=en&q="+query+"&as_vis=1&as_sdt=1,5&start=%d"
if min_date!=None:
url += "&as_ylo="+str(min_date)
url = (
r"https://scholar.google.com/scholar?hl=en&q="
+ query
+ "&as_vis=1&as_sdt=1,5&start=%d"
)
if min_date != None:
url += "&as_ylo=" + str(min_date)

if len(query)>7 and (query[0:7]=="http://" or query[0:8]=="https://"):
url = query
if len(query) > 7 and (query[0:7] == "http://" or query[0:8] == "https://"):
url = query

to_download = scholar_requests(scholar_pages, url, restrict, scholar_results)

Expand Down