From 24f3f61ee872cc5d99a159d5bef40bca31e112fe Mon Sep 17 00:00:00 2001 From: Yushuhuan <22215064@zju.edu.cn> Date: Thu, 19 Dec 2024 19:51:19 +0800 Subject: [PATCH 1/2] change function schoolarParser for accurate search --- PyPaperBot/HTMLparsers.py | 54 ++++++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 23 deletions(-) diff --git a/PyPaperBot/HTMLparsers.py b/PyPaperBot/HTMLparsers.py index 937ce86..dc90e06 100644 --- a/PyPaperBot/HTMLparsers.py +++ b/PyPaperBot/HTMLparsers.py @@ -6,10 +6,13 @@ """ from bs4 import BeautifulSoup + def schoolarParser(html): result = [] soup = BeautifulSoup(html, "html.parser") - for element in soup.findAll("div", class_="gs_r gs_or gs_scl"): + for element in soup.findAll( + "div", class_=["gs_r gs_or gs_scl", "gs_r gs_or gs_scl gs_fmar"] + ): # "gs_r gs_or gs_scl gs_fmar" for only one search result if isBook(element) == False: title = None link = None @@ -25,13 +28,15 @@ def schoolarParser(html): link = a.get("href") found = True for a in element.findAll("a"): - if "Cited by" in a.text: - cites = int(a.text[8:]) - if "[PDF]" in a.text: - link_pdf = a.get("href") + if "Cited by" in a.text: + cites = int(a.text[8:]) + if "[PDF]" in a.text: + link_pdf = a.get("href") for div in element.findAll("div", class_="gs_a"): try: - authors, source_and_year, source = div.text.replace('\u00A0', ' ').split(" - ") + authors, source_and_year, source = div.text.replace( + '\u00A0', ' ' + ).split(" - ") except ValueError: continue @@ -48,27 +53,28 @@ def schoolarParser(html): year = None else: year = str(year) - if title!=None: - result.append({ - 'title' : title, - 'link' : link, - 'cites' : cites, - 'link_pdf' : link_pdf, - 'year' : year, - 'authors' : authors}) + if title != None: + result.append( + { + 'title': title, + 'link': link, + 'cites': cites, + 'link_pdf': link_pdf, + 'year': year, + 'authors': authors, + } + ) return result - def isBook(tag): result = False for span in tag.findAll("span", class_="gs_ct2"): - if span.text=="[B]": + if span.text == "[B]": result = True return result - def getSchiHubPDF(html): result = None soup = BeautifulSoup(html, "html.parser") @@ -76,17 +82,18 @@ def getSchiHubPDF(html): iframe = soup.find(id='pdf') plugin = soup.find(id='plugin') - if iframe!=None: + if iframe != None: result = iframe.get("src") - if plugin!=None and result==None: + if plugin != None and result == None: result = plugin.get("src") - if result!=None and result[0]!="h": - result = "https:"+result + if result != None and result[0] != "h": + result = "https:" + result return result + def SciHubUrls(html): result = [] soup = BeautifulSoup(html, "html.parser") @@ -94,8 +101,9 @@ def SciHubUrls(html): for ul in soup.findAll("ul"): for a in ul.findAll("a"): link = a.get("href") - if link.startswith("https://sci-hub.") or link.startswith("http://sci-hub."): + if link.startswith("https://sci-hub.") or link.startswith( + "http://sci-hub." + ): result.append(link) return result - From cf04533c8743c3e780d9cd40678d9ba7e8302459 Mon Sep 17 00:00:00 2001 From: Yushuhuan <22215064@zju.edu.cn> Date: Thu, 19 Dec 2024 20:09:50 +0800 Subject: [PATCH 2/2] change function scholarParser (previously named schoolarParser) for accurate search --- PyPaperBot/HTMLparsers.py | 2 +- PyPaperBot/Scholar.py | 44 +++++++++++++++++++++++++-------------- 2 files changed, 29 insertions(+), 17 deletions(-) diff --git a/PyPaperBot/HTMLparsers.py b/PyPaperBot/HTMLparsers.py index dc90e06..a2d7c64 100644 --- a/PyPaperBot/HTMLparsers.py +++ b/PyPaperBot/HTMLparsers.py @@ -7,7 +7,7 @@ from bs4 import BeautifulSoup -def schoolarParser(html): +def scholarParser(html): result = [] soup = BeautifulSoup(html, "html.parser") for element in soup.findAll( diff --git a/PyPaperBot/Scholar.py b/PyPaperBot/Scholar.py index 184b6a9..dfac72b 100644 --- a/PyPaperBot/Scholar.py +++ b/PyPaperBot/Scholar.py @@ -1,15 +1,17 @@ import time import requests import functools -from .HTMLparsers import schoolarParser +from .HTMLparsers import scholarParser from .Crossref import getPapersInfo from .NetInfo import NetInfo def waithIPchange(): while True: - inp = input('You have been blocked, try changing your IP or using a VPN. ' - 'Press Enter to continue downloading, or type "exit" to stop and exit....') + inp = input( + 'You have been blocked, try changing your IP or using a VPN. ' + 'Press Enter to continue downloading, or type "exit" to stop and exit....' + ) if inp.strip().lower() == "exit": return False elif not inp.strip(): @@ -17,8 +19,11 @@ def waithIPchange(): time.sleep(30) return True + def scholar_requests(scholar_pages, url, restrict, scholar_results=10): - javascript_error = "Sorry, we can't verify that you're not a robot when JavaScript is turned off" + javascript_error = ( + "Sorry, we can't verify that you're not a robot when JavaScript is turned off" + ) to_download = [] for i in scholar_pages: while True: @@ -33,13 +38,15 @@ def scholar_requests(scholar_pages, url, restrict, scholar_results=10): else: break - papers = schoolarParser(html) - print("\nGoogle Scholar page {} : {} papers found".format(i,scholar_results)) + papers = scholarParser(html) + print("\nGoogle Scholar page {} : {} papers found".format(i, scholar_results)) - if(len(papers)>0): + if len(papers) > 0: papersInfo = getPapersInfo(papers, url, restrict, scholar_results) - info_valids = functools.reduce(lambda a,b : a+1 if b.DOI!=None else a, papersInfo, 0) - print("Papers found on Crossref: {}/{}\n".format(info_valids,len(papers))) + info_valids = functools.reduce( + lambda a, b: a + 1 if b.DOI != None else a, papersInfo, 0 + ) + print("Papers found on Crossref: {}/{}\n".format(info_valids, len(papers))) to_download.append(papersInfo) else: @@ -48,15 +55,20 @@ def scholar_requests(scholar_pages, url, restrict, scholar_results=10): return to_download +def ScholarPapersInfo( + query, scholar_pages, restrict, min_date=None, scholar_results=10 +): -def ScholarPapersInfo(query, scholar_pages, restrict, min_date=None, scholar_results=10): - - url = r"https://scholar.google.com/scholar?hl=en&q="+query+"&as_vis=1&as_sdt=1,5&start=%d" - if min_date!=None: - url += "&as_ylo="+str(min_date) + url = ( + r"https://scholar.google.com/scholar?hl=en&q=" + + query + + "&as_vis=1&as_sdt=1,5&start=%d" + ) + if min_date != None: + url += "&as_ylo=" + str(min_date) - if len(query)>7 and (query[0:7]=="http://" or query[0:8]=="https://"): - url = query + if len(query) > 7 and (query[0:7] == "http://" or query[0:8] == "https://"): + url = query to_download = scholar_requests(scholar_pages, url, restrict, scholar_results)