From 49c69beee41baf04f0406038b15a8a3856e7d120 Mon Sep 17 00:00:00 2001 From: wonder Date: Tue, 14 Nov 2017 12:38:50 +0800 Subject: [PATCH 1/2] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E4=BB=A3=E7=90=86?= =?UTF-8?q?=E6=B1=A0,=E4=BF=AE=E5=A4=8D503=E9=94=99=E8=AF=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .DS_Store | Bin 12292 -> 10244 bytes pdf/README.md | 4 +++- pdf/crawler.py | 48 ++++++++++++++++++++++++++++++++++-------------- 3 files changed, 37 insertions(+), 15 deletions(-) diff --git a/.DS_Store b/.DS_Store index f3a3c14ff2454934c161065105a70413c9c61b89..2e66cd579d1f2a28143ab1c3a4a7524376052e3f 100644 GIT binary patch delta 205 zcmZokXbF&DU|?W$DortDU{C-uIe-{M3-C-V6q~3gIoUvmMH0wo-~?hupg03VCPQ9Q zd2vBfPJYtnixO7s6B{x)fifV#!3`7 - """ @@ -51,7 +49,26 @@ def request(url, **kwargs): 网络请求,返回response对象 :return: """ - response = requests.get(url, **kwargs) + fp = open('proxies.txt', 'r') + ips = fp.readlines() + proxys = list() + for p in ips: + ip = p.strip('\n').split('\t') + pro = dict() + pro['https'] = ip[0] + ':' + ip[1] + proxys.append(pro) + + headers = { + 'user-agent': 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19'} + while (1): + try: + response = requests.get(url, headers=headers, proxies=random.choice(proxys), timeout=2) + if (response.status_code != 200): continue + break + except Exception as e: + print(e) + + print("response:", response) return response def parse_menu(self, response): @@ -70,6 +87,7 @@ def parse_body(self, response): def run(self): start = time.time() + options = { 'page-size': 'Letter', 'margin-top': '0.75in', @@ -93,7 +111,6 @@ def run(self): with open(f_name, 'wb') as f: f.write(html) htmls.append(f_name) - pdfkit.from_file(htmls, self.name + ".pdf", options=options) for html in htmls: os.remove(html) @@ -112,10 +129,12 @@ def parse_menu(self, response): :param response 爬虫返回的response对象 :return: url生成器 """ - soup = BeautifulSoup(response.content, "html.parser") - menu_tag = soup.find_all(class_="uk-nav uk-nav-side")[1] - for li in menu_tag.find_all("li"): - url = li.a.get("href") + soup = BeautifulSoup(response.text, "html.parser") + menu_tag = soup.find_all('ul', class_="uk-nav uk-nav-side")[1] + # print(menu_tag.find_all('a')[0].get("href")) + for li in menu_tag.find_all("a"): + url = li.get("href") + # print(li) if not url.startswith("http"): url = "".join([self.domain, url]) # 补全为全路径 yield url @@ -127,9 +146,10 @@ def parse_body(self, response): :return: 返回处理后的html文本 """ try: - soup = BeautifulSoup(response.content, 'html.parser') - body = soup.find_all(class_="x-wiki-content")[0] + soup = BeautifulSoup(response.text, 'html.parser') + body = soup.find_all(class_="x-wiki-content x-main-content")[0] + print("body:", body) # 加入标题, 居中显示 title = soup.find('h4').get_text() center_tag = soup.new_tag("center") @@ -158,6 +178,6 @@ def func(m): if __name__ == '__main__': - start_url = "http://www.liaoxuefeng.com/wiki/0013739516305929606dd18361248578c67b8067c8c017b000" + start_url = "https://www.liaoxuefeng.com/wiki/0013739516305929606dd18361248578c67b8067c8c017b000" crawler = LiaoxuefengPythonCrawler("廖雪峰Git", start_url) crawler.run() From bbd839ec75a3a71ce7873d111b8621a40733fc2b Mon Sep 17 00:00:00 2001 From: GazEoD Date: Tue, 14 Nov 2017 12:54:02 +0800 Subject: [PATCH 2/2] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E4=BB=A3=E7=90=86?= =?UTF-8?q?=E6=B1=A0,=E4=BF=AE=E5=A4=8D503=E9=94=99=E8=AF=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pdf/censor.py | 21 ++++++++++ pdf/get_proxy.py | 22 +++++++++++ pdf/host.txt | 100 +++++++++++++++++++++++++++++++++++++++++++++++ pdf/proxies.txt | 14 +++++++ 4 files changed, 157 insertions(+) create mode 100644 pdf/censor.py create mode 100644 pdf/get_proxy.py create mode 100644 pdf/host.txt create mode 100644 pdf/proxies.txt diff --git a/pdf/censor.py b/pdf/censor.py new file mode 100644 index 0000000..b946d17 --- /dev/null +++ b/pdf/censor.py @@ -0,0 +1,21 @@ +import random +from urllib import request + +import requests + +fpr=open('host.txt','r') +fpw = open('proxies.txt','w') + +ips=fpr.readlines() +proxys=list() +for p in ips: + ip=p.strip('\n').split('\t') + pro=dict() + pro['https'] = ip[0] + ':' + ip[1] + print(pro) + try: + response = requests.get('https://www.baidu.com', proxies=pro,timeout=2) + print(response) + fpw.write(p) + except Exception as e: + print(e) \ No newline at end of file diff --git a/pdf/get_proxy.py b/pdf/get_proxy.py new file mode 100644 index 0000000..a9c5a63 --- /dev/null +++ b/pdf/get_proxy.py @@ -0,0 +1,22 @@ +from urllib import request +from bs4 import BeautifulSoup + +fp = open('host.txt', 'w') +for i in range(1,3): + url='http://www.xicidaili.com/wn/' + str(i) + opener=request.build_opener() + opener.addheaders = [('User-Agent', + 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36')] + request.install_opener(opener) + response = request.urlopen(url) + soup = BeautifulSoup(response.read(),'html.parser') + list = soup.find_all(class_='odd') + for elem in list: + data = elem.find_all('td') + ip=data[1].string + port=data[2].string + fp.write(ip) + fp.write('\t') + fp.write(port) + fp.write('\n') +fp.close() diff --git a/pdf/host.txt b/pdf/host.txt new file mode 100644 index 0000000..aa7b20b --- /dev/null +++ b/pdf/host.txt @@ -0,0 +1,100 @@ +58.62.86.245 9999 +61.152.230.26 8080 +171.104.132.28 9999 +112.74.94.142 3128 +139.224.24.26 8888 +122.72.18.34 80 +111.85.15.166 8080 +122.72.18.61 80 +119.90.63.3 3128 +27.44.170.236 9999 +116.52.224.81 9999 +119.114.229.137 80 +180.76.134.106 3128 +116.211.88.90 3128 +116.236.151.166 8080 +183.135.251.12 30291 +125.112.194.189 37301 +183.95.22.121 53281 +182.246.209.199 80 +222.132.145.126 80 +202.201.3.121 3128 +222.78.116.105 26956 +49.88.168.150 31329 +139.208.198.170 8118 +113.87.88.103 9797 +183.135.249.107 42411 +59.63.74.254 4362 +49.85.13.107 42332 +171.212.140.116 8118 +183.51.191.234 9797 +113.76.96.91 9797 +123.185.129.93 8080 +223.241.79.48 8010 +39.88.13.3 53281 +27.46.39.194 9797 +115.203.196.36 22221 +175.171.108.227 53281 +218.29.111.106 9999 +113.65.160.146 9797 +120.76.55.49 8088 +183.38.61.213 9999 +59.60.168.219 29377 +223.241.117.50 8010 +115.221.117.220 22628 +117.90.111.6 45614 +114.234.80.152 29786 +218.73.139.165 25933 +182.34.50.184 42887 +183.135.253.23 46668 +182.88.126.216 9797 +113.89.13.153 9999 +1.194.162.92 39615 +60.179.40.138 27739 +175.147.66.112 8080 +14.211.119.11 9797 +113.121.251.55 31353 +14.211.123.143 9797 +114.239.222.194 44127 +115.203.194.104 38774 +115.230.62.194 31772 +223.241.116.93 8010 +59.38.61.207 9797 +110.72.34.144 8123 +59.40.68.44 8010 +223.241.119.91 8010 +125.126.172.225 23107 +59.40.50.197 8010 +113.121.170.239 25683 +223.241.116.115 8010 +27.44.162.195 9999 +183.189.114.218 80 +106.113.242.113 9999 +110.72.26.118 8123 +118.254.153.227 3128 +218.20.54.92 9999 +120.42.124.17 48795 +223.223.203.30 8080 +111.76.64.44 4392 +183.148.87.90 21140 +221.198.105.220 8118 +223.241.78.169 8010 +113.89.15.143 9999 +182.42.45.36 808 +14.211.122.146 9797 +202.105.111.162 9000 +120.43.230.31 47224 +115.202.80.0 28202 +119.136.199.74 808 +59.40.50.231 8010 +223.241.117.25 8010 +180.115.11.207 38367 +223.241.117.196 8010 +115.215.51.218 808 +14.29.84.50 8080 +111.76.175.214 4323 +27.44.159.41 9797 +115.46.68.40 8123 +171.39.41.249 8123 +121.31.71.193 8123 +60.160.185.213 7654 diff --git a/pdf/proxies.txt b/pdf/proxies.txt new file mode 100644 index 0000000..37f10d4 --- /dev/null +++ b/pdf/proxies.txt @@ -0,0 +1,14 @@ +58.62.86.245 9999 +122.72.18.34 80 +111.85.15.166 8080 +122.72.18.61 80 +116.52.224.81 9999 +116.236.151.166 8080 +183.95.22.121 53281 +202.201.3.121 3128 +113.87.88.103 9797 +27.46.39.194 9797 +120.76.55.49 8088 +182.88.126.216 9797 +14.211.119.11 9797 +14.211.123.143 9797