免费代理稳定性连同时切换代理附代理采集下载下载地址
优采云 发布时间: 2021-08-11 21:19免费代理稳定性连同时切换代理附代理采集下载下载地址
1、Proxy 文件格式:(Proxy采集Address)
2、Free agent 稳定性不可靠,同时使用装饰器重连切换代理
# coding: utf-8
# pyhotn 2.7
# 小说棋 单篇小说采集 http://www.xs7.la/
# 替换第一章地址,总章节数。
# ip.txt 为代理池。
import urllib2
from bs4 import BeautifulSoup
import sys
import traceback
import random
import gzip
reload(sys)
sys.setdefaultencoding('utf-8')
f = open("out.txt", "a+")
headers = { "Host": "www.xs7.la", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "X-Requested-With": "XMLHttpRequest", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36", "Content-Type": "text/html", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Referer": "http://www.xs7.la/book/18_18966/", "Accept-Encoding": 'deflat'
}
url = "http://www.xs7.la/book/18_18966/7828246.html" # 第一章网址
page = 184 # 章节数
nextHref = url
ipPool = []
def IPpool(): reader = open('ip.txt') line = reader.readline() while line: if line.strip() != '': ipPool.append(line.split()) line = reader.readline() reader.close()
RETRIES = 0
# 重试的次数
count = {"num": RETRIES}
def conn_try_again(function): def wrapped(*args, **kwargs): try: return function(*args, **kwargs) except Exception, err: print("--重试访问,当前次数 %s ,(总次数11)--" % (count['num'] + 1)) if count['num'] < 10: count['num'] += 1 return wrapped(*args, **kwargs) else: raise Exception(err) return wrapped
bsObj = None
#判断编码格式
def getCoding(strInput): ''' 获取编码格式 ''' if isinstance(strInput, unicode): return "unicode" try: strInput.decode("utf8") return 'utf8' except: pass try: strInput.decode("gbk") return 'gbk' except: pass @conn_try_again
def getContent(url): global nextHref, page, bsObj # 定义一个代理开关 proxySwitch = True try: poolLen = len(ipPool) if (poolLen > 0): i = random.randint(0, poolLen - 1) print(ipPool[i]) proxy_host = ipPool[i][2] + "://" + ipPool[i][0] + ":" + ipPool[i][1] proxy_temp = {ipPool[i][2]: proxy_host} proxy_support = urllib2.ProxyHandler(proxy_temp) else: print('--代理池当前无可用代理,使用本机地址访问--') proxy_support = urllib2.ProxyHandler({}) nullproxy_handler = urllib2.ProxyHandler({"http": "124.172.232.49:8010"}) if proxySwitch: opener = urllib2.build_opener(proxy_support) else: opener = urllib2.build_opener(nullproxy_handler) urllib2.install_opener(opener) req = urllib2.Request(url, headers=headers) response = urllib2.urlopen(req, timeout=3) r = response.read() encode=getCoding(r) if(encode==None): print(response.info().get('Content-Encoding')) #gzip需要解压 else : r = r.decode(encode) # print(r) bsObj = BeautifulSoup(r, 'lxml') except Exception, err: raise Exception(err) # print(bsObj) contentDiv = bsObj.find('div', id='content') content = bsObj.find('div', id='content').get_text() preAndNextBar = bsObj.find('div', id='thumb') title = bsObj.find('div', id='bgdiv').h1.get_text() if ("下一章" in preAndNextBar.get_text()): next = None aList = preAndNextBar.findAll('a') for i in aList: if ("下一章" in i.get_text()): next = i if (next == None): print("下一章为空") return True nextHref = next.get('href') print(title) # print(content) print(nextHref) f.write("#####" + '\n') f.write(title + '\n') f.write(content + '\n') count['num'] = 0 else: return True
def main(): IPpool() global page try: for num in range(1, page): if (getContent(nextHref)): break print("--- end ---") except Exception, e: print(traceback.print_exc()) finally: f.close()
main()
附件:Agent采集
下载链接: