实时文章采集(简单几个打包为exe命令全部源码参考资料多线程并发#lock-objectsPySimpleGUI)

优采云 发布时间: 2022-03-02 03:20

  实时文章采集(简单几个打包为exe命令全部源码参考资料多线程并发#lock-objectsPySimpleGUI)

  最近帮朋友写了一个简单的爬虫。对了,我做了一个小说爬虫工具,有GUI界面,可以从笔趣阁爬取小说。

  开发完成后的界面

  

  采集进程接口

  

  采集之后保存

  

  主要实现的功能是多线程采集,一个线程采集小说支持使用代理,尤其是多线程采集时,没有ip可能会被阻塞使用代理

  

  实时输出采集结果

  

  使用threading.BoundedSemaphore() pool_sema.acquire() pool_sema.release() 限制线程数,防止并发线程溢出。具体限制可在软件界面输入,默认为5个线程

  

  所有线程任务开始前

pool_sema.threading.BoundedSemaphore(5)

具体每个线程开始前 锁

pool_sema.acquire()

....

# 线程任务执行结束释放

pol_sema.release()

  使用的第三方模块

  pip install requests

pip install pysimplegui

pip install lxml

pip install pyinstaller

  GUI界面使用了一个tkinter包库PySimpleGUI,使用起来非常方便。界面虽然不是很漂亮,但是很简单,很适合开发一些小工具。 pysimplegui.readthedocs.io/en/latest/ 比如这个界面的布局,就是几个简单的列表

  layout = [

[sg.Text('输入要爬取的小说网址,点此打开笔趣阁站点复制', font=("微软雅黑", 12),

key="openwebsite", enable_events=True, tooltip="点击在浏览器中打开")],

[sg.Text("小说目录页url,一行一个:")],

[

sg.Multiline('', key="url", size=(120, 6), autoscroll=True, expand_x=True, right_click_menu=['&Right', ['粘贴']]

)

],

[sg.Text(visible=False, text_color="#ff0000", key="error")],

[

sg.Button(button_text='开始采集', key="start", size=(20, 1)),

sg.Button(button_text='打开下载目录', key="opendir",

size=(20, 1), button_color="#999999")

],

[sg.Text('填写ip代理,有密码格式 用户名:密码@ip:端口,无密码格式 ip:端口。如 demo:123456@123.1.2.8:8580')],

[

sg.Input('', key="proxy"),

sg.Text('线程数量:'),

sg.Input('5', key="threadnum"),

],

[

sg.Multiline('等待采集', key="res", disabled=True, border_width=0, background_color="#ffffff", size=(

120, 6), no_scrollbar=False, autoscroll=True, expand_x=True, expand_y=True, font=("宋体", 10), text_color="#999999")

],

]

  打包成exe命令

  pyinstaller -Fw start.py

  全部源代码

  import time

import requests

import os

import sys

import re

import random

from lxml import etree

import webbrowser

import PySimpleGUI as sg

import threading

# user-agent

header = {

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36"

}

# 代理

proxies = {}

# 删除书名中特殊符号

# 笔趣阁基地址

baseurl = 'https://www.xbiquwx.la/'

# 线程数量

threadNum = 6

pool_sema = None

THREAD_EVENT = '-THREAD-'

cjstatus = False

# txt存储目录

filePath = os.path.abspath(os.path.join(os.getcwd(), 'txt'))

if not os.path.exists(filePath):

os.mkdir(filePath)

# 删除特殊字符

def deletetag(text):

return re.sub(r'[\[\]#\/\\:*\,;\?\"\'\|\(\)《》&\^!~=%\{\}@!:。·!¥……() ]','',text)

# 入口

def main():

global cjstatus, proxies, threadNum, pool_sema

sg.theme("reddit")

layout = [

[sg.Text('输入要爬取的小说网址,点此打开笔趣阁站点复制', font=("微软雅黑", 12),

key="openwebsite", enable_events=True, tooltip="点击在浏览器中打开")],

[sg.Text("小说目录页url,一行一个:")],

[

sg.Multiline('', key="url", size=(120, 6), autoscroll=True, expand_x=True, right_click_menu=['&Right', ['粘贴']]

)

],

[sg.Text(visible=False, text_color="#ff0000", key="error")],

[

sg.Button(button_text='开始采集', key="start", size=(20, 1)),

sg.Button(button_text='打开下载目录', key="opendir",

size=(20, 1), button_color="#999999")

],

[sg.Text('填写ip代理,有密码格式 用户名:密码@ip:端口,无密码格式 ip:端口。如 demo:123456@123.1.2.8:8580')],

[

sg.Input('', key="proxy"),

sg.Text('线程数量:'),

sg.Input('5', key="threadnum"),

],

[

sg.Multiline('等待采集', key="res", disabled=True, border_width=0, background_color="#ffffff", size=(

120, 6), no_scrollbar=False, autoscroll=True, expand_x=True, expand_y=True, font=("宋体", 10), text_color="#999999")

],

]

window = sg.Window('采集笔趣阁小说', layout, size=(800, 500), resizable=True,)

while True:

event, values = window.read()

if event == sg.WIN_CLOSED or event == 'close': # if user closes window or clicks cancel

break

if event == "openwebsite":

webbrowser.open('%s' % baseurl)

elif event == 'opendir':

os.system('start explorer ' + filePath)

elif event == 'start':

if cjstatus:

cjstatus = False

window['start'].update('已停止...点击重新开始')

continue

window['error'].update("", visible=False)

urls = values['url'].strip().split("\n")

lenth = len(urls)

for k, url in enumerate(urls):

if (not re.match(r'%s\d+_\d+/' % baseurl, url.strip())):

if len(url.strip()) > 0:

window['error'].update("地址错误:%s" % url, visible=True)

del urls[k]

if len(urls) < 1:

window[&#39;error&#39;].update(

"每行地址需符合 %s84_84370/ 形式" % baseurlr, visible=True)

continue

# 代理

if len(values[&#39;proxy&#39;]) > 8:

proxies = {

"http": "http://%s" % values[&#39;proxy&#39;],

"https": "http://%s" % values[&#39;proxy&#39;]

}

# 线程数量

if values[&#39;threadnum&#39;] and int(values[&#39;threadnum&#39;]) > 0:

threadNum = int(values[&#39;threadnum&#39;])

pool_sema = threading.BoundedSemaphore(threadNum)

cjstatus = True

window[&#39;start&#39;].update(&#39;采集中...点击停止&#39;)

window[&#39;res&#39;].update(&#39;开始采集&#39;)

for url in urls:

threading.Thread(target=downloadbybook, args=(

url.strip(), window,), daemon=True).start()

elif event == "粘贴":

window[&#39;url&#39;].update(sg.clipboard_get())

print("event", event)

if event == THREAD_EVENT:

strtext = values[THREAD_EVENT][1]

window[&#39;res&#39;].update(window[&#39;res&#39;].get()+"\n"+strtext)

cjstatus = False

window.close()

#下载

def downloadbybook(page_url, window):

try:

bookpage = requests.get(url=page_url, headers=header, proxies=proxies)

except Exception as e:

window.write_event_value(

&#39;-THREAD-&#39;, (threading.current_thread().name, &#39;\n请求 %s 错误,原因:%s&#39; % (page_url, e)))

return

if not cjstatus:

return

# 锁线程

pool_sema.acquire()

if bookpage.status_code != 200:

window.write_event_value(

&#39;-THREAD-&#39;, (threading.current_thread().name, &#39;\n请求%s错误,原因:%s&#39; % (page_url, page.reason)))

return

bookpage.encoding = &#39;utf-8&#39;

page_tree = etree.HTML(bookpage.text)

bookname = page_tree.xpath(&#39;//div[@id="info"]/h1/text()&#39;)[0]

bookfilename = filePath + &#39;/&#39; + deletetag(bookname)+&#39;.txt&#39;

zj_list = page_tree.xpath(

&#39;//div[@class="box_con"]/div[@id="list"]/dl/dd&#39;)

for _ in zj_list:

if not cjstatus:

break

zjurl = page_url + _.xpath(&#39;./a/@href&#39;)[0]

zjname = _.xpath(&#39;./a/@title&#39;)[0]

try:

zjpage = requests.get(

zjurl, headers=header, proxies=proxies)

except Exception as e:

window.write_event_value(&#39;-THREAD-&#39;, (threading.current_thread(

).name, &#39;\n请求%s:%s错误,原因:%s&#39; % (zjname, zjurl, zjpage.reason)))

continue

if zjpage.status_code != 200:

window.write_event_value(&#39;-THREAD-&#39;, (threading.current_thread(

).name, &#39;\n请求%s:%s错误,原因:%s&#39; % (zjname, zjurl, zjpage.reason)))

return

zjpage.encoding = &#39;utf-8&#39;

zjpage_content = etree.HTML(zjpage.text).xpath(&#39;//div[@id="content"]/text()&#39;)

content = "\n【"+zjname+"】\n"

for _ in zjpage_content:

content += _.strip() + &#39;\n&#39;

with open(bookfilename, &#39;a+&#39;, encoding=&#39;utf-8&#39;) as fs:

fs.write(content)

window.write_event_value(

&#39;-THREAD-&#39;, (threading.current_thread().name, &#39;\n%s:%s 采集成功&#39; % (bookname, zjname)))

time.sleep(random.uniform(0.05, 0.2))

# 下载完毕

window.write_event_value(&#39;-THREAD-&#39;, (threading.current_thread(

).name, &#39;\n请求 %s 结束&#39; % page_url))

pool_sema.release()

if __name__ == &#39;__main__&#39;:

main()

  参考多线程并发#lock-objectsPySimpleGUI

0 个评论

要回复文章请先登录注册


官方客服QQ群

微信人工客服

QQ人工客服


线