解决方案:海南网站建设WordPress插件-AutoBlog自动采集插件V1.2

优采云发布时间: 2022-10-01 07:06

　　AutoBlog（自动采集发布插件）是一款出色的插件工具，可以帮助用户采集任意网站中的站点内容，并全自动更新你的WordPress站点，进行文章发布等等。使用方法简单，无需复杂设置，支持wordpress所有特性。

　　通过简单设置可采集来自于任何网站的内容，并可设置多个采集任务同时进行，可设置任务为自动运行或手动运行，主任务列表显示每个采集任务的状况：上次检测采集时间，预计下次检测采集时间，最近采集文章，已采集更新的文章数等信息，方便查看管理。

　　文章管理功能方便查询、搜索、删除已采集文章，改进算法已从根本上杜绝了重复采集相同文章，日志功能记录采集过程中出现的异常和抓取错误，方便检查设置错误以便进行修复。

　　解决方案:Python采集X音用户作品+调用Aria2下载+fire生成命令行+Vue界面

　　X音采集开源仓库

　　介绍

　　Python取数据 +Vue写界面 +Aria2下载

　　根据X音各种链接或各种id，通过网页接口采集视频作品，并下载作品到本地。

　　支持用户主页链接或sec_uid/话题挑战和音乐原声链接或ID。

　　支持下载喜欢列表（需喜欢列表可见）。

　　①2000多本Python电子书（主流和经典的书籍应该都有了）

　　②Python标准库资料（最全中文版）

　　③项目源码（四五十个有趣且经典的练手项目及源码）

　　④Python基础入门、爬虫、web开发、大数据分析方面的视频（适合小白学习）

　　⑤ Python学习路线图（告别不入流的学习）

　　当然在学习Python的道路上肯定会困难，没有好的学习资料，怎么去学习呢？

学习Python中有不明白推荐加入交流Q群号：928946953

群里有志同道合的小伙伴，互帮互助，群里有不错的视频学习教程和PDF！

还有大牛解答！

　　使用0x00 安装依赖

　　在程序目录打开命令行，输入

　　复制代码隐藏代码

pip install -r requirements.txt

　　0x01 使用UI界面

　　双击打开启动.bat，或者在程序目录打开命令行，输入

　　复制代码隐藏代码

python ui.py

　　0x02 直接修改douyin.py中相关参数使用

　　完全不懂Python的朋友用命令行或操作界面。

　　0x03 从命令行使用exec.py直接运行可查看命令列表，或使用-h参数查看帮助复制代码隐藏代码

　　pythonexec.py pythonexec.py -h pythonexec.py download -h pythonexec.py download_batch -h使用函数名调用程序复制代码隐藏代码

　　--type指定下载类型，默认值：--type=user --limit指定采集数量，默认值：--limit=0（不限制）例如采集某用户全部作品：复制代码隐藏代码

　　pythonexec.py download python exec.py download 用户的secuid例如采集某用户喜欢的前10个作品：复制代码隐藏代码

　　pythonexec.py download MS4wLjABAAAAl7TJWjJJrnu11IlllB6Mi5V9VbAsQo1N987guPjctc8--type=like --limit=10pythonexec.py download 用户的secuid例如采集某音乐原声前10个作品：复制代码隐藏代码

　　python exec.py download --type=music --limit=10 python exec.py download 音乐ID --type=music --limit=10TODO知识点X音相关Aria2相关Python相关命令行模块fire相关UI模块pywebview相关X音采集部分源码

　　复制代码隐藏代码

# -*- encoding: utf-8 -*-

"""

@File : douyin.py

@Time : 2021年03月12日 18:16:57 星期五

@Author : erma0

@Version : 1.0

@Link : https://erma0.cn

@Desc : X音用户作品采集

"""

import json

import os

import time

from urllib.parse import parse_qs, urlparse

import requests

from download import Download

class Douyin(object):

"""

X音用户类

采集作品列表

"""

def __init__(self, param: str, limit: int = 0):

"""

初始化用户信息

参数自动判断：ID/URL

"""

self.limit = limit

self.http = requests.Session()

self.url = ""

self.type = "unknow"

self.download_path = "暂未定义目录"

# ↑ 预定义属性，避免调用时未定义 ↑

self.param = param.strip()

self.sign = "TG2uvBAbGAHzG19a.rniF0xtrq" # sign可以固定

self.__get_type() # 判断当前任务类型：链接/ID

self.aria2 = Download() # 初始化Aria2下载服务，先不指定目录了，在设置文件名的时候再加入目录

self.has_more = True

self.finish = False

# 字典格式方便入库用id做key/取值/修改对应数据，但是表格都接收数组

self.videosL = [] #列表格式

# self.videos = {} #字典格式

self.gids = {} # gid和作品序号映射

def __get_type(self):

"""

判断当前任务类型

链接/ID

"""

if "://" in self.param: # 链接

self.__url2redirect()

else: # ID

self.id = self.param

def __url2redirect(self):

"""

取302跳转地址

短连接转长链接

"""

headers = { # 以前作品需要解析去水印，要用到移动端UA，现在不用了

"User-Agent":

"Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1 Edg/89.0.4389.82"

}

try:

r = self.http.head(self.param, headers=headers, allow_redirects=False)

self.url = r.headers["Location"]

except:

self.url = self.param

def __url2id(self):

try:

self.id = urlparse(self.url).path.split("/")[3]

except:

self.id = ""

def __url2uid(self):

try:

<p>

query = urlparse(self.url).query

self.id = parse_qs(query)["sec_uid"][0]

except:

self.id = ""

def get_sign(self):

"""

网页sign算法，现在不需要了，直接固定

"""

self.sign = "TG2uvBAbGAHzG19a.rniF0xtrq"

return self.sign

def get_user_info(self):

"""

取用户信息

查询结果在 self.user_info

"""

if self.url:

self.__url2uid()

url = "https://www.iesdouyin.com/web/api/v2/user/info/?sec_uid=" + self.id

try:

res = self.http.get(url).json()

info = res.get("user_info", dict())

except:

info = dict()

self.user_info = info

# 下载路径

username = "{}_{}_{}".format(self.user_info.get("short_id", "0"),

self.user_info.get("nickname", "无昵称"), self.type)

self.download_path = Download.title2path(username) # 需提前处理非法字符串

def get_challenge_info(self):

"""

取话题挑战信息

查询结果在 self.challenge_info

"""

if self.url:

self.__url2id()

url = "https://www.iesdouyin.com/web/api/v2/challenge/info/?ch_id=" + self.id

try:

res = self.http.get(url).json()

info = res.get("ch_info", dict())

except:

info = dict()

self.challenge_info = info

# 话题挑战下载路径

username = "{}_{}_{}".format(self.challenge_info.get("cid", "0"),

self.challenge_info.get("cha_name", "无标题"), self.type)

self.download_path = Download.title2path(username) # 需提前处理非法字符串

def get_music_info(self):

"""

取音乐原声信息

查询结果在 self.music_info

"""

if self.url:

self.__url2id()

url = "https://www.iesdouyin.com/web/api/v2/music/info/?music_id=" + self.id

try:

res = self.http.get(url).json()

info = res.get("music_info", dict())

except:

info = dict()

self.music_info = info

# 音乐原声下载路径

username = "{}_{}_{}".format(self.music_info.get("mid", "0"), self.music_info.get("title", "无标题"),

self.type)

self.download_path = Download.title2path(username) # 需提前处理非法字符串

def crawling_users_post(self):

"""

采集用户作品

"""

self.type = "post"

self.__crawling_user()

def crawling_users_like(self):

"""

采集用户喜欢

"""

self.type = "like"

self.__crawling_user()

def crawling_challenge(self):

"""

采集话题挑战

"""

self.type = "challenge"

self.get_challenge_info() # 取当前信息，用做下载目录

# https://www.iesdouyin.com/web/api/v2/challenge/aweme/?ch_id=1570693184929793&count=9&cursor=9&aid=1128&screen_limit=3&download_click_limit=0&_signature=AXN-GQAAYUTpqVxkCT6GHQFzfg

url = "https://www.iesdouyin.com/web/api/v2/challenge/aweme/"

cursor = "0"

while self.has_more:

params = {

"ch_id": self.id,

"count": "21", # 可调大初始值：9

"cursor": cursor,

"aid": "1128",

"screen_limit": "3",

"download_click_limit": "0",

"_signature": self.sign

}

try:

res = self.http.get(url, params=params).json()

cursor = res["cursor"]

self.has_more = res["has_more"]

self.__append_videos(res)

except:

print("话题挑战采集出错")

print("话题挑战采集完成")

def crawling_music(self):

"""

采集音乐原声

"""

self.type = "music"

self.get_music_info() # 取当前信息，用做下载目录

# https://www.iesdouyin.com/web/api/v2/music/list/aweme/?music_id=6928362875564067592&count=9&cursor=18&aid=1128&screen_limit=3&download_click_limit=0&_signature=5ULmIQAAhRYNmMRcpDm2COVC5j

url = "https://www.iesdouyin.com/web/api/v2/music/list/aweme/"

cursor = "0"

while self.has_more:

params = {

"music_id": self.id,

"count": "21", # 可调大初始值：9

"cursor": cursor,

"aid": "1128",

"screen_limit": "3",

"download_click_limit": "0",

"_signature": self.sign

}

try:

res = self.http.get(url, params=params).json()

cursor = res["cursor"]

self.has_more = res["has_more"]

self.__append_videos(res)

except:

print("音乐原声采集出错")

print("音乐原声采集完成")

def __crawling_user(self):

"""

采集用户作品/喜欢

"""

self.get_user_info() # 取当前用户信息，昵称用做下载目录

max_cursor = 0

# https://www.iesdouyin.com/web/api/v2/aweme/like/?sec_uid=MS4wLjABAAAAaJO9L9M0scJ_njvXncvoFQj3ilCKW1qQkNGyDc2_5CQ&count=21&max_cursor=0&aid=1128&_signature=2QoRnQAAuXcx0DPg2DVICdkKEY&dytk=

# https://www.iesdouyin.com/web/api/v2/aweme/post/?sec_uid=MS4wLjABAAAAaJO9L9M0scJ_njvXncvoFQj3ilCKW1qQkNGyDc2_5CQ&count=21&max_cursor=0&aid=1128&_signature=DrXeeAAAbwPmb.wFM3e63w613m&dytk=

url = "https://www.iesdouyin.com/web/api/v2/aweme/{}/".format(self.type)

while self.has_more:

params = {

"sec_uid": self.id,

"count": "21",

"max_cursor": max_cursor,

"aid": "1128",

"_signature": self.sign,

"dytk": ""

}

try:

res = self.http.get(url, params=params).json()

max_cursor = res["max_cursor"]

self.has_more = res["has_more"]

self.__append_videos(res)

except:

print("作品采集出错")

print("作品采集完成")

def __append_videos(self, res):

"""

数据入库

"""

if res.get("aweme_list"):

for item in res["aweme_list"]:

info = item["statistics"]

info.pop("forward_count")

info.pop("play_count")

info["desc"] = Download.title2path(item["desc"]) # 需提前处理非法字符串

info["uri"] = item["video"]["play_addr"]["uri"]

info["play_addr"] = item["video"]["play_addr"]["url_list"][0]

info["dynamic_cover"] = item["video"]["dynamic_cover"]["url_list"][0]

info["status"] = 0 # 下载进度状态；等待下载：0，下载中：0.xx；下载完成：1

# 列表格式

self.videosL.append(info)

# 字典格式

# self.videos[info["aweme_id"]] = info

# 此处可以直接添加下载任务，不过考虑到下载占用网速,影响采集过程，所以采集完再下载

if self.limit:

more = len(self.videos) - self.limit

if more >= 0:

# 如果给出了限制采集数目，超出的删除后直接返回

self.has_more = False

# 列表格式

self.videosL = self.videosL[:self.limit]

# 字典格式

# for i in range(more):

# self.videos.popitem()

# return

else: # 还有作品的情况下没返回数据则进入这里

print("未采集完成，但返回作品列表为空")

def download_all(self):

"""

作品抓取完成后，统一添加下载任务

可选择在外部注册回调函数，*敏*感*词*下载任务状态

"""

for id, video in enumerate(self.videosL):

# for id, video in self.videos.items():

gid = self.aria2.download(url=video["play_addr"],

filename="{}/{}_{}.mp4".format(self.download_path, video["aweme_id"],

video["desc"])

# ,options={"gid": id} # 指定gid

)

self.gids[gid] = id # 因为传入gid必须16位，所以就不指定gid了，另存一个字典映射

print("下载任务投递完成")</p>

0

2022-10-01

采集文章自动发布

0 个评论

要回复文章请先登录或注册

AI时代内容工厂

解决方案:海南网站建设WordPress插件-AutoBlog自动采集插件V1.2

0 个评论

发起人

AI时代内容工厂

解决方案:海南网站建设WordPress插件-AutoBlog自动采集插件V1.2

0 个评论

发起人

相关问题