网页音频抓取(Mozilla/5.0(WindowsNT)#递归删除目录树)
优采云 发布时间: 2022-02-18 02:03网页音频抓取(Mozilla/5.0(WindowsNT)#递归删除目录树)
导入异步
来自 bs4import BeautifulSoup
来自 lxmlimport etree
导入操作系统
导入关闭
filePath="D:\\temp_ximalaya_audio"
channelFilePath=""
#初始化文件目录
如果 os.path.isdir(filePath):
shutil.rmtree(filePath)#递归删除目录树
elif os.path.isfile(filePath):
os.remove(filePath)#删除文件
os.makedirs(filePath)#创建目录
#mongodb
#clients = pymongo.MongoClient('localhost')
#db = clients["XiMaLaYa"]
#col1 = db["album2"]
#col2 = db["detail2"]
UA_LIST = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, 像壁虎) Chrome/22.@ >0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11;CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome /20.0.1132.@>57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, 像 Gecko) Chrome/20.@ >0.1092.@>0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, 像壁虎) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, 像壁虎) Chrome/19.@ >77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, 像 Gecko) Chrome/19.@>0. 1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, 像壁虎) Chrome/19.@>0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, 像壁虎) Chrome/19.@ >0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, 像壁虎) Chrome/19.@>0.1063.0 Safari/536.3",
"Mozilla/4.0(兼容;MSIE 7.0;Windows NT 5.1;Trident/4.0;SE 2.@>X MetaSr < @1.0; SE 2.@>X MetaSr 1.0; .NET CLR 2.@>0.50727; SE 2.@>X MetaSr 1.< @0)",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, 像壁虎) Chrome/19.@>0.1062.@>0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, 像壁虎) Chrome/19.@ >0.1062.@>0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, 像壁虎) Chrome/19.@ >0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML,像壁虎) Chrome/19.@>0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, 像壁虎) Chrome/19.@>0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML,像 Gecko) Chrome/19.@>0. 1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, 像 Gecko) Chrome/19.@ >0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, 像壁虎) Chrome/22.@ >0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11;CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome /20.0.1132.@>57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, 像 Gecko) Chrome/20.@ >0.1092.@>0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, 像壁虎) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, 像壁虎) Chrome/19.@ >77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, 像 Gecko) Chrome/19.@>0. 1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, 像壁虎) Chrome/19.@>0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, 像壁虎) Chrome/19.@ >0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, 像壁虎) Chrome/19.@>0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, 像 Gecko) Chrome/19.@>0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, 像壁虎) Chrome/19.@>0.1062.@>0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, 像壁虎) Chrome/19.@ >0.1062.@>0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, 像壁虎) Chrome/19.@>0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, 像壁虎) Chrome/19.@ >0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML,像壁虎) Chrome/19.@>0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, 像壁虎) Chrome/19.@>0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML,像 Gecko) Chrome/19.@>0. 1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, 像 Gecko) Chrome/19.@ >0.1055.1 Safari/535.24"
]
标题1 = {
'接受':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate, sdch',
'Accept-Language':'zh-CN,zh;q=0.8,en;q=0.6',
'Cache-Control':'max-age=0',
'代理连接':'keep-alive',
'Upgrade-Insecure-Requests':'1',
'用户代理':random.choice(UA_LIST)
}
headers2 = {
'接受':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate, sdch',
'Accept-Language':'zh-CN,zh;q=0.8,en;q=0.6',
'Cache-Control':'max-age=0',
'代理连接':'keep-alive',
'推荐人':'',
'Upgrade-Insecure-Requests':'1',
'用户代理':random.choice(UA_LIST)
}
def get_url():
#start_urls = ['{}'.format(num) for num in range(1, 85)]
start_urls = [""]
打印(start_urls)
对于 start_urlin start_urls:
打印(start_url)
print("==================开始 html===============")
html = requests.get(start_url,headers=headers1).text
print("html = {}".format(html))
print("================end html===============")
print("================开始汤=============")
soup = BeautifulSoup(html,'lxml')
打印(汤)
print("================结束汤============")
对于 itemin soup.find_all(class_="albumfaceOutter"):
print("================开始项目================")
打印(项目)
print("================结束项目========================= =")
print("================开始内容================")
内容 = {
'href': item.a['href'],
'title': item.img['alt'],
'img_url': item.img['src']
}
打印(内容)
print("==================结束内容======================= ===")
#col1.插入(内容)
print('写一个频道' + item.a['href'])
子频道 = item.a['href']
print("============开始子频道=====================")
打印(子通道)
subchannelArr = subchannel.split("/")
打印(subchannelArr)
#channelFilePath = subchannelArr[len(subchannelArr) - 2]
channelFilePath = content['title']
打印(通道文件路径)
channelFilePath = filePath + os.sep + channelFilePath
打印(通道文件路径)
如果 os.path.isdir(channelFilePath):
shutil.rmtree(channelFilePath)#递归删除目录树
elif os.path.isfile(channelFilePath):
os.remove(channelFilePath)#删除文件
os.makedirs(channelFilePath)# 创建目录
print("============结束子频道=====================")
打印(内容)
另一个(channelFilePath, item.a['href'])
时间.sleep(1)
def another(channelFilePath, url):
print("========================开始另一个html================= === =====")
html = requests.get(url,headers=headers2).text
打印(html)
print("========================结束另一个html================= === =====")
print("=========================开始另一个 ifanother================ === =====")
ifanother = etree.HTML(html).xpath('//div[@class="pagingBar_wrapper"]/a[last()-1]/@data-page')
打印(ifanother)
print("========================结束另一个 ifanother================= === =====")
如果 len(ifanother):
num = ifanother[0]
print('这个频道资源存在' + num + 'pages')
对于 nin range(1,int(num)):
print('开始解析{}中的第{}页'.format(num, n))
url2 = url +'?page={}'.format(n)
打印(网址)
打印(网址2)
get_m4a(channelFilePath, url2)
get_m4a(url)
def get_m4a(channelFilePath, url):
时间.sleep(1)
html = requests.get(url,headers=headers2).text
print("==============开始get_m4a======================")
numlist = etree.HTML(html).xpath('//div[@class="personal_body"]/@sound_ids')[0].split(',')
打印(数字列表)
print("==============end get_m4a======================")
对于iin numlist:
print("==============开始 get_m4a murl======================")