网页音频抓取(Mozilla/5.0(WindowsNT)#递归删除目录树)

优采云 发布时间: 2022-02-18 02:03

  网页音频抓取(Mozilla/5.0(WindowsNT)#递归删除目录树)

  导入异步

  来自 bs4import BeautifulSoup

  来自 lxmlimport etree

  导入操作系统

  导入关闭

  filePath="D:\\temp_ximalaya_audio"

  channelFilePath=""

  #初始化文件目录

  如果 os.path.isdir(filePath):

  shutil.rmtree(filePath)#递归删除目录树

  elif os.path.isfile(filePath):

  os.remove(filePath)#删除文件

  os.makedirs(filePath)#创建目录

  #mongodb

  #clients = pymongo.MongoClient('localhost')

  #db = clients["XiMaLaYa"]

  #col1 = db["album2"]

  #col2 = db["detail2"]

  UA_LIST = [

  "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, 像壁虎) Chrome/22.@ >0.1207.1 Safari/537.1",

  "Mozilla/5.0 (X11;CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome /20.0.1132.@>57 Safari/536.11",

  "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, 像 Gecko) Chrome/20.@ >0.1092.@>0 Safari/536.6",

  "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, 像壁虎) Chrome/20.0.1090.0 Safari/536.6",

  "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, 像壁虎) Chrome/19.@ >77.34.5 Safari/537.1",

  "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, 像 Gecko) Chrome/19.@>0. 1084.9 Safari/536.5",

  "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, 像壁虎) Chrome/19.@>0.1084.36 Safari/536.5",

  "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, 像壁虎) Chrome/19.@ >0.1063.0 Safari/536.3",

  "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, 像壁虎) Chrome/19.@>0.1063.0 Safari/536.3",

  "Mozilla/4.0(兼容;MSIE 7.0;Windows NT 5.1;Trident/4.0;SE 2.@>X MetaSr < @1.0; SE 2.@>X MetaSr 1.0; .NET CLR 2.@>0.50727; SE 2.@>X MetaSr 1.< @0)",

  "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, 像壁虎) Chrome/19.@>0.1062.@>0 Safari/536.3",

  "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, 像壁虎) Chrome/19.@ >0.1062.@>0 Safari/536.3",

  "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, 像壁虎) Chrome/19.@ >0.1061.1 Safari/536.3",

  "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML,像壁虎) Chrome/19.@>0.1061.1 Safari/536.3",

  "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, 像壁虎) Chrome/19.@>0.1061.0 Safari/536.3",

  "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML,像 Gecko) Chrome/19.@>0. 1055.1 Safari/535.24",

  "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, 像 Gecko) Chrome/19.@ >0.1055.1 Safari/535.24",

  "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, 像壁虎) Chrome/22.@ >0.1207.1 Safari/537.1",

  "Mozilla/5.0 (X11;CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome /20.0.1132.@>57 Safari/536.11",

  "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, 像 Gecko) Chrome/20.@ >0.1092.@>0 Safari/536.6",

  "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, 像壁虎) Chrome/20.0.1090.0 Safari/536.6",

  "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, 像壁虎) Chrome/19.@ >77.34.5 Safari/537.1",

  "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, 像 Gecko) Chrome/19.@>0. 1084.9 Safari/536.5",

  "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, 像壁虎) Chrome/19.@>0.1084.36 Safari/536.5",

  "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, 像壁虎) Chrome/19.@ >0.1063.0 Safari/536.3",

  "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, 像壁虎) Chrome/19.@>0.1063.0 Safari/536.3",

  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, 像 Gecko) Chrome/19.@>0.1063.0 Safari/536.3",

  "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, 像壁虎) Chrome/19.@>0.1062.@>0 Safari/536.3",

  "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, 像壁虎) Chrome/19.@ >0.1062.@>0 Safari/536.3",

  "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, 像壁虎) Chrome/19.@>0.1061.1 Safari/536.3",

  "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, 像壁虎) Chrome/19.@ >0.1061.1 Safari/536.3",

  "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML,像壁虎) Chrome/19.@>0.1061.1 Safari/536.3",

  "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, 像壁虎) Chrome/19.@>0.1061.0 Safari/536.3",

  "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML,像 Gecko) Chrome/19.@>0. 1055.1 Safari/535.24",

  "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, 像 Gecko) Chrome/19.@ >0.1055.1 Safari/535.24"

  ]

  标题1 = {

  '接受':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',

  'Accept-Encoding':'gzip, deflate, sdch',

  'Accept-Language':'zh-CN,zh;q=0.8,en;q=0.6',

  'Cache-Control':'max-age=0',

  '代理连接':'keep-alive',

  'Upgrade-Insecure-Requests':'1',

  '用户代理':random.choice(UA_LIST)

  }

  headers2 = {

  '接受':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',

  'Accept-Encoding':'gzip, deflate, sdch',

  'Accept-Language':'zh-CN,zh;q=0.8,en;q=0.6',

  'Cache-Control':'max-age=0',

  '代理连接':'keep-alive',

  '推荐人':'',

  'Upgrade-Insecure-Requests':'1',

  '用户代理':random.choice(UA_LIST)

  }

  def get_url():

  #start_urls = ['{}'.format(num) for num in range(1, 85)]

  start_urls = [""]

  打印(start_urls)

  对于 start_urlin start_urls:

  打印(start_url)

  print("==================开始 html===============")

  html = requests.get(start_url,headers=headers1).text

  print("html = {}".format(html))

  print("================end html===============")

  print("================开始汤=============")

  soup = BeautifulSoup(html,'lxml')

  打印(汤)

  print("================结束汤============")

  对于 itemin soup.find_all(class_="albumfaceOutter"):

  print("================开始项目================")

  打印(项目)

  print("================结束项目========================= =")

  print("================开始内容================")

  内容 = {

  'href': item.a['href'],

  'title': item.img['alt'],

  'img_url': item.img['src']

  }

  打印(内容)

  print("==================结束内容======================= ===")

  #col1.插入(内容)

  print('写一个频道' + item.a['href'])

  子频道 = item.a['href']

  print("============开始子频道=====================")

  打印(子通道)

  subchannelArr = subchannel.split("/")

  打印(subchannelArr)

  #channelFilePath = subchannelArr[len(subchannelArr) - 2]

  channelFilePath = content['title']

  打印(通道文件路径)

  channelFilePath = filePath + os.sep + channelFilePath

  打印(通道文件路径)

  如果 os.path.isdir(channelFilePath):

  shutil.rmtree(channelFilePath)#递归删除目录树

  elif os.path.isfile(channelFilePath):

  os.remove(channelFilePath)#删除文件

  os.makedirs(channelFilePath)# 创建目录

  print("============结束子频道=====================")

  打印(内容)

  另一个(channelFilePath, item.a['href'])

  时间.sleep(1)

  def another(channelFilePath, url):

  print("========================开始另一个html================= === =====")

  html = requests.get(url,headers=headers2).text

  打印(html)

  print("========================结束另一个html================= === =====")

  print("=========================开始另一个 ifanother================ === =====")

  ifanother = etree.HTML(html).xpath('//div[@class="pagingBar_wrapper"]/a[last()-1]/@data-page')

  打印(ifanother)

  print("========================结束另一个 ifanother================= === =====")

  如果 len(ifanother):

  num = ifanother[0]

  print('这个频道资源存在' + num + 'pages')

  对于 nin range(1,int(num)):

  print('开始解析{}中的第{}页'.format(num, n))

  url2 = url +'?page={}'.format(n)

  打印(网址)

  打印(网址2)

  get_m4a(channelFilePath, url2)

  get_m4a(url)

  def get_m4a(channelFilePath, url):

  时间.sleep(1)

  html = requests.get(url,headers=headers2).text

  print("==============开始get_m4a======================")

  numlist = etree.HTML(html).xpath('//div[@class="personal_body"]/@sound_ids')[0].split(',')

  打印(数字列表)

  print("==============end get_m4a======================")

  对于iin numlist:

  print("==============开始 get_m4a murl======================")

0 个评论

要回复文章请先登录注册


官方客服QQ群

微信人工客服

QQ人工客服


线