网页视频抓取脚本(之前博客>博客博客博客岂止是换页博客)
优采云 发布时间: 2021-12-05 06:03网页视频抓取脚本(之前博客>博客博客博客岂止是换页博客)
由于之前张耀老师的网页脚本51cto升级,课程列表页面使用javascript无效。
笔者发现视频课程页面右侧的列表是静态视频课程地址,所以修改了老师最初版本的脚本,使用视频课程页面时结果正常,所以修改后的脚本并发布了一些评论 出来供您研究
<p>[root@m01 scripts]# cat html_to_table.sh
#!/bin/bash
# oldboy linux training
# 2016-11-13
# 基于老男孩linux21期学员张耀开发脚本
#
EduFile=/tmp/edu.html #process temp file1
EduFile2=/tmp/edu2.html #process 2
Url="$*"
# Check for given parameters
[ $# -eq 0 ] && {
echo "USAGE: /bin/sh $0 http://...."
exit 1
}
# Judge url is ok?
curl -I $Url &>/dev/null
[ $? -ne 0 ] &&{
echo "Bad url,Please check it"
exit 1
}
# Defined get pagenum and CourseId Functions
# Defined get pagenum and CourseId Functions
#function getnum(){
# curl -s $Url>$EduFile
# grep '"pagesGoEnd"' $EduFile &>/dev/null
# if [ $? -eq 0 ]
# then
# num=`sed -rn 's#.*page=([0-9].*)" class="pagesGoEnd".*$#\1#gp' $EduFile`
# else
# num=`sed -rn 's|.*page=([0-9].*)#" class="pagesNum".*$|\1|gp' $EduFile`
# fi
# pagenum=${num:-1}
# CourseId=`echo $Url|awk -F "[-.]" '{print $4}'`
#}
# Defined curl html Functions
#function Curl(){
# getnum
# for i in `seq $pagenum`
# do
# curl "http://edu.51cto.com/index.php?do=course&m=lessions&course_id=$CourseId&page=$i" 1>>$EduFile 2>/dev/null
# done
#}
#分段没了,原函数保留,视频页抓一遍就好
function Curl(){
curl "$*" 1>>$EduFile 2>/dev/null
}
# Defined Create table Functions
function table(){
sum=""
index=1
sed -rn '/lesson/ s#