网页视频抓取工具 知乎(哥不穿内裤:你看过哪些和以往认知大相径庭的科普视频? )

优采云 发布时间: 2022-01-04 20:03

  网页视频抓取工具 知乎(哥不穿内裤:你看过哪些和以往认知大相径庭的科普视频?

)

  例子(这个id名是得到的...)

  不穿内裤的小哥:你看过哪些和之前认知大不相同的科普视频?

  Q:如何知道浏览器获取视频的全过程?

  答:打开火狐的调试工具(按F12),选择【网络】,然后查看每一个get和post

  过程:

  my $res = $ua->get(

$main .$pgcode,

"authorization" => $oauth,

);

my $data = decode_json( $res->content );

my $play_url = $data->{playlist}->{sd}->{play_url}; # m3u8 url

  完整代码:

  =info

Author: 523066680

Date: 2018-05

=cut

use Modern::Perl;

use LWP::UserAgent;

use File::Slurp;

use JSON;

STDOUT->autoflush(1);

goto_dir("D:/temp");

our $main = "https://lens.zhihu.com/api/videos/";

our $ua = LWP::UserAgent->new( );

our $target = "https://www.zhihu.com/question/271736973/answer/391332001";

my $res = $ua->get( $target );

my $html = $res->content();

my @video = $html=~/>https:.*?video\/(\d+)get(

$main .$pgcode,

"authorization" => $oauth,

);

die unless $res->is_success();

my $data = decode_json( $res->content );

my $play_url = $data->{playlist}->{sd}->{play_url}; # m3u8 url

my $pre_url;

# 获取网址共用部分

$play_url =~/(.*?\w{32})/;

$pre_url = $1 ."/";

$res = $ua->get( $play_url );

my @vlinks = $res->content =~/\n(.*?\d+\.ts.*?)\n/g;

grep { $_ = $pre_url . $_ } @vlinks;

return $pgcode, @vlinks;

}

# 获取视频切片,合并

sub get_video

{

our $ua;

my $name = shift;

my $buff = "";

my $res;

while ( my $link = shift )

{

print $#_ + 1 ," ";

$res = $ua->get( $link );

$buff .= $res->content();

}

print "\n";

write_file( "${name}.ts", {binmode=>":raw"}, $buff );

}

sub get_oauth

{

our ( $ua );

my $html = shift;

my ($js) = $html =~/(https:[^]+main\.app[^]+js)/g;

my $res = $ua->get( $js );

# pattern: authorization:"oauth c3cef7c66a1843f8b3a9e6a1e3160e20"}

my ($oauth) = $res->content =~/authorization:"([^"]{30,})"/;

return $oauth

}

sub goto_dir

{

my $dir = shift;

mkdir $dir unless ( -e $dir );

chdir $dir;

}

__DATA__

  2018-10-16 更新,现在更简单,单个 MP4 文件

  =info

Author: 523066680

2018-07 知乎去掉了 oauth 授权方式

2018-10 从 ts 多文件,变更为 mp4 单文件下载

=cut

use JSON;

use Encode qw/from_to/;

use LWP::UserAgent;

use Mojo::DOM;

use File::Slurp;

STDOUT->autoflush(1);

our $wdir = "D:/temp";

our $main = "https://lens.zhihu.com/api/videos/";

our $ua = LWP::UserAgent->new();

our $target = "https://www.zhihu.com/question/271736973/answer/389377346";

#our $target = "https://www.zhihu.com/question/285103979/answer/492401516";

#our $target = "https://www.zhihu.com/question/278030511/answer/452274063";

my $res = $ua->get( $target );

my $html = $res->content();

my @video = $html=~/>https:.*?video\/(\d+)get( $main .$pgcode );

die unless $res->is_success();

my $data = decode_json( $res->content );

my $play_url = $data->{playlist}->{sd}->{play_url};

$res = $ua->get( $play_url );

write_file( $fname, {binmode=>":raw"}, $res->content );

}

sub get_title_name

{

my $html = shift;

my $dom = Mojo::DOM->new($html);

my $title = $dom->at("title")->text;

$title =~s/ - 知乎//;

from_to( $title, "utf8", "gbk" );

return $title;

}

0 个评论

要回复文章请先登录注册


官方客服QQ群

微信人工客服

QQ人工客服


线