全网邮箱email地址采集api接口及实现剖析

优采云 发布时间: 2020-08-09 18:14

  先上一个在线邮箱采集demo样例:

  

  这样的在线工具原理与普通的客户端工具(例如八虾采集工具等)是一样的,所以这儿以这个在线工具作为参考进行实现剖析。

  邮箱采集原理:

  1、根据要采集的url地址,获取页面html内容,然后采用正则匹配出页面的url列表、邮箱地址列表。

  2、分两个进程:

  ①保存邮箱地址;

  ②分析采集子页面url的邮箱地址;

  基本源码(golang):

<p>

//采集入口方法

func CollectEmail(hosturl string) (EmailObj, []string, error) {

emailObj := new(EmailObj)

var inhost []string

//获取主域名

uparse, err := url.Parse(hosturl)

if err != nil {

return *emailObj, inhost, err

}

emailObj.Surl = hosturl

//

bodystr, err := HttpGetV2(hosturl)

if err != nil {

return *emailObj, inhost, errors.New("get request error")

}

//是否是gbk编码

pos := strings.Index(bodystr, "charset=gb")

pos2 := strings.Index(bodystr, "bg2312")

if pos != -1 || pos2 != -1 {

decodeBytes, err := simplifiedchinese.GB18030.NewDecoder().Bytes([]byte(bodystr))

if err != nil {

return *emailObj, inhost, errors.New("simplifiedchinese coding change error")

}

bodystr = string(decodeBytes)

}

//获取邮箱地地址

emailObj.Emails = append(emailObj.Emails, matchEmail(bodystr)...)

//获取联系手机

emailObj.Phones = append(emailObj.Phones, matchPhone(bodystr)...)

//获取内页链接列表

matchUrls := matchUrls(bodystr)

for _, item := range matchUrls {

itemparse, err := url.Parse(item)

if err != nil {

continue

}

if strings.Index(itemparse.Path, ".js") != -1 || strings.Index(itemparse.Path, ".css") != -1 {

continue

}

if itemparse.Host == uparse.Host {

inhost = append(inhost, item)

}

if itemparse.Scheme != "http" && itemparse.Scheme != "https" {

if strings.Index(itemparse.Path, "/") == 0 {

inhost = append(inhost, uparse.Scheme+"://"+uparse.Host+itemparse.Path)

} else {

inhost = append(inhost, uparse.Scheme+"://"+uparse.Host+"/"+itemparse.Path)

}

continue

}

}

//获取内页email

inhost = RemoveRepeatedElement(inhost)

emailObj.Emails = RemoveRepeatedElement(emailObj.Emails)

return *emailObj, inhost, nil

}

func matchEmail(str string) (email []string) {

var emailList []string

//re, _ := regexp.Compile("\\

0 个评论

要回复文章请先登录注册


官方客服QQ群

微信人工客服

QQ人工客服


线