python文章采集例子（爬取http://infoq.com）

优采云发布时间: 2020-08-09 15:02

　　写了个采集资源的小程序，原理：从上读取提供的RESS资源。然后按照资源中相关链接下载相应文章

　　RESS地址:

- - 未注册用户的 InfoQ 个性化 RSS Feed - 请注册后升级！ http://www.infoq.com/cn/ 本 RSS Feed 是一个个性化定制的 Feed，对于您在 InfoQ.com 上的帐号（无论注册与否）都是唯一的。您可以从 InfoQ 网站左侧栏中的“您的社区”选项框内选择感兴趣的社区，此外您还可以通过关闭子话题和标签的方式过滤掉您不感兴趣的内容。您所做的选择将影响到本 RSS Feed 显示的新闻——新闻内容将和您在网站首页中央的新闻栏看见的内容保持一致。如果您的 RSS Feed 没有反映出这样的相关性，那么可能是因为您使用的 Feed 链接没有与您的 InfoQ 帐号相关联。为了确保您所使用的 Feed 的正确性，请先在 InfoQ 上注册，然后从网站左侧菜单中的“个性化 RSS”链接获取新的 RSS Feed URL。祝您使用愉快！ - - - PetaPoco：适用于.NET的微型ORM http://www.infoq.com/cn/news/2011/06/petapoco >PetaPoco是一款适用于.NET应用程序的对象关系映射器（ORM, Object Relational Mapper）。与那些功能完备的ORM（如NHibernate或Entity Framework）不同的是，PetaPoco更注重易用性和性能，而非丰富的功能。使用PetaPoco只需要引入一个C#文件，可以使用强类型的POCO，并支持 .........

　　得到一个RESS的标准XML文档，然后解析XML得相关文章信息，再步入解析，最后下载图片，保存文章信息到MYSQL数据库中

　　下面是代码：

#! /usr/bin/env python # -*- coding: utf-8 -*- import urllib import re,sys import string from xml.dom.minidom import parseString from sgmllib import SGMLParser import MySQLdb reload(sys) sys.setdefaultencoding('utf8') class Constants(): #站点 HTML_SITE = "http://www.infoq.com"; #聚体资源 HTML_RESOURCE = HTML_SITE + "/cn/rss/rss.action?token=v4OEYqEXG7ltwOPp5IpH34Ky6WdtpXqz"; #数据库配置 DB_HOST = "localhost" #数据库用户名 DB_USER = "root" #数据库密码 DB_PASSWORD = "root" #数据库 DB_DATABASE = "test" #数据库连接编码集 CHARSET = "utf8" #代理服务器 PROXY_ADRESS = "" #代理用户名 PROXY_USERNAME = "" #代理用户密码 PROXY_PASSWORD = "" #图片本地保存路径 IMG_LOCALDSTDIR = "E:/image/" class ListUrls(SGMLParser): def reset(self): self.imgs = [] SGMLParser.reset(self) def start_img(self, attrs): src = [v for k, v in attrs if k == 'src'] if src: self.imgs.extend(src) #数据库工具类 class DBUTIL(): def getConnectionDB(self): try: conn = MySQLdb.connect(host=Constants.DB_HOST, user=Constants.DB_USER, passwd=Constants.DB_PASSWORD, db=Constants.DB_DATABASE, charset=Constants.CHARSET) return conn except: print "EROOR: get ConnectionDB is FAIL" #文章对象用于从网站中爬取然后存储在DB中 class actrict(): title = '' link = '' description = '' creator = '' createdate = '' identifier = '' content = '' class webcrawlerhttpurl(): #获取HTML内容 def getUrlInfo(self, weburl): try : #proxyConfig = 'http://%s:%s@%s' % (Constants.PROXY_USERNAME, Constants.PROXY_PASSWORD, Constants.PROXY_ADRESS) #inforMation = urllib.urlopen(weburl, proxies={'http':proxyConfig}) inforMation = urllib.urlopen(weburl) #header = inforMation.info() #contentType = header.getheader('Content-Type') status = inforMation.getcode() if status == 200: html = inforMation.readlines() return html else: return 'ERROR: get web %s% is fail and status=%s' % (weburl, status); except: print 'ERROR: get web %s% is fail' % (weburl); finally: inforMation.close() #解析HTML def parseHtml(self, html, link): try: #body是一个list，需要转成string document = "" for line in html: if line.split(): document = document + line #title title = document[re.search("title>", document).end():] title = title[:re.search("title>", title).end() - 8] #content content = document[re.search("box-content-5", document).end():] content = content[:re.search("bottom-corners", content).end()] content = document[re.search("", document).end():] content = content[:re.search("", content).end() - 33] content = content.replace("'", "\\'") except: print 'ERROR:PARSEHTML IS FAIL %s' % (link) return content #解析RESS然后访问其中每个具体资源 def parseRessXml(self, xml_file): #body是一个list，需要转成string document = "" for line in xml_file: document = document + line doc = parseString(document) pkgs = doc.getElementsByTagName("item") #遍历所有的资源地址 i = 0; for pkg in pkgs: try: i = i + 1 print '-------------------PARSE HTML (%s)-----------------' % (i) title = pkg.getElementsByTagName("title") title = self.getText(title[0].childNodes) link = pkg.getElementsByTagName("link") link = self.getText(link[0].childNodes) description = pkg.getElementsByTagName("description") description = self.getText(description[0].childNodes) creator = pkg.getElementsByTagName("dc:creator") creator = self.getText(creator[0].childNodes) createdate = pkg.getElementsByTagName("dc:date") createdate = self.getText(createdate[0].childNodes) identifier = pkg.getElementsByTagName("dc:identifier") identifier = self.getText(identifier[0].childNodes) #判断文章是否已存在 conn = DBUTIL().getConnectionDB() cur = conn.cursor() SQL = "SELECT COUNT(1) FROM ARTICLES WHERE identifier='%s'"%(identifier) cur.execute(SQL) alldata = cur.fetchall() if alldata[0][0] != 0: print "Warning: DB already exist for this article" continue; #解析Html返回文章内容 content = self.parseHtml(self.getUrlInfo(link), link) #存储图片文件到本地 lister = ListUrls() lister.feed(content) self.saveimg(lister.imgs) for img in lister.imgs: content = content.replace(img,Constants.IMG_LOCALDSTDIR +"/" + img.split("/")[-1].split(";")[0]) #封装成actrict类 actrict.title = title actrict.link = link actrict.identifier = identifier actrict.description = description actrict.createdate = createdate actrict.creator = creator actrict.content = content #进行存本地数据库 self.putDB(actrict) except : print "ERROR: PARSE_XMLRESS IS FAIL%s" % (link) #解析XML取字符 def getText(self, nodelist): rc = "" for node in nodelist: if node.nodeType == node.TEXT_NODE: rc = rc + node.data return rc #保存图片文件 def saveimg(self, imgs): for img in imgs : try: if string.find(img, 'http') != 0: img = Constants.HTML_SITE + img DstDir = Constants.IMG_LOCALDSTDIR imgPath = DstDir + img.split("/")[-1].split(";")[0] print imgPath File = open(imgPath, "wb") #proxyConfig = 'http://%s:%s@%s' % (Constants.PROXY_USERNAME, Constants.PROXY_PASSWORD, Constants.PROXY_ADRESS) #inforMation = urllib.urlopen(img, proxies={'http':proxyConfig}) inforMation = urllib.urlopen(img) jpg = inforMation.read() File.write(jpg) print("INFO: SAVE IMG:" + imgPath) except : print "ERROR: SAVA IMG IS FAIL:%s" % (img) finally: inforMation.close() File.close() #存储DB def putDB(self, actrict): title = actrict.title link = actrict.link identifier = actrict.identifier description = actrict.description createdate = actrict.createdate creator = actrict.creator content = actrict.content print title try: conn = DBUTIL().getConnectionDB() cur = conn.cursor() SQL = "INSERT INTO ARTICLES(title,link,identifier,description,createdate,creator,content)VALUES\ ('%s','%s','%s','%s','%s','%s','%s')" % (title, link, identifier, description, createdate, creator, content) cur.execute(SQL) conn.commit() print "INFO: SAVE ACTRICT IS SUCCESSFUL" except : print "ERROR: SAVE ACTRICT IS FAIL" finally: cur.close() conn.close() if __name__ == "__main__": webcrawler = webcrawlerhttpurl(); xml_file = webcrawler.getUrlInfo(Constants.HTML_RESOURCE) webcrawler.parseRessXml(xml_file) ''' CREATE TABLE `ARTICLES` ( `id` int(11) NOT NULL AUTO_INCREMENT, `title` varchar(500) DEFAULT NULL COMMENT '文章标题', `link` varchar(500) DEFAULT NULL COMMENT '文章完整链接', `description` varchar(5000) DEFAULT NULL COMMENT '描述信息', `creator` varchar(200) DEFAULT NULL COMMENT '作者', `createdate` varchar(200) DEFAULT NULL COMMENT '发布时间', `identifier` varchar(500) DEFAULT NULL COMMENT '关键字，用于区分文章是否已存在', `content` longtext COMMENT '内容', PRIMARY KEY (`id`) ) ENGINE=InnoDB AUTO_INCREMENT=4 DEFAULT CHARSET=utf8 '''

　　------------------------------

　　提供源码下载：

0

2020-08-09

全网文章采集

0 个评论

要回复文章请先登录或注册

AI时代内容工厂

python文章采集例子（爬取http://infoq.com）

0 个评论

发起人

AI时代内容工厂

python文章采集例子（爬取http://infoq.com）

0 个评论

发起人

相关问题