细数SEO工作中给力的Python小脚本
优采云 发布时间: 2022-05-15 02:58细数SEO工作中给力的Python小脚本
人生苦短,我用Python。Python就像王者荣耀中的铭文或装备,强有力的武器可以更好地帮助您更好地刷野怪,更快地发育,从而通过等级差找突破口。
应用场景一:过滤敏感,不相关等杂七杂八关键词
# -*- coding: utf-8 -*-<br />op_txt=open('done.txt','a')<br /><br />class NaiveFilter():<br /> def __init__(self):<br /> self.keywords = set([])<br /><br /> def parse(self, path):<br /> for keyword in open(path):<br /> self.keywords.add(keyword.strip().encode('utf-8').lower())<br /> # print (self.keywords)<br /><br /> def filter(self, message, replss=r"*"):<br /> message = unicode(message).lower()<br /> for k in self.keywords:<br /> if k in message:<br /> message=message.replace(k, replss)<br /> else:<br /> op_txt.write('%s\n'%message)<br /><br /> print (message)<br /> # return message<br /><br /><br />if __name__ == '__main__':<br /> f = NaiveFilter()<br /> f.parse("keywords") #keywords里面放要敏感词或不想要的词等<br /> a=[i.strip() for i in open('hotword.txt').readlines()] #hotword.txt是将要过滤的词库<br /> c=len(a)<br /> for i in range(c):<br /> f.filter(a[i])<br />
应用场景二:结巴jieba分词计算高频词及TFIDF
#coding=utf-8<br />import sys<br />reload(sys)<br />sys.setdefaultencoding("utf-8")<br />import jieba <br />import jieba.analyse #导入结巴jieba相关模块<br />output=open('words.csv','a') <br />output.write('词语,词频,词权\n') <br />stopkeyword=[line.strip() for line in open('stop.txt').readlines()] #将停止词文件保存到列表<br />text = open(r"new.txt","r").read() #导入需要计算的内容<br />zidian={}<br />fenci=jieba.cut_for_search(text)<br />for fc in fenci:<br /> if fc in zidian: <br /> zidian[fc]+=1<br /> else:<br /> # zidian.setdefault(fc,1) #字典中如果不存在键,就加入键,键值设置为1<br /> zidian[fc]=1<br />#计算tfidf<br />tfidf=jieba.analyse.extract_tags(text,topK=30,withWeight=True)<br /><br />#写入到csv<br />for word_weight in tfidf:<br /> if word_weight in stopkeyword: <br /> pass<br /> else: #不存在的话就输出<br /> print word_weight[0],zidian.get(word_weight[0],'not found'),str(int(word_weight[1]*100))+'%'<br /> output.write('%s,%s,%s\n'%(word_weight[0],zidian.get(word_weight[0],'not found'),str(int(word_weight[1]*100))+'%'))<br />
应用场景三:定向定时更新采集
#coding:utf-8<br />import urllib2,re,lxml,requests,time<br />from bs4 import BeautifulSoup<br />str_time=time.strftime('%Y-%m-%d',time.localtime())<br />op_txt=open('url.txt','a')<br />url = 'http://www.xxx.com/sitemap/group.htm'<br />html=requests.get(url).content<br />soup = BeautifulSoup(html,"lxml")<br />zidian={}<br />c=0<br />with open('url.txt') as f:<br /> for i in f.readlines():<br /> i=i.strip()<br /> zidian['%s'%(i)]=c<br /> c+=1<br />for urllist in re.findall(re.compile(r'.*?href="(.*?)" target="_blank">(.*?)</a>'),str(soup)): <br /> url_data=urllist[0].strip()<br /> title=urllist[1]<br /> if '2019' in title: <br /> print title,url_data<br /> if zidian.has_key(url_data):<br /> print (u'没有更新'+str_time)<br /> continue<br /> else:<br /> print (u'成功更新'+str_time)<br /> op_txt.writelines('%s\n'%url_data)<br />
应用场景四:百万级别一键生成sitemap文件
应用场景五:合并目录下的所有日志文件
<p>#coding=utf-8<br /><br />import os<br />import sys<br />import glob<br /><br />def dirTxtToLargeTxt(dir,outputFileName):<br /> '''从dir目录下读入所有的TXT文件,将它们写到outputFileName里去'''<br /> #如果dir不是目录返回错误<br /><br /> if not os.path.isdir(dir):<br /> print ("传入的参数有错%s不是一个目录" %dir)<br /> return False<br /> #list all txt files in dir<br /> outputFile = open(outputFileName,"a")<br /><br /> for txtFile in glob.glob(os.path.join(dir,"*.txt")):<br /> print (txtFile)<br /> inputFile = open(txtFile,"rb")<br /><br /> for line in inputFile:<br /> outputFile.write(line)<br /> return True<br /><br />if __name__ =="__main__":<br /> if len(sys.argv)