重磅----东方财富研报自动采集系统，附程序代码

优采云发布时间: 2022-06-13 12:00

　　我们直接看数据，打开东方财富研报界面。

　　我们用东方财富个股研报为例子进行讲解，我们先获取研报链接，这个很复杂，因为它没有直接提供链接，研报链接需要组合出来。需要解析的数据，参考代码。

　　我们看程序运行的效果，因为这个获取比较复杂，涉及东西很大。

　　程序运行的图形界面

　　比如我们点击个股研报采集，输入要采集文件的多少，比如30

　　我们看采集的效果，不会重复采集。

　　我们看保存下来的文件，自动分类。

　　我们随便打开一个文件。

　　程序代码

　　import pandas as pdfrom bs4 import BeautifulSoupfrom xml import etreeimport jsonimport jsonpathfrom fpdf import FPDFimport requestsimport osimport timeimport PySimpleGUI as sgimport tkinter as tkfrom openpyxl import load_workbookimport matplotlib.pyplot as pltroot=tk.Tk()root.wm_title('东方财富研报采集')root.geometry('600x500')menmenu=tk.Menu(root)report_down=tk.Menu(menmenu)menmenu.add_cascade(label='东方财富研报自动采集',menu=report_down)#东方财富研报个股采集#检测主文件夹，在桌面main_name='东方财富'main_list=os.listdir(r'C:\Users\Administrator\Desktop')if main_name in main_list: print('{}文件夹已经存在'.format(main_name)) #建立文件夹else: os.makedirs(r'C:\Users\Administrator\Desktop\{}'.format(main_name))#采集不分类个股的研报def stock_now_report_pdf(): page=sg.popup_get_text('输入要下载数据大小比如30') url='https://reportapi.eastmoney.com/report/list?' #自动填充时间，获取最新的研报 locatime=time.localtime() year=locatime.tm_year mo=locatime.tm_mon daily=locatime.tm_mday h=locatime.tm_hour start_date='{}-{}-{}'.format(year,mo,daily) end_date='{}-{}-{}'.format(year,mo,daily) params={ 'cb':'datatable1335389', 'industryCode':'*', 'pageSize':page, 'industry':'*', 'rating':'*', 'ratingChange':'*', 'beginTime':start_date, 'endTime':end_date, 'pageNo':'1', 'fields':'', 'qType':'0', 'orgCode':'', 'code':'*', 'rcode':'', '_':'1653745465030' } res=requests.get(url=url,params=params) res_text=res.text[17:len(res.text)-1] json_text=json.loads(res_text) df_text=pd.DataFrame(json_text['data']) df=df_text wjj_name='个股研报' wjj_list=os.listdir(r'C:\Users\Administrator\Desktop\{}'.format(main_name)) if wjj_name in wjj_list: print('{}文件夹已经存在'.format(wjj_name)) #建立文件夹 else: os.makedirs(r'C:\Users\Administrator\Desktop\{}\{}'.format(main_name,wjj_name)) for title,stockname,infoCode,encodeUrl in zip(df['title'],df['stockName'],df['infoCode'],df['encodeUrl']): try: pdf_name=stockname+title+'.pdf' encodeurl=encodeUrl.split('=')[0] pdf_url='https://pdf.dfcfw.com/pdf/H3_{}_1.pdf?{}.pdf'.format(infoCode,encodeurl) #自动建立文件夹,先检测文件夹是否存在 pdf_request=requests.get(pdf_url) #建立pdf格式文档，因为有些电脑不需要 #检测文件是不是已经存在 path_name=os.listdir(r'C:\Users\Administrator\Desktop\{}\{}'.format(main_name,wjj_name)) if pdf_name in path_name: print('{}文件已经存在，不采集'.format(pdf_name)) else: pdf=FPDF() pdf.output(r'C:\Users\Administrator\Desktop\{}\{}\{}'.format(main_name,wjj_name,pdf_name)) #读取pdf with open(r'C:\Users\Administrator\Desktop\{}\{}\{}'.format(main_name,wjj_name,pdf_name),'wb') as f: f.write(pdf_request.content) print(pdf_name,'下载完成') except: print('采集失败{}'.format(pdf_name))#行业研报def industry_report_down_pdf(): page=sg.popup_get_text('输入要下载数据大小比如30') url='https://reportapi.eastmoney.com/report/list?' #自动填充时间，获取最新的研报 locatime=time.localtime() year=locatime.tm_year mo=locatime.tm_mon daily=locatime.tm_mday h=locatime.tm_hour start_date='{}-{}-{}'.format(year,mo,daily) end_date='{}-{}-{}'.format(year,mo,daily) params={ 'cb':'datatable1451407', 'industryCode':'*', 'pageSize':page, 'industry':'*', 'rating':'*', 'ratingChange':'*', 'beginTime':start_date, 'endTime':end_date, 'pageNo':'1', 'fields':'', 'qType':'1', 'orgCode':'', 'rcode':'', '_':'1654947298302' } res=requests.get(url=url,params=params) res_text=res.text[17:len(res.text)-1] json_text=json.loads(res_text) df_text=pd.DataFrame(json_text['data']) df=df_text wjj_name='行业研报' wjj_list=os.listdir(r'C:\Users\Administrator\Desktop\{}'.format(main_name)) if wjj_name in wjj_list: print('{}文件夹已经存在'.format(wjj_name)) #建立文件夹 else: os.makedirs(r'C:\Users\Administrator\Desktop\{}\{}'.format(main_name,wjj_name)) for title,stockname,infoCode,encodeUrl in zip(df['title'],df['stockName'],df['infoCode'],df['encodeUrl']): try: pdf_name=stockname+title+'.pdf' encodeurl=encodeUrl.split('=')[0] pdf_url='https://pdf.dfcfw.com/pdf/H3_{}_1.pdf?{}.pdf'.format(infoCode,encodeurl) #自动建立文件夹,先检测文件夹是否存在 pdf_request=requests.get(pdf_url) #建立pdf格式文档，因为有些电脑不需要 #检测文件是不是已经存在 path_name=os.listdir(r'C:\Users\Administrator\Desktop\{}\{}'.format(main_name,wjj_name)) if pdf_name in path_name: print('{}文件已经存在，不采集'.format(pdf_name)) else: pdf=FPDF() pdf.output(r'C:\Users\Administrator\Desktop\{}\{}\{}'.format(main_name,wjj_name,pdf_name)) #读取pdf with open(r'C:\Users\Administrator\Desktop\{}\{}\{}'.format(main_name,wjj_name,pdf_name),'wb') as f: f.write(pdf_request.content) print(pdf_name,'下载完成') except: print('采集失败{}'.format(pdf_name))#新股研报def new_stock_report_pdf(): page=sg.popup_get_text('输入要下载数据大小比如30') url='https://reportapi.eastmoney.com/report/newStockList?' #自动填充时间，获取最新的研报 locatime=time.localtime() year=locatime.tm_year mo=locatime.tm_mon daily=locatime.tm_mday h=locatime.tm_hour start_date='{}-{}-{}'.format(year,mo,daily) end_date='{}-{}-{}'.format(year,mo,daily) params={ 'cb':'datatable3277848', 'pageSize':page, 'beginTime':start_date, 'endTime':end_date, 'pageNo':'2', 'fields':'', 'qType':'4', 'p':'2', 'pageNum':'2', 'pageNumber':'2', '_':'1654947808283' } res=requests.get(url=url,params=params) res_text=res.text[17:len(res.text)-1] json_text=json.loads(res_text) df_text=pd.DataFrame(json_text['data']) df=df_text wjj_name='新股研报' wjj_list=os.listdir(r'C:\Users\Administrator\Desktop\{}'.format(main_name)) if wjj_name in wjj_list: print('{}文件夹已经存在'.format(wjj_name)) #建立文件夹 else: os.makedirs(r'C:\Users\Administrator\Desktop\{}\{}'.format(main_name,wjj_name)) for title,stockname,infoCode,encodeUrl in zip(df['title'],df['stockName'],df['infoCode'],df['encodeUrl']): try: pdf_name=stockname+title+'.pdf' encodeurl=encodeUrl.split('=')[0] pdf_url='https://pdf.dfcfw.com/pdf/H3_{}_1.pdf?{}.pdf'.format(infoCode,encodeurl) #自动建立文件夹,先检测文件夹是否存在 pdf_request=requests.get(pdf_url) #建立pdf格式文档，因为有些电脑不需要 #检测文件是不是已经存在 path_name=os.listdir(r'C:\Users\Administrator\Desktop\{}\{}'.format(main_name,wjj_name)) if pdf_name in path_name: print('{}文件已经存在，不采集'.format(pdf_name)) else: pdf=FPDF() pdf.output(r'C:\Users\Administrator\Desktop\{}\{}\{}'.format(main_name,wjj_name,pdf_name)) #读取pdf with open(r'C:\Users\Administrator\Desktop\{}\{}\{}'.format(main_name,wjj_name,pdf_name),'wb') as f: f.write(pdf_request.content) print(pdf_name,'下载完成') except: print('采集失败{}'.format(pdf_name))#策略报告def cl_report_pdf_down(): page=sg.popup_get_text('输入要下载数据大小比如30') url='https://reportapi.eastmoney.com/report/jg?' #自动填充时间，获取最新的研报 locatime=time.localtime() year=locatime.tm_year mo=locatime.tm_mon daily=locatime.tm_mday h=locatime.tm_hour start_date='{}-{}-{}'.format(year,mo,daily) end_date='{}-{}-{}'.format(year,mo,daily) params={ 'cb':'datatable6714376', 'pageSize':page, 'beginTime':start_date, 'endTime':end_date, 'pageNo':'1', 'fields':'', 'qType':'2', 'orgCode':'', 'author':'', 'p':'1', 'pageNum':'1', 'pageNumber':'1', '_':'1654948156323' } res=requests.get(url=url,params=params) res_text=res.text[17:len(res.text)-1] json_text=json.loads(res_text) df_text=pd.DataFrame(json_text['data']) df=df_text wjj_name='策略研报' wjj_list=os.listdir(r'C:\Users\Administrator\Desktop\{}'.format(main_name)) if wjj_name in wjj_list: print('{}文件夹已经存在'.format(wjj_name)) #建立文件夹 else: os.makedirs(r'C:\Users\Administrator\Desktop\{}\{}'.format(main_name,wjj_name)) for title,encodeUrl in zip(df['title'],df['encodeUrl']): try: pdf_name=title+'.pdf' encodeurl=encodeUrl.split('=')[0] pdf_url='https://pdf.dfcfw.com/pdf/H3_{}_1.pdf?.pdf'.format(encodeurl) #自动建立文件夹,先检测文件夹是否存在 pdf_request=requests.get(pdf_url) #建立pdf格式文档，因为有些电脑不需要 #检测文件是不是已经存在 path_name=os.listdir(r'C:\Users\Administrator\Desktop\{}\{}'.format(main_name,wjj_name)) if pdf_name in path_name: print('{}文件已经存在，不采集'.format(pdf_name)) else: pdf=FPDF() pdf.output(r'C:\Users\Administrator\Desktop\{}\{}\{}'.format(main_name,wjj_name,pdf_name)) #读取pdf with open(r'C:\Users\Administrator\Desktop\{}\{}\{}'.format(main_name,wjj_name,pdf_name),'wb') as f: f.write(pdf_request.content) print(pdf_name,'下载完成') except: print('采集失败{}'.format(pdf_name))#宏观研报def hg_report_pdf_down(): page=sg.popup_get_text('输入要下载数据大小比如30') url='https://reportapi.eastmoney.com/report/jg?' #自动填充时间，获取最新的研报 locatime=time.localtime() year=locatime.tm_year mo=locatime.tm_mon daily=locatime.tm_mday h=locatime.tm_hour start_date='{}-{}-{}'.format(year,mo,daily) end_date='{}-{}-{}'.format(year,mo,daily) params={ 'cb':'datatable7655083', 'pageSize':page, 'beginTime':start_date, 'endTime':end_date, 'pageNo':'1', 'fields':'', 'qType':'3', 'orgCode':'', 'author':'', 'p':'1', 'pageNum':'1', 'pageNumber':'1', '_':'1654947750723' } res=requests.get(url=url,params=params) res_text=res.text[17:len(res.text)-1] json_text=json.loads(res_text) df_text=pd.DataFrame(json_text['data']) df=df_text wjj_name='宏观经济研报' wjj_list=os.listdir(r'C:\Users\Administrator\Desktop\{}'.format(main_name)) if wjj_name in wjj_list: print('{}文件夹已经存在'.format(wjj_name)) #建立文件夹 else: os.makedirs(r'C:\Users\Administrator\Desktop\{}\{}'.format(main_name,wjj_name)) for title,encodeUrl in zip(df['title'],df['encodeUrl']): try: pdf_name=title+'.pdf' encodeurl=encodeUrl.split('=')[0] pdf_url='https://pdf.dfcfw.com/pdf/H3_{}_1.pdf?.pdf'.format(encodeurl) #自动建立文件夹,先检测文件夹是否存在 pdf_request=requests.get(pdf_url) #建立pdf格式文档，因为有些电脑不需要 #检测文件是不是已经存在 path_name=os.listdir(r'C:\Users\Administrator\Desktop\{}\{}'.format(main_name,wjj_name)) if pdf_name in path_name: print('{}文件已经存在，不采集'.format(pdf_name)) else: pdf=FPDF() pdf.output(r'C:\Users\Administrator\Desktop\{}\{}\{}'.format(main_name,wjj_name,pdf_name)) #读取pdf with open(r'C:\Users\Administrator\Desktop\{}\{}\{}'.format(main_name,wjj_name,pdf_name),'wb') as f: f.write(pdf_request.content) print(pdf_name,'下载完成') except: print('采集失败{}'.format(pdf_name))#券商晨报def jscb_report_pdf_down(): page=sg.popup_get_text('输入要下载数据大小比如30') url='https://reportapi.eastmoney.com/report/jg?' #自动填充时间，获取最新的研报 locatime=time.localtime() year=locatime.tm_year mo=locatime.tm_mon daily=locatime.tm_mday h=locatime.tm_hour start_date='{}-{}-{}'.format(year,mo,daily) end_date='{}-{}-{}'.format(year,mo,daily) params={ 'cb':'datatable2280662', 'pageSize':page, 'beginTime':start_date, 'endTime':end_date, 'pageNo':'1', 'fields':'', 'qType':'4', 'orgCode':'', 'author':'', 'p':'1', 'pageNum':'1', 'pageNumber':'1', '_':'1654949598388' } res=requests.get(url=url,params=params) res_text=res.text[17:len(res.text)-1] json_text=json.loads(res_text) df_text=pd.DataFrame(json_text['data']) df=df_text wjj_name='券商晨报研报' wjj_list=os.listdir(r'C:\Users\Administrator\Desktop\{}'.format(main_name)) if wjj_name in wjj_list: print('{}文件夹已经存在'.format(wjj_name)) #建立文件夹 else: os.makedirs(r'C:\Users\Administrator\Desktop\{}\{}'.format(main_name,wjj_name)) for title,encodeUrl in zip(df['title'],df['encodeUrl']): try: pdf_name=title+'.pdf' encodeurl=encodeUrl.split('=')[0] pdf_url='https://pdf.dfcfw.com/pdf/H3_{}_1.pdf?.pdf'.format(encodeurl) #自动建立文件夹,先检测文件夹是否存在 pdf_request=requests.get(pdf_url) #建立pdf格式文档，因为有些电脑不需要 #检测文件是不是已经存在 path_name=os.listdir(r'C:\Users\Administrator\Desktop\{}\{}'.format(main_name,wjj_name)) if pdf_name in path_name: print('{}文件已经存在，不采集'.format(pdf_name)) else: pdf=FPDF() #输出文件 pdf.output(r'C:\Users\Administrator\Desktop\{}\{}\{}'.format(main_name,wjj_name,pdf_name)) #读取pdf with open(r'C:\Users\Administrator\Desktop\{}\{}\{}'.format(main_name,wjj_name,pdf_name),'wb') as f: f.write(pdf_request.content) print(pdf_name,'下载完成') except: print('采集失败{}'.format(pdf_name))report_down.add_command(label='东方财富个股研报采集',command=stock_now_report_pdf)report_down.add_command(label='东方财富行业研报采集',command=industry_report_down_pdf)report_down.add_command(label='东方财富新股研报采集',command=new_stock_report_pdf)report_down.add_command(label='东方财富策略研报采集',command=cl_report_pdf_down)report_down.add_command(label='东方财富宏观研报采集',command=hg_report_pdf_down)report_down.add_command(label='东方财富券商晨报采集',command=jscb_report_pdf_down)root['menu']=menmenuroot.mainloop()<br />

0

2022-06-13

采集自动组合

0 个评论

要回复文章请先登录或注册

AI时代内容工厂

重磅----东方财富研报自动采集系统，附程序代码

0 个评论

发起人

AI时代内容工厂

重磅----东方财富研报自动采集系统，附程序代码

0 个评论

发起人

相关问题