办公自动化24-爬取CMB网站理财产品的基本信息、投资报告并格式化输出

# -*- coding: utf-8 -*-
"""
Created on Aug 5 2020

@author: lizitingxue
"""

#基础包
import numpy as np
import pandas as pd
import os
import time
##爬虫
import re
import requests
from bs4 import BeautifulSoup
import urllib.request
from selenium import webdriver
##pdf处理
import winerror
from win32com import client as wc
from win32com.client.dynamic import Dispatch, ERRORS_BAD_CONTEXT
##word信息处理
from docx import Document
from openpyxl import Workbook

#定义函数
# 定义函数
def get_code(key_word):
    url = 'http://www.cmbchina.com/cfweb/Personal/Default.aspx'
    js_bottom = "var q=document.documentElement.scrollTop=10000"
    
    browser = webdriver.Chrome()
    browser.get(url)
    time.sleep(3)
    browser.find_element_by_xpath('//*[@id="tbCondition"]/table[1]/tbody/tr[9]/td[2]/ul/li[2]/input[1]').click()
    browser.find_element_by_xpath('//*[@id="inputPrdKey"]').clear()
    browser.find_element_by_xpath('//*[@id="inputPrdKey"]').send_keys(key_word)
    browser.find_element_by_xpath('//*[@id="column_content"]/div[8]/input[1]').click()
    time.sleep(2)
    browser.execute_script(js_bottom)  
    time.sleep(2)
    browser.execute_script(js_bottom)   
    time.sleep(2) 
    ##产品名称和编号
    temp_for_code = browser.find_elements_by_css_selector("[class='leftArea']>[class='row']>[class='code']")
    print(key_word+'系列产品数量为'+str(len(temp_for_code)))
    temp_for_name = browser.find_elements_by_css_selector('[class="inlineBgImage"]>[class="inlineBriefName"]')
    temp_code = []
    temp_name = []
    for j in temp_for_code:
        temp_code.append(j.text)
    for i in temp_for_name:
        temp_name.append(i.text)
    
    ###产品pdf的下载链接
    url_lst = []
    counts = len(browser.find_elements_by_css_selector("[class='viewMore']"))
    # temp = browser.find_elements_by_css_selector("[class='viewMore']")
    for i in range(counts):
        #需要返回上一级
        browser.find_elements_by_css_selector("[class='viewMore']")[i].click()
        time.sleep(3)
        browser.switch_to.window(browser.window_handles[-1])
        browser.find_element_by_id('fmenu_7').click()
        temp_for_pdf = browser.find_elements_by_css_selector("[class='c_title']>[target = '_blank']")

        if len(temp_for_pdf)!= 0:
            pdf = temp_for_pdf[0].get_attribute('href')
            url_lst.append(pdf)
        else:
            url_lst.append("未披露")
        browser.close() 
        browser.switch_to.window(browser.window_handles[0]) 
        
    ######保存为excel
    tarDir = folder_path+"//"+key_word #希望生成code列表文件的位置",
    if not os.path.exists(tarDir):
        os.mkdir(tarDir)
    df = pd.DataFrame({"产品名称":temp_name,"产品代码":temp_code,"投资报告下载链接":url_lst})
    print(df)
    df.to_excel(tarDir+"\"+key_word+"产品代码和链接.xlsx",index = False)
    return url_lst



def get_pdf(link,key_word):
    #保存pdf
    try:
        r = requests.get(link)
        r.encoding = r.apparent_encoding
        html = r.text
        soup = BeautifulSoup(html,'html.parser')
        pro_name = soup.find(name='head',attrs={"id":"Head1"}).text.replace('
', '').replace('
', '').replace('	', '').replace(' ', '').split("--")[-1]
        link = soup.find_all(name='a',attrs={"href":re.compile(r'.pdf$')})
        if len(link) != 0:
            mylink = link[0]
#             name = mylink.string
#             print(name)
            url = mylink.get('href')
#             print(url)
            filename = pro_name
            filepath = folder_path + '/' +key_word+'/'+ pro_name +".pdf"
            urllib.request.urlretrieve(url, filename=filepath)
            print(pro_name+"报告下载完成~~~~~O(∩_∩)O~")
        else:
            print(pro_name+'报告下载失败01')
    except:
        print(pro_name+"报告下载失败02")

def pdf_to_word(my_dir):
    ERRORS_BAD_CONTEXT.append(winerror.E_NOTIMPL)
    os.chdir(my_dir)
    file_lst = [x for x in os.listdir('.') if os.path.isfile(x) and os.path.splitext(x)[1]=='.pdf']
    for f in file_lst:
        src = os.path.abspath(f)#原来的地址
#         my_pdf = "招商银行零售青葵系列一年定开1号理财计划(产品代码:301025301025B301025C301025D)2020年第2季度报告.pdf"
        try:
            AvDoc = Dispatch("AcroExch.AVDoc")    

            if AvDoc.Open(src, ""):            
                pdDoc = AvDoc.GetPDDoc()
                jsObject = pdDoc.GetJSObject()
                new = os.path.join(my_dir, f.split(".")[0]+'.docx')
                jsObject.SaveAs(new, "com.adobe.acrobat.docx")#新的地址
                #word格式转换
                word = wc.Dispatch('Word.Application') 
                doc = word.Documents.Open(new) 
                doc.SaveAs(new.split(".")[0]+'T'+".docx",16) 
                #16对应于下表中的默认docx文件文件
                doc.Close() 
                word.Quit()
                #把原来的doc文件删除
                if os.path.exists(new):
                    os.remove(new)

        except Exception as e:
            print(str(e))

        finally:        
            AvDoc.Close(True)

            jsObject = None
            pdDoc = None
            AvDoc = None     
            
def date_trans(start):
    publish_Time = start.replace(' ','')
    array = time.strptime(publish_Time, u"%Y年%m月%d日")
    publishTime = time.strftime("%Y-%m-%d", array)
    return publishTime



def basic_info(file):
    wb = Workbook()
    sheet = wb.active
    header = ['产品名称', '业绩比较基准', '产品风险评级', '杠杆上限', '理财产品份额总额(亿元)', '理财产品成立日','理财产品到期日',
              '直接_债券','直接_一般债权','直接_资产支持证券','直接_现金及货币工具','直接_含权债权','直接_权益类','直接_商品及金融衍生品类','直接_合计',
              '间接_债券','间接_一般债权','间接_资产支持证券','间接_现金及货币工具','间接_含权债权','间接_权益类','间接_商品及金融衍生品类','间接_合计',]
    sheet.append(header)

    for doc in file:
        document = Document(folder_path + "\"+key_word + "\"+doc)
        # 读取word中的所有表格
        tables = document.tables
        #基本信息
        table0= tables[0]
        name = table0.cell(1, 1).text.strip().replace(' ','')[4:][:-4]
        compe = table0.cell(6, 1).text.strip().replace(' ','')[2:]
        risk = table0.cell(7,1).text.strip().replace(' ','')
        leverage = table0.cell(8,1).text.strip().replace(' ','')
        total = table0.cell(9,1).text.strip().replace(' ','')
        start = date_trans(table0.cell(10,1).text.strip().replace(' ',''))
        end = date_trans(table0.cell(11,1).text.strip().replace(' ','')[0:11])
        #投资标的
        invest = tables[1]
        print(doc+"投资表格行数"+str(len(invest.rows))+"-"*20)
        temp_direct = []
        temp_indirect = []
        for i in range(8): 
                try:
                    # 直接投资
                     # name = invest.cell(i+3,1)
                    amount = invest.cell(i+1,2).text.replace("
","")
                    temp_direct.append(amount)
                 # 间接投资
                    # name = invest.cell(i+3,1)
                    amount = invest.cell(i+1,4).text.replace("
","")
                    temp_indirect.append(amount)
                except Exception as error:
                    # 捕获异常,也可以用log写到日志里方便查看和管理
                    print(error)
                    continue
        #格式化输出基本信息
        row = [name,compe,risk,leverage,total,start,end]+temp_direct+temp_indirect
        sheet.append(row)    
    wb.save(file_path+'\'+key_word+'基本信息.xlsx')    

    
def nonstan_info(file):
    res = pd.DataFrame(columns = ['产品名称','项目名称','融资客户名称','交易日','项目剩余融资期限(单位:天)','报告期内到期收益率预估(%)','交易结构']) 
    for doc in file:
        document = Document(folder_path + "\"+key_word + "\"+doc)
        # 读取word中的所有表格
        tables = document.tables
        #非标信息
        name = tables[0].cell(1, 1).text.strip().replace(' ','')[4:][:-4]
        nonstan = tables[-2]
    #     print(len(nonstan.rows))#带表头
        nonstan_all = [[] for _ in range(len(nonstan.rows)-1)]
        if nonstan.cell(1,1).text != '-':
            for l in range(len(nonstan.rows)-1):
                for num in range(6):
                    nonstan_all[l].append(nonstan.cell(l+1,num).text.replace("
",'').replace(" ",""))
    #     print(nonstan_all)#不带表头
            print(name + '非标项目数量为{0}'.format(len(nonstan_all)))
    #     print("-"*20)

        if nonstan_all != [[]]:
            for x in range(len(nonstan_all)):
                temp = {'产品名称':name,'项目名称':nonstan_all[x][0],'融资客户名称':nonstan_all[x][1],'交易日':nonstan_all[x][2],'项目剩余融资期限(单位:天)':nonstan_all[x][3],'报告期内到期收益率预估(%)':nonstan_all[x][4],'交易结构':nonstan_all[x][5]}
                res = res.append(temp,ignore_index=True)

    res.to_excel (file_path+'\'+key_word +'非标投资情况.xlsx', index = False, header=True)

##爬取过程

#定义路径和爬取产品名称
folder_path = 'F:\PY\爬虫\投资报告备份' 
key_word = "卓远"

#爬取数据
url_lst = get_code(key_word)

#下载报告
for link in url_lst:
    if link != "未披露":
        get_pdf(link,key_word)

#pdf解析
my_dir = folder_path+'/'+key_word 
pdf_to_word(my_dir)

#提取投资报告信息
file_path = folder_path+"\"+key_word
file = [x for x in os.listdir(file_path) if os.path.isfile(x) and os.path.splitext(x)[1]=='.docx']
file

#保存信息
basic_info(file)
nonstan_info(file)

有问题欢迎留言哦~~~O(∩_∩)O

原文地址:https://www.cnblogs.com/lizitingxue/p/13442092.html