百度百科-人物数据采集

import json
import re

import requests
from urllib.parse import quote

from bs4 import BeautifulSoup
from pyquery import PyQuery as pq


class BaiDuPerson:
    def __init__(self, name):
        self.temp_url = 'https://baike.baidu.com/search/word?word='
        self.headers={
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"
        }
        self.response = ''
        self.save_path = r'E:百度json文件'
        self.name = name
        self.run()

    def get_response(self):
        url = self.temp_url + quote(self.name)
        response = requests.get(url=url, headers=self.headers)
        self.response = response.content.decode('utf8')

    def check_ambiguity(self):
        """校验人名是否有歧义--多个人名指代"""
        doc = pq(self.response)
        ul = doc('.polysemantList-wrapper.cmn-clearfix')
        if ul:
            return True
        else:
            return False

    def get_introduction(self):
        """
        获得简介
        """
        soup = BeautifulSoup(self.response, "lxml")
        try:

            result = soup.select(".lemma-summary")[0].text
            result = "".join(result)
        except:
            result = ''
        return result

    def get_person_lifetime(self):
        """
        获取生平数据
        """
        res = self.response.split('<h2 class="title-text"')
        h2_dict = {}
        if len(res) == 1:
            doc = pq(self.response)
            content = doc('.para').text()
            h2_dict['生平'] = content
        else:
            for h2 in res[1:]:
                tmp2 = {}
                if '<div class="album-list">' in h2:
                    h2 = h2.split('<div class="album-list">')[0]
                if '<dt class="reference-title"' in h2:
                    h2 = h2.split('<dt class="reference-title"')[0]
                if '<div class="rs - container - foot"' in h2:
                    h2 = h2.split('<div class="rs - container - foot"')[0]
                if '<div class="tashuo-bottom"' in h2:
                    h2 = h2.split('<div class="tashuo-bottom"')[0]
                if '<div class="go-auth-box"' in h2:
                    h2 = h2.split('<div class="go-auth-box"')[0]
                if '<div class="side-content">' in h2:
                    h2 = h2.split('<div class="side-content">')[0]
                h2 = '<h2 class="title-text"' + h2
                soup = BeautifulSoup(h2, "lxml")
                h2_key = soup.find("h2").get_text().replace(self.name, '').strip()
                h3_dict = {}
                if "<h3" in h2:
                    for h3 in h2.split("<h3")[1:]:
                        tmp3 = {}
                        h3 = "<h3" + h3
                        soup = BeautifulSoup(h3, "lxml")
                        replace = soup.find("h3").get_text()
                        h3_title = replace.replace(self.name, '').strip()
                        if "<ul" in h3:
                            res = h3.split("<ul")
                            ul_dict = {}
                            for ul in res[1:]:
                                ul = "<ul" + ul
                                soup = BeautifulSoup(ul, "lxml")
                                ul_title = soup.find("ul").get_text().replace(self.name, '').strip()
                                tmp1 = {}

                                for item in ul.split("</ul>")[1:]:
                                    v_list = []  # 存储多个关系
                                    soup = BeautifulSoup(item, "lxml")
                                    ul_vlist = soup.find_all("div")
                                    for i in ul_vlist:
                                        ul_v = i.get_text().replace("xa0", '')
                                        for shangbiao in re.findall(re.compile(r"[d+]"), ul_v):
                                            ul_v = ul_v.replace(shangbiao, "")
                                        if ul_v == '':
                                            continue
                                        else:
                                            v_list.append(ul_v)
                                    tmp1[ul_title] = v_list
                                ul_dict.update(tmp1)
                            h3_dict.update(ul_dict)
                        else:
                            h3_v = soup.get_text().replace(replace, "").replace("xa0", '')
                            for shangbiao in re.findall(re.compile(r"[d+]"), h3_v):
                                h3_v = h3_v.replace(shangbiao, "")
                            tmp3[h3_title] = [h3_v]
                            h3_dict.update(tmp3)
                        tmp2 = {h2_key: h3_dict}
                    h2_dict.update(tmp2)
                else:
                    h2_v = soup.get_text().replace(soup.find("h2").get_text(), "").replace("xa0", '')
                    for shangbiao in re.findall(re.compile(r"[d+]"), h2_v):
                        h2_v = h2_v.replace(shangbiao, "")

                    h2_v = h2_v.split("
")
                    h2_v_list = []
                    for item in h2_v:
                        if item and (not item == '编辑'):
                            h2_v_list.append(item)

                    tmp = {h2_key: h2_v_list}
                    h2_dict.update(tmp)
        return h2_dict

    def get_relationship(self):
        """
        获取人物关系
        """
        relationship = []
        soup = BeautifulSoup(self.response, "lxml")
        res_ship = soup.select(".info .name")
        res_value = soup.select(".info .title")
        for i in range(len(res_ship)):
            temp = []
            temp.append(self.name)
            temp.append(res_ship[i].string)
            temp.append(res_value[i].string)
            relationship.append(temp)
        return relationship

    def get_person_details(self):
        """获取人物标签栏数据"""
        doc = pq(self.response)
        person_detail_key_doc_list = doc('.basic-info.cmn-clearfix dt').items()
        person_detail_key_list = []
        for key_doc in person_detail_key_doc_list:
            person_detail_key = key_doc.text().replace(' ','')
            person_detail_key_list.append(person_detail_key)
        person_detail_value_doc_list = doc('.basic-info.cmn-clearfix dd').items()
        person_detail_value_list = []
        for value_doc in person_detail_value_doc_list:
            person_detail_value = value_doc.text().replace(' ','')
            person_detail_value_list.append(person_detail_value)
        person_detail_dict = dict(zip(person_detail_key_list, person_detail_value_list))
        return person_detail_dict

    def get_name(self):
        """抓取的首页的人物名字"""
        soup = BeautifulSoup(self.response, "lxml")
        try:
            name = soup.find("h1").text
        except:
            name = ''
        return name

    def run(self):
        self.get_response()
        check_ambiguity_result = self.check_ambiguity()
        if check_ambiguity_result:
            with open('有歧义.txt', 'a', encoding='utf8') as f:
                f.write(self.name+'
')
        else:
            introduction = self.get_introduction()
            person_name = self.get_name()
            relationship = self.get_relationship()
            person_lifetime = self.get_person_lifetime()
            person_detail = self.get_person_details()
            person_information = dict()
            person_information['Introduction'] = introduction
            person_information['Rel'] = relationship
            person_information['Details'] = person_detail
            person_information.update(person_lifetime)
            with open(self.save_path+'\'+person_name+'.json', 'w', encoding='utf8') as f:
                f.write(json.dumps(person_information, ensure_ascii=False))


if __name__ == '__main__':
    name = '裴寂'
    BaiDuPerson(name)
原文地址:https://www.cnblogs.com/lqn404/p/13827435.html