搜韵网诗词采集

搜韵网诗词采集,会封ip 建议使用代理采集

import pymongo
import requests
from pyquery import PyQuery as pq
from urllib.parse import urljoin


class Poetry:
    def __init__(self):
        self.start_url = 'https://sou-yun.cn/PoemIndex.aspx'
        self.comment_url = 'https://api.sou-yun.cn/api/Poem?jsonType=true&includeLinks=true&key={}'
        self.headers = {
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"
        }
        self.client = pymongo.MongoClient(host='localhost', port=27017).Poetry['Poetry']
        self.dynasty_dict = dict()
        self.run()

    def get_response(self, url):
        """单独设置请求,便于添加代理"""
        response = requests.get(url=url, headers=self.headers)
        return response

    def get_dynasty_content(self):
        """获取朝代的链接地址,每一个朝代的信息"""
        dynasty_response = self.get_response(self.start_url)
        doc = pq(dynasty_response.content.decode('utf8'))
        dynasty_cate_doc = doc('.inline1').items()
        for dynasty_doc in dynasty_cate_doc:
            dynasty_name = dynasty_doc('a').text()
            dynasty_url = urljoin(self.start_url, dynasty_doc('a').attr('href'))
            self.dynasty_dict[dynasty_name] = dynasty_url

    def get_poetry(self, dynasty, person_name, person_url):
        """获取每一个朝代的历史人物的诗词,翻页采用递归的方式"""
        poetry_response = self.get_response(person_url)
        doc = pq(poetry_response.content.decode('utf8'))
        poetry_doc_list = doc('._poem').items()
        for poetry_doc in poetry_doc_list:
            poetry_id = poetry_doc.attr('id').replace('poem_', '')
            poetry_title = poetry_doc('.poemCommentLink').text()
            title_comment = poetry_doc('.titleComment').text()
            poetry_content = poetry_doc('.poemSentence').text().strip()
            poetry_comment_doc = poetry_doc('.poemComment')
            poetry_comment = ''
            if poetry_comment_doc:
                comment_url = self.comment_url.format(poetry_id)
                comment_response = self.get_response(comment_url).json()
                comments = comment_response['ShiData'][0]['Comments']
                for comment in comments:
                    book = comment['Book']
                    content = comment['Content'].replace('<br />', '
')
                    poetry_comment += book + '' + '
' + content + '
'
            poetry_type = poetry_doc('.titleIndent').text()
            poetry_note = poetry_doc('.poemNote').text()
            poetry_dict = dict()
            poetry_dict['朝代'] = dynasty
            poetry_dict['作者'] = person_name
            poetry_dict['标题'] = poetry_title
            poetry_dict['标题注释'] = title_comment
            poetry_dict['类型'] = poetry_type
            poetry_dict['内容'] = poetry_content
            poetry_dict['评注'] = poetry_comment
            poetry_dict['注释'] = poetry_note
            self.client.insert_one(poetry_dict)

        #  翻页逻辑
        next_page_doc_list = doc('#content>div:last-child a').items()
        next_page_url = ''
        for next_page_doc in next_page_doc_list:
            if '下一页' in next_page_doc.text():
                next_page_url_doc = next_page_doc.attr('href')
                next_page_url = urljoin(self.start_url, next_page_url_doc)
        if next_page_url:
            self.get_poetry(dynasty, person_name, next_page_url)

    def get_person_content(self):
        """根据朝代获取每一个朝代的名人的链接地址"""
        for dynasty, dynasty_url in self.dynasty_dict.items():
            person_response = self.get_response(dynasty_url)
            doc = pq(person_response.content.decode('utf8'))
            person_doc_list = doc('.inline1').items()
            for person_doc in person_doc_list:
                person_name = person_doc('a').text()
                person_url = urljoin(self.start_url, person_doc('a').attr('href'))
                self.get_poetry(dynasty, person_name, person_url)

    def run(self):
        self.get_dynasty_content()
        self.get_person_content()


if __name__ == '__main__':
    Poetry()

代码未设置代理

原文地址:https://www.cnblogs.com/lqn404/p/13840385.html