python 爬虫小说

思路:

1.获取第一张内容

2.判断请求方式

3.对URL存在回车进行处理

4.正则匹配

5.写入文件中

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2021/8/27 20:34
# @Author : Lhtester
# @Site : 
# @File : book.py
# @Software: PyCharm
import requests
import re
import time
import random
import sys

sys.setrecursionlimit(16000)#设置递归深度
class Book_deaill():
    def __init__(self):
        self.url = 'https://m.xyshuge.com/k3nl5/19/19364/55418253.html'
        # self.url = 'https://m.xyshuge.com/k3nl5/19/19364/55728792_2.html'
        self.headers ={"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36"}


    def data_get(self,url=None):
        if url==None:
            url=self.url
        result = requests.get(url=url, headers=self.headers).text
        # print(result)
        title= re.findall(r'<div class="nr_title" id="nr_title">(.*?)</div>',result)#匹配章节名称
        print(title)
        with open('../image/book2.txt','a+',encoding='utf-8')as f:
            for i in title:
                f.write(i)
                f.write('
')
        print('write titie complete')
        text = re.findall(r"<p class='c_detail'>(.*?)</p>",result)#匹配正文
        with open('../image/book2.txt','a+',encoding='utf-8')as f:
            for n in text:
                n = n.replace("&nbsp;","")
                n= n.replace("阅书阁『wWw.xyshuge.Com』,全文免费阅读.","")#删除网站的自定义文字
                f.write('
')
                f.write(n)
        time.sleep(random.randint(1 , 5) )#随机休眠,避免被对方反爬检测到
        print('write text complete')

        self.start_analysis(result)


    def start_analysis(self,result):

        new_url = 'https://m.xyshuge.com/k3nl5'
        get_next_page = re.findall(r'<a id="pb_next" href="/k3nl5
(.*?)">↓一页</a>',result)
        if len(get_next_page)==0:
            get_next_page = re.findall(r'<a id="pb_next" href="/k3nl5
(.*?)">↓一章</a>',result)
            print('下一章:',get_next_page)
        if len(get_next_page) == 0:#最后一章再次判断
            print('爬虫结束')
        else:
            new_url = new_url+get_next_page[0]
            print(new_url)
            self.data_get(new_url)#地址拼接

    def start_get_data(self):
        print('start get data ')
        self.data_get()



if __name__=='__main__':
    data =Book_deaill()
    data.start_get_data()
原文地址:https://www.cnblogs.com/anhao-world/p/15196630.html