python简单爬虫

"""
请求连接:https://maoyan.com/board/4
第二页:https://maoyan.com/board/4?offset=10

"""
import requests
import re


class myspider():
    
    def __init__(self,base_url,headers):
        self.base_url = base_url
        self.headers = headers
    
    #获取第一页数据
    def get_data(self,start_num):
        url = self.base_url.format(start_num)
        response = requests.get(url = url,headers = self.headers)
        #判断状态码
        if response.status_code == 200:
            
            return response.content.decode('utf8')
        else:
            return None
    #解析数据
    def parse_onepage(self,html):
        pattern = re.compile('<dd>.*?board-index.*?>(\d+).*?movie-item-info.*?>.*?<a.*?title="(.*?)".*?>.*?</dd>',re.S)
        result = re.findall(pattern,html)
        return result
    #保存数据
    def save_data(self,data):
        for value in data:
            list1 = []
            for valuedate in value:
                list1.append(valuedate)
                
            #列表拼接成字符串
            movestr = " ".join(list1)+'\n'
            
            with open('./movestr.txt','a',encoding='utf-8') as f:
                f.write(movestr)
    
    


if __name__ == "__main__":
    
    #连接参数
    base_url = "https://maoyan.com/board/4?offset={}"
    
    #请求头
    headers = {
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"

    }
    
    my_spider = myspider(base_url, headers)
    html = my_spider.get_data(0)
    value = my_spider.parse_onepage(html)
    my_spider.save_data(value)
原文地址:https://www.cnblogs.com/luweilehei/p/11342200.html