正则匹配的爬虫

import requests
import re
class Anjuke(object):
    def __init__(self):
        self.url = "https://beijing.anjuke.com/sale/huairou/o5/"
        self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3"}
        self.pattern = re.compile('<ul id="houselist-mod-new" class="houselist-mod houselist-mod-new">(.*?)</ul>',re.S)
        self.second_pattern = re.compile('<(.*?)>|&(.*?);|s')


    def send_request(self):
        reponse = requests.get(self.url, headers=self.headers)
        data = reponse.content.decode()
        print(data)
        return data

    def save_data(self,result_data):
        with open('anjuke.text','a') as f:
            for data in result_data:
                second_content = self.second_pattern.sub('', data) + ' '
                f.write(second_content)
    def analysis_data(self,data):
        result_list = self.pattern.findall(data)
        return result_list

    def run(self):
        data = self.send_request()
        result_list = self.analysis_data(data)
        print(result_list)
        self.save_data(result_list)

if __name__ == '__main__':
    Anjuke().run()

























原文地址:https://www.cnblogs.com/hanjian200ok/p/9463165.html