巴比特网站爬取

#!/user/bin/env python
# -*- conding:utf-8 -*-
import requests
from lxml import etree
import json
class BtcSpider(object):
    def __init__(self):
        self.base_url = 'https://8btc.com/forum-61-'
        self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6823.400 QQBrowser/10.3.3117.400'}
        self.data_list = []
    #发送请求
    def get_response(self,url):
        response = requests.get(url,headers=self.headers)
        #head--meta-charset
        #抓取网页的编码是gbk
        data = response.content.decode('gbk')
        return data
    #解析网页
    def parse_data(self,data):
        #使用xpath解析当前页面
        #转类型
        x_data = etree.HTML(data)
        #根据xpath路径解析
        #路径  手写    借助浏览器  右击粘贴xpath路径,需要修改
        tittle_list = x_data.xpath('//a[@class="s xst"]/text()')
        # tittle_list = x_data.xpath('//from[@id="moderate"]/div/div[2]/div/a[1]/text()')
        #模糊查询                     //div[contain(@id,"normathread")]
        url_list = x_data.xpath('//a[@class="s xst"]/@href')
        for index,tittle in enumerate(tittle_list):
            news = {}
            # print(index)
            # print(tittle)
            news['name'] = tittle
            news['url'] = url_list[index]
            self.data_list.append(news)
    #保存数据
    def save_data(self):
        #将列表转换成字符串
        data_str = json.dumps(self.data_list)
        with open('05btc.json','w') as f:
            f.write(data_str)
    #启动
    def run(self):
        #拼接完整url
        for i in range(1,10):

            url =self.base_url + str(i)+'.html'

            #发送请求
            data = self.get_response(url)
            #做解析
            parse_data = self.parse_data(data)
        #保存
        self.save_data()

BtcSpider().run()
原文地址:https://www.cnblogs.com/HomeG/p/10527164.html