19、练习:使用selenium与Phantom JS爬取天天基金部分数据

selenium 练习

# coding="utf-8"

from selenium import webdriver
from  lxml import etree
import json
import time

class Tiantian_spider():
    def __init__(self):
        self.driver = webdriver.PhantomJS()
        self.html = None
        self.next_page = True
    
    
    # 1 发起请求
    def parser_url(self):
        if self.next_page :
            # 点击页面进行翻页
            self.driver.find_element_by_xpath("//div[@id ='pagebar']/label[last()]").click()
            time.sleep(4)  # 网页返回数据需要时间
            self.html = self.driver.page_source


    # 2 解析数据
    def parser_data(self):
        rel =[]
        html = etree.HTML(self.html)
        tr_list = html.xpath("//table[@id ='dbtable']//tbody/tr")
        next_page = html.xpath("//div[@id ='pagebar']//label[last()]")
        # print(next_page)
        for tr in tr_list:
            dic = {}
            tds =tr.xpath("./td")
            dic['序号'] = tds[1].text
            print(dic["序号"])
            dic["基金代码"] = tds[2].xpath("./a/text()")[0]
            dic["基金简称"] = tds[3].xpath("./a/@title")[0]
            dic["日期"] = tds[4].text
            dic["单位净增"] = tds[5].text
            dic["累计净值"] = tds[6].text
            dic["日增长率"] = tds[7].text
            dic["近一周"] = tds[8].text
            dic["近1月"] = tds[9].text
            dic["近3月"] = tds[10].text
            dic["近6月"] = tds[11].text
            rel.append(dic)
        return rel,next_page


    # 数据保存
    def save_data(self,data):
        with open("天天基金.txt","a",encoding="utf-8") as f:
            json.dump(data,f,ensure_ascii=False,indent=2)
        # print("保存成功")


    # 翻页控制器
    def over_page(self,next_page):
        kw = next_page[0].xpath("./label[contains(@class,'end')]")
        print(kw)
        flage = True if len(kw)==0 else False
        return flage


    def run(self,url):
        # 1 发起请求
        # 2 获取数据,解析数据
        self.driver.get(url)
        self.html = self.driver.page_source
        while self.next_page:
            data ,next_page= self.parser_data()
            # 3 保存数据
            self.save_data(data)
            # 4 翻页继续爬取
            self.next_page = self.over_page(next_page)
            self.parser_url()
            # print("程序执行完毕!!")
        self.driver.quit()


if __name__ == '__main__':
    url = "http://fund.eastmoney.com/data/fundranking.html#tall;c0;r;szzf;pn50;ddesc;qsd20200106;qed20210106;qdii;zq;gg;gzbd;gzfs;bbzt;sfbb"
    tiantian = Tiantian_spider()
    tiantian.run(url)




原文地址:https://www.cnblogs.com/hefany/p/14245212.html