python3 selenium模拟登陆斗鱼提取数据保存数据库

# coding=utf-8
from selenium import webdriver
import json
import time
import pymongo


class Douyu:
  def __init__(self):
    self.driver = webdriver.Chrome()
    # 发送首页请求
    self.driver.get("https://www.douyu.com/directory/all")
    self.host = '127.0.0.1'
    self.port = 27017
    self.DBname = 'douyu'

  def get_content(self):
    time.sleep(3)
    li_list = self.driver.find_elements_by_xpath('//ul[@id="live-list-contentbox"]/li')
    # print(li_list)

    contents = []
    # 遍历房间列表
    for i in li_list:
      item = {}
      # 获取房间图片
      item['img'] = i.find_element_by_xpath('./a//img').get_attribute("src")
      # 获取房间名字
      item['title'] = i.find_element_by_xpath('./a').get_attribute("title")
      # 获取房间分类
      item['category'] = i.find_element_by_xpath('./a/div[@class="mes"]/div/span').text
      # 获取主播名字
      item['name'] = i.find_element_by_xpath("./a/div[@class='mes']/p/span[1]").text

      # 观看人数
      item['watch_num'] = i.find_element_by_xpath("./a/div[@class='mes']/p/span[2]").text
      # print(item)
      contents.append(item)
    return contents

  # 保存到MongoDB
  def save_content(self, contents):
    # 创建MongoDB连接
    client = pymongo.MongoClient(host=self.host, port=self.port)
    # 指向指定的数据库
    mdb = client[self.DBname]
    self.post = mdb[self.DBname]
    self.post.insert(contents)

  # 保存到本地
  # def save_content(self, contents):
    # with open("douyu.json", "a") as f:
      # for content in contents:
      # json.dump(content, f, ensure_ascii=False, indent=2)
      # f.write(', ')

  def run(self):
    # 1.发送首页请求
    # 2.获取首页信息
    contents = self.get_content()
    # 3.保存内容
    self.save_content(contents)
    # 4.循环 点击下一页按钮,直到下一页对应的class名字不再是"shark-pager-next"
    # 判断有没有下一页
    while self.driver.find_element_by_class_name("shark-pager-next"):
      # 5.点击下一页按钮
      self.driver.find_element_by_class_name("shark-pager-next").click()
      # 6.获取下一页的内容
      contents = self.get_content()
      # 7.保存内容
      self.save_content(contents)

if __name__ == '__main__':
douyu = Douyu()
douyu.run()

原文地址:https://www.cnblogs.com/x-pyue/p/7798899.html