爬虫-3

xpath的包含
- `//div[contains(@class,'i')]`

实现爬虫的套路
  - 准备url
    - 准备start_url
      - url地址规律不明显,总数不确定
      - 通过代码提取下一页的url
        - xpath
        - 寻找url地址,部分参数在当前的响应中(比如,当前页码数和总的页码数在当前的响应中)
    - 准备url_list
      - 页码总数明确
      - url地址规律明显

  - 发送请求,获取响应
    - 添加随机的User-Agent,反反爬虫
    - 添加随机的代理ip,反反爬虫
    - 在对方判断出我们是爬虫之后,应该添加更多的headers字段,包括cookie
    - cookie的处理可以使用session来解决
    - 准备一堆能用的cookie,组成cookie池
      - 如果不登录
        - 准备刚开始能够成功请求对方网站的cookie,即接收对方网站设置在response的cookie
        - 下一次请求的时候,使用之前的列表中的cookie来请求
      - 如果登录
        - 准备多个账号
        - 使用程序获取每个账号的cookie
        - 之后请求登录之后才能访问的网站随机的选择cookie

  - 提取数据
    - 确定数据的位置
      - 如果数据在当前的url地址中
        - 提取的是列表页的数据
          - 直接请求列表页的url地址,不用进入详情页
        - 提取的是详情页的数据
          - 1. 确定url
          - 2. 发送请求
          - 3. 提取数据
          - 4. 返回

      - 如果数据不在当前的url地址中
        - 在其他的响应中,寻找数据的位置
          - 1. 从network中从上往下找
          - 2. 使用chrome中的过滤条件,选择出了js,css,img之外的按钮
          - 3. 使用chrome的search all file,搜索数字和英文
  - 数据的提取
    - xpath,从html中提取整块的数据,先分组,之后每一组再提取
    - re,提取max_time,price,html中的json字符串
    - json

  - 保存
    - 保存在本地,text,json,csv
    - 保存在数据库

# coding=utf-8
import requests
from lxml import etree
import json

class TiebaSpider:
    def __init__(self,tieba_name):
        self.tieba_name = tieba_name
        self.start_url = "http://tieba.baidu.com/mo/q----,sz@320_240-1-3---2/m?kw="+tieba_name+"&pn=0"
        self.part_url = "http://tieba.baidu.com/mo/q----,sz@320_240-1-3---2/"
        self.headers= {"User-Agent":"Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Mobile Safari/537.36"}

    def parse_url(self,url):#发送请求,获取响应
        print(url)
        response = requests.get(url,headers=self.headers)
        return response.content

    def get_content_list(self,html_str):#提取数据
        html = etree.HTML(html_str)

        div_list = html.xpath("//div[contains(@class,'i')]") #根据div分组
        content_list = []
        for div in div_list:
            item = {}
            item["title"] = div.xpath("./a/text()")[0] if len(div.xpath("./a/text()"))>0 else None
            item["href"] = self.part_url+div.xpath("./a/@href")[0] if len(div.xpath("./a/@href"))>0 else None
            item["img_list"] = self.get_img_list(item["href"],[])
            itemp["img_list"] = [requests.utils.unquote(i).split("src=")[-1] for i in item["img_list"]]
            content_list.append(item)
        #提取下一页的url地址
        next_url = self.part_url+html.xpath("//a[text()='下一页']/@href")[0] if len(html.xpath("//a[text()='下一页']/@href"))>0 else None
        return content_list,next_url

    def get_img_list(self,detail_url,total_img_list): #获取帖子中的所有的图片
        #3.2请求列表页的url地址,获取详情页的第一页
        detail_html_str = self.parse_url(detail_url)
        detail_html = etree.HTML(detail_html_str)
        #3.3提取详情页第一页的图片,提取下一页的地址
        img_list = detail_html.xpath("//img[@class='BDE_Image']/@src")
        total_img_list.extend(img_list)
        #3.4请求详情页下一页的地址,进入循环3.2-3.4
        detail_next_url = detail_html.xpath("//a[text()='下一页']/@href")

        if len(detail_next_url)>0:
            detail_next_url =  self.part_url + detail_next_url[0]
            return self.get_img_list(detail_next_url,total_img_list)
        # else:
        #     return total_img_list
        return total_img_list

    def save_content_list(self,content_list): #保存数据
        file_path = self.tieba_name+".txt"
        with open(file_path,"a",encoding="utf-8") as f:
            for content in content_list:
                f.write(json.dumps(content,ensure_ascii=False,indent=2))
                f.write("
")
        print("保存成功")

    def run(self):#实现主要逻辑
        next_url = self.start_url
        while next_url is not None:
            #1.start_url
            #2.发送请求,获取响应
            html_str = self.parse_url(next_url)
            #3.提取数据,提取下一页的url地址
                #3.1提取列表页的url地址和标题
                #3.2请求列表页的url地址,获取详情页的第一页
                #3.3提取详情页第一页的图片,提取下一页的地址
                #3.4请求详情页下一页的地址,进入循环3.2-3.4
            content_list,next_url = self.get_content_list(html_str)
            #4.保存数据
            self.save_content_list(content_list)
            #5.请求下一页的url地址,进入循环2-5不

if __name__ == '__main__':
    tieba_spider = TiebaSpider("做头发")
    tieba_spider.run()
View Code
# coding=utf-8
import requests
from lxml import etree

class QiubaiSpdier:
    def __init__(self):
        self.url_temp = "https://www.qiushibaike.com/8hr/page/{}/"

        self.headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"}
    def get_url_list(self):
        return [self.url_temp.format(i) for i in range(1,14)]

    def parse_url(self,url):
        print(url)
        response = requests.get(url,headers=self.headers)
        return response.content.decode()

    def get_content_list(self,html_str): #提取数据
        html = etree.HTML(html_str)
        div_list = html.xpath("//div[@id='content-left']/div")  #分组
        content_list = []
        for div in div_list:
            item= {}
            item["content"] = div.xpath(".//div[@class='content']/span/text()")
            item["content"] = [i.replace("
","") for i in item["content"]]
            item["author_gender"] = div.xpath(".//div[contains(@class,'articleGender')]/@class")
            item["author_gender"] = item["author_gender"][0].split(" ")[-1].replace("Icon","") if len(item["author_gender"])>0 else None
            item["auhtor_age"] = div.xpath(".//div[contains(@class,'articleGender')]/text()")
            item["auhtor_age"] = item["auhtor_age"][0] if len(item["auhtor_age"])>0 else None
            item["content_img"] = div.xpath(".//div[@class='thumb']/a/img/@src")
            item["content_img"] = "https:"+item["content_img"][0] if len(item["content_img"])>0 else None
            item["author_img"] = div.xpath(".//div[@class='author clearfix']//img/@src")
            item["author_img"] = "https:"+item["author_img"][0] if len(item["author_img"])>0 else None
            item["stats_vote"] = div.xpath(".//span[@class='stats-vote']/i/text()")
            item["stats_vote"] = item["stats_vote"][0] if len(item["stats_vote"])>0 else None
            content_list.append(item)
        return content_list

    def save_content_list(self,content_list): #保存
        for i in content_list:
            print(i)

    def run(self): #实现主要逻辑
        #1.url_list
        url_list = self.get_url_list()
        #2.遍历,发送请求,获取响应
        for url in url_list:
            html_str = self.parse_url(url)
            #3.提取数据
            content_list = self.get_content_list(html_str)
            #4.保存
            self.save_content_list(content_list)

if __name__ == '__main__':
    qiubai = QiubaiSpdier()
    qiubai.run()
View Code
# coding=utf-8
import requests
from lxml import etree
import threading
from queue import Queue

class QiubaiSpdier:
    def __init__(self):
        self.url_temp = "https://www.qiushibaike.com/8hr/page/{}/"
        self.headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"}

        self.url_queue = Queue()
        self.html_queue  = Queue()
        self.content_queue = Queue()

    def get_url_list(self):
        # return [self.url_temp.format(i) for i in range(1,14)]
        for i in range(1,4):
            self.url_queue.put(self.url_temp.format(i))

    def parse_url(self):
        while True:
            url = self.url_queue.get()

            print(url)
            response = requests.get(url,headers=self.headers)
            self.html_queue.put(response.content.decode())
            self.url_queue.task_done()

    def get_content_list(self): #提取数据
        while True:
            html_str = self.html_queue.get()

            html = etree.HTML(html_str)
            div_list = html.xpath("//div[@id='content-left']/div")  #分组
            content_list = []
            for div in div_list:
                item= {}
                item["content"] = div.xpath(".//div[@class='content']/span/text()")
                item["content"] = [i.replace("
","") for i in item["content"]]
                item["author_gender"] = div.xpath(".//div[contains(@class,'articleGender')]/@class")
                item["author_gender"] = item["author_gender"][0].split(" ")[-1].replace("Icon","") if len(item["author_gender"])>0 else None
                item["auhtor_age"] = div.xpath(".//div[contains(@class,'articleGender')]/text()")
                item["auhtor_age"] = item["auhtor_age"][0] if len(item["auhtor_age"])>0 else None
                item["content_img"] = div.xpath(".//div[@class='thumb']/a/img/@src")
                item["content_img"] = "https:"+item["content_img"][0] if len(item["content_img"])>0 else None
                item["author_img"] = div.xpath(".//div[@class='author clearfix']//img/@src")
                item["author_img"] = "https:"+item["author_img"][0] if len(item["author_img"])>0 else None
                item["stats_vote"] = div.xpath(".//span[@class='stats-vote']/i/text()")
                item["stats_vote"] = item["stats_vote"][0] if len(item["stats_vote"])>0 else None
                content_list.append(item)
            self.content_queue.put(content_list)
            self.html_queue.task_done()

    def save_content_list(self): #保存
        while True:
            content_list = self.content_queue.get()
            for i in content_list:
                # print(i)
                pass
            self.content_queue.task_done()

    def run(self): #实现主要逻辑
        thread_list = []
        #1.url_list
        t_url = threading.Thread(target=self.get_url_list)
        thread_list.append(t_url)
        #2.遍历,发送请求,获取响应
        for i in range(20):
            t_parse = threading.Thread(target=self.parse_url)
            thread_list.append(t_parse)
        #3.提取数据
        for i in range(2):
            t_html = threading.Thread(target=self.get_content_list)
            thread_list.append(t_html)
        #4.保存
        t_save = threading.Thread(target=self.save_content_list)
        thread_list.append(t_save)
        for t in thread_list:
            t.setDaemon(True) #把子线程设置为守护线程,该线程不重要,主线程结束,子线程结束
            t.start()

        for q in [self.url_queue,self.html_queue,self.content_queue]:
            q.join() #让主线程等待阻塞,等待队列的任务完成之后再完成

        print("主线程结束")
'''
多线程精髓:
    1.让子线程依赖主线程,即给子线程设置:t.setDaemon(True);然后再让主线程以来队列中的任务,则q.join()
    2.数据通过队列来传递
'''

if __name__ == '__main__':
    qiubai = QiubaiSpdier()
    qiubai.run()
View Code
import os
import re

from lxml import etree

from retrying import retry
import requests

class Tieba():
    def __init__(self):
        self.start_url = "https://tieba.baidu.com/f?ie=utf-8&kw=DOTA2&fr=search"
        self.headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36"}

    @retry(stop_max_attempt_number = 3)
    def _parse_url(self, url):
        response = requests.get(url, headers = self.headers, timeout=8)
        assert response.status_code == 200
        try:
            html_str = response.content.decode()
        except:
            html_str =  response.text
        return html_str

    def parse_url(self, url):
        try:
            html_str = self._parse_url(url)
        except Exception as e:
            print(e)
            html_str = None
        return html_str

    def analysis(self, html_str):
        html_str = re.sub(r'<!--|-->', "", html_str)
        nodes = etree.HTML(html_str)
        title = nodes.xpath('//ul//a[@rel="noreferrer" and @class="j_th_tit "]')
        ret = []  # 这个列表用于保存字典——{ 标题:地址 }
        for t in title:
            temp_dict = dict()
            item = {}
            item["title"] = t.xpath("@title")[0]
            item["href"] = t.xpath("@href")[0] if len(t.xpath("@href"))>0 else None
            item["img_list"] = self.get_pic()
            ret.append(temp_dict)
        return ret

    def get_pic(self, item_list):
        for item in item_list:
            if list(item.values())[0] is None:
                continue
            url = "https://tieba.baidu.com{}".format(list(item.values())[0])
            try:
                html_str = self.parse_url(url)
                html_str = re.sub(r'<!--|-->', "", html_str)
                ret = etree.HTML(html_str)
                pic_list = ret.xpath("//img[]/@src")
                return pic_list
            except Exception as e:
                print(e)
            os.mkdir(list(item.keys())[0])
            for pic in pic_list:
                if pic.startswith("//"):  # 发现贴吧里有的图片是以//开头的,所以需要给这些图片加上http:
                    pic_url = "http:"+ pic
                else:
                    pic_url = pic
                pic_str = requests.get(pic_url, headers=self.headers).content
                if pic_str is not None:
                    pic_rex = re.search(r"(.jpg|.png|.gif)", pic_url)  # 获取图片的格式,是jpg, png 还是gif。
                    if pic_rex is not None:
                        pic_style = pic_rex.group(0)
                    else:
                        pic_style = ".jpg"
                    with open(list(item.keys())[0]+"/"+str(pic_list.index(pic))+ pic_style, "wb") as f:
                        f.write(pic_str)

    def get_next(self, html_str):
        ret = etree.HTML(html_str)
        next_page = ret.xpath('//a[text()="下一页>"]/@href')
        print(next_page)
        return next_page

    def run(self):
        html_str = self.parse_url(self.start_url)
        if html_str is not None:
            # html_str = re.sub(r'<!--|-->', "", html_str)
            item_list = self.analysis(html_str)
            self.get_pic(item_list)
            ret = self.get_next(html_str)
            self.save_ret(ret)
            while len(ret)>0:
                ret[0] = "http:"+ ret[0]
                html_str = self.parse_url(ret[0])
                # html_str = re.sub(r'<!--|-->', "", html_str)
                item_list = self.analysis(html_str)
                self.get_pic(item_list)
                ret = self.get_next(html_str)



if __name__ == '__main__':
    tieba = Tieba()
    tieba.run()
View Code

验证码的识别
  - url不变,验证码不变
    - 请求验证码的地址,获得相应,识别

  - url不变,验证码会变
    - 思路:对方服务器返回验证码的时候,会和每个用户的信息和验证码进行一个对应,之后,在用户发送post请求的时候,会对比post请求中法的验证码和当前用户真正的存储在服务器端的验证码是否相同

    - 1.实例化session
    - 2.使用seesion请求登录页面,获取验证码的地址
    - 3.使用session请求验证码,识别
    - 4.使用session发送post请求’

  - 使用selenium登录,遇到验证码
    - url不变,验证码不变,同上
    - url不变,验证码会变
      - 1.selenium请求登录页面,同时拿到验证码的地址
      - 2.获取登录页面中driver中的cookie,交给requests模块发送验证码的请求,识别
      - 3.输入验证码,点击登录

  selenium使用的注意点
    - 获取文本和获取属性
      - 先定位到元素,然后调用`.text`或者`get_attribute`方法来去
    - selenium获取的页面数据是浏览器中elements的内容
    - find_element和find_elements的区别
      - find_element返回一个element,如果没有会报错
      - find_elements返回一个列表,没有就是空列表
      - 在判断是否有下一页的时候,使用find_elements来根据结果的列表长度来判断
    - 如果页面中含有iframe、frame,需要先调用driver.switch_to.frame的方法切换到frame中才能定位元素

    - selenium请求第一页的时候回等待页面加载完了之后在获取数据,但是在点击翻页之后,hi直接获取数据,此时可能会报错,因为数据还没有加载出来,需要time.sleep(3)
    - selenium中find_element_by_class_name智能接收一个class对应的一个值,不能传入多个

# coding=utf-8
from selenium import webdriver
import time

#实例化一个浏览器
driver = webdriver.Chrome()
# driver = webdriver.PhantomJS()

#设置窗口大小
# driver.set_window_size(1920,1080)

#最大化窗口
driver.maximize_window()

#发送请求
driver.get("http://www.baidu.com")

#进行页面截屏
driver.save_screenshot("./baidu.png")

#元素定位的方法
driver.find_element_by_id("kw").send_keys("python")
driver.find_element_by_id("su").click()

# driver 获取html字符串
# print(driver.page_source) #浏览器中elements的内容

print(driver.current_url)

#driver获取cookie
# cookies = driver.get_cookies()
# print(cookies)
# print("*"*100)
# cookies = {i["name"]:i["value"] for i in cookies}
# print(cookies)

#退出浏览器
time.sleep(3)
driver.quit()
View Code
# coding=utf-8
from selenium import webdriver
import time
import requests
from yundama.dama import indetify

#实例化driver
driver = webdriver.Chrome()
driver.get("https://www.douban.com/")

driver.find_element_by_id("form_email").send_keys("784542623@qq.com")
driver.find_element_by_id("form_password").send_keys("zhoudawei123")

#识别验证码
captcha_image_url = driver.find_element_by_id("captcha_image").get_attribute("src")
captcha_content = requests.get(captcha_image_url).content
captcha_code = indetify(captcha_content)
print("验证码的识别结果为:",captcha_code)

#输入验证码
driver.find_element_by_id("captcha_field").send_keys(captcha_code)

driver.find_element_by_class_name("bn-submit").click()


#获取cookie
cookies = {i["name"]:i["value"] for i in driver.get_cookies()}
print(cookies)

time.sleep(3)
driver.quit()
View Code
# coding=utf-8
from selenium import webdriver

driver = webdriver.Chrome()

# driver.get("http://neihanshequ.com/")
driver.get("https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&ch=&tn=baidu&bar=&wd=python&rn=&oq=&rsv_pq=87739988000939bf&rsv_t=b194dxdCny6hrJFXQrh4D6bavkKZwfpeT4s7j7V6AvGfiiAvTgxqGAvWbCM&rqlang=cn")

# ret1 = driver.find_elements_by_xpath("//ul[@id='detail-list']/li")
# # print(ret1)
# # print(ret1)
# for li in ret1:
#     print(li.find_element_by_xpath(".//h1/p").text)
#     print(li.find_element_by_xpath(".//a[@class='image share_url1']").get_attribute("href"))

#find_element_by_link_text
print(driver.find_element_by_link_text("下一页>").get_attribute("href"))
#find_element_by_partial_link_text文本中包含下一页的a标签
print(driver.find_element_by_partial_link_text("下一页").get_attribute("href"))


driver.quit()
View Code
# coding=utf-8
from selenium import webdriver
import time

driver = webdriver.Chrome()
driver.get("https://mail.qq.com/")

#切换到iframe
driver.switch_to.frame("login_frame")

driver.find_element_by_id("u").send_keys("12312312312")


time.sleep(3)
driver.quit()
View Code
# coding=utf-8
from selenium import webdriver
import time
driver = webdriver.Chrome()

driver.get("https://www.bilibili.com/v/kichiku/mad/#/all/stow")

print(driver.find_element_by_xpath("//ul[@class='vd-list mod-2']/li//a[@class='title']").text)

#翻页
driver.find_element_by_xpath("//button[@class='nav-btn iconfont icon-arrowdown3']").click()

time.sleep(3)
print(driver.find_element_by_xpath("//ul[@class='vd-list mod-2']/li//a[@class='title']").text)

driver.quit()
View Code
原文地址:https://www.cnblogs.com/MR-allen/p/10584222.html