爬虫模块之selenium模块

一 模块的介绍

 selenium模块最开始是一个自动化测试的工具,驱动浏览器完全模拟浏览器自动测试。

from selenium import webdriver  # 驱动浏览器
browser=webdriver.Chrome()  # 谷歌浏览器
browser=webdriver.Firefox()   # 火狐浏览器
browser=webdriver.PhantomJS()  # 虚拟浏览器
browser=webdriver.Safari()
browser=webdriver.Edge() 

二 下载安装

#安装:selenium+chromedriver
pip3 install selenium
下载chromdriver.exe放到python安装路径的scripts目录中即可,注意最新版本是2.29,并非2.9
国内镜像网站地址:http://npm.taobao.org/mirrors/chromedriver/2.29/
最新的版本去官网找:https://sites.google.com/a/chromium.org/chromedriver/downloads

#验证安装
C:UsersAdministrator>python3
Python 3.6.1 (v3.6.1:69c0db5, Mar 21 2017, 18:41:36) [MSC v.1900 64 bit (AMD64)] on win32
Type "help", "copyright", "credits" or "license" for more information.
>>> from selenium import webdriver
>>> driver=webdriver.Chrome() #弹出浏览器
>>> driver.get('https://www.baidu.com')
>>> driver.page_source

#注意:
selenium3默认支持的webdriver是Firfox,而Firefox需要安装geckodriver
下载链接:https://github.com/mozilla/geckodriver/releases
View Code
#安装:selenium+phantomjs
pip3 install selenium
下载phantomjs,解压后把phantomjs.exe所在的bin目录放到环境变量
下载链接:http://phantomjs.org/download.html

#验证安装
C:UsersAdministrator>phantomjs
phantomjs> console.log('egon gaga')
egon gaga
undefined
phantomjs> ^C
C:UsersAdministrator>python3
Python 3.6.1 (v3.6.1:69c0db5, Mar 21 2017, 18:41:36) [MSC v.1900 64 bit (AMD64)] on win32
Type "help", "copyright", "credits" or "license" for more information.
>>> from selenium import webdriver
>>> driver=webdriver.PhantomJS() #无界面浏览器
>>> driver.get('https://www.baidu.com')
>>> driver.page_source
View Code

三 基本使用

 ActionChains:拖动的一些事。

 expected_conditions:加载的时间设置

 find_element_by_id:id查找的方式。

 send_keys:发送查找的关键字

 click:点击事件

 current_url:获取正在驱动的url

 get_cookies:获取cookies信息

 page_source:页面源代码

from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR
from selenium.webdriver.common.keys import Keys #键盘按键操作
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素

browser=webdriver.Chrome()
try:
    browser.get('https://www.baidu.com')

    input_tag=browser.find_element_by_id('kw')
    input_tag.send_keys('美女') #python2中输入中文错误,字符串前加个u
    input_tag.send_keys(Keys.ENTER) #输入回车

    wait=WebDriverWait(browser,10)
    wait.until(EC.presence_of_element_located((By.ID,'content_left'))) #等到id为content_left的元素加载完毕,最多等10秒

    print(browser.page_source)
    print(browser.current_url)
    print(browser.get_cookies())

finally:
    browser.close()
View Code

 四 选择器

 基本选择器查找: 

 find_element_by_id:根据ID查找

 find_element_by_link_text:通过文本查找

 find_element_by_partial_link_text:根据某些文本模糊查找到第一个内容

 find_element_by_class_name:通过class查找

 find_element_by_name:通过name属性查找

  补充:

  presence_of_all_elements_located:相对应的所有元素加载完毕过后

  presence_of_element_located:查找到第一个加载完毕后

  element_to_be_clickable:等待可以点击过后。

  By.CLASS_NAME:class查找的方式

  get_attribute:访问标签的属性

  text:访问文本

  tag_name:访问name

# from selenium import webdriver
# from selenium.webdriver import ActionChains
# from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR
# from selenium.webdriver.common.keys import Keys #键盘按键操作
# from selenium.webdriver.support import expected_conditions as EC
# from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素
# import time
#
# try:
#     '''
#     find_element_by_id
#     find_element_by_name
#     find_element_by_link_text
#     find_element_by_partial_link_text
#     find_element_by_tag_name
#     find_element_by_class_name
#
#     find_element_by_css_selector
#     find_element_by_xpath
#     '''
#     driver = webdriver.Chrome()
#     wait=WebDriverWait(driver,3)
#     driver.get('https://www.baidu.com/')
#
#     # 1、find_element_by_id
#     # input_tag=driver.find_element_by_id('kw')
#     # print(input_tag.tag_name)
#     # print(input_tag.get_attribute('name'))
#     # print(input_tag.text)
#
#     # 2、find_element_by_link_text
#     # login=driver.find_element_by_link_text('登录')
#     # login.click()
#
#     # 3、find_element_by_partial_link_text
#     login=driver.find_element_by_partial_link_text('登')
#     login.click()
#
#     # 4、find_element_by_class_name
#     # login_for_user=driver.find_element_by_class_name('tang-pass-footerBarULogin')
#     # login_for_user=wait.until(EC.presence_of_element_located((By.CLASS_NAME,'tang-pass-footerBarULogin')))
#     login_for_user=wait.until(EC.element_to_be_clickable((By.CLASS_NAME,'tang-pass-footerBarULogin')))
#     # print(login_for_user)
#     login_for_user.click()
#
#
#     #4、find_element_by_name
#     # input_user=driver.find_element_by_name('userName')
#     # input_pwd=driver.find_element_by_name('password')
#     # button=driver.find_element_by_id('TANGRAM__PSP_10__submit')
#     #
#     # input_user.send_keys('17094322519')
#     # input_pwd.send_keys('11111111111')
#     # button.click()
#
#
#
#     time.sleep(5)
# finally:
#     driver.close()
View Code

 以上这些只能够查找出来一个内容,如果想要加载相关的所有内容,将有血查找方式的element改成elements就可以了。如下

    find_elements_by_name
    find_elements_by_xpath
    find_elements_by_link_text
    find_elements_by_partial_link_text
    find_elements_by_tag_name
    find_elements_by_class_name
    find_elements_by_css_selector

  find_element(s)_by_xpath:如果在没有一个合适定位的方式的时候就可以使用这个

  /:单斜杠,查找一个标签,可以从根标签一层一层的向内部查找。

  //:双斜杠,从当前页面查找出相对用的所有的标签。

  [数字]:确定查找到哪一个标签。

  [@属性=“属性值”]:属性的查找方式

  [locntains(@属性=“属性值的部分内容”)]:属性模糊查找

  //*:所有的标签

  [标签/@属性=“属性值”]:查找有这个标签的标签的值

  ..:两个点,代表的是上一级

#Xpath选择器
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By  # 按照什么方式查找,By.ID,By.CSS_SELECTOR
from selenium.webdriver.common.keys import Keys  # 键盘按键操作
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait  # 等待页面加载某些元素
import time

try:
    driver = webdriver.Chrome()
    # wait = WebDriverWait(driver, 3)
    driver.implicitly_wait(3)

    driver.get('https://doc.scrapy.org/en/latest/_static/selectors-sample1.html')

    #1、//与/
    # tag=driver.find_element_by_xpath('/html/body/div/a')
    # print(tag.tag_name)
    # print(tag.text)
    # print(tag.get_attribute('href'))

    # tag=driver.find_elements_by_xpath('//a')
    # print(tag)

    # tag=driver.find_elements_by_xpath('//div//a')
    # tag=driver.find_elements_by_css_selector('div a')
    # print(len(tag))

    #2、查找第几个
    # tag=driver.find_elements_by_xpath('//div//a[5]')
    # print(tag[0].text)

    #3、按照属性查找
    # tag1=driver.find_element_by_xpath('//a[@href="image4.html"]')
    # tag2=driver.find_element_by_xpath('//a[4]')
    # tag3=driver.find_element_by_xpath('//a[contains(@href,"image4")]')
    #
    # print(tag1.text)
    # print(tag2.text)
    # print(tag3.text)

    #4、其他
    # driver.find_elements_by_xpath('//*[@class="xxxxx"]')
    # driver.find_elements_by_xpath('//div[@class="xxxxx"][@class="yyyyy"]')

    # print(driver.find_element_by_xpath('//a[img/@src="image2_thumb.jpg"]').text)
    # print(driver.find_element_by_xpath('//a/..').tag_name)

    # print([tag.tag_name for tag in driver.find_elements_by_xpath('//img//..')])

    img=driver.find_element_by_xpath('//img')
    print(img.location)
    print(img.size)

    time.sleep(5)
finally:
    driver.close()
View Code

五 交互操作

 location:坐标,横:x;竖:y

 size:大小,也就是内容的长宽

 impicitly_wait:隐式等待。

from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR
from selenium.webdriver.common.keys import Keys #键盘按键操作
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素

browser=webdriver.Chrome()

#隐式等待:在查找所有元素时,如果尚未被加载,则等10秒
browser.implicitly_wait(10)

browser.get('https://www.baidu.com')


input_tag=browser.find_element_by_id('kw')
input_tag.send_keys('美女')
input_tag.send_keys(Keys.ENTER)

contents=browser.find_element_by_id('content_left') #没有等待环节而直接查找,找不到则会报错
print(contents)

browser.close()
View Code

 显式等待:

from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR
from selenium.webdriver.common.keys import Keys #键盘按键操作
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素

browser=webdriver.Chrome()
browser.get('https://www.baidu.com')


input_tag=browser.find_element_by_id('kw')
input_tag.send_keys('美女')
input_tag.send_keys(Keys.ENTER)


#显式等待:显式地等待某个元素被加载
wait=WebDriverWait(browser,10)
wait.until(EC.presence_of_element_located((By.ID,'content_left')))

contents=browser.find_element(By.CSS_SELECTOR,'#content_left')
print(contents)


browser.close()
View Code

 execute_script:直接写js代码

 clear:清空输入框

 iframe:在一个页面中嵌套一个页面

 switch_to.frame:切换到子页面

 awitch_to.parent_frame:切换到父页面

 Action chains(浏览器对象):拖动

 drag_aand_drop(源,目标):从源拖动到目标

 perform():开始执行

 click_and_hold:点击不松手

 move_by_offset:偏移量

 release:松开鼠标

# from selenium import webdriver
# from selenium.webdriver import ActionChains
# from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR
# from selenium.webdriver.common.keys import Keys #键盘按键操作
# from selenium.webdriver.support import expected_conditions as EC
# from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素
# import time
#
#
# try:
#     driver=webdriver.Chrome()
#     driver.get('https://www.jd.com/')
#     driver.implicitly_wait(3)
#
#     input_tag=driver.find_element_by_id('key')
#     input_tag.send_keys('iphoneX')
#     input_tag.send_keys(Keys.ENTER)
#
#     time.sleep(3)
#     input_tag = driver.find_element_by_id('key')
#     input_tag.clear()
#     input_tag.send_keys('mac pro')
#     input_tag.send_keys(Keys.ENTER)
#
#
#     time.sleep(5)
# finally:
#     driver.close()



#ActionChains
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR
from selenium.webdriver.common.keys import Keys #键盘按键操作
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素
import time


try:
    driver=webdriver.Chrome()
    driver.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
    driver.implicitly_wait(3)
    driver.execute_script('alert("hahha")')

    # driver.switch_to.frame('iframeResult')
    # driver.switch_to.parent_frame()



    # 方式一:基于一条链,从头一下移动到尾部
    # source = driver.find_element_by_id('draggable')
    # target = driver.find_element_by_id('droppable')
    # actions=ActionChains(driver)
    # actions.drag_and_drop(source,target)
    # actions.perform()

    # 方式二:基于不同ActionChains,可以控制移动的位移
    # source = driver.find_element_by_id('draggable')
    # target = driver.find_element_by_id('droppable')

    # distance=target.location['x'] - source.location['x']
    #
    # ActionChains(driver).click_and_hold(source).perform()
    # ActionChains(driver).move_by_offset(xoffset=distance,yoffset=0).perform()
    # ActionChains(driver).release().perform()
    #
    # res=0
    # while res < distance:
    #     ActionChains(driver).move_by_offset(xoffset=1,yoffset=0).perform()
    #     res+=1
    # ActionChains(driver).release().perform()
    #


    time.sleep(5)
finally:
    driver.close()
View Code

六 浏览器的前进和后退:

 back:后退

 forword:前进

#浏览器的前进后退
# import time
# from selenium import webdriver
#
# browser=webdriver.Chrome()
# browser.get('https://www.baidu.com')
# browser.get('https://www.taobao.com')
# browser.get('http://www.python.org/')
#
# time.sleep(3)
# browser.back()
# time.sleep(3)
# browser.forward()
# browser.close()
View Code

七 cookies

 get_cookies:获取cookies里面的信息。

#cookies
from selenium import webdriver

browser=webdriver.Chrome()
browser.get('https://www.zhihu.com/explore')
print(browser.get_cookies())
browser.add_cookie({'k1':'xxx','k2':'yyy'})
print(browser.get_cookies())

# browser.delete_all_cookies()
View Code

八 异常处理

from selenium import webdriver
from selenium.common.exceptions import TimeoutException,NoSuchElementException,NoSuchFrameException

try:
    browser=webdriver.Chrome()
    browser.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
    browser.switch_to.frame('iframssseResult')

except TimeoutException as e:
    print(e)
except NoSuchFrameException as e:
    print(e)
finally:
    browser.close()
View Code

九 选项卡管理

#选项卡管理:切换选项卡,有js的方式windows.open,有windows快捷键:ctrl+t等,最通用的就是js的方式
import time
from selenium import webdriver

browser=webdriver.Chrome()
browser.get('https://www.baidu.com')
browser.execute_script('window.open()')

print(browser.window_handles) #获取所有的选项卡
browser.switch_to.window(browser.window_handles[1])
browser.get('https://www.taobao.com')
time.sleep(10)
browser.switch_to.window(browser.window_handles[0])
browser.get('https://www.sina.com.cn')
browser.close()
View Code

十 练习

#注意:网站都策略都是在不断变化的,精髓在于学习流程。下述代码生效与2017-11-7,不能保证永久有效
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait

browser=webdriver.Chrome()

try:
    browser.get('http://mail.163.com/')

    wait=WebDriverWait(browser,5)

    frame=wait.until(EC.presence_of_element_located((By.ID,'x-URS-iframe')))
    browser.switch_to.frame(frame)

    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'.m-container')))

    inp_user=browser.find_element_by_name('email')
    inp_pwd=browser.find_element_by_name('password')
    button=browser.find_element_by_id('dologin')
    inp_user.send_keys('18611453110')
    inp_pwd.send_keys('xxxx')
    button.click()

    #如果遇到验证码,可以把下面一小段打开注释
    # import time
    # time.sleep(10)
    # button = browser.find_element_by_id('dologin')
    # button.click()

    wait.until(EC.presence_of_element_located((By.ID,'dvNavTop')))
    write_msg=browser.find_elements_by_css_selector('#dvNavTop li')[1] #获取第二个li标签就是“写信”了
    write_msg.click()


    wait.until(EC.presence_of_element_located((By.CLASS_NAME,'tH0')))
    recv_man=browser.find_element_by_class_name('nui-editableAddr-ipt')
    title=browser.find_element_by_css_selector('.dG0 .nui-ipt-input')
    recv_man.send_keys('378533872@qq.com')
    title.send_keys('圣旨')
    print(title.tag_name)


    frame=wait.until(EC.presence_of_element_located((By.CLASS_NAME,'APP-editor-iframe')))
    browser.switch_to.frame(frame)
    body=browser.find_element(By.CSS_SELECTOR,'body')
    body.send_keys('egon很帅,可以加工资了')

    browser.switch_to.parent_frame() #切回他爹
    send_button=browser.find_element_by_class_name('nui-toolbar-item')
    send_button.click()

    #可以睡时间久一点别让浏览器关掉,看看发送成功没有
    import time
    time.sleep(10000)

except Exception as e:
    print(e)
finally:
    browser.close()
View Code
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR
from selenium.webdriver.common.keys import Keys #键盘按键操作
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素
import time


def get_goods(driver):
    try:
        goods=driver.find_elements_by_class_name('gl-item')

        for good in goods:
            detail_url=good.find_element_by_tag_name('a').get_attribute('href')

            p_name=good.find_element_by_css_selector('.p-name em').text.replace('
','')
            price=good.find_element_by_css_selector('.p-price i').text
            p_commit=good.find_element_by_css_selector('.p-commit a').text

            msg = '''
            商品 : %s
            链接 : %s
            价钱 :%s
            评论 :%s
            ''' % (p_name,detail_url,price,p_commit)

            print(msg,end='

')


        button=driver.find_element_by_partial_link_text('下一页')
        button.click()
        time.sleep(1)
        get_goods(driver)
    except Exception:
        pass

def spider(url,keyword):
    driver = webdriver.Chrome()
    driver.get(url)
    driver.implicitly_wait(3)  # 使用隐式等待
    try:
        input_tag=driver.find_element_by_id('key')
        input_tag.send_keys(keyword)
        input_tag.send_keys(Keys.ENTER)
        get_goods(driver)
    finally:
        driver.close()


if __name__ == '__main__':
    spider('https://www.jd.com/',keyword='iPhone8手机')
View Code
#coding=utf-8
from selenium import webdriver
# from selenium import fire
from selenium.webdriver.common.keys import Keys
import re
from selenium.webdriver.support.ui import Select

import time
from pyquery import PyQuery as pq
from xlwt import *
import calendar
from collections import OrderedDict


def openurl(num):
    browser = webdriver.Chrome(executable_path=r"H:chromedriver_win32chromedriver.exe")
    browser.get("https://finsix.com/#section-compatibility")
    html = browser.page_source  # 获取网站源码
    data = browser.page_source  # str() 函数将对象转化为适于人阅读的形式。

    lis = [['product','version','title','status','tags','info']]
    re_rule_1 = r'<select class="section-compatibility__manufacturers select2-hidden-accessible" tabindex="-1" aria-hidden="true">(.*?)</select>'
    data_list = re.findall(re_rule_1,data, re.S)
    data = data_list[0]
    re_rule = r'<option value=".*?">(.*?)</option>'
    datalist = re.findall(re_rule, data, re.S)
    print datalist
    for i in range(1, len(datalist)):
        try:
            browser.find_element_by_class_name('select2-selection__arrow').click()
        except :
            browser.find_element_by_class_name('section-compatibility__message__close').click()
            browser.find_element_by_class_name('select2-selection__arrow').click()
        s1 = Select(browser.find_element_by_class_name('section-compatibility__manufacturers'))
        s2 = s1.options[i]
        # s1.select_by_index(i)
        s3= s2.text
        print s3
        s2.click()
        try:
            browser.find_element_by_class_name('select2-selection__arrow').click()
        except :
            browser.find_element_by_class_name('section-compatibility__message__close').click()
            browser.find_element_by_class_name('select2-selection__arrow').click()
            s2.click()
            browser.find_element_by_class_name('select2-selection__arrow').click()

        time.sleep(2)

        rule1 = r'<select class="section-compatibility__models select2-hidden-accessible" tabindex="-1" aria-hidden="true">(.*?)</select>'
        data = browser.page_source
        if '<select class="section-compatibility__models select2-hidden-accessible" tabindex="-1" aria-hidden="true">' not in data:
            rule1 =  '<select class="section-compatibility__models select2-hidden-accessible" disabled="" tabindex="-1" aria-hidden="true">(.*?)</select>'
        bullish = re.findall(rule1,data , re.S)
        # print len(bullish),bullish
        if len(bullish)>0:
            bullish = bullish[0]
        else:
            print len(bullish), bullish
            lis.append([s3,'','','','',''])
            continue
        re_rule = r'<option value=".*?">(.*?)</option>'
        bullish = re.findall(re_rule, bullish, re.S)
        print bullish
        for j in range(0,len(bullish)):
            btn = browser.find_elements_by_class_name('select2-selection')
            try:
                btn[1].click()
            except :
                try:
                    browser.find_element_by_class_name('section-compatibility__message__close').click()
                    btn[1].click()
                except:
                    # btn[1].click()
                    browser.find_element_by_class_name('section-compatibility__message__close').click()
                    btn[1].click()
            s4=Select(browser.find_element_by_class_name('section-compatibility__models'))
            s5 = s4.options[j]
            s6=s5.text
            print s6
            s5.click()
            try:
                btn[1].click()
            except:
                try:
                    browser.find_element_by_class_name('section-compatibility__message__close').click()
                    btn[1].click()
                except:
                    hdata = browser.page_source
                    try:
                        t = browser.find_element_by_class_name('section-compatibility__message__title').text
                    except:
                        t = ''
                    print t
                    b1 = r'<div class="section-compatibility__message__tip"> <strong>(.*?): </strong><span>(.*?)</span> </div>'
                    bk = re.findall(b1, hdata, re.S)
                    print bk
                    try:
                        status = bk[0][0]
                    except:
                        status = ''
                    try:
                        tag = bk[0][1]
                    except:
                        tag = ''
                    # i1 = r'<div class="section-compatibility__message__body"><p>(.*?)</p>.*?</div>'
                    # info = re.findall(i1,hdata,re.S)
                    try:
                        info = browser.find_element_by_class_name(
                            'section-compatibility__message__body').find_element_by_tag_name('p').text
                    except:
                        info = ''
                    print info
                    lis.append([s3, s6, t, status, tag, info])
                    continue
            hdata = browser.page_source
            try:
                t = browser.find_element_by_class_name('section-compatibility__message__title').text
            except:
                t=''
            print t
            b1 =  r'<div class="section-compatibility__message__tip"> <strong>(.*?): </strong><span>(.*?)</span> </div>'
            bk = re.findall(b1,hdata,re.S)
            print bk
            try:
                status = bk[0][0]
            except:status=''
            try:
                tag = bk[0][1]
            except:tag=''
            # i1 = r'<div class="section-compatibility__message__body"><p>(.*?)</p>.*?</div>'
            # info = re.findall(i1,hdata,re.S)
            try:
                info = browser.find_element_by_class_name('section-compatibility__message__body').find_element_by_tag_name('p').text
            except:info=''
            print info
            lis.append([s3,s6,t,status, tag,info])

            try:
                browser.find_element_by_class_name('section-compatibility__message__close').click()
            except:
                # btn[1].click()
                try:
                    browser.find_element_by_class_name('section-compatibility__message__close').click()
                except:
                    try:
                        btn[1].click()
                        s5.click()
                        btn[1].click()
                        browser.find_element_by_class_name('section-compatibility__message__close').click()
                    except:
                        continue
    return lis

def zhizuo(lis):
    file = Workbook(encoding='utf-8')
    table = file.add_sheet('data')
    for i, p in enumerate(lis):
        for j, q in enumerate(p):
            table.write(i, j, q)
    file.save( 'product_info.csv')
    return 'success'


url = 'https://www.xuangubao.cn/'
lis = openurl(3)
print(lis)
zhizuo(lis)
# f=open("F:\text.txt","a")
# for key,values in  dict.items():
# f.write((key+"	"))
# print(key,values)
# f.close()
爬取finsix

破解滑动验证:

 http://www.cnblogs.com/fangjie0410/p/8269219.html

 

原文地址:https://www.cnblogs.com/fangjie0410/p/8259558.html