利用协程框架,无界面浏览器爬取上海高院开庭数据

# -*- coding: utf-8 -*-
"""
@author: Dell Created on Thu Jan  2 11:16:08 2020
"""
import gevent 
from gevent import monkey

monkey.patch_all()

from lxml import etree
from selenium import webdriver
from selenium.webdriver import PhantomJS 
from selenium.webdriver.chrome.options import Options


def download(url, start_idx, end_idx, file):
    #需要一个phantomjs.exe路径参数,但是高版本的selenium已经将PhantomJS废弃
    # driver = PhantomJS()
    # 实现无界面爬取,高版本
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    driver = webdriver.Chrome(options=chrome_options)  
   
    try:
        driver.get(url)
        for i in range(start_idx, end_idx):#循环提取页面数据
            script = "javascript:goPage('"+str(i)+"')"
            driver.execute_script(script)#执行Javascript代码
            gevent.sleep(5)#等待页面加载完成
            print("开始解析第", i, "页")
            
            html = etree.HTML(driver.page_source)#获得的page_source是<class 'str'>
            trs = html.xpath("//table[@id='report']//tbody/tr[position()>1]")
            for tr in trs:
                court = tr.xpath("./td[1]/font/text()")[0].strip()#法院
                court_code = tr.xpath("./td[2]/font/text()")[0].strip()#法庭
                lawful_day = tr.xpath("./td[3]/text()")[0].strip()#开庭日期
                
                code = tr.xpath("./td[4]/text()")[0].strip()#案号
                reason = tr.xpath("./td[5]/text()")[0].strip()#案由
                undertaking_department = tr.xpath("./td[6]/div/text()")[0].strip()#承办部门
                
                presiding_judge = tr.xpath("./td[7]/div/text()")[0].strip()#审判长/主审人
                complaint = tr.xpath("./td[8]/text()")[0].strip()#原告
                defendant = tr.xpath("./td[9]/text()")[0].strip()#被告
                
                print(court,court_code,lawful_day,code,reason,undertaking_department,presiding_judge,complaint,defendant)
                line = (court,court_code,lawful_day,code,reason,undertaking_department,presiding_judge,complaint,defendant)
                file.write((str(line)+"
").encode("utf-8", errors="ignore"))
                # break
            print("共有数据:", len(trs), "条")
    except:
        print("error")
    finally:
        driver.quit()#提取完成,退出浏览器
    

def main():
    url = "http://www.hshfy.sh.cn/shfy/gweb2017/ktgg_search.jsp"
    file = open("court.txt", "wb")
    #每个线程抓两页数据,协程框架
    gevent.joinall([
        gevent.spawn(download, url, 1, 100, file),    
        gevent.spawn(download, url, 100, 200, file),    
        gevent.spawn(download, url, 200, 300, file),    
        gevent.spawn(download, url, 400, 500, file),    
        gevent.spawn(download, url, 500, 600, file),    
    ])
    file.close()
     

if __name__ == "__main__":
    main()
    pass









原文地址:https://www.cnblogs.com/zxfei/p/12132362.html