人民法院重大事件抓取

时间:2017-8-3 23:30

Url:http://www.court.gov.cn

py3.4 + mysql + win7

import urllib.request
import re
import pymysql
from time import sleep
try:
    con = pymysql.connect(host = '127.0.0.1',user = 'root',passwd='root')
    con.query('create database PeopleCourt')
    con = pymysql.connect(host = '127.0.0.1',user = 'root',passwd='root',db = 'PeopleCourt')
except:
    con = pymysql.connect(host = '127.0.0.1',user = 'root',passwd='root',db = 'PeopleCourt')
try:
    con.query('create TABLE lawcase(title char(100),url char(100),time char(50))')
except:
    print('Table existed')

url_row = 'http://www.court.gov.cn/fabu-gengduo-15.html?page=1'
header = {'User-Agent':'Mozilla/5.0'}
req = urllib.request.Request(url_row,headers=header)
res = urllib.request.urlopen(req)
data = res.read().decode()
reg_page = re.compile('<li class="last"><a href="/fabu-gengduo-15.html?page=(.*?)">').findall(data)
print('page:'+str(reg_page[0]))
for page in range(1,int(reg_page[0])+1):
    print('Grab page:'+str(page))
    url = 'http://www.court.gov.cn/fabu-gengduo-15.html?page='+str(page)
    req = urllib.request.Request(url,headers=header)
    res = urllib.request.urlopen(req)
    data = res.read().decode()
    reg_item_string = '<a title="(.*?)" target="_blank" href="(.*?)">.*?</a>.*?<i class="date">(.*?)</i>'
    reg_item = re.compile(reg_item_string,re.S).findall(data)
    for item in reg_item:
        title = item[0].replace('
','')
        Url = 'http://www.court.gov.cn'+item[1]
        time = item[2]
        sql = "insert INTO lawcase(title,url,time) VALUES ('"+title+"','"+Url+"','"+time+"')"
        con.query(sql)
    sleep(2)
print('Ok')

数据库截图:

天下飞羽,花落凡尘
原文地址:https://www.cnblogs.com/AngelYuFan/p/7282821.html