python requests包爬网页数据demo

通过简短的对python语法的熟悉,结合百度搜索的部分博主文章,拼写了个 抓取页面内容的demo

学习记录下!

from requests_html import HTMLSession
import requests
import pymysql.cursors


def getJsonText(url):
    try:      
        headers = {
                'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                'Connection':'keep-alive',
                'User-Agent':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0',
                'Cookie':'csrftoken=6ac95edd2e4866f1c5d2873d6295c5ce; tt_webid=6564523141883692558; uuid="w:1f2180a58a6048ab96b7dac4c8dbab81"; UM_distinctid=163dd0f3890548-04ef14e2aeee38-77256752-1fa400-163dd0f3891694; CNZZDATA1259612802=1435136700-1528418761-null%7C1531360188; tt_webid=6564523141883692558; WEATHER_CITY=%E5%8C%97%E4%BA%AC; _ga=GA1.2.1854637036.1528798614; login_flag=f5b3b0ab7f662248c014dc175aaab576; sessionid=1a2269ab6f9602fa1359cf507705e8b3; uid_tt=e5c3d73d536ad7d832d37328ce7ab08e; sid_tt=1a2269ab6f9602fa1359cf507705e8b3; sid_guard="1a2269ab6f9602fa1359cf507705e8b3|1530692681|15552000|Mon54 31-Dec-2018 08:24:41 GMT"; __tasessionId=zj77s5mu41531359387030; _gid=GA1.2.973835841.1531360238',                
            }
        
        r = requests.get(url, headers=headers)
        json_str = r.json()
        
        return json_str
    except:
        return '请求失败!'
    
def getHtml(url):
    try:
        headers = {
                'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                'User-Agent':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0',
            }
        if url is None:
            return
        session = HTMLSession()
        r = session.get(url, headers=headers)
        
        return r.html
    
    except Exception as e:
        return '抓取失败'

def jsonParser(url,html,path):
    data = {}
    postList = html['data']
    for post in postList:
        data = {
                'source':post['source'],
                'title':post['title'],
                'source_url': post['source_url'],
                'image_url':post['image_url'],                
            }
        
        with open(path, 'a', encoding='utf-8') as f:
            f.write(str(data))
            f.write('
')

#HTML页面
def HtmlParser(url,html,path):
    data = {}
    postList = html.find('div.entlist')
    for rs in postList:
        
        data['title'] = rs.find('a', first=True).text
        data['desc'] = rs.find('div.entinfonews > p',first=True).text
        data['time'] = rs.find('div.time',first=True).text
        detial_url = 'http://cbngold.com/' + rs.find('h2',first=True).find('a',first=True).attrs['href']
        
        data['content'] = HtmlDetailedParser(detial_url)
        #数据入库处理
        connection = pymysql.connect(host='localhost',
                                     user='root',
                                     password='111111',
                                     db='wikiurl',
                                     charset='utf8mb4')
        
        try:
            
            with connection.cursor() as cursor:
                sql = "insert into `urls`(`urlname`,`urlhref`,`content`) values(%s,%s,%s)"
                cursor.execute(sql, (data['title'], detial_url, data['content']))
                connection.commit()
        finally:
            connection.close()
    
#详细内容 url
def HtmlDetailedParser(url):
    html = getHtml(url)
    content = html.find('div#contentPanel',first=True).text
    return content

#抓取html页面入口
def HtmlMain(url):
    savepath = '/home/wwwroot/python_pro/text.txt'
    html = getHtml(url)
    HtmlParser(url, html, savepath)

#解析json接口
def JsonMain(url):
    savepath = '/home/wwwroot/python_pro/toutiao.txt'
    html = getJsonText(url)
    jsonParser(url, html, savepath)
    
#入口
#HtmlMain('https://www.toutiao.com/ch/news_travel/')
#HtmlMain('https://www.toutiao.com/api/pc/feed/?category=news_finance&utm_source=toutiao&widen=1&max_behot_time=1531348824&max_behot_time_tmp=1531348824&tadrequire=true&as=A1358BD4F6AB71A&cp=5B46EB87D10ABE1&_signature=VhWM3gAADVR0cakAFkjT4lYVjM')

#JsonMain('https://www.toutiao.com/api/pc/feed/?category=news_finance&utm_source=toutiao&widen=1&max_behot_time=0&max_behot_time_tmp=0&tadrequire=true&as=A115CB74368C533&cp=5B46AC55C3C37E1&_signature=ReOiVAAAHqdnh4eKksi3R0Xjok')


HtmlMain('http://cbngold.com/newslist.aspx?id=25&p=0')

感谢网络提供的方便,特别是度娘 ~~

原文地址:https://www.cnblogs.com/murenhui/p/9300848.html