dogedoge浏览器爬取标题

# coding:utf-8
import hashlib

import datetime
import lxml
import pymysql
import requests

from lxml import etree
import sys

reload(sys)

sys.setdefaultencoding('utf-8')

def search_data(kw, n):
    ll = []
    res = requests.get('https://www.dogedoge.com/results?q={}'.format(kw))
    if n > 1:
        res = requests.get('https://www.dogedoge.com/results?q={}&p={}'.format(kw, n))
    con = etree.HTML(res.text)
    url = con.xpath('//div[@class="result results_links_deep highlight_d result--url-above-snippet"]')
    for u in url:
        title = ''
        for i in u.xpath('./div/h2/a//text()'):
            title += i
        url = ''
        for i in u.xpath('./div/div/div/a/span//text()'):
            url += i
        domain = ''
        if url.find('http') != -1:
            domain = url.split('/')[2]
        else:
            domain = url.split('/')[0]
        md5 = hashlib.md5(url).hexdigest()
        item = {}
        item['keywd'] = kw
        item['domain'] = domain
        item['title'] = title
        item['md5'] = md5
        item['url'] = url
        item['searcher'] = 'dogedoge'
        ll.append(item)
    save(ll)
    try:
        next = con.xpath('//div[@id="rld-2"]')
    except:
        print '没有下一页了'
        return ''
    else:
        return next


def main(kw):
    n = 1
    while True:
        next_page = search_data(kw, n)
        if not next_page:
            break
        n += 1


def save(ll):
    db = pymysql.connect(
        host=MYSQL_HOST,
        db=MYSQL_DBNAME,
        user=MYSQL_USER,
        passwd=MYSQL_PASSWD,
        charset='utf8',
        use_unicode=True)
    cursor = db.cursor()
    for item in ll:
        # print type(item), item['searcher']
        try:
            # 插入数据库
            cursor.execute(
                "insert into weixintb(md5,keyword,title,url,`date`,`domain`, browser) value(%s, %s, %s, %s, %s, %s,%s)",
                (item['md5'],
                 item['keywd'],
                 item['title'],
                 item['url'],
                 datetime.datetime.now(),
                 item['domain'],
                 item['searcher']
                 ))
            # 提交sql语句
            db.commit()

        except Exception as error:
            # 出现错误时打印错误日志
            # print error
            # logger.error(error)
            db.rollback()
    cursor.close()
    db.close()

main('爬取关键词')
原文地址:https://www.cnblogs.com/qxh-beijing2016/p/12770181.html