python 爬虫 爬取序列博客文章列表

python中写个爬虫真是太简单了




import urllib.request
from pyquery import PyQuery as PQ

# 根据URL获取内容并解码为UTF-8
def getHtml(url):
    page = urllib.request.urlopen(url)
    html = page.read()
    html = html.decode('UTF-8')
    return html

# 解析返回的html
def getArtical(html, results):
    doc = PQ(html)
    # data = doc('.searchAtcList .searchAtc_top a')
    data = doc('.atc_title a')
    for x in data.items():
        title = x.text()
        href = x.attr('href')
        if title.find('教你炒股票') >= 0:
            # 标题被截断的需要根据URL获取完整的标题
            if title.find('…') >= 0:
                title = getArticalDetail(x.attr('href'))

            r = '[' + title + '](' + href + ')'
            index = title[5 : title.index(':')]
            results.append((int(index),r))

# 获取文章标题
def getArticalDetail(url):
    html = getHtml(url)
    doc = PQ(html)
    data = doc('.articalTitle h2')
    title = data.text()
    return title

blog3 = 'http://blog.sina.com.cn/s/articlelist_1215172700_0_'
# http://blog.sina.com.cn/s/articlelist_1215172700_0_1.html
# http://blog.sina.com.cn/s/articlelist_1215172700_0_15.html
# blog = 'http://control.blog.sina.com.cn/search/search.php?uid=1215172700&keyword=%E8%82%A1%E7%A5%A8&page='
# blog2 = 'http://control.blog.sina.com.cn/search/search.php?uid=1215172700&keyword=%E8%82%A1%E7%A5%A8&page='

results = []

# 总共有23页
for i in range(1, 24):
    url = blog3 + str(i) + '.html'
    print(url)
    html = getHtml(url)
    getArtical(html, results)

# 排序后输出
results.sort()
for x in results:
    print(x[1])



原文地址:https://www.cnblogs.com/wancy86/p/6377971.html