爬虫之爬取糗事百科

#!/usr/bin/env python
#-*- coding:utf-8 -*-
import urllib2,re
from bs4 import BeautifulSoup

def getContentOrComment(argurl):
user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'
headers = {'User-Agent':user_agent}
#加上头部信息,反爬虫
req = urllib2.Request(url=argurl,headers=headers)
#try:
response = urllib2.urlopen(req) #打开网址
content = response.read() #读取源代码
#print content
#except Exception,e:
#content = None
return content

articleUrl = 'http://www.qiushibaike.com/textnew/page/%d' #文章地址
commentUrl = 'http://www.qiushibaike.com/article/%s' #评论地址

page = 0

while True:
raw = raw_input('点击enter查看或者输入exit退出,请输入你的选择:')
if raw == 'exit':
break
page += 1
Url = articleUrl % page
print Url


articlePage = getContentOrComment(Url)
articleFloor = 1

soup = BeautifulSoup(articlePage,'html.parser') #解析网页
for string in soup.find_all(attrs='article block untagged mb15'):
commentId = str(string.get('id')).strip().split('_')[2]
#print commentId
print ' '
print articleFloor,'.',string.find(attrs='content').get_text().strip()
articleFloor +=1

#获取评论
commentPage = getContentOrComment(commentUrl % commentId)
if commentPage is None:
continue
soupComment = BeautifulSoup(commentPage,'html.parser')
commentFloor = 1
for comment in soupComment.find_all(attrs='body'):
print ' ',commentFloor,'楼回复:',comment.get_text().strip()
commentFloor +=1

原文地址:https://www.cnblogs.com/shanhua-fu/p/6934509.html