一个逐页抓取网站小说的爬虫

需求:

抓取某些网站上的小说,按页抓取

每页都有next 按钮,获取这写next 按钮的 href 然后 就可以逐页抓取

解析网页使用beautisoup

from bs4 import BeautifulSoup
import urllib2
import time

import sys


#http://www.vc.com/htm/2016/12/24/t02/367246.html
host_name = 'http://www.vc.com'

def html_process(html_file,url):
	'''
	use bs to get the titile && contain && next link from html_file
	'''
	global host_name

	#soup = BeautifulSoup(open(html_file),"html_parser")
	soup = BeautifulSoup(html_file,"html.parser")

	#####################################################
	text = '/dev/shm/novel.txt'
	file = open(text,'a')
	file.write('######################################')
	file.write('
' + url + '
')

	#####################################################
	#get title
	title_ret = soup.title.string.split('-')[0].strip()
	file.write('
@# '+ title_ret+ '
')
	#####################################################
	#get context
	file.write( soup.find("div",id='view2').get_text() + '
')
	file.close()

	#####################################################
	#get next href
	link = soup.find_all("li",class_ = "next")[0]
	if None == link:
		print 'next link is None'
		exit(0)
	next_href = host_name + link.a['href'] 

	return next_href


def html_get(url):
	user_agent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0"
	headers = {'User-Agent':user_agent}
	req = urllib2.Request(url,headers = headers)
	try:
		page = urllib2.urlopen(req,timeout=20).read()
		return page
	except urllib2.URLError,e:
		print "error while loading" + url
		exit(1)
	except socket.timeout:
		#do retry
		return html_get(url)

def test(url):
	while None != url:
		html_file = html_get(url)
		if None == html_file:
			print 'ERROR OF READING ',url
			exit(1)
		url = html_process(html_file,url)
		time.sleep(5)

if __name__ == '__main__':
	reload(sys)
	sys.setdefaultencoding( "utf-8" )
	#start up url 
	test("http://www.vc.com/htm/2013/11/2/t02/316551.html")
原文地址:https://www.cnblogs.com/shaivas/p/6218227.html