Scrapy爬取大众养生网

(1)
进入养生之道网首页,分析首页布局:
首页布局分为五个大类,食疗养生,养生人群,运动养生,医学健康,糖尿病
我们同过这几个大类,进入链接,爬取大类下的子分类,并把子分类链接和子分类名and五个大类名字,
分别存入redis和mongodb库中,编写Health.py文件。

#-----------Health.py------------
import scrapy from scrapy.selector import HtmlXPathSelector from scrapy.http import Request # from urllib.request import urlopen from scrapy.http import Request # from hello.items import ZhaopinItem # from scrapy.spiders import CrawlSpider, Rule # from scrapy.linkextractors import LinkExtractor from urllib.request import urlopen #from urllib.request import Request from bs4 import BeautifulSoup from lxml import etree from bson.objectid import ObjectId import pymongo client = pymongo.MongoClient(host="127.0.0.1") db = client.publicHealth #库名dianping collection = db.healthClass import redis r = redis.Redis(host='127.0.0.1', port=6379, db=0) class healthClassSpider(scrapy.Spider): name = "health" allowed_domains = ["ys137.com"] # 允许访问的域 start_urls = [ "https://www.ys137.com/lvyou/", ] #每爬完一个网页会回调parse方法 # def parse(self, response): # print(response.body.decode('utf-8')) def parse(self, response): hxs = HtmlXPathSelector(response) hxsObj = hxs.select('//div[@class="container-fluid top-nav"]/div[@class="container main clearfix"]/table[@class="pull-left"]/tr/th/a') for secItem in hxsObj: className = secItem.select('text()').extract() classUrl = secItem.select('@href').extract() print(className[0]) print(classUrl[0]) print('----------------------------------') classid = self.insertMongo(className[0], None) request = Request(classUrl[0], callback=lambda response, pid=str(classid): self.parse_subClass(response, pid)) yield request print("======================") def parse_subClass(self, response, pid): hxs = HtmlXPathSelector(response) hxsObj = hxs.select('//div[@class="channel-sons pull-left"]/a') for secItem in hxsObj: className2 = secItem.select('text()').extract() classUrl2 = secItem.select('@href').extract() print(className2) print('----------------------------') print(classUrl2) classid = self.insertMongo(className2[0], ObjectId(pid)) self.pushRedis(classid, pid, classUrl2[0]) def insertMongo(self, classname, pid): classid = collection.insert({'classname': classname, 'pid': pid}) return classid def pushRedis(self, classid, pid, url): healthurl = '%s,%s,%s' % (classid, pid, url) r.lpush('healthurl', healthurl)

  (2)
通过向rsdis库依次取出子分类链接,进入子分类页面,爬取子分类页面的内容标题和链接,并再次存入
标题和链接到redis和mongodb中,编写Health2.py文件

#-----------Health2.py------------
# -*- coding: utf-8 -*- import scrapy from scrapy.selector import HtmlXPathSelector # from scrapy.http import Request # from urllib.request import urlopen from scrapy.http import Request # from hello.items import ZhaopinItem # from scrapy.spiders import CrawlSpider, Rule from time import sleep # from scrapy.linkextractors import LinkExtractor import pymongo from bson.objectid import ObjectId client = pymongo.MongoClient(host="127.0.0.1") db = client.publicHealth # 库名dianping collection = db.healthTitle import redis # 导入redis数据库 r = redis.Redis(host='127.0.0.1', port=6379, db=0) ii = 0 class healthClassSpider(scrapy.Spider): name = "health2" allowed_domains = ["ys137.com"] # 允许访问的域 def __init__(self): # global pid # 查询reids库novelurl # qidianNovelSpider.start_urls=["https://www.qidian.com/all",] start_urls = [] urlList = r.lrange('healthurl', 0, 1) ii = 0 self.dict = {} for item in urlList: itemStr = str(item, encoding="utf-8") arr = itemStr.split(',') classid = arr[0] pid = arr[1] url = arr[2] start_urls.append(url) self.dict[url] = {"classid": classid, "pid": pid,"urls":url, "num": 0} # ii += 1 # if ii > 3: # break print(start_urls) self.start_urls = start_urls def parse(self, response): classInfo = self.dict[response.url] objectid = classInfo['classid'] pid = classInfo['pid'] headurl=classInfo['urls'] num = classInfo['num'] if num > 3: return None hxs = HtmlXPathSelector(response) hxsObj = hxs.select('//div[@class="arc-infos clearfix"]/h2/a') for secItem in hxsObj: className = secItem.select('text()').extract() classUrl = secItem.select('@href').extract() print(className[0]) print(classUrl) #classid =self.insertMongo(className[0],ObjectId(objectid)) #self.pushRedis(classid,objectid, classUrl[0]) # --------------------------不用调用方法直接取下一页------------------------------------------------------------------------------ nextPages= hxs.select('//ul[@class="pagination"]/li/a/@href') print(len(nextPages)) nextPage=nextPages.extract()[len(nextPages)-1] #print(headurl) nextPage= headurl+nextPage #print(nextPage) classInfo['num'] += 1 self.dict[nextPage] = classInfo request = Request(nextPage, callback=self.parse) yield request print('--------end--------------') ''' def insertMongo(self, className, pid): classid = collection.insert({'classname': className, 'pid': pid}) return classid def pushRedis(self, classid, pid, classUrl): titlename = '%s,%s,%s,' % (classid, pid, classUrl) r.lpush('titlenameurl', titlename) '''

  

(3)
向库中取出每个分类下的标题链接,爬取到标题下的内容,并根据id更新到,存有标题表,的对应标题下
编写py文件。

#--------------Health3.py---------------------
# -*- coding: utf-8 -*- import scrapy from selenium import webdriver from selenium.webdriver.common.desired_capabilities import DesiredCapabilities from scrapy.selector import HtmlXPathSelector # from scrapy.http import Request # from urllib.request import urlopen from scrapy.http import Request # from hello.items import ZhaopinItem # from scrapy.spiders import CrawlSpider, Rule from time import sleep # from scrapy.linkextractors import LinkExtractor from lxml import etree import pymongo from bson.objectid import ObjectId import re client = pymongo.MongoClient(host="127.0.0.1") db = client.publicHealth # 库名dianping collection = db.healthTitle import redis # 导入redis数据库 r = redis.Redis(host='127.0.0.1', port=6379, db=0) ii = 0 class healthClassSpider(scrapy.Spider): name = "health3" allowed_domains = ["ys137.com"] # 允许访问的域 def __init__(self): # global pid # 查询reids库novelurl #qidianNovelSpider.start_urls=["https://read.qidian.com/chapter/kbE0tc0oVoNrZK4x-CuJuw2/92LFs_xdtPXwrjbX3WA1AA2",] start_urls = [] urlList = r.lrange('titlenameurl', 0,1) ii = 0 self.dict = {} for item in urlList: itemStr = str(item, encoding="utf-8") arr = itemStr.split(',') classid = arr[0] pid = arr[1] url = arr[2] print(arr[2]) start_urls.append(url) self.dict[url] = {"classid": classid, "pid": pid, "num": 0} ii += 1 if ii > 1: break print(start_urls) self.start_urls = start_urls def parse(self, response): classInfo = self.dict[response.url] objectid = classInfo['classid'] objectid2 = ObjectId(objectid) pid = classInfo['pid'] num = classInfo['num'] ii = "" #================================================================================== hxs = HtmlXPathSelector(response) hxsObj = hxs.select('//div[@class="article-content"]/table/tr/td/p') for secItem in hxsObj: healthTitleContent = secItem.select('text()').extract() #print(healthTitleContent[0]) if healthTitleContent ==[]: pass else: ii = ii+healthTitleContent[0] print(ii) # db.healthTitle.update({"_id": objectid2}, {"$set": {'healthTitleContent':ii}}) # sleep(0.3) print('------------------------------------------------------') ''' html = response.body.decode('gbk') selector = etree.HTML(html) Name = selector.xpath('//div[@class="article-content"]/table/tr/td/h2/text()') #print(Name) # print(len(Name)) arr=[] for i in range(len(Name)): print(Name[i]) arr[i]=Name[i]+'/n' #db.healthTitle.update({"_id": objectid2}, {"$set": {'healthTitlechapter': Name}}) # print('----------------------------------------------------') classname = selector.xpath('//div[@class="article-content"]/table/tr/td/p/text()') #print(classname) for a in range(len(classname)): print(classname[a]) # db.healthTitle.update({"_id": objectid2}, {"$set": {'healthTitleContent': classname}}) '''

  

原文地址:https://www.cnblogs.com/yongxinboy/p/8052725.html