微博类爬虫

一、创建爬虫项目

scrapy startproject demo1

二、分析并定义需要关注的信息，创建数据库（chapter18.sql），设置爬取变量（items.py）

chapter18.sql

create database chaper18;
use chaper18;

create table info(
    id int(10) auto_increment primary key,not null,
    name varchar(30),
    url varchar(100),
    hits int(15),
    comment int(15)
)

items.py

 1 # -*- coding: utf-8 -*-
 2 import scrapy
 3 # Define here the models for your scraped items
 4 #
 5 # See documentation in:
 6 # http://doc.scrapy.org/en/latest/topics/items.html
 7 
 8 class Demo1Item(scrapy.Item):    
 9     # 定义关注的数据
10     name = scrapy.Field()
11     url = scrapy.Field()
12     hits = scrapy.Field()
13     comment = scrapy.Field()

三、引入下载中间件封装类（middlewares.py），并在settings.py中设置下载中间件指向

 1 # -*- coding: utf-8 -*-
 2 # 导入随机模块
 3 import random
 4 # 导入有关IP池有关的模块
 5 from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware
 6 # 导入有关用户代理有关的模块
 7 from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
 8 
 9 # IP池
10 class HTTPPROXY(HttpProxyMiddleware):
11     # 初始化 注意一定是 ip=''
12     def __init__(self, ip=''):
13         self.ip = ip
14 
15     def process_request(self, request, spider):
16         item = random.choice(IPPOOL)
17         try:
18             print("当前的IP是："+item["ipaddr"])
19             request.meta["proxy"] = "http://"+item["ipaddr"]
20         except Exception as e:
21             print(e)
22             pass
23 
24 
25 # 设置IP池
26 IPPOOL = [
27     {"ipaddr": "182.117.102.10:8118"},
28     {"ipaddr": "121.31.102.215:8123"},
29     {"ipaddr": "1222.94.128.49:8118"}
30 ]
31 
32 
33 # 用户代理
34 class USERAGENT(UserAgentMiddleware):
35     #初始化 注意一定是 user_agent=''
36     def __init__(self, user_agent=''):
37         self.user_agent = user_agent
38 
39     def process_request(self, request, spider):
40         item = random.choice(UPPOOL)
41         try:
42             print("当前的User-Agent是："+item)
43             request.headers.setdefault('User-Agent', item)
44         except Exception as e:
45             print(e)
46             pass
47 
48 
49 # 设置用户代理池
50 UPPOOL = [
51     "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393"
52 ]
53 # 在settings.py中设置一下内容（这里包括了禁止Cookie、和管道指向）
54 '''
55 #==============================================
56 
57 # 禁止Cookie
58 COOKIES_ENABLED = False
59 
60 # 下载中间件指向
61 DOWNLOADER_MIDDLEWARES = {
62     # 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware':123,
63     # '工程名.middlewares.HTTPPROXY' : 125,
64     'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': 2,
65     '工程名.middlewares.USERAGENT': 1
66 }
67 
68 # 管道指向
69 ITEM_PIPELINES = {
70     '工程名.pipelines.管道中对应的类名': 300,
71 }
72 
73 # 注意将Obey robots关闭（settings.py文件上面已经开启了，所以需要找到并设置为False）
74 ROBOTSTXT_OBEY = False
75 
76 #==============================================
77 '''

如在settings.py添加如下信息

#==============================================

# 禁止Cookie
COOKIES_ENABLED = False

# 下载中间件指向
DOWNLOADER_MIDDLEWARES = {
    # 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware':123,
    # '工程名.middlewares.HTTPPROXY' : 125,
    'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': 2,
    'demo1.middlewares.USERAGENT': 1
}
#==============================================

四、编写管道文件（pipelines.py），并设置管道指向

 1 # -*- coding: utf-8 -*-
 2 import pymysql as pm
 3 # Define your item pipelines here
 4 #
 5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 
 8 class Demo1Pipeline(object):
 9     # 初始化数据库连接
10     def __init__(self):
11         # 创建连接
12         self.db = pm.connect(host='localhost', user='root', password='123456', database='chapter18', charset='utf8')
13         # 创建游标
14         self.cur = self.db.cursor()
15     def process_item(self, item, spider):
16         # 处理信息，包括空信息等
17         for j in range(0, len(item["name"])):
18             name = item["name"][j]
19             url = item["url"][j]
20             hits = item["hits"][j]
21             comment = item["comment"][j]
22             sql = "insert into info(name,url,hits,comment) values(%s,%s,%s,%s)"
23             try:
24                 self.cur.execute(sql, (name, url, hits, comment))
25                 self.db.commit()
26                 print("OK")
27             except:
28                 self.db.rollback()
29                 print("数据插入出错")
30         return item
31     # 关闭游标和断开数据库连接
32     def close_spider(self, spider):
33         self.cur.close()
34         self.db.close()
35

如在settings.py添加如下信息

# 管道指向
ITEM_PIPELINES = {
    'demo1.pipelines.Demo1Pipeline': 300,
}

五、创建爬虫文件，编写爬虫文件（test.py），并引入高度仿浏览器封装类（HeadersHelper.py）

#scrapy genspider -t basic test 爬取总网站域名
scrapy genspider -t basic test hexun.com

test.py源码：

 1 # -*- coding: utf-8 -*-
 2 import scrapy
 3 from ..items import Demo1Item
 4 from .HeadersHelper import HeadersHelper
 5 from scrapy.http import Request
 6 
 7 class TestSpider(scrapy.Spider):
 8     name = "test"
 9     allowed_domains = ["hexun.com"]
10     start_urls = (
11         #'http://www.hexun.com/',
12         'http://yinglunjinye.blog.hexun.com/',
13     )
14 
15     def parse(self, response):
16         item = Demo1Item()
17         #patter_comment = "commentd*?','(d*?)'"
18         #response.xpath('//div[@class="news"]/h1/a/@href').extract()
19         item["name"] = response.xpath('//span[@class="ArticleTitleText"]/a/text()').extract()
20         item["url"] = response.xpath('//span[@class="ArticleTitleText"]/a/@href').extract()
21         url = HeadersHelper('http://yinglunjinye.blog.hexun.com', pattern='<script type="text/javascript" src="(http://click.tool.hexun.com/.*?)"></script>').handle_info()[0]
22         item["hits"] = HeadersHelper(url,pattern="clickd*?','(d*?)'").handle_info()
23         item["comment"] = HeadersHelper(url,pattern="commentd*?','(d*?)'").handle_info()
24         data = HeadersHelper("http://yinglunjinye.blog.hexun.com/",pattern="http://fjrs168.blog.hexun.com/p(d*?)/").handle_info()
25         if(len(data)>=2):
26             # data[-2] 是str类型
27             page_count = int(data[-2])
28         else:
29             page_count = 1
30         yield item
31         for i in range(2, (page_count+1)):
32             url = "http://yinglunjinye.blog.hexun.com/p"+str(i)+"/default.html"
33             yield Request(url, callback=self.parse)

高度仿浏览器封装类HeadersHelper.py

 1 import urllib.request
 2 import http.cookiejar
 3 import re
 4 
 5 class HeadersHelper:
 6     def __init__(self, url, path=None, pattern=None):
 7         self.url = url #urllib.request.quote(url,safe='/:?=', encoding='utf-8')
 8         self.path = path
 9         self.pattern = pattern
10 
11     # 设置信息头，高度仿照浏览器
12     def set_Headers(self):
13         # 添加报头 注意"Accept-Encoding": "gb2312, utf-8" 防止解码而出现乱码
14         headers = {"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
15                    "Accept-Encoding": "gb2312, utf-8", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
16                    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0",
17                    "Connection": "keep-alive", "Host": "baidu.com"
18                    }
19         cjar = http.cookiejar.CookieJar()
20         opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cjar))
21         headall = []
22         for key, value in headers.items():
23             item = (key, value)
24             headall.append(item)
25         opener.addheaders = headall
26         urllib.request.install_opener(opener)
27 
28     # 信息返回
29     def feedbak_info(self):
30         self.set_Headers()
31         # 有时候用utf-8,有时候用gbk
32         # http://yinglunjinye.blog.hexun.com 就需要gbk
33         '''
34                 try:
35             info = urllib.request.urlopen(self.url).read().decode('utf-8')
36         except:
37             info = urllib.request.urlopen(self.url).read().decode('gbk')
38         '''
39         info = urllib.request.urlopen(self.url).read()
40         return str(info)
41 
42     # 信息存档
43     def save_InFile(self):
44         self.set_Headers()
45         info = urllib.request.urlopen(self.url).read()
46         file = open(self.path, 'wb')
47         file.write(info)
48         file.close()
49 
50     # 页面信息处理（正则处理）
51     def handle_info(self):
52         info = self.feedbak_info()
53         return re.compile(pattern=self.pattern, flags=re.S).findall(info)

六、测试