20.Scrapy日常练手

1.创建爬虫项目:

scrapy  startproject tutorial

2.创建 spider 

cd tutorial

scrapy  genspider quotes quotes.toscrape.com

如下图:

3.

quotes.py

___________________________________________________________________________

 1 # -*- coding: utf-8 -*-
 2 import scrapy
 3 from tutorial.items import TutorialItem
 4 import logging
 5 class QuotesSpider(scrapy.Spider):
 6     name = 'quotes'
 7     allowed_domains = ['quotes.toscrape.com']
 8     start_urls = ['http://quotes.toscrape.com/']
 9 
10     def parse(self, response):
11         quotes=response.css('.quote')
12         for quote in quotes:
13 
14             item=TutorialItem()
15             #内容
16             item['text']=quote.css('.text::text').extract_first()
17 
18             #作者
19             item['author']=quote.css('.author::text').extract_first()
20 
21             #标签
22             item['tags']=quote.css('.tags .tag::text').extract_first()
23 
24             yield item
25 
26         #下一页
27         next=response.css('.pager .next a::attr("href")').extract_first()
28         url=response.urljoin(next)
29         yield scrapy.Request(url=url,callback=self.parse)


items.py
________________________________________________________________________

 1 # -*- coding: utf-8 -*-
 2 
 3 # Define here the models for your scraped items
 4 #
 5 # See documentation in:
 6 # https://doc.scrapy.org/en/latest/topics/items.html
 7 
 8 import scrapy
 9 
10 
11 class TutorialItem(scrapy.Item):
12     # define the fields for your item here like:
13     # name = scrapy.Field()
14 
15 
16     text=scrapy.Field()
17     author=scrapy.Field()
18     tags=scrapy.Field()
19 piplines.py



_________________________________________________________________________
 1 # -*- coding: utf-8 -*-
 2 
 3 # Define your item pipelines here
 4 #
 5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 from scrapy.exceptions import DropItem
 8 
 9 import pymysql
10 
11 class TutorialPipeline(object):
12     # def __init__(self):
13     #     self.limit=50
14     # def process_item(self, item, spider):
15     #     if  item['text']:
16     #         if len(item['text'])>self.limit:
17     #             item['text']=item['text'][0:self.limit].rstrip()+'...'
18     #         return item
19     #     else:
20     #         return DropItem('Missing Text')
21     def __init__(self):
22         pass
23     def open_spider(self, spider):
24         self.my_conn = pymysql.connect(
25             host = '192.168.113.129',
26             port = 3306,
27             database = 'datas',
28             user = 'root',
29             password = '123456',
30             charset = 'utf8'
31         )
32         self.my_cursor = self.my_conn.cursor()
33 
34     def process_item(self,item, spider):
35 
36         dict(item)
37         insert_sql = "insert into quotes(author,tags,text) values(%s,%s,%s)"
38         self.my_cursor.execute(insert_sql,[item['author'],item['tags'],item['text']])
39         return  item
40     def close_spider(self, spider):
41         self.my_conn.commit()
42 
43         self.my_cursor.close()
44 
45         self.my_conn.close()


setting.py
___________________________________________________________________________
# Obey robots.txt rules
ROBOTSTXT_OBEY = True

ITEM_PIPELINES = {
   'tutorial.pipelines.TutorialPipeline': 200,
}
代码配置完:

保存文件格式
scrapy crawl  quotes -o quotes.xml
scrapy crawl  quotes -o quotes.csv


 
原文地址:https://www.cnblogs.com/lvjing/p/9584810.html