scrapy

__author__ = 'Administrator'
# -*- encoding:utf-8 -*-
import scrapy
class QuoteSpider(scrapy.Spider):
    name = 'poxiao'
    start_urls=['https://www.poxiao.com/type/movie/']
    def parse(self, response):#固定的
        quotes=response.xpath('//li/h3')#内容
        for quote in quotes:
            yield {
                'name':quote.xpath('./a/text()').extract_first(),
                'author':'https://www.poxiao.com'+quote.xpath('./a/@href').extract_first()
            }
            next_page=response.xpath('//div[@class="list-pager"]/a[last()-1]/@href').extract_first()
            if next_page:
                yield response.follow(next_page,self.parse)

用SCRAPY爬取某网页链接地址

scrapy runspider ***.py  运行此工程

SCRAPY runspider ***.py -o aa.json      保存成JSON文件

scrap runspider ***.py -o aa.csv -t csv    保存成EXCEL

# -*- coding: utf-8 -*-
import scrapy


class MovieSpider(scrapy.Spider):
    name = 'movie'
    allowed_domains = ['poxiao.com']
    start_urls = ['https://www.poxiao.com/type/movie/index_2.html',
                  'https://www.poxiao.com/type/movie/index_3.html'
                  ]

    def parse(self, response):
        filname=response.url.split('/')[-1].split('.')[-2]
        with open(filname,'wb')as f:
            f.write(response.body)

爬取HTML源文件 

# -*- coding: utf-8 -*-
import scrapy
from meiju.items import MeijuItem

class Mj100Spider(scrapy.Spider):
    name = 'mj100'
    allowed_domains = ['meijutt.com']
    start_urls = ['https://www.meijutt.com/new100.html']

    def parse(self, response):
        movies=response.xpath('//h5/a')
        for each_movie in movies:
            item=MeijuItem()
            item['name']=each_movie.xpath('./text()').extract_first()
            yield item
class MeijuPipeline(object):
    def process_item(self, item, spider):
        with open('my_meiju.txt','a')as fp:
            fp.write(item['name']+'
')
class MeijuItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    name=scrapy.Field()

爬取美剧100实例     注意  还要注释一下PIPLINE文件里的内容 就是有300    优先级那个

# -*- coding: utf-8 -*-
import scrapy
from poxiao.items import PoxiaoItem


class NameSpider(scrapy.Spider):
    name = 'name'
    allowed_domains = ['poxiao.com']
    start_urls = ['https://www.poxiao.com/type/movie/']

    def parse(self, response):

        movie=response.xpath('//div[@class="gkpic"]//img')
        for i in movie:
            item=PoxiaoItem()
            item['src']=i.xpath('./@src').extract_first()
            item['name']=i.xpath('./@alt').extract_first()
            yield item
            next_page=response.xpath('//div[@class="list-pager"]/a[last()-1]/@href').extract_first()
            if next_page:
                yield response.follow("https://www.poxiao.com"+next_page,self.parse)

第一个小爬虫

import os
import requests
class PoxiaoPipeline(object):
    def process_item(self, item, spider):
        filename=os.path.join(r"d:untitled1poxiao",item['name']+'.jpg')
        with open(filename,'wb') as f:
            f.write(requests.get(item['src']).content)
原文地址:https://www.cnblogs.com/xupanfeng/p/11765545.html