网易新闻爬取

基于requests 模块

#动态加载的js的数据
import requests
import re
from lxml import etree
import json

url = 'https://temp.163.com/special/00804KVA/cm_war.js?callback=data_callback'
js_data = requests.get(url=url).text
ex = 'data_callback((.*?))'
list_str = re.findall(ex,js_data,re.S)[0]
list_obj = json.loads(list_str)

for dic in list_obj:
    title = dic['title']
    detail_url = dic['docurl']
    page_text = requests.get(url=detail_url).text
    tree = etree.HTML(page_text)
    content = tree.xpath('//*[@id="endText"]//text()')
    content = ''.join(content).replace(' ','').replace('
','')
    print(content)

下面是scrapyselenium配合的使用

# spider.py
# -*- coding: utf-8 -*-
import scrapy
from Net163.items import Net163Item
from selenium import webdriver
from selenium.webdriver import ChromeOptions

class NetPageSpider(scrapy.Spider):
    option = ChromeOptions()
    option.add_experimental_option('excludeSwitches', ['enable-automation'])
    # 创建一个浏览器对象
    bro = webdriver.Chrome(executable_path=r'C:spiderscrapy1chromedriver.exe')
    name = 'net_page'
    model_urls = [] # 存放的就是4个板块对应的详情页的url
    # allowed_domains = ['www.xxx.com']
    start_urls = ['https://news.163.com']

    # 内容详情页面
    def content_parse(self,response):
        item = response.meta['item']
        # 解析数据 存储到item里面
        content_lst = response.xpath('//div[@id="endText"]//text()').extract()
        # extract 返回的是列表,列表里面存的是字符串
        item['desc'] = ''.join(content_lst).replace(' ','').replace('
','').replace('	','')  # 拼接str
        yield item

    # 板块页面
    def detail_parse(self, response):
        div_lst = response.xpath('////div[@class="ndi_main"]/div')
        for div in div_lst:
            item = Net163Item()
            title = div.xpath('./div/div[1]/h3/a/text()').extract_first()
            new_detail_url = div.xpath('./div/div[1]/h3/a/@href').extract_first()

            item['title'] = title
            # meta是一个字典,字典中所有的键值对都可以传递给指定的回调函数
            yield scrapy.Request(url=new_detail_url, callback=self.content_parse,meta={'item':item})

    # 开始的start_urls
    def parse(self, response):
        li_lst = response.xpath('//div[@class="ns_area list"]/ul/li')
        indexs = [3,4,6,7]
        model_lst = [] #板块
        for index in indexs:
            li = li_lst[index]
            model_lst.append(li)
        # 解析板块url
        for li in model_lst:
            model_url = li.xpath('./a/@href').extract_first()
            self.model_urls.append(model_url)

            # 对每个板块的url发请求,获取板块页面内容数据
            yield  scrapy.Request(url=model_url,callback=self.detail_parse)

    # 关闭浏览器
    def closed(self, spider):
        self.bro.quit()
# items.py
import scrapy
class Net163Item(scrapy.Item):
    # define the fields for your item here like:
    title = scrapy.Field()
    desc = scrapy.Field()
#pipelines.py
# 管道  负责持久化的  可以txt mysql redis mongodb  可以写多个类注册到settings,py里面
class Net163Pipeline(object):
    def process_item(self, item, spider):
        print(item['title'],len(item['desc']))
        return item
# settings.py 里面的修改
USER_AGENT = ''           # UA检测
ROBOTSTXT_OBEY = False        # robot协议
DOWNLOADER_MIDDLEWARES = {    #下载中间件
   'Net163.middlewares.Net163DownloaderMiddleware': 543,
}
ITEM_PIPELINES = {    #管道类
   'Net163.pipelines.Net163Pipeline': 300,  
}
LOG_LEVEL = 'ERROR'   #日志等级
# middlewares.py里面
#
-*- coding: utf-8 -*- from time import sleep from scrapy import signals from scrapy.http import HtmlResponse class Net163DownloaderMiddleware(object):    # 类方法 @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s    # 处理请求 def process_request(self, request, spider): return None # 该方法可以拦截到所有的响应对象,(需求中需要处理的是指定的某些响应对象) def process_response(self, request, response, spider): # 找出指定的响应对象进行处理操作 # 可以跟根据指定的请求对象定位到指定的响应对象 # 指定的请求对象可以通过请求的url定位 model_urls = spider.model_urls bro = spider.bro if request.url in model_urls: # 通过指定的url定位到指定的request # 通过指定的request定位到指定的response(不符合需求的要求) # 自己手动的创建四个符合需求要求的新的响应对象(需要将符合要求的响应数据存储放置到新的响应对象中) # 使用新的响应对象替换原来原始的响应对象 bro.get(request.url) # 使用浏览器对4个板块发请求的url sleep(2) js = 'window.scrollTo(0,document.body.scrollHeight)' bro.execute_script(js) sleep(2) # 页面源码数据就会包含了加载的动态数据 page_text = bro.page_source # 手动创建一个新的响应对象,将page_text作为响应数据封装到该响应对象中 # body参数表示的就是响应数据 return HtmlResponse(url=bro.current_url,body=page_text,encoding='utf-8',request=request) # 旧的响应对象 return response    # 处理异常 def process_exception(self, request, exception, spider): pass    # 开启爬虫 def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name)
原文地址:https://www.cnblogs.com/zhangchen-sx/p/10834494.html