python——爬虫

利用空余时间写了个简单的python爬虫程序——获取博海拾贝的标题和封面图

这里使用到的技术比较简单，可以供一些python入门的新手做参考。

知道需要采集的东西，那接下来的就是撸码干了。

首先应该分析爬取数据需要使用的函数或者程序包，在这里使用的时urllib2+lxml.etree.

下面是全部代码：（或许在代码中会出现一些新手们不了解的函数。不要怕麻烦，自己多在网上查找下资料可以很好的了解新知识）

#!  /usr/bin/env python
# coding:utf-8

import urllib2
import lxml.etree
import sys
import os
from MyHelper import MyHelper

class  bssb:
    reload(sys)
    sys.setdefaultencoding('UTF-8')
    type = sys.getfilesystemencoding()
    def   getHtml(self,_url):    #获取网页内容
        _headers = MyHelper().getHeaders()
        request = urllib2.Request(url=_url,headers=_headers)
        try:
            page=urllib2.urlopen(request)
            html=page.read()
            return html
        except urllib2.HTTPError as e:
            print 'HTTPError=',e.code
        except urllib2.URLError as e:
            print 'URLError=',e.reason

    def  content(self,html,_xpath):   #获取需要抓取的内容
        content = lxml.etree.HTML(html.lower().decode('utf-8'))
        result = content.xpath(_xpath)
        return result

    def  HtmlforPage(self,htmlurl,titles,imgs):
        html = self.getHtml(htmlurl)
        articles = self.content(html,'//article')
        nextpage = self.content(html,'//li[@class="next-page"]/a')
        for item in crticles:
            #抓取标题
            title = item.findall('./header//a')[0].text
            #抓取图片
            img = item.findall('./p[@class="focus"]//img')[0].attrib['src']
            titles.append(title)
            imgs.append(img)
        if len(nextpage)!=0:
            self.HtmlForPage(nextpage[0].attrib['href'],titles,imgs)
        else:
            #将标题写入txt文件
            MyHelper().save_txt(titles,'./Bohai/titles.txt','wb+')
            #将图片保存到本地
            MyHelper().save_file(imgs)

if __name__ == "__main__":
    bohai = bssb()    
    url = 'https://bohaishibei.com/post/category/main/'
    titles=[]
    imgs=[]
    bohai.HtmlforPage(url,titles,imgs)

View Code

下面这个是简单的自定义帮助类：

#! /usr/bin/env python
# coding:utf-8

import os
import urllib
import re 
import requests
from PIL import Image
from io import BytestIO

class MyHelper:
    def  __init__(self,language='zh-CN,en;q=0.9',control='max-age=0')
        self.language = language
        self.control = control

    def  getAgent(self):
        user_agent=['Mozilla/5.0(Windows NT 10.0; WOW64)','Mozilla/5.0 (Windows NT 6.3;WOW64)','Opera/9.27 (Windows NT 5.2; U; zh-cn)']
        return user_agent

    def  getHeaders(self):
        headers = {'Accept-Language':self.language,
                          'cache-control':self.control,
                          'User-Agent':random.choice(self.getAgent())
        }
        return headers

    def  save_txt(self,contents,txtPath,model):
        _path = self.GetPath(txtPath)
        with open(_path,model) as fo:
            if isinstance(contents,list):
                for item in contents：
                    fo.write(item+'
')
            else:
                fo.write(contents+'
')
            fo.close()

    def  save_file(self,_path):
        file_path='./imgData'
        try:
            file_path = self.GetPath(file_path)
            if isinstance(_path,list):
                count=1
                for item in _path:
                    file_suffix = os.path.splitext(item)[1]
                    if file_suffix.__contains__('&'):
                        file_suffix = file_suffix.aplit('&')[0]
                    response = requests.get(item)
                    image = Image.open(BytesIO(response.content))                      
                    _img = file_path+'/bohai'+str(count)+file_suffix     
                    image.save(_img)
                    count+=1
            else:
                file_suffix = os.path.splitext(_path)[1]
                if file_suffix.__contains__('&'):
                    file_suffix = file_suffix.split('&')[0]
                response = requests.get(item)
                image = Image.Open(BytesIO(response.content))
                image.save(file_path+'/bohai'+file_suffix)
        except IOError as e:
            print 'Error:没有找到文件或者读取文件失败'

    def GetPath(self,_path):
        #将文件路劲分割出目录和文件
        file_path = os.path.split(_path)
        if not os.path.isdir(file_path[0]):
            print '目录不存在,新建', file_path[0]
            os.system(r'touch %s' % _path)
        return _path

View Code

代码亲测试可以运行。。。。

如有问题欢迎留言。。。。。