python爬虫实例--博客园首页Java目录博文爬虫

学了两天, 感觉python很简单, 就想做一个爬虫,我喜欢弄博文, 就想着爬取博客园首页的Java博文, 目前已经实现爬取200页的博文, 一页20条, 爬了4000, 效果还不错, 下面就讲一下爬取的需求:

(1).爬取网页

(2).保存网页内容,包括图片,文本等

(3).实现本地打开, 无乱码, css格式正确

(4).能够断点爬取

(5).除了Java目录外, 可以扩展到其他语言目录

完成这些功能, 以后就可以每天看大量的博文了,看一篇, 删一篇,mmp(弄不好的,加群解决!!!)

目录结构:

如果爬取网页遇到错误, 就在success.txt中手动添加这个出错的url吧, 算是放弃!!!

css文件夹: 上图中的css是从博客园的博文中head标签中下载的:

com_img文件夹: 是css文件夹中需要的img背景图和其他图片等

img文件夹: 是存放下载的html文件中的图片, 实现本地化

success.txt : 是保存下载过的html的url, 支持断点下载

网页下载路径:

下面是代码, 都加注释, 很简单:欢迎加入:123300273, 学习交流!!!

from bs4 import BeautifulSoup
import urllib.request
import re
import uuid
import socket
socket.setdefaulttimeout(10.0)# 设置图片下载超时


# 进度
def Schedule(a, b, c):
    #  a:已经下载的数据块
    #  b:数据块的大小
    # c:远程文件的大小

    per = 100.0 * a * b / c
    if per > 100:
        per = 100
    print('%.2f%%' % per)


##获取目标页面的源码
def getHtml(url):
    page = urllib.request.urlopen(url)  ##打开页面
    html = page.read()  ##获取目标页面的源码
    return html


def getBlogsURL(html):
    # reg = 'src="(.+?.png)"'  ##正则表达式筛选目标图片格式，有些是'data-original="(.+?.jpg)"'
    # img = re.compile(reg)

    # < a class ="titlelnk" href="https://www.cnblogs.com/coderJiebao/p/Netty07.html" target="_blank" > Netty入门（七）使用SSL / TLS加密Netty程序 < / a >

    img = re.compile(r'<a class=.+?href="(.+?)" target')
    html = html.decode('utf-8')  ##编码方式为utf-8
    imglist = re.findall(img, html)  ##解析页面源码获取图片列表
    # print(imglist)
    # x = 0
    # length = len(imglist)
    # for i in range(6):  ##取前6张图片保存
    #     imgurl = imglist[i]
    #     # imgurl = re.sub('"(.*?)"',r'1',imgurl) #取单引号里的双引号内容
    #     # print(imgurl)
    #     urllib.request.urlretrieve(imgurl, './helloworld1/%s.jpg' % x, Schedule)  ##将图片从远程下载到本地并保存
    #     x += 1

    return imglist


def getBody(html):
    soup = BeautifulSoup(html, 'html.parser')  # 文档对象

    #标题
    divTitle = soup.find_all('div', class_="postTitle")
    if (len(divTitle) == 0):
        divTitle = soup.find_all('h1', class_="postTitle")
    if (len(divTitle) == 0):
        divTitle = soup.find_all('h2')
    if (len(divTitle) == 0):
        divTitle = soup.find_all('div',class_="posttitle")
    if (len(divTitle) == 0):
        divTitle = soup.find_all('h1', class_="block_title")
    if (len(divTitle) == 0):
        return
    #内容
    divBody =soup.find_all('div',class_="postBody")
    if (len(divBody) == 0):
        divBody = soup.find_all('div',class_="blogpost-body")
    if (len(divBody) == 0):
        return

    divBody_set = str(divBody[0])
    imageEntityArray = divBody[0].find_all('img')
    for image in imageEntityArray:
        link = image.get('src')
        count2 = str(uuid.uuid1())
        try:
            print("图片:" + str(link))
            if not ("i.imgur.com" in link or "http://www.ityouknow.com/assets/images/2017/jvm/jvm05.png" in link ) :
                # urllib.request.urlretrieve(link, 'html/img/%s.jpg' % count2, Schedule)
                urllib.request.urlretrieve(link, 'html/img/%s.jpg' % count2)
        except Exception:
            print("错误:" + str(link))
        divBody_set=divBody_set.replace(str(link),'img/%s.jpg' % count2)


    # print("标题:"+str(divTitle))
    # print("内容:"+str(divBody_set))

    img = re.compile(r'>(.+?)<')
    html = html.decode('utf-8')  ##编码方式为utf-8
    imglist = re.findall(img, html)  ##解析页面源码获取图片列表
    x=imglist[0]
　　
    x=x.replace("/", " ")
    x=x.replace(":", " ")
    x=x.replace("*", " ")
    x=x.replace("|", " ")
    x=x.replace("~", " ")
    x=x.replace("？", " ")
    x=x.replace("？", " ")
    x=x.replace("?", " ")
    x=x.replace("\", " ")
    x=x.replace("$", " ")
    x=x.replace(""", " ")
    x=x.replace("<", " ")
    x=x.replace(">", " ")

    file = open("html/%s.html" % x, "w",encoding='utf-8')
    file.write('<html lang="zh-cn"><head><meta charset = "utf-8" > <link type = "text/css"  rel = "stylesheet"  href = "css/bundle-coffee.css"><link  type = "text/css"  rel = "stylesheet"  href = "css/bundle-CodingLife.css"><link  type = "text/css"  rel = "stylesheet"  href = "css/blog-common.css"></head>')
    file.write('<body style=" 68%;margin:0 auto;"><div id="topics">')
    file.write(str(divTitle[0]))
    file.write(str(divBody_set))
    file.write('</div></body></html>')
    file.close()


    # img = re.compile(r'<div class="postBody">(.+?)')
    # html = html.decode('utf-8')  ##编码方式为utf-8
    # imglist = re.findall(img, html)  ##解析页面源码获取图片列表
    # count = 1
    # for body in imglist:
    #     print(str(count) + ":" + body)
    #     # urllib.request.urlretrieve(url, './helloworld1/%s.jpg' % count, Schedule)
    #     count = count + 1

def getURLTXT():
    listURl=[]
    f=open("success.txt", "r",encoding='utf-8')
    lines=f.readlines()
    for line in lines:
        listURl.append(str(line))
    f.close()
    return listURl

def setURLTXT(url):
    f = open("success.txt", "a", encoding='utf-8')
    f.write(url)
    f.close()

def comparisonURL(url,listURl):
    if str(url)+"
" in listURl:
       return True

def main(url):
    html = getHtml(url)
    imglist= getBlogsURL(html)
    return imglist




if __name__ == '__main__':
    imglist=[]
　　#自己修改要下载多少页, 有的目录下载的页面没有200页, 要注意!!!
    for i in range(200):
        url="https://www.cnblogs.com/cate/java/"+str(i)
        print("读取第"+str(i) + "页:" + url)
        imglist=imglist+main(url)
    count = 1
    for url in imglist:
        print(str(count) + ":" + url)
        # urllib.request.urlretrieve(url, './helloworld1/%s.jpg' % count, Schedule)
        listURl=getURLTXT()
        if not comparisonURL(url, listURl):
            try:
                html = getHtml(url)
            except Exception:
                print("错误:" + url)
            getBody(html)
            setURLTXT(url+"
")
        count = count + 1