2019-01-31 Python学习之BFS与DFS实现爬取邮箱

今天学习了python网络爬虫的简单知识

首先是一个爬取百度的按行读取和一次性爬取

逐行爬取

for line in urllib.request.urlopen("http://www.baidu.com"):
    print(line.decode("utf-8"))

全部爬取

mystr = urllib.request.urlopen("http://www.baidu.com").read()
print(mystr.decode("utf-8"))

分别用栈和队列实现了DFS和BFS的邮箱爬取

用队列deque实现BFS

import re
import urllib
import urllib.request
from collections import deque

def getallemail(data):  #邮箱的正则表达式获取所有的邮箱
    try:
        mailregex = re.compile(r"([A-Z0-9._%+-]+@[A-Z0-9.-]+.[A-Z]{2,4})", re.IGNORECASE)
        mylist = mailregex.findall(data)
        return mylist
    except:
        return []


def getdata(url):   #用utf-8编码读取url返回网页源代码
    try:
        data = urllib.request.urlopen(url).read().decode("utf-8")
        return data
    except:
        return ""


def geteveryurl(data):  #获得网页所有的url
    alllist = []
    mylist1 = getallhttp(data)
    mylist2 = []
    if len(mylist1)>0:
        mylist2 = getabsurl(mylist1[0],data)    #mylist[0]作用是提取元素
    alllist.extend(mylist1)
    alllist.extend(mylist2)
    return alllist


def gethostname(httpstr):
    try:
        mailregex = re.compile(r"(http://S*?)/",re.IGNORECASE) #预编译提取主机名的regex
        mylist = mailregex.findall(httpstr)
        if len(mylist)==0:
            return None
        else:
            return mylist[0]
    except:
        return None


def getabsurl(url,data):
    try:
        regex = re.compile("href="(.*?)"",re.IGNORECASE) #预编译提取href正则表达式
        httplist = regex.findall(data)
        newhttplist = httplist.copy()  #进行一次深拷贝，以进行后面的删除行为
        for data in newhttplist:
            if data.find("http://")!=-1:  #如果其中包含http
                httplist.remove(data) #在原list中remove此data
            if data.find("javascript")!=-1:
                httplist.remove(data) #同理
        hostname = gethostname(url)
        if hostname!=None:
            for i in range(len(httplist)):
                httplist[i] = hostname + httplist[i]
        return httplist
    except:
        return []


def getallhttp(data):#找到所有的http
    try:
        mailregex = re.compile(r"(http://S*?)["|>|)]",re.IGNORECASE)
        mylist = mailregex.findall(data)
        return mylist
    except:
        return[]


def BFS(urlstr):
    urlqueue = deque([]) #新建一个队列
    urlqueue.append(urlstr) #队列中加入最初的url
    while len(urlqueue)!=0: #判断队列是否为空
        url = urlqueue.popleft()  #队列弹出的数据（url）
        print(url)  #打印url连接
        pagedata = getdata(url)  #获取网页源代码
        emaillist = getallemail(pagedata)  #提取邮箱到列表
        if len(emaillist)!=0:       #若邮箱列表不为空
            for email in emaillist:
                print(email)        #打印所有的邮箱
        newurllist = geteveryurl(pagedata) #抓取该网页的所有的url
        if len(newurllist)!=0:      #若列表不为空
            for urlstr in newurllist:
                if urlstr not in urlqueue:
                    urlqueue.append(urlstr)     #若url不在该队列中，则将该url加入队列中

BFS(input("请输入你想爬取的最初页面"))

用栈stack实现DFS

import re
import urllib
import urllib.request

def getallemail(data):  #邮箱的正则表达式获取所有的邮箱
    try:
        mailregex = re.compile(r"([A-Z0-9._%+-]+@[A-Z0-9.-]+.[A-Z]{2,4})", re.IGNORECASE)
        mylist = mailregex.findall(data)
        return mylist
    except:
        return []


def getdata(url):   #用utf-8编码读取url返回网页源代码
    try:
        data = urllib.request.urlopen(url).read().decode("utf-8")
        return data
    except:
        return ""


def geteveryurl(data):  #获得网页所有的url
    alllist = []
    mylist1 = getallhttp(data)
    mylist2 = []
    if len(mylist1)>0:
        mylist2 = getabsurl(mylist1[0],data)    #mylist[0]作用是提取元素
    alllist.extend(mylist1)
    alllist.extend(mylist2)
    return alllist


def gethostname(httpstr):
    try:
        mailregex = re.compile(r"(http://S*?)/",re.IGNORECASE) #预编译提取主机名的regex
        mylist = mailregex.findall(httpstr)
        if len(mylist)==0:
            return None
        else:
            return mylist[0]
    except:
        return None


def getabsurl(url,data):
    try:
        regex = re.compile("href="(.*?)"",re.IGNORECASE) #预编译提取href正则表达式
        httplist = regex.findall(data)
        newhttplist = httplist.copy()  #进行一次深拷贝，以进行后面的删除行为
        for data in newhttplist:
            if data.find("http://")!=-1:  #如果其中包含http
                httplist.remove(data) #在原list中remove此data
            if data.find("javascript")!=-1:
                httplist.remove(data) #同理
        hostname = gethostname(url)
        if hostname!=None:
            for i in range(len(httplist)):
                httplist[i] = hostname + httplist[i]
        return httplist
    except:
        return []


def getallhttp(data):#找到所有的http
    try:
        mailregex = re.compile(r"(http://S*?)["|>|)]",re.IGNORECASE)
        mylist = mailregex.findall(data)
        return mylist
    except:
        return[]


def DFS(urlstr):
    visitlist = [] #代表已经访问过的url，防止深度遍历出现死循环
    urlstack=[]         #栈
    urlstack.append(urlstr)
    while len(urlstack)!=0:
        url = urlstack.pop()
        print(url)  #打印url链接
        if url not in visitlist:
            pagedata = getdata(url)
            emaillist = getallemail(pagedata)
            if len(emaillist)!=0:
                for email in emaillist:
                    print(email)
            newurllist = geteveryurl(pagedata)
            if len(newurllist)!=0:
                for urlstr in newurllist :
                    if urlstr not in urlstack:
                        urlstack.append(urlstr)
            visitlist.append(url)


DFS(input("请输入你想爬取的最初页面"))

#提取数据容易出现广度遍历
#深度遍历容易出现死循环

其中需要注意的是，DFS容易出现死循环现象，故使用visitlist来避免，数据提取适合使用广度遍历实现，因为深度遍历是一撸到底，适合挖掘网站的层数。

代码来自尹成python教学