【python】理想论坛帖子爬虫1.06

昨天认识到在本期同时起一百个回调/线程后程序会崩溃，造成结果不可信。

于是决定用Python单线程操作，因为它理论上就用主线程跑不会有问题，只是时间长点。

写好程序后，测试了一中午，210个主贴，11974个帖子，11974个文件都生成了。

当然，程序没有一蹴而就，原有的对requests属性不加限制时最多跑个四千条就崩了，参考了 “https://blog.csdn.net/shi_weihappy/article/details/51009602”的做法后才知道要将requests.session.keep_alive = False,以免资源不释放。另外加了一点异常时让其重复处理的代码，以免丢失数据。这两处代码都在saveTopicDetail函数中。

代码如下：

#------------------------------------------------------------------------------------
# 理想论坛爬虫1.06，用于爬取主贴再爬子贴，数据存到文件里，再由insertDB.py读取插DB
# 2018年4月26日
#------------------------------------------------------------------------------------
from bs4 import BeautifulSoup
import requests
import threading
import re
import time
import datetime
import os
import json
import colorama
from colorama import Fore, Back, Style
colorama.init()

user_agent='Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
headers={'User-Agent':user_agent}

# 存储数据文件的目录
folder=""

# 主帖数组
topics=[]

#------------------------------------
# 在论坛页中寻找主贴
# pageUrl:论坛页url
#------------------------------------
def findTopics(pageUrl):
    print("
开始读取页面"+pageUrl+"的帖子");

    try:
        rsp=requests.get(pageUrl,headers=headers)
        soup= BeautifulSoup(rsp.text,'html.parser',from_encoding='gb2312')

        for tbodys in soup.find_all('tbody'):
            pageCount=1
            url='none'
            title='none'

            for spans in tbodys.find_all('span',class_="forumdisplay"):
                for link in spans.find_all('a'):
                    if link and link.get("href"): 
                       url="http://www.55188.com/"+link.get("href")
                       title=link.text
                        
            for spans in tbodys.find_all('span',class_="threadpages"):
                for link in spans.find_all('a'):
                    pageCount=link.text
                
            if url!='none' and title!='none':
                topic={'pageCount':pageCount,'url':url,'title':title}
                topics.append(topic)

        #print("读取页面"+pageUrl+"的帖子完毕");
    except Exception as e:
        log("findTopics出现异常:"+str(e),'red')

#------------------------------------
# 以不同颜色在控制台输出文字
# pageUrl:论坛页url
#------------------------------------
def log(text,color):
    if color=='red':
        print(Fore.RED + text+ Style.RESET_ALL)
    elif color=='green':
        print(Fore.GREEN + text+ Style.RESET_ALL)
    else:
        print(text)

#------------------------------------
# 找到并保存帖子的细节
# index:序号,url:地址,title:标题
#------------------------------------
def saveTopicDetail(index,url,title):
    infos=[]    # 找到的子贴信息

    while(len(infos)==0):
        try:
            rsp=requests.get(url,headers=headers)                       
            soup= BeautifulSoup(rsp.text,'html.parser',from_encoding='gb2312')
            session = requests.session()
            session.keep_alive = False
            
            for divs in soup.find_all('div',class_="postinfo"):
                # 用正则表达式将多个空白字符替换成一个空格
                RE = re.compile(r'(s+)')
                line=RE.sub(" ",divs.text)
                arr=line.split(' ')
                arrLength=len(arr)

                if arrLength==7:
                    info={'楼层':arr[1],
                          '作者':arr[2].replace('只看：',''),
                          '日期':arr[4],
                          '时间':arr[5],'title':title,'url':url}
                    infos.append(info);
                elif arrLength==8:
                    info={'楼层':arr[1],
                          '作者':arr[2].replace('只看：',''),
                          '日期':arr[5],
                          '时间':arr[6],'title':title,'url':url}
                    infos.append(info);

            #存文件
            filename=folder+"/"+str(index)+'.json'
            with open(filename,'w',encoding='utf-8') as fObj:
                json.dump(infos,fObj)
        except Exception as e:
            log("findTopicDetail访问"+url+"时出现异常:"+str(e),'red')
            time.sleep(5); # 如果出现异常，休息五秒后再试
            continue;

#------------------------------------
# 入口函数
# start:起始页,end:终止页
#------------------------------------
def main(start,end):
    # 创建目录
    currTime=time.strftime('%H_%M_%S',time.localtime(time.time()))
    global folder
    folder="./"+currTime
    os.makedirs(folder)
    print("目录"+folder+"创建完成")

    # 获取主贴
    print('
将从以下页面获取主贴:');
    for i in range(start,end+1):        
        pageUrl='http://www.55188.com/forum-8-'+str(i)+'.html' # 这个页是论坛页，即第1页，第2页等
        findTopics(pageUrl);

    n=len(topics)
    log("共读取到:"+str(n)+"个主贴",'green')

    # 获取主贴及其子贴
    finalTopics=[]
    index=0
    for topic in topics:
        end=int(topic['pageCount'])+1
        title=topic['title']

        for i in range(1,end):
            pattern='-(d+)-(d+)-(d+)'
            newUrl=re.sub(pattern,lambda m:'-'+m.group(1)+'-'+str(i)+'-'+m.group(3),topic['url'])
            #print(newUrl)

            newTopic={'index':index,'url':newUrl,'title':title}
            finalTopics.append(newTopic)

            index=index+1

    n=len(finalTopics)
    log("共读取到:"+str(n)+"个帖子",'green')
    
    # 遍历finalTopics
    for newTopic in finalTopics:
        saveTopicDetail(newTopic['index'],newTopic['url'],newTopic['title']);

# 开始
main(1,3)

控制台输入如下：

C:Usershorn1Desktoppython26>python lixiang.py
目录./11_12_31创建完成

将从以下页面获取主贴:

开始读取页面http://www.55188.com/forum-8-1.html的帖子
C:Usershorn1AppDataLocalProgramsPythonPython36libsite-packagess4__init__.py:146: UserWarning: You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.
  warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")

开始读取页面http://www.55188.com/forum-8-2.html的帖子

开始读取页面http://www.55188.com/forum-8-3.html的帖子
共读取到:210个主贴
共读取到:11974个帖子
findTopicDetail访问http://www.55188.com/thread-6009271-105-2.html时出现异常:HTTPConnectionPool(host='www.55188.com', port=80): Max retries exceeded with url: /thread-6009271-105-2.html (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x0000016DE6EBA278>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed',))
findTopicDetail访问http://www.55188.com/thread-5938774-339-2.html时出现异常:('Connection aborted.', TimeoutError(10060, '由于连接方在一段时间后没有正确答复或连接的主机没有反应 ，连接尝试失败。', None, 10060, None))
findTopicDetail访问http://www.55188.com/thread-5938774-583-2.html时出现异常:HTTPConnectionPool(host='www.55188.com', port=80): Max retries exceeded with url: /thread-5938774-583-2.html (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x0000016DE6E448D0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed',))
findTopicDetail访问http://www.55188.com/thread-7065242-77-2.html时出现异常:HTTPConnectionPool(host='www.55188.com', port=80): Max retries exceeded with url: /thread-7065242-77-2.html (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x0000016DE75B49B0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed',))

C:Usershorn1Desktoppython26>

生成文件截图：

这一次达到了下载指定页数据的目的，最终数据有237496条。

而负责插入数据的insertDB.py代码如下：

# 读取理想论坛爬虫1.06生成的数据，然后写入DB
import pymysql
import time
import datetime
import os
import json

# 数据库插值
def insertDB(sqls):
    conn=pymysql.connect(host='127.0.0.1',user='root',passwd='12345678',db='test',charset='utf8')
    
    sum=0;# 插入成功总数

    
    for sql in sqls:        
        try:
            count=conn.query(sql)    #单条是否成功    
        except Exception as e:
            print("sql'"+sql+"'出现异常:"+str(e))
            continue;

        if count==0:
            print(sql+'插入记录失败');    
            
        sum+=count
    
    conn.commit()
    conn.close()

    return sum

# 入口函数
def main(folder):
    allinfos=[]
    for filename in os.listdir(folder):
        filePathname=folder+"/"+filename

        with open(filePathname,'r',encoding='utf-8') as fObj:
            infos=json.load(fObj)
            allinfos.extend(infos)

    sqls=[]
    for info in allinfos:
        sql="insert into test.topic14(floor,author,tdate,ttime,addtime,url,title) values ('"+info['楼层']+"','"+info['作者']+"','"+info['日期']+"','"+info['时间']+"',"+"now(),'"+info['url']+"','"+info['title']+"' "+" )"
        sqls.append(sql)

    print("将向数据库插入"+str(len(sqls))+"条记录")
    retval=insertDB(sqls)
    print("已向数据库插入"+str(retval)+"条记录")

# 开始
main("./11_12_31")

第一次执行时出现异常，我于是加了异常处理，原来有一条数据因乱码在文本里出现了单引号，跳过这一句就好了，输出如下：

C:Usershorn1Desktoppython26>python insertDB.py
将向数据库插入237496条记录
sql'insert into test.topic14(floor,author,tdate,ttime,addtime,url,title) values ('7155Â¥','Ö»¿´£ºÒ»¾ÅÎÞÏÞ'','2011-4-1','21:53',now(),'http://www.55188.com/thread-3980834-358-3.html',' Ïþ¸çµÄÍ¶×ÊÉúÑÄ:³¬³¤ÏßÊµÅÌÌù '  )'出现异常:(1064, "You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near '2011-4-1','21:53',now(),'http://www.55188.com/thread-3980834-358-3.html',' Ïþ' at line 1")
已向数据库插入237495条记录

C:Usershorn1Desktoppython26>

数据库里也得到了数据：

到现在，程序崩溃的问题找到了，那就是控制并发，控制session释放，这样能让程序稳定的执行下去。

但是乱码问题还在，理想论坛爬虫还需要磨练。

2018年4月26日14点00分