爬虫大作业

                       


1.选一个自己感兴趣的主题。

2.用python 编写爬虫程序,从网络上爬取相关主题的数据。

3.对爬了的数据进行文本分析,生成词云。

4.对文本分析结果进行解释说明。

5.写一篇完整的博客,描述上述实现过程、遇到的问题及解决办法、数据分析思想及结论。

6.最后提交爬取的全部数据、爬虫及数据分析源代码。

from bs4 import BeautifulSoup
import logging
import sys
reload(sys)
sys.setdefaultencoding( "utf-8" )


class Item(object):
    title = None #帖子标题
    firstAuthor = None #帖子原作者
    firstTime = None #帖子创建时间
    reNum = None #帖子回复浏览数量
    LastTime = None #帖子最后回复时间
    LastAuthor = None #帖子最后回复作者
    link = None #帖子链接

# 全局方法获取网页内容
def getResponseContent(url):
    try:
         response = urllib2.urlopen(url.encode('utf8'),timeout=20)
    except:
        logging.error(u'Python返回URL:{}数据失败'.format(url))
    else:
        logging.info(u'Python返回URL:{}数据成功'.format(url))
        return response.read()

class getHupuInfo(object):
    def __init__(self,url):
        self.url = url
        self.pageSum = 3 
        self.urls = self.getUrls(self.pageSum)
        self.items = self.spider(self.urls)
        self.pipelines(self.items)

    def getUrls(self,pageSum):
        urls = []
        urls.append(self.url)
        for pn in range(1,pageSum):
            tempurl = self.url + '-'+ str(pn+1)
            urls.append(tempurl)
        logging.info(u'获取URLS成功!
')
        return urls

    def spider(self,urls):
        items = []
        for url in urls:
            htmlContent = getResponseContent(url)
            soup = BeautifulSoup(htmlContent,'lxml')
            tagtable = soup.find('table',attrs={'id':'pl'})
            tagstr = tagtable.find_all('tr')

            flag = 0 
            for tag in tagstr:
                if flag == 0:
                    flag +=1
                    continue
                else:
                    flag += 1
                    item = Item()
                    item.link = '/'+ tag.get('mid') + '.html'  
                    item.title = tag.find('td', attrs={'class': 'p_title'}).find('a',href = item.link).get_text()
                    item.firstAuthor = tag.find('td', attrs={'class': 'p_author'}).a.get_text()
                    item.firstTime = tag.find('td', attrs={'class': 'p_author'}).get_text()
                    item.reNum = tag.find('td', attrs={'class': 'p_re'}).get_text()
                    item.LastAuthor = tag.find('td', attrs={'class': 'p_retime'}).a.get_text()
                    item.LastTime = tag.find('td', attrs={'class': 'p_retime'}).get_text()
                    items.append(item)
        logging.info(u'获取帖子成功')
        return items

    def pipelines(self,items):
        fileName = u'Hupu_bxj.txt'
        with open(fileName,'w') as fp:
            for item in items:
                #fp.write('{}	{}	{}	{}	{}	{}
{}

'.format(item.title,item.firstAuthor,item.firstTime,item.reNum,item.LastAuthor,item.LastTime,item.link))
                fp.write('{}
 '.format(item.title).encode('utf8'))
        logging.info(u'写入文本成功')

    def getpiclink(self):
        piclink = []
        for item in self.items:
            piclink.append(self.url[0:20] + item.link)
        logging.info(u'返回图片帖子链接成功')
        return piclink




class picInfo(object):

    def __init__(self,links):
        self.links = links
        self.imgurls = []
        self.spider()
        self.pipeline()


    def spider(self):

        if self.links == None:
            logging.error('无图片链接')
        else:
            for link in self.links:
                htmlContent = getResponseContent(link)
                soup =  BeautifulSoup(htmlContent,'lxml')
                tagDiv = soup.find('div',attrs={'id':'tpc'})
                img = tagDiv.find('div',attrs={'class':'quote-content'}).find_all('img')
                if img == None:
                    continue
                else:
                    for subimg in img:
                     
                        if subimg.get('data-original') == None:
                            imgurl = subimg.get('src')
                        else:
                            imgurl = subimg.get('data-original')
                        self.imgurls.append(imgurl)
        logging.info(u'获取图片链接成功')

    def pipeline(self):

        for i in range(len(self.imgurls)):
            if self.imgurls[i][-3:] == 'png':
                imgname = str(i) + '.png'
            elif self.imgurls[i][-3:] == 'jpg':
                imgname = str(i) + '.jpg'
            elif self.imgurls[i][-4:] == 'jpeg':
                imgname = str(i) + '.jpeg'
            elif self.imgurls[i][-3:] == 'gif':
                imgname = str(i) + '.jpeg'
            else:
                continue
            img = getResponseContent(self.imgurls[i])

            with open (imgname, 'ab') as fp:
                fp.write(img)
        logging.info(u'写入图片成功')

if __name__ == '__main__':
    logging.basicConfig(level= logging.INFO)
    url = u'https://bbs.hupu.com/bxj'
    HUPU = getHupuInfo(url)
    picurls = HUPU.getpiclink()
    PIC = picInfo(picurls)

原文地址:https://www.cnblogs.com/lk666/p/8974592.html