朴素贝叶斯——实现贴吧精品贴的预测

首先是爬取了我们学校贴吧的贴吧数据，每个帖子都有是否是精品贴的标签。

根据帖子标题信息，实现了贴吧精品贴和普通贴的分类。错误率在10%左右。

切词用的是jieba吧，没有过滤点停用词和标点符号，因为标点符号其实也是可以算是区分帖子是否是精品贴的而一个重要特征；其实还可以增加几个特征，比如第一页是否含有音频、视频、图片的数量等等，有些重要的特征维度甚至可以多覆盖几个维度。分类的效果可能会更好。但是目前光看标题正确率已经达到了90%，说明朴素贝叶斯还真不错。

训练算法部分用的是朴素贝叶斯方法。测试部分，还是用的留出法。

先是爬虫部分代码：

  1 __author__ = 'Oscar_Yang'
  2 # -*- coding= utf-8 -*-
  3 """
  4 本次目的
  5 1、抓取列表页,标题
  6 2、计算提拔关键词
  7 """
  8 # 导入相关模块。
  9 import re, requests, json, random, time, jieba,pymongo
 10 import urllib.request
 11 from bs4 import BeautifulSoup
 12 """连接mongodb"""
 13 client = pymongo.MongoClient("localhost",27017)
 14 db_tieba = client["db_tieba"]
 15 # sheet_tieba_ysu_good = db_tieba_ysu_good["sheet_tieba_ysu_good"]
 16 # sheet_tieba_dq = db_tieba_dq["sheet_tieba_dq_test"]
 17 # sheet_tieba_dbdx = db_tieba["sheet_tieba_dbdx"]
 18 sheet_tieba = db_tieba["sheet_tieba_ysu_914"]
 19 
 20 """设置代理"""
 21 # resp = requests.get("http://tor1024.com/static/proxy_pool.txt")
 22 # ips_txt = resp.text.strip().split("
")
 23 # # print(ips_txt)
 24 # ips = []
 25 # for i in ips_txt:
 26 #     try:
 27 #         k = json.loads(i)
 28 #         ips.append(k)
 29 #     except Exception as e:
 30 #         print(e)
 31 
 32 header = {
 33     "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36',
 34     "Cookie": 'XXX'
 35 }
 36 
 37 
 38 # 定义获取base_urls的函数
 39 
 40 def get_base_urls():
 41     urls = ["http://tieba.baidu.com/f?kw=%E7%87%95%E5%B1%B1%E5%A4%A7%E5%AD%A6&ie=utf-8&pn={}".format(str(i)) for i in range(0, 383400, 50)]
 42     return urls
 43 def get_base_good_urls():
 44     urls=["http://tieba.baidu.com/f?kw=%E9%87%8C%E4%BB%81%E5%AD%A6%E9%99%A2&ie=utf-8&pn={}".format(str(i)) for i in range(0,10000,50)]
 45     return urls
 46 def get_last_reply_time(detail_url):
 47     web_data = requests.get(detail_url)
 48     web_data.encoding = "utf-8"
 49     soup = BeautifulSoup(web_data.text, "lxml")
 50     detail_page_num = soup.find_all(class_="red")[1].text
 51     detail_page_last_url=detail_url+"?pn={}".format(detail_page_num)
 52     web_data1 = requests.get(detail_page_last_url)
 53     web_data1.encoding = "utf-8"
 54     soup1 = BeautifulSoup(web_data1.text, "lxml")
 55     last_post_time_data_0 = soup1.find_all(class_="l_post j_l_post l_post_bright  ")
 56     last_post_time_data = last_post_time_data_0[len(last_post_time_data_0)-1].get("data-field")
 57     last_reply_post_time = last_post_time_data[int(last_post_time_data.find("date")) + 7:int(last_post_time_data.find("vote_crypt") - 3)]
 58     return last_post_time_data
 59 
 60 def get_detail_data(detail_url):
 61     res = requests.get(detail_url)
 62     res.encoding = "utf-8"
 63     soup = BeautifulSoup(res.text,"lxml")
 64     post_time_data = soup.find_all(class_='l_post j_l_post l_post_bright noborder ')[0].get("data-field")
 65     author_dengji = soup.select("div.d_author > ul > li.l_badge > div > a > div.d_badge_lv")[0].text
 66     author_lable = soup.select("div.d_badge_title")[0].text
 67     reply_nus = soup.find_all(class_="red")[0].text
 68     title = soup.select("div.core_title.core_title_theme_bright > h1")[0].text
 69     author_name = soup.select("div.d_author > ul > li.d_name > a")[0].text
 70     authon_contents = soup.find_all(class_="d_post_content j_d_post_content  clearfix")
 71     jingyan_scores = post_time_data[int(post_time_data.find("cur_score"))+11:int(post_time_data.find("bawu"))-2]
 72     post_time = post_time_data[int(post_time_data.find("date")) + 7:int(post_time_data.find("vote_crypt")) - 3]
 73     phone_is = post_time_data[int(post_time_data.find("open_type")) + 12:int(post_time_data.find("date")) - 2]
 74     sex_is = post_time_data[int(post_time_data.find("user_sex")) + 10:int(post_time_data.find("user_sex")) + 11]
 75     pic_num_firstpage = soup.find_all(class_="BDE_Image")
 76     voice_num = soup.find_all(class_="voice_player_inner")
 77     video_num = soup.find_all(class_="BDE_Flash")
 78     detail_page_num = soup.find_all(class_="red")[1].text
 79     contents = []
 80     for i in range(len(authon_contents)):
 81         contents.append(authon_contents[i].text.strip())
 82     data={
 83         "nick_name":author_name,
 84         "post_time":post_time,
 85         "vehicle":phone_is,
 86         "level":author_dengji,
 87         "honored_name":author_lable,
 88         "reply_num":reply_nus,
 89         "title":title,
 90         "sex":sex_is,
 91         "jingyan_scores":jingyan_scores,
 92         "pic_num":len(pic_num_firstpage),
 93         "video_num":len(video_num),
 94         "voice_num":len(voice_num),
 95         "detail_page_num":detail_page_num,
 96         "contents":contents,
 97         "tiezi_url":detail_url
 98     }
 99     #print(data)
100     # sheet_tieba_ysu_good.insert_one(data)
101     # sheet_tieba_dbdx.insert_one(data)
102     # sheet_tieba_dq.insert_one(data)
103     sheet_tieba.insert_one(data)
104     #data1file(data)
105 def data1file(s):
106     path = r"C:UsersOscarDesktop数据.txt"
107     file = open(path, "a",encoding="utf-8")
108     file.write("
")
109     file.write(str(s))
110     file.close()
111 
112 def get_detail_urls(url):
113     detail_links = []
114     res = requests.get(url)
115     res.encoding = "utf-8"
116     soup = BeautifulSoup(res.text, 'lxml')
117     link_tags = soup.select("#thread_list div.threadlist_title.pull_left.j_th_tit > a")
118     for link_tag in link_tags:
119         detail_links.append("http://tieba.baidu.com/" + link_tag.get("href"))
120     return detail_links
121     # print(detail_links)
122 
123 # 获取列表页数据
124 def get_data(url):
125     web_data = requests.get(url)
126     web_data.encoding = "utf-8"
127     soup = BeautifulSoup(web_data.text, "lxml")
128     titles = soup.select('#thread_list div.threadlist_title.pull_left.j_th_tit > a')
129     reply_nums = soup.select("div > div.col2_left.j_threadlist_li_left > span")
130     zhurens = soup.select("div.threadlist_author.pull_right > span.tb_icon_author > span.frs-author-name-wrap > a")
131     link_tags = soup.select('#thread_list div.threadlist_title.pull_left.j_th_tit > a')
132     #time.sleep(random.randint(1, 2))
133     for title, reply_num, link_tag, zhuren in zip(titles, reply_nums, link_tags, zhurens):
134         data = {
135             "标题": title.get_text(),
136             "回复数": reply_num.text,
137             "主人": zhuren.get_text(),
138             "原文链接": "http://tieba.baidu.com/" + link_tag.get("href")
139 
140         }
141         print(data)
142 
143 def get_counts():
144     f = open(r'C:UsersOscarDesktop1.txt', 'r', encoding='utf-8')
145     sentence = f.read()
146     #words = jieba.cut(sentence, cut_all=True)
147     #words = jieba.cut(sentence, cut_all=False)
148     words = jieba.cut_for_search(sentence)
149     tf = {}
150     for word in words:
151         print(word)
152         word = ''.join(word.split())
153         if word in tf:
154             tf[word] += 1
155         else:
156             tf[word] = 1
157     return tf
158 
159 def top_counts_sorted(tf, n=50):
160     value_key_pairs = sorted([(count, tz) for tz, count in tf.items()], reverse=True)
161     print(value_key_pairs[:n])
162 
163 # top_counts_sorted(get_counts())
164 if __name__ == '__main__':
165     count=0
166 
167     for base_url in get_base_urls():
168         count = count + 1
169         detail_urls = get_detail_urls(base_url)
170         for detail_url in detail_urls:
171             try:
172                 get_detail_data(detail_url)
173             except Exception as e:
174                 print(e)
175                 # pass
176         #time.sleep(random.randint(1,3))
177         print("完成了第{}页的抓取".format(count))

爬取后的数据存入到mongodb了，我最终导出到txt了。精品贴的标题和普通的贴的标题分别放在两个txt了，如下图所示（0.txt，1.txt）

下面是朴素贝叶斯的部分，没有包在类中，几个函数就行了，包括文件处理函数，创建单词表函数，训练函数，分类函数，主函数。

  1 def text_split(textlist):
  2     import re
  3     word_cut = jieba.cut(textlist, cut_all=False)  # 精确模式，返回的结构是一个可迭代的genertor
  4     word_list = list(word_cut)  # genertor转化为list，每个词unicode格式
  5     return word_list
  6 
  7 
  8 # 创建单词表
  9 def createVocabList(dataSet):
 10     vocabSet = set([])  # 创建一个空的集合
 11     for document in dataSet:
 12         vocabSet = vocabSet | set(document)  # union of the two sets
 13     return list(vocabSet)
 14 
 15 
 16 def trainNB0(trainMatrix, trainCategory):
 17     numTrainDocs = len(trainMatrix)  # 训练矩阵的行数
 18     numWords = len(trainMatrix[0])  # 字母表的维度，即训练矩阵的列数
 19     pAbusive = sum(trainCategory) / float(numTrainDocs)  # 先验信息
 20     p0Num = ones(numWords);
 21     p1Num = ones(numWords)  # 改为 ones()
 22     p0Denom = 2.0;
 23     p1Denom = 2.0  # 改成 2.0
 24     for i in range(numTrainDocs):
 25         if trainCategory[i] == 1:
 26             p1Num += trainMatrix[i]
 27             p1Denom += sum(trainMatrix[i])
 28         else:
 29             p0Num += trainMatrix[i]
 30             p0Denom += sum(trainMatrix[i])
 31     p1Vect = log(p1Num / p1Denom)  # 改为 log()
 32     p0Vect = log(p0Num / p0Denom)  # 改为 log()
 33     return p0Vect, p1Vect, pAbusive
 34     # 返回先验信息PAbusive,返回确定分类的条件下的每个单词出现的概率（此时概率为频率）
 35 
 36 
 37 def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
 38     p1 = sum(vec2Classify * p1Vec) + log(pClass1)  # 此时p1vec为对原始数组分别取对数之后的矩阵了，利用log(a*b)=sum(log(a)+log(b))再sum求和
 39     # pClass1为先验概率，此时p1就是最终的概率值。同理p0,根据后验概率最大准则，判别
 40     p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
 41     if p1 > p0:
 42         return 1
 43     else:
 44         return 0
 45 
 46 
 47 # 定义词袋模型，词出现几次算几次
 48 def bagOfWords2VecMN(vocabList, inputSet):
 49     returnVec = [0] * len(vocabList)  # 初始化矩阵
 50     for word in inputSet:
 51         if word in vocabList:
 52             returnVec[vocabList.index(word)] += 1
 53     return returnVec
 54 
 55 
 56 def spamTest():
 57     """"
 58     文本矩阵化，构建文本矩阵和分类矩阵；
 59     注意：由于有个文本的编码解码有问题，我给直接过滤掉了，所以最后矩阵有49行而不是50行
 60     """
 61     docList = [];classList = [];fullText = []
 62     path=r"C:UsersOscarDesktopdata1.txt"
 63     with open(path,encoding="utf8") as f:
 64         i=0
 65         while i<1046:
 66             a=f.readline()
 67             word_list=text_split(a)
 68             docList.append(word_list)
 69             classList.append(1)
 70             fullText.extend(docList)
 71             i=i+1
 72     path=r"C:UsersOscarDesktopdata.txt"
 73     with open(path,encoding="utf8") as f:
 74         i=0
 75         while i<1546:
 76             a=f.readline()
 77             word_list=text_split(a)
 78             docList.append(word_list)
 79             classList.append(0)
 80             fullText.extend(docList)
 81             i=i+1
 82     vocabList = createVocabList(docList)  # 创建词汇表
 83 
 84     trainingSet = list(range(2500))
 85     testSet = []  # 随机的构建测试集和训练集，留存交叉验证的方法
 86     for i in range(10):  # 测试集大小为10，训练集大小为49-10=39
 87         randIndex = int(random.uniform(0, len(trainingSet)))
 88         testSet.append(trainingSet[randIndex])
 89         trainingSet.pop(randIndex)
 90 
 91     trainMat = []
 92     trainClasses = []
 93     for docIndex in trainingSet:
 94         trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
 95         trainClasses.append(classList[docIndex])
 96     p0V, p1V, pSpam = trainNB0(array(trainMat), array(trainClasses))
 97 
 98     errorCount = 0
 99     for docIndex in testSet:  # classify the remaining items
100         wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
101         if classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
102             errorCount += 1
103             print("分类错误的帖子是", docList[docIndex],"正确的类应该是",classList[docIndex])
104     print('错误率为: ', float(errorCount) / len(testSet))
105     # return vocabList,fullText
106 
107 
108 if __name__ == '__main__':
109     spamTest()