Python爬虫(三)--百度贴吧

  1. #coding=utf-8
  2. import urllib2
  3. import urllib
  4. import re
  5. # 处理页面标签类
  6. class Tool:
  7. # 去除img标签,7位长空格
  8. removeImg = re.compile(r'<img.*?>| {7}|')
  9. # 删除超链接标签
  10. removeAddr = re.compile('<a.*?>|</a>')
  11. # 把换行标签换位
  12. replaceLine = re.compile('<tr>|<div>|</div|</p>')
  13. # 将制表<td>换位
  14. replaceTD = re.compile('<td>')
  15. # 将段落开头换为 加两空格
  16. replacePara = re.compile('<p.*?>')
  17. # 将换行符或双换行符换为
  18. replaceBR = re.compile('<br><br>|<br>')
  19. # 删除其他标签
  20. removeExtraTag = re.compile('<.*?>')
  21. def replace(self,x):
  22. x = re.sub(self.removeImg,"",x)
  23. x = re.sub(self.removeAddr,"",x)
  24. x = re.sub(self.replaceLine," ",x)
  25. x = re.sub(self.replaceTD," ",x)
  26. x = re.sub(self.replacePara," ",x)
  27. x = re.sub(self.replaceBR," ",x)
  28. x = re.sub(self.removeExtraTag,"",x)
  29. # 将前后多余的内容删除
  30. return x.strip()
  31. # 百度贴吧爬虫类
  32. class BDTB:
  33. # 初始化,传入基地址,传入是否“只看楼主”参数
  34. def __init__(self,baseUrl,seeLz,floorTag):
  35. # base连接地址
  36. self.basURL = baseUrl
  37. # 是否只看楼主
  38. self.seeLZ = '?see_lz='+str(seeLz)
  39. # 工具类
  40. self.tool = Tool()
  41. # 全局file变量,文件写入操作对象
  42. self.file = None
  43. # 楼层标号,初始为1
  44. self.floor = 1
  45. # 默认的标题
  46. self.defaultTitle = u"百度贴吧"
  47. # 是否写入楼分隔符的标记
  48. self.floorTag = floorTag
  49. # 获取该页(pageNum)的代码
  50. def getPage(self,pageNum):
  51. try:
  52. url = self.basURL + self.seeLZ + '&pn=' + str(pageNum)
  53. request = urllib2.Request(url)
  54. response = urllib2.urlopen(request)
  55. # 返回utf-8格式编码内容
  56. return response.read().decode('utf-8')
  57. except urllib2.URLError,e:
  58. if hasattr(e,"reason"):
  59. print u"连接百度贴吧失败,错误原因",e.reason
  60. return None
  61. # 获得页面标题
  62. def getTitle(self,page):
  63. # 获取标题的正则表达式
  64. pattern = re.compile(r'<h3 class="core_title_txt.*?>(.*?)</h3>',re.S)
  65. result = re.search(pattern, page)
  66. if result:
  67. return result.group(1).strip()
  68. else:
  69. return None
  70. # 提取页数
  71. def getPageNum(self,page):
  72. # 获取页数的正则表达式
  73. pattern = re.compile(r'reply_num.*?</span.*?class="red">(.*?)</span>',re.S)
  74. result = re.search(pattern,page)
  75. if result:
  76. return result.group(1).strip()
  77. else:
  78. return None
  79. # 提取内容
  80. def getContent(self,page):
  81. pattern = re.compile(r'post_content.*?>(.*?)</div>',re.S)
  82. items = re.findall(pattern,page)
  83. contents = []
  84. for item in items:
  85. #将文本内容进行处理,同时在前后加上换行符
  86. content = " " + self.tool.replace(item)+" "
  87. contents.append(content.encode('utf-8'))
  88. return contents
  89. def setFileTitle(self,title):
  90. if title is not None:
  91. self.file = open(title+".txt","w+")
  92. else:
  93. self.file = open(self.defaultTitle+".txt","w+")
  94. # 写文件
  95. def writeData(self,contents):
  96. for item in contents:
  97. if self.floorTag == "1":
  98. floorLine = " "+str(self.floor)+u"-------------------------------------- "
  99. self.file.write(floorLine)
  100. self.file.write(item)
  101. self.floor += 1
  102. def start(self):
  103. indexPage = self.getPage(1)
  104. pageNum = self.getPageNum(indexPage)
  105. title = self.getTitle(indexPage)
  106. self.setFileTitle(title)
  107. if pageNum == None:
  108. print u"URL已失效,请重试"
  109. return
  110. try:
  111. print "该帖子共有"+str(pageNum)+"页。"
  112. for i in range(1,int(pageNum)+1):
  113. print "正在写入第"+str(i)+"页数据"
  114. page = self.getPage(i)
  115. contents = self.getContent(page)
  116. self.writeData(contents)
  117. except IOError,e:
  118. print "写入异常,原因:"+e.message
  119. finally:
  120. print "写入完成"
  121. baseURL = 'http://tieba.baidu.com/p/3138733512'
  122. seeLz = raw_input("是否只获取楼主发言,是输入1,否输入0 ")
  123. floorTag = raw_input('是否写入楼层信息,是输入1,否输入0 ')
  124. bdtb = BDTB(baseURL,seeLz,floorTag)
  125. bdtb.start()




原文地址:https://www.cnblogs.com/aniudcs/p/ff8bf2f784e6207675c0571ff4470be7.html