python爬虫

pypthon爬虫相关

  • 1、爬取图片至本地,目前仅能实现将百度帖里面的图片下载至本地,百度图片里面的图片还不能实现,等完善
  • 2、抓取糗百的段子,查看作者、段子内容,点赞个数
  • 3、更加人性化的查看方式,按下回车显示一个段子,如果按[q|Q]就直接退出

代码

  • 1、用到的模块,urllib,urllib2,re
  • 2、url匹配用到了re模块
  • 3、文件下载使用了urllib.urlretrieve()来将分析出来的图片下载至本地

1、百度贴吧图片

#!/usr/bin/env python
#-*- coding:utf8 -*-

import urllib, urllib2
import re

def getHtml(url):
	page = urllib2.urlopen(url)

	return page.read()

def getImage(html):
	'''需要注意这里的.*?表示的是非贪婪匹配,如果遇到第一个>,那么就停止匹配'''
	re_img = re.compile(r'<img class="BDE_Image" src="(.*?)".*?>')
	img_list = re_img.findall( html )
	i = 1

	for imgurl in img_list:
		print imgurl
		'''使用urllib.urlretrieve()来将分析出来的图片下载至本地
		'''
		urllib.urlretrieve(imgurl, filename='%s.jpg' % i)
		i += 1

if __name__ == '__main__':
	#url = 'http://tieba.baidu.com/p/3999261766'
	#url = 'http://tieba.baidu.com/p/4957363500'
	url = 'http://tieba.baidu.com/p/2263349749'
	page = getHtml( url )
	# print page
	getImage( page )

2、一次性获取指定页面的糗百段子

#!/usr/bin/env python
#-*- coding:utf8 -*-

'''
# 爬取糗百段子
1. 攫取段子
2. 过滤带有图片的段子
3. 实现第按一次回车显示一个段子的发布人,段子内容,点赞个数
'''

import urllib, urllib2
import re

page = 2
url = 'http://www.qiushibaike.com/hot/page/' + str(page)
# print( url )
# User-Agent: 封装
headers = {'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:45.0) Gecko/20100101 Firefox/45.0"}

try:
	req = urllib2.Request(url, headers=headers)
	rsp = urllib2.urlopen(req)
	html = rsp.read()
except urllib2.URLError, e:
	if hasattr(e, 'code'):
		print e.code
	if hasattr(e, 'reason'):
		print e.reason

re_page = re.compile(r'<div class="author.*?>.*?<a.*?<img src=".*?" alt="(.*?)"/>.*?<a.*?<div class="content">.*?<span>(.*?)</span>.*?<div class="stats">.*?<i class="number">(d+)</i>', re.S)
items = re_page.findall( html )

# print(items)
for item in items:
	for i in item:
		print(i)

3、实现第按一次回车显示一个段子的发布人,段子内容,点赞个数

#!/usr/bin/env python
#-*- coding:utf8 -*-

'''
# 爬取糗百段子
1. 攫取段子
2. 实现每按一次回车只显示一个段子
'''

import urllib, urllib2
import re
import sys

def getPage(page_num):
   url = 'http://www.qiushibaike.com/hot/page/' + str(page_num)
   # print( url )
   # User-Agent: 封装
   headers = {'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:45.0) Gecko/20100101 Firefox/45.0"}

   try:
      req = urllib2.Request(url, headers=headers)
      rsp = urllib2.urlopen(req)
      html = rsp.read()
      # print html

      return html # 返回网页源码
   except urllib2.URLError, e:
      if hasattr(e, 'code'):
         print('连接服务器失败, 错误代码: %s' % e.code)
         return None
      if hasattr(e, 'reason'):
         print('连接服务器失败,错误原因: %s' % e.reason)
         return None

def getPageContent(page_num=1):
   html = getPage(page_num)
   # print(html)
   # re_page = re.compile(r'<div class="author.*?>.*?<a.*?<img src=".*?" alt="(.*?)"/>.*?<a.*?<div class="content">.*?<span>(.*?)</span>.*?<div class="stats">.*?<i class="number">(d+)</i>', re.S)
   re_page = re.compile(r'<div class="author.*?>.*?<a.*?<img src=".*?"/>.*?</a>.*?<a.*?>.*?<h2>(.*?)</h2>.*?</a>.*?<a.*?<div class="content">.*?<span>(.*?)</span>.*?<div class="stats">.*?<i class="number">(d+)</i>', re.S)
   items = re_page.findall( html )
   # print( items[1] )
   # print( len(items) )
   page_contents = []

   for item in items:
   #  print(' item '.center(50, '='))
   #  print( item )
      page_contents.append((page_num, item[0].strip(), item[1].strip(), item[2].strip()))
      # page_contents.append('
')
   
   return page_contents

def getOneStory(page_contents):
   for story in page_contents:
      input = raw_input()
      if input.lower() == 'q':
         sys.exit()
      # print(story[0], story[1], story[2], story[3])
      print('第%s页	发布人:%s	赞: %s
%s' % (story[0], story[1], story[3], story[2]))

if '__main__' == __name__:
   print("Loading web content from web site ...
 Press [q|Q] to exit, and press 'Enter' see next content: 
")
   num = 1

   while True:
      page_contents = getPageContent(num)
      getOneStory(page_contents)
      num += 1

   # page_content = getPageContent()
   # print(page_content)
   # for item in page_content:
   #  for i in item:
   #     print(i)

未完,待续……

Yesterday is history.
Tomorrow is a mystery.
But today is a gift.
That is why it's called the present.
The old game: give a wolf a taste, then keep him hungry.
原文地址:https://www.cnblogs.com/ZhangRuoXu/p/6367132.html