最简单的python 爬虫


C:UsersIBM_ADMIN>python -V
Python 2.7.13

查Python 工资的网站 :

http://www.jobui.com/salary/%E5%8C%97%E4%BA%AC-python%E5%B7%A5%E7%A8%8B%E5%B8%88/

# -*- coding:utf-8 -*-
import re,urllib2
url = 'http://daily.zhihu.com/'
headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6)Gecko/20091201 Firefox/3.5.6'}
#https://v.qq.com/x/page/w05097k8olz.html
def getHtml(url):
    request = urllib2.Request(url,headers=headers)
    response = urllib2.urlopen(request)
    content = response.read()
    #print content
    return content

html = getHtml(url)
    
def getUrls(html):
    pattern = re.compile('<a href="/story/(.*?)"')
    items = re.findall(pattern,html)
    allUrls=[]
    for it in items:
        allUrls.append(url+'story/'+it)
    return allUrls
        
urls = getUrls(html)

def getContent(urls):
    patternTitle=re.compile('<h1 class="headline-title">(.*?)</h1>')
    patternContent = re.compile('<div class="content">\n<p>(.*?)</p>\n</div>',re.S)# 匹配换行
    for url in urls:
        html = getHtml(url)
        item =re.findall(patternTitle,html)
        print '-----------------------------------------'+'-----------------------------------------'
        print '-----------------------------------------'+'-----------------------------------------'
        print '***************'+item[0]+'***************'
        print '-----------------------------------------'+'-----------------------------------------'
        content = re.findall(patternContent,html)
        for con in content:
            print con
        #print content[0]
                                
        
getContent(urls)

# remove unneeded things ----> 祛杂质
def characterProcessing(content):
    pattern = re.compile('<p>(.*?)</p>'|'<li>(.*?)</li>')
    pass


原文地址:https://www.cnblogs.com/TendToBigData/p/10501215.html