爬虫二 cookie&正则

一.cookie应用实例

import urllib.request
import urllib.parse

'''带着cookie进入人人网的用户管理界面：
        1.用浏览器登录人人网
        2.下次请求时，抓包，拿到它带着的cookie
        3.编写代码，带着cookie过去
        4.如果不行，带着所有的请求信息过去（终极方案）'''
url = 'http://www.renren.com/971302264/profile'

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) '
                         'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
            'Cookie': 'anonymid=jxczgs3yw3oby9; _de=9718742970B17AD7ABC87CAAA6A740CC;'
                      ' p=176166a1bb4a1d1a163443225f52e24e4; first_login_flag=1; ln_uact=18404904721; '
                      'ln_hurl=http://head.xiaonei.com/photos/0/0/men_main.gif; '
                      't=21d77ab67402235d4282cf725f991aab4; societyguester=21d77ab67402235d4282cf725f991aab4; '
                      'id=971302264; xnsid=6d1019cd; ver=7.0; loginfrom=null; JSESSIONID=abcOB4RHNlyeq8Dv_7sUw; '
                      'jebe_key=2819f31f-79cc-428e-b61e-8b968e2beda4%7C920b82268747e02c45f3056eeda651c7%7C1561538325515%7C1%7C1561538325729; '
                      'jebe_key=2819f31f-79cc-428e-b61e-8b968e2beda4%7C920b82268747e02c45f3056eeda651c7%7C1561538325515%7C1%7C1561538325732; wp_fold=0'

}

req = urllib.request.Request(url,headers=headers)

rep = urllib.request.urlopen(req)

with open('ren.html','wb') as fp:
    fp.write(rep.read())

二、编程登录人人网

import urllib.request
import urllib.parse
import http.cookiejar

'''python登录人人网：
        1.用浏览器登录并抓包
        2.拿到目标url和post信息
        3.带着这些信息发请求'''

'''创建这样的打开器，登录时会保存cookie信息到该打开器'''
cj = http.cookiejar.CookieJar()    #创建CookieJar对象
handler = urllib.request.HTTPCookieProcessor(cj)     #创建cookie处理者
opener = urllib.request.build_opener(handler)      #创建打开器

post_url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2019531649636 '

form_data = {'email':'18404904721',
            'icode':'',
            'origURL':'http://www.renren.com/home',
            'domain':'renren.com',
            'key_id':'1',
            'captcha_type':'web_login',
            'password':'641fd8bce69ff3a3acfb14fc094fefe9487f9b4f843d18063fcce22e0a468066',
            'rkey':'2c3ae276413c03a1eb5159d355895bd0',
            'f':'http%3A%2F%2Fwww.renren.com%2F971302264%2Fprofile'}

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) '
                         'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',}

form_data = urllib.parse.urlencode(form_data).encode()       #post表单处理

req = urllib.request.Request(url=post_url,headers=headers)     #创建请求对象

rep = opener.open(req,data=form_data)            #发送post请求

# print(rep.read().decode())

'''进入用户管理界面，验证是否登录成功'''
get_url = 'http://www.renren.com/971302264/profile'

req1 = urllib.request.Request(url=get_url,headers=headers)

rep1 = opener.open(req1)     #再次使用该打开器，里面会带着cookie

with open('guanli.html','wb') as fp:
    fp.write(rep1.read())

三、正则表达式提取内容

import re

'''()子模式'''
# string = '<div><span>悟空</span></div>'
# '''匹配上面的字符串，标签是对称的'''
# pattern = re.compile(r'<(w+)><(w+)>w+</2></1>')
# ret = pattern.search(string)
# print(ret)

'''贪婪与非贪婪'''
# string = '<div>八戒</div></div></div>'
# '''匹配上面的字符串，标签是对称的'''
# pattern1 = re.compile(r'<div>.*</div>')
# pattern2 = re.compile(r'<div>.*?</div>')
# ret1 = pattern1.search(string)
# ret2 = pattern2.search(string)
# print(ret1)
# print(ret2)

'''re.M多行匹配'''
string = '''beautiful'
beach'''
pattern = re.compile(r'^bea',re.M)
ret = pattern.findall(string)
print(ret)

'''re.S单行匹配'''
# string = '<div>《沁园春-雪》' 
#          '北国风光，千里冰封，万里雪飘。' 
#          '望长城内外，惟余莽莽。' 
#          '大河上下，顿失滔滔。</div>'
# pattern = re.compile(r'.*',re.S)
# ret = pattern.search(string)
# print(ret)

'''re.I 单忽略大小写'''
# string = 'Life Is Short You Must Be Sexy'
# pattern = re.compile(r'life is short you must be sexy',re.I)
# ret = pattern.search(string)
# print(ret)

'''正则替换'''
string = 'Life Is Short You Must Be Sexy'
pattern = re.compile(r'Sexy')
ret = re.sub(pattern,'sao',string)
ret2 = pattern.sub('lang',string)
print(ret)
print(ret2)

def func(a):
    ret = int(a.group())
    return str(ret - 3)
string = '最佳身高为175cm'
pattern = re.compile(r'd+')
ret2 = pattern.sub(func,string)
print(ret2)

四、正则例子-爬取糗图图片

import urllib.request
import urllib.parse
import re
import  os

def create_request(url,page):
    post_url = url + str(page) +'/'
    header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) '
                         'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
    req = urllib.request.Request(url=post_url,headers=header)
    return req

def download_image(content):
    pattern = re.compile(r'<div class="thumb">.*?<img src="(.*?)" .*?>.*?</div>',re.S)
    img_list = pattern.findall(content)
    #print(img_list)
    for img_src in img_list:
        img_url = 'https:' + img_src      #拼接图片链接
        dirname = 'qiutu'
        if not os.path.exists(dirname):
            os.mkdir(dirname)
        img_name = img_url.split('/')[-1]
        filepath = dirname + '/' + img_name
        urllib.request.urlretrieve(img_url,filepath)

def main():
    url = 'https://www.qiushibaike.com/pic/page/'

    start_page = int(input('起始页码：'))
    end_page = int(input('结束页码：'))

    for page in range(start_page,end_page):
        print('第%s页开始下载...' %page)
        #创建请求
        req = create_request(url,page)

        #发送请求,得到内容
        rep = urllib.request.urlopen(req).read().decode()

        #解析内容，下载图片
        download_image(rep)
        print('第%s页结束下载...' % page)

if __name__ == '__main__':
    main()

五、正则例子-爬取语录

import urllib.request
import urllib.parse
import re
import  os

def create_request(url,page=None):
    if page != None:
        url = url + str(page) + '.html'
    #print(post_url)
    header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) '
                         'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}

    req = urllib.request.Request(url=url,headers=header)
    return req

def get_content(href):
    request = create_request(href)
    content_html = urllib.request.urlopen(request).read().decode()
    pattern = re.compile(r'<div class="neirong">(.*?)</div>', re.S)
    content_list = pattern.findall(content_html)
    #print(content_list)
    pat = re.compile(r'<img .*?>')
    text = pat.sub('',content_list[0])
    return text

def parse_html(content):

    #正则筛选内容
    pattern = re.compile(r'<h3><a href="/mingrenjingdianyulu/(d+/d+/d+.html)"><b>(.*?)</b></a></h3>', re.S)
    title_list = pattern.findall(content)
    #print(title_list)

    for i in title_list:

        href = 'http://www.yikexun.cn/mingrenjingdianyulu/' + i[0]     # 拼接内容的跳转链接
        title = i[1]

        #向href发送请求，获取内容
        content = get_content(href)

        #写入文件
        string = '<!DOCTYPE html>' 
                 '<html lang="en">' 
                 '<head>' 
                 '  <meta charset="UTF-8">' 
                 '  <title>Title</title>' 
                 '</head>' 
                 '<body>' 
                 '  <h1>%s</h1>%s' 
                 '</body>' %(title,content)

        with open('yulu.html','a',encoding='utf8') as fp:
            fp.write(string)

def main():
    url = 'http://www.yikexun.cn/mingrenjingdianyulu/list_10_'

    start_page = int(input('起始页码：'))
    end_page = int(input('结束页码：'))

    for page in range(start_page,end_page+1):
        print('第%s页开始下载...' %page)

        #创建请求
        req = create_request(url,page)

        #发送请求,得到内容
        rep = urllib.request.urlopen(req).read().decode()

        #解析内容，下载图片
        parse_html(rep)
        print('第%s页结束下载...' % page)

if __name__ == '__main__':
    main()