爬虫：今日头条

知识点：

　　1.urlencode的使用

　　2.md5加密方法

　　3.os的使用

　　4.json格式的get方法

遇到的问题：

　　1.返回的json数据为空：原因，没有添加headers和cookie

import time
import requests
import urllib.parse
def get_page(offset,timestamp):
    parms={
        'aid':'24',
        'app_name':'web_search',
        'offset':offset,
        'format':'json',
        'keyword':'街拍',
        'autoload':'true',
        'count':'20',
        'en_qc':'1',
        'cur_tab':'1',
        'from':'search_tab',
        'pd':'synthesis',
        'timestamp':timestamp,
    }

    url='https://www.toutiao.com/api/search/content/?'+urllib.parse.urlencode(parms)


    try:
        response = requests.get(url)
        if response.status_code==200:
            print(response.json())
            return response.json()
    except requests.ConnectionError:
        print('请求失败')
        return None

if __name__ == '__main__':
    get_page(20,int(time.time()))

结果：

{'count': 0, 'return_count': 0, 'query_id': '6537385837821170952', 'has_more': 0, 'request_id': '20200203144241010014017021172B7CF7', 'search_id': '20200203144241010014017021172B7CF7', 'cur_ts': 1580712161, 'offset': 40, 'message': 'success', 'pd': 'synthesis', 'show_tabs': 1, 'keyword': '街拍', 'city': '永州', 'log_pb': {'impr_id': '20200203144241010014017021172B7CF7', 'is_incognito': 0}, 'data': None, 'data_head': [{'challenge_code': 1366, 'cell_type': 71, 'keyword': '街拍', 'url': 'sslocal://search?keyword=%E8%A1%97%E6%8B%8D&from=&source=search_tab'}], 'ab_fields': None, 'latency': 0, 'search_type': 2, 'tab_rank': None, 'temp_type': 0, 'tab_list': None}

正确结果：

import urllib.parse
import os
from hashlib import md5


headers={
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"
}

cookie={

    'Cookie':' tt_webid=6789077713942971916; s_v_web_id=k65z596n_stTrkCwT_ni5x_4sJa_BH3v_HlSjP1Rsc1ko; WEATHER_CITY=%E5%8C%97%E4%BA%AC; tt_webid=6789077713942971916; csrftoken=b898a5247b610a1bd24735bbdccc8a8a; __tasessionId=tbdg6xx8b1580707187238'

}
def get_page(offset,timestamp):
    parms={
        'aid':'24',
        'app_name':'web_search',
        'offset':offset,
        'format':'json',
        'keyword':'街拍',
        'autoload':'true',
        'count':'20',
        'en_qc':'1',
        'cur_tab':'1',
        'from':'search_tab',
        'pd':'synthesis',
        'timestamp':timestamp,
    }

    url='https://www.toutiao.com/api/search/content/?'+urllib.parse.urlencode(parms)


    try:
        response = requests.get(url,headers=headers,cookies=cookie)
        if response.status_code==200:
            print(response.json())
            return response.json()
    except requests.ConnectionError:
        print('请求失败')
        return None

if __name__ == '__main__':
    get_page(20,int(time.time()))


#结果
{'count': 20, 'return_count': 20, 'query_id': '6537385837821170952', 'has_more': 1, 'request_id': '202002031444450100120260661F2BF201',
 'search_id': '202002031444450100120260661F2BF201', 'cur_ts': 1580712286, 
'offset': 40, 'message': 'success', 'pd': 'synthesis', 'show_tabs': 1, 'keyword': '街拍', 'city': '永州', 'tokens': ['街拍'], 
'log_pb': {'impr_id': '202002031444450100120260661F2BF201', 'is_incognito': 0},
 'data': [{'abstract': '', 'app_info': {'db_name': 'R_SITE', 'page_type': '1', 'query_type': 'SearchAggregationInternalQueryType'},
 'article_url': 'http://toutiao.com/item/6786802624487227908/', 'behot_time': '1580175623', 'comment_count': 0, '。。。。等等等

完整代码：

# -*- coding:utf-8 -*-
import time
import requests
import urllib.parse
import os
from hashlib import md5

headers={
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"
}

cookie={

    'Cookie':' tt_webid=6789077713942971916; s_v_web_id=k65z596n_stTrkCwT_ni5x_4sJa_BH3v_HlSjP1Rsc1ko; WEATHER_CITY=%E5%8C%97%E4%BA%AC; tt_webid=6789077713942971916; csrftoken=b898a5247b610a1bd24735bbdccc8a8a; __tasessionId=tbdg6xx8b1580707187238'

}
def get_page(offset,timestamp):
    parms={
        'aid':'24',
        'app_name':'web_search',
        'offset':offset,
        'format':'json',
        'keyword':'街拍',
        'autoload':'true',
        'count':'20',
        'en_qc':'1',
        'cur_tab':'1',
        'from':'search_tab',
        'pd':'synthesis',
        'timestamp':timestamp,
    }

    url='https://www.toutiao.com/api/search/content/?'+urllib.parse.urlencode(parms)　　　　#知识点1，urlencode的使用


    try:
        response = requests.get(url,headers=headers,cookies=cookie)
        if response.status_code==200:

            return response.json()
    except requests.ConnectionError:
        print('请求失败')
        return None



def get_image(myjson):
    if myjson.get('data'):　　　　　　　　　　　　　　　　　　　　　　　　#知识点2，字典的get方法
        for item in myjson.get('data'):
            if item.get('title'):
                title=item.get('title')

                image_list=item.get('image_list')
                for image in image_list:
                    # print(image.get('url'))
                    if image:
                        image_url=image.get('url')

                        yield{　　　　　　　　　　　　　　　　　　　　　　　　#知识点3，yield生成器
                            'title':title,
                            'image':image_url
                        }
def save_image(item):
    if not os.path.exists(item.get('title')):　　　　　　　　　　　　　　　　#知识点4，os模块
        os.mkdir(item.get('title'))
    try:
        response=requests.get(url=item.get('image'),headers=headers)
        if response.status_code==200:
            file_path='{0}/{1}.{2}'.format(item.get('title'),md5(response.content).hexdigest(),'.jpg')　　　　　　　　　　　　#知识点5，MD5加密

            if not os.path.exists(file_path):
                with open(file_path,'wb')as fp:　　　　　　　　　　　　　　　　　　　　　　　　#知识点6，文件读写方法
                    fp.write(response.content)
            else:
                print('文件已下载',file_path)
        else:
            print('访问失败')

    except requests.ConnectionError:
        print('下载失败')


from multiprocessing.pool import Pool


def main(offset):
    json = get_page(offset,int(time.time()))
    for item in get_image(json):
        print(item)
        save_image(item)


GROUP_START = 1
GROUP_END = 20

if __name__ == '__main__':
    pool = Pool()
    groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])
    pool.map(main, groups)
    pool.close()
    pool.join()

知识点回顾

一、urlencode的使用

import urllib.parse


url='http://www.baidu.com/'

params={
    'kw':'美女',
    'time':'time',
}

query_string=urllib.parse.urlencode(params)

new_url=url+query_string
print(new_url)


#结果
http://www.baidu.com/kw=%E7%BE%8E%E5%A5%B3&time=time

二、字典的get方法

my_dict={
    'title':'街拍美女',
    'image':'http://www.baidu.com/'
}

print(my_dict.get('title'))
print(my_dict.get('image'))


#结果
街拍美女
http://www.baidu.com/

三、yield

def foo():
    print('start..')
    while True:
        res= yield 4
        print('res:',res)

r= foo()
print(type(r))
print(next(r))
print('*'*20)
print(next(r))


#结果
<class 'generator'>
start..
4
********************
res: None
4


#代码解释：
1.程序开始，遇到yield关键字，foo函数并没有真正执行，而是得到一个生成器对象 r
2.当执行next方法的时候，函数开始执行，打印了start，进入while循环
3.程序遇到关键字yield，返回一个 4 ，此时res并没有被赋值 ，此时next执行完成，输出了前面两行
4.程序执行打印20个*
5.继续执行next方法，不过这次跟上次不同，这次是从上次停止的地方开始继续执行，所以此时打印出来res的结果，因为前面没有把4给res，所以此时res是空的
6.程序又一次进入while 返回4 所以又打印了一次 4


#要点：
1.带yield的函数是一个生成器
2.要执行这个函数需要调用next方法，这一次的next开始的地方是接着上一次的next停止的地方执行的

四、os模块

import os
current_cd=os.getcwd()
print(current_cd)

#结果
C:UsersegonPycharmProjectspyCrawlerAjax练习

参考见：https://www.cnblogs.com/yufeihlf/p/6179547.html

五、MD5加密

import hashlib

m=hashlib.md5()
m.update(b'123')
print(m.hexdigest())

#结果
202cb962ac59075b964b07152d234b70

update(arg)传入arg对象来更新hash的对象。必须注意的是，该方法只接受byte类型，否则会报错。这就是要在参数前添加b来转换类型的原因。

常见用法：

print(hashlib.md5(b'123').hexdigest())


#对中文进行md5需要转码
data='你好世界'
print(hashlib.md5(data.encode('utf-8')).hexdigest())



参考：https://www.cnblogs.com/lanston1/p/11025881.html

六、文件读写模式

w 以写方式打开，
W 文件若存在，首先要清空，然后（重新）创建
a 以追加模式打开 (从 EOF 开始, 必要时创建新文件)
r+ 以读写模式打开
w+ 以读写模式打开 (参见 w )
a+ 以读写模式打开 (参见 a )
rb 以二进制读模式打开
wb 以二进制写模式打开 (参见 w )
ab 以二进制追加模式打开 (参见 a )
rb+ 以二进制读写模式打开 (参见 r+ )
wb+ 以二进制读写模式打开 (参见 w+ )
ab+ 以二进制读写模式打开 (参见 a+ )