爬虫项目汇总

不登录下获取数据

 1 # coding=utf-8
 2 """
 3 用类封装爬虫任务，
 4 目的，获取豆瓣某地区安热度排列的全部电影
 5 思路：
 6     chorme分析目标url，
 7     构建url
 8     发请求获取数据
 9     保存数据
10     循环上三步直到最后一页
11 注意：目前代码中的url地址已经失效
12 """
13 import requests
14 import json
15 
16 class DoubanSpider:
17     def __init__(self):
18         self.url_temp_list = [
19             {
20                 "url_temp": "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_american_hot/items?start={}&count=18&loc_id=108288",
21                 "country": "US"
22             },
23             {
24                 "url_temp": "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_english_hot/items?start={}&count=18&loc_id=108288",
25                 "country": "UK"
26             },
27             {
28                 "url_temp": "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_domestic_hot/items?start={}&count=18&loc_id=108288",
29                 "country": "CN"
30             }
31         ]
32         self.headers = {
33             "User-Agent": "Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Mobile Safari/537.36",
34             "Referer": "https://m.douban.com/movie/"
35         }
36 
37     def parse_url(self, url):  # 发送请求，获取响应
38         print(url)
39         response = requests.get(url, headers=self.headers)
40         return response.content.decode()
41 
42     def get_content_list(self, json_str):  # 提取数据
43         dict_ret = json.loads(json_str)
44         content_list = dict_ret["subject_collection_items"]
45         total = dict_ret["total"]  # 代表总数量  不一定正确
46         return content_list, total
47 
48     def save_content_list(self, content_list,country):  # 保存
49         with open("douban.txt", "a", encoding="utf-8") as f:
50             for content in content_list:
51                 content["country"] = country
52                 f.write(json.dumps(content, ensure_ascii=False))
53                 f.write("
")  # 写入换行符，进行换行
54         print("保存成功")
55 
56     def run(self):  # 实现主要逻辑
57         for url_temp in self.url_temp_list:
58             num = 0 # num是url中的start参数，表示起始页
59             total = 100  # 假设有第一页
60             while num < total + 18: # 不能等于，因为等于意味着上一次已经把最后一页取完了
61                 # 1.start_url
62                 url = url_temp["url_temp"].format(num)
63                 # 2.发送请求，获取响应
64                 json_str = self.parse_url(url)
65                 # 3.提取是数据
66                 content_list, total = self.get_content_list(json_str)
67 
68                 # 4.每一页都保存一下，而不是全部获取后再保存，防止中间出问题了，前面获取的都白费了。
69                 self.save_content_list(content_list,url_temp["country"])
70                 # if len(content_list)<18: # 这种方式判断是否取到尾也可以
71                 #     break
72                 # 5.构造下一页的url地址,进入循环
73                 num += 18
74 
75 
76 if __name__ == '__main__':
77     douban_spider = DoubanSpider()
78     douban_spider.run()

01.豆瓣获取最热电影信息

 1 # coding=utf-8
 2 import requests
 3 from retrying import retry
 4 
 5 headers = {
 6     "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36",
 7 }
 8 
 9 
10 @retry(stop_max_attempt_number=3)
11 def _parse_url(url, method, data, proxies):
12     print("*" * 20)
13     if method == "POST":
14         response = requests.post(url, data=data, headers=headers, proxies=proxies)
15     else:
16         response = requests.get(url, headers=headers, timeout=3, proxies=proxies)
17     assert response.status_code == 200
18     return response.content.decode()
19 
20 
21 def parse_url(url, method="GET", data=None, proxies={}):
22     try:
23         html_str = _parse_url(url, method, data, proxies)
24     except Exception as e:
25         html_str = None
26 
27     return html_str
28 
29 
30 if __name__ == '__main__':
31     url = "www.baidu.com"
32     print(parse_url(url))

02-1.通用贴吧爬虫-parse_url.py

 1 from parse_url import parse_url
 2 from lxml import etree
 3 import json
 4 """
 5 爬取任意贴吧 中帖子列表中的题目，评论数等，以及么个帖子中详情中每一页的图片，保存在文件中
 6 爬取时间：2019/3
 7 """
 8 
 9 class TieBa:
10     def __init__(self, name):
11         self.name = name
12         self.start_url = f"https://tieba.baidu.com/f?kw={name}&ie=utf-8&pn=0"
13         self.root_url = "https://tieba.baidu.com"
14 
15     def etree_get_content(self, text):
16         html = etree.HTML(text)
17         li_list = html.xpath("//li[@class=' j_thread_list clearfix']")  # 获取内容分组
18         data = []
19         for i in li_list:
20             # print(etree.tostring(i).decode())
21             item = {}
22             item["title"] = i.xpath(".//div[@class='threadlist_title pull_left j_th_tit ']/a/text()")[0] if i.xpath(
23                 ".//div[@class='threadlist_title pull_left j_th_tit ']/a/text()") else None
24             item["comments"] = i.xpath(".//span[@class='threadlist_rep_num center_text']/text()")[0] if i.xpath(
25                 ".//span[@class='threadlist_rep_num center_text']/text()") else None
26             item["href"] = self.root_url + i.xpath(".//div[@class='threadlist_title pull_left j_th_tit ']/a/@href")[
27                 0] if i.xpath(
28                 ".//div[@class='threadlist_title pull_left j_th_tit ']/a/@href") else None
29             item["imgs"] = self.get_img_list(item["href"], [])
30             data.append(item)
31         next_url = html.xpath(".//a[text()='下一页>']/@href")[0] if html.xpath(".//a[text()='下一页>']/@href") else None
32         return data, "https:" + next_url
33 
34     def pre_html(self, text):
35         """
36         去除注释符号
37         :return:
38         """
39         text = text.replace('<!--', '')
40         return text.replace('--!>', '')
41 
42     def get_img_list(self, next_url, container):
43         """递归爬取每一页贴吧详情页中发帖的图片"""
44         if next_url is None:
45             return container
46         detail_content = parse_url(next_url)
47         # 提取该页内容,提取下一页
48         html = etree.HTML(detail_content)
49         img_list = html.xpath("//img[@class='BDE_Image']/@src")  # 获取图片src 列表
50         container.extend(img_list)
51         next_url = html.xpath(".//a[text()='下一页']/@href")  # 下一页href
52         if next_url:  # 不为空列表
53             # 这里return写不写都行，因为container是列表，可变类型，下一步return container最后返回也行，但是建议写上，这样除了最后一次循环外，每层都少走一步
54             return self.get_img_list(self.root_url + next_url[0], container)
55         return container
56 
57     def save_content_dict(self, data):
58         file_path = self.name + ".txt"
59         with open(file_path, 'a+', encoding='utf8') as f:
60             for dd in data:
61                 f.write(json.dumps(dd, ensure_ascii=False))
62                 f.write('
')
63 
64     def run(self):
65         # 第一页
66         next_url = self.start_url
67         # 循环获取每一页
68         while next_url is not None:
69             html_str = parse_url(next_url)
70             # 请求信息预处理
71             html_str = self.pre_html(html_str)
72             # 提取该页内容,提取下一页
73             data, next_url = self.etree_get_content(html_str)
74             print(data)
75             self.save_content_dict(data)
76 
77 
78 if __name__ == '__main__':
79     name = input("请输入贴吧名称：").strip()
80     ba = TieBa(name)
81     ba.run()

02-2.通用贴吧爬虫-main.py

 1 """
 2 抓取https://www.qiushibaike.com所有热门 中数据
 3 爬取时间：2019/4
 4 """
 5 from parse_url import parse_url
 6 from lxml import etree
 7 import json
 8 
 9 
10 class QiuShi:
11 
12     def __init__(self):
13         self.start_url = "https://www.qiushibaike.com/8hr/page/{}/"  # 根据规律构建全部url地址
14         self.part_url = "https://www.qiushibaike.com"
15 
16     def etree_get_content(self, text):
17         etree_elemnt = etree.HTML(text)
18         # 先分组
19         content_list = etree_elemnt.xpath("//div[@class='recommend-article']/ul/li")
20         data = []
21         for li in content_list:
22             item = {}
23             try:
24                 item['title'] = li.xpath(".//a[@class='recmd-content']/text()")[0] if li.xpath(".//a[@class='recmd-content']/text()") else None
25                 item['href'] = self.part_url + li.xpath(".//a[@class='recmd-content']/@href")[0] if li.xpath(".//a[@class='recmd-content']/@href") else None
26                 item['laugh_num'] = li.xpath(".//div[@class='recmd-num']/span[1]/text()")[0] if li.xpath(".//div[@class='recmd-num']/span[4]/text()") else None
27                 item['comment_num'] = li.xpath(".//div[@class='recmd-num']/span[4]/text()")[0] if li.xpath(".//div[@class='recmd-num']/span[4]/text()") else None
28             except Exception as e:
29                 print(e)
30                 continue
31             data.append(item)
32         return data
33 
34     def save_content_dict(self, data):
35         file_path = "糗事百科热门.txt"
36         with open(file_path, 'a+', encoding='utf8') as f:
37             for dd in data:
38                 f.write(json.dumps(dd, ensure_ascii=False))
39                 f.write('
')
40 
41     def run(self):
42         # 构建url地址列表
43         for i in range(1, 14):
44             # 获取每一页目标响应
45             html_str = parse_url(self.start_url.format(i))
46             # 解析页面
47             data = self.etree_get_content(html_str)
48             # 每一页保存一次
49             self.save_content_dict(data)
50 
51 
52 if __name__ == '__main__':
53     q = QiuShi()
54     q.run()

03.爬取qiushibaike-单线程

 1 """
 2 多线程 抓取https://www.qiushibaike.com所有热门 中数据，
 3 但是该网站布局已经改版了，部分xpath解析的位置已经不在了
 4 爬取时间：2017/10
 5 """
 6 import requests
 7 from lxml import etree
 8 import threading
 9 from queue import Queue
10 
11 
12 class QiubaiSpdier:
13     def __init__(self):
14         self.url_temp = "https://www.qiushibaike.com/8hr/page/{}/"
15         self.headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"}
16         self.url_queue = Queue()
17         self.html_queue  = Queue()
18         self.content_queue = Queue()
19 
20     def get_url_list(self):
21         # return [self.url_temp.format(i) for i in range(1,14)]
22         for i in range(1,4):
23             self.url_queue.put(self.url_temp.format(i))
24 
25     def parse_url(self):
26         while True:
27             url = self.url_queue.get()
28             print(url)
29             response = requests.get(url,headers=self.headers)
30             self.html_queue.put(response.content.decode())
31             self.url_queue.task_done() # 注意必须url的get并处理好url的响应put到对应的队列后，再调用url的task_done使计数减一
32 
33     def get_content_list(self): # 提取数据
34         while True:
35             html_str = self.html_queue.get()
36 
37             html = etree.HTML(html_str)
38             div_list = html.xpath("//div[@id='content-left']/div")  #分组
39             content_list = []
40             for div in div_list:
41                 item= {}
42                 item["content"] = div.xpath(".//div[@class='content']/span/text()")
43                 item["content"] = [i.replace("
","") for i in item["content"]]
44                 item["author_gender"] = div.xpath(".//div[contains(@class,'articleGender')]/@class")
45                 item["author_gender"] = item["author_gender"][0].split(" ")[-1].replace("Icon","") if len(item["author_gender"])>0 else None
46                 item["auhtor_age"] = div.xpath(".//div[contains(@class,'articleGender')]/text()")
47                 item["auhtor_age"] = item["auhtor_age"][0] if len(item["auhtor_age"])>0 else None
48                 item["content_img"] = div.xpath(".//div[@class='thumb']/a/img/@src")
49                 item["content_img"] = "https:"+item["content_img"][0] if len(item["content_img"])>0 else None
50                 item["stats_vote"] = div.xpath(".//span[@class='stats-vote']/i/text()")
51                 item["stats_vote"] = item["stats_vote"][0] if len(item["stats_vote"])>0 else None
52                 content_list.append(item)
53             self.content_queue.put(content_list)
54             self.html_queue.task_done() # 注意task_done放在put后面，确保get的结果处理完并且已经put都对应的队列中
55 
56     def save_content_list(self): # 保存
57         while True:
58             content_list = self.content_queue.get()
59             for i in content_list:
60                 # print(i)
61                 pass
62             self.content_queue.task_done()
63 
64     def run(self): #实现主要逻辑
65         thread_list = []
66         #1.url_list
67         t_url = threading.Thread(target=self.get_url_list)
68         thread_list.append(t_url)
69         #2.遍历，发送请求，获取响应
70         for i in range(20):
71             t_parse = threading.Thread(target=self.parse_url)
72             thread_list.append(t_parse)
73         #3.提取数据
74         for i in range(2):
75             t_html = threading.Thread(target=self.get_content_list)
76             thread_list.append(t_html)
77         #4.保存
78         t_save = threading.Thread(target=self.save_content_list)
79         thread_list.append(t_save)
80         for t in thread_list:
81             t.setDaemon(True) # 把子线程设置为守护线程，主线程结束，子线程结束
82             t.start()
83 
84         for q in [self.url_queue,self.html_queue,self.content_queue]:
85             # 调用此方法让主线程阻塞，直到队列中所有的项目均被处理。阻塞将持续到队列中的每个项目均调用q.task_done（）方法为止
86             q.join()
87 
88 if __name__ == '__main__':
89     qiubai = QiubaiSpdier()
90     qiubai.run()
91     print("主线程结束")

03.爬取qiushibaike-多线程

 1 """
 2 第一页：
 3 Request URL: https://www.douyu.com/directory/all
 4 Request Method: GET
 5 
 6 
 7 第二页：
 8 Request URL: https://www.douyu.com/gapi/rkc/directory/0_0/2
 9 Request Method: GET
10 
11 爬取时间：2019/4
12 """
13 import json
14 import time
15 from retrying import retry
16 from selenium import webdriver
17 from selenium.webdriver.chrome.options import Options
18 
19 chrome_options = Options()
20 chrome_options.add_argument('--headless')  # 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
21 
22 class DouYuSpider:
23     def __init__(self):
24         self.start_url = 'https://www.douyu.com/directory/all'
25         self.driver = webdriver.Chrome(chrome_options=chrome_options)
26 
27     def save_content_dict(self, data):
28         file_path = 'douyu-room' + ".txt"
29         with open(file_path, 'a+', encoding='utf8') as f:
30             for dd in data:
31                 f.write(json.dumps(dd, ensure_ascii=False))
32                 f.write('
')
33             f.flush()
34 
35     @retry(stop_max_attempt_number=3)
36     def get_next_page_click(self):
37         next_page = self.driver.find_elements_by_xpath("//li[@class=' dy-Pagination-next']/span")
38         # 最后一页的 下一页 父元素 class=‘dy-Pagination-disabled dy-Pagination-next’ 表示不可点击了
39         if len(next_page) == 0:
40             return -1
41         else:
42             next_page[0].click()
43 
44     def get_single_page(self):
45         # 先分组
46         room_list = self.driver.find_elements_by_xpath(
47             "//div[@class='layout-Module-container layout-Cover ListContent']/ul/li")
48         data = []
49         for room in room_list:
50             item = {}
51             item['title'] = room.find_element_by_xpath(".//h3[@class='DyListCover-intro']").text
52             item['zone'] = room.find_element_by_xpath(".//span[@class='DyListCover-zone']").text
53             # item['img'] = room.find_element_by_xpath(".//img[@class='DyImg-content is-normal']").get_attribute(
54             #     'src')
55             item['anchor_name'] = room.find_element_by_xpath(".//h2[@class='DyListCover-user']").text
56             data.append(item)
57         return data
58 
59 
60     def run(self):
61         # 第一页
62         self.driver.get(self.start_url)
63         self.driver.implicitly_wait(12)
64 
65         while True:
66             # 获取每一页的页面结构化数据
67             data = self.get_single_page()
68             # 保存数据
69             self.save_content_dict(data)
70             # 查找下一页url,并点击
71             try:
72                 ret = self.get_next_page_click()
73                 time.sleep(2) # 等待页面加载完全
74                 if ret == -1:
75                     break
76             except Exception as e:
77                 print(e)
78 
79         self.driver.quit()
80 
81 
82 if __name__ == '__main__':
83     douyu = DouYuSpider()
84     douyu.run()
85 
86 """
87 优化建议：
88 1.把每一页的self.driver.page_source 页面字符串传给lxml的etree去处理
89 2.staleness_of 尝试失败 https://www.mlln.cn/2018/05/22/python-selenium如何在点击后等待页面刷新
90 
91 """

04.爬取斗鱼直播房间信息

自动登录案例

 1 """
 2 套路：登录首页的时候，已经给浏览器设置cookies，此时未激活
 3 登录成功后返回假的cookies，激活未激活的cookies，
 4 
 5 """
 6 import requests
 7 from bs4 import BeautifulSoup
 8 
 9 headers = {
10     "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36",
11 }
12 
13 index = requests.get("https://dig.chouti.com/", headers=headers)
14 cookies = index.cookies.get_dict()
15 
16 
17 # ===========================点赞=================
18 
19 # 1.登录
20 login = requests.post(
21     "https://dig.chouti.com/login",
22     data={
23         "phone": 8615026809593,
24         "password":'dajiahaa',
25     },
26     headers=headers,
27     cookies=cookies)
28 
29 # 2.点赞
30 dizan = requests.post(
31     url="https://dig.chouti.com/link/vote?linksId=25389911",
32     cookies=cookies,
33     headers=headers)
34 
35 print(dizan.text)

01.抽屉网

 1 """
 2 套路：
 3 - 带请求头
 4 - 带cookie
 5 - 请求体中：
 6     commit:Sign in
 7     utf8:✓
 8     authenticity_token:放在页面隐藏表单中
 9     login:asdfasdfasdf
10     password:woshiniba8
11 
12 """
13 import requests
14 from bs4 import BeautifulSoup
15 
16 headers = {
17     "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36",
18 }
19 
20 login = requests.get(
21     "https://github.com/login",
22     headers=headers,
23 )
24 cookies = login.cookies.get_dict()
25 login_par = BeautifulSoup(login.content, 'html.parser')
26 token_input = login_par.find(name='input', attrs={"name": "authenticity_token"})
27 
28 authenticity_token = token_input.attrs.get("value")
29 # 1.登录
30 re_login = requests.post(
31     "https://github.com/session",
32     data={
33         "commit": "Sign in",
34         "utf8":"✓",
35         "login": "cpcp@163.com",
36         "password": 'cs11187',
37         "authenticity_token": authenticity_token,
38         "webauthn-support": "supported"
39     },
40     cookies=cookies,
41     headers={
42         "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36",
43         "Referer": "https://github.com/login"
44     }
45 )
46 
47 print(re_login.text)

02.github