爬取全部的校园新闻

一、从新闻url获取点击次数,获取新闻发布时间及类型转换,并整理成函数

  • 获取新闻URL点击次数
1 # 获取新闻点击次数
2 def get_click_num(arg_news_url):
3     click_id = re.findall("(\d{1,5})", arg_news_url)[-1]          # findall函数返回正则表达式在字符串中匹配的结果列表
4     click_url = "http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80".format(click_id)
5     response = requests.get(click_url)
6     news_click_num = int(response.text.split('.html')[-1].lstrip("('").rstrip("');"))  # lstrip,rstrip函数分别去除相应匹配的头,尾字符
7     return news_click_num
  • 获取新闻发布时间及类型转换
1 # 获取新闻发布时间及类型转换
2 def get_news_date(showinfo):
3     news_date = showinfo.split()[0].split(':')[1]
4     news_time = showinfo.split()[1]
5     news_dt = news_date + " " + news_time
6     dt = datetime.strptime(news_dt, "%Y-%m-%d %H:%M:%S")
7     return dt

二、从新闻url获取新闻详情

 1 # 获取相应URL的新闻标题,调用获取新闻发布时间、新闻的点击次数的函数
 2 def get_news_detail(arg_news_url):
 3     news_detail = {}
 4     response = requests.get(arg_news_url)
 5     response.encoding = 'utf-8'
 6     soup = BeautifulSoup(response.text, 'html.parser')
 7     news_detail['NewsTitle'] = soup.select('.show-title')[0].text
 8     showinfo = soup.select('.show-info')[0].text
 9     news_detail['NewsDateTime'] = get_news_date(showinfo)
10     news_detail['NewsClickNum'] = get_click_num(arg_news_url)
11     return news_detail

三、从列表页的url获取新闻url

 1 # 获取22~32页所有新闻的详情信息并形成列表
 2 def get_news_list():
 3     news_list = []
 4     page_urls = get_page_urls()
 5     for page_url in page_urls:
 6         sleep()
 7         response = requests.get(page_url)
 8         response.encoding = 'utf-8'
 9         soup = BeautifulSoup(response.text, 'html.parser')
10         for news in soup.select('li'):
11             if len(news.select('.news-list-title')) > 0:
12                 news_url = news.select('a')[0]['href']                     # 链接
13                 news_desc = news.select('.news-list-description')[0].text  # 描述
14                 news_dict = get_news_detail(news_url)
15                 news_dict['NewsUrl'] = news_url
16                 news_dict['Description'] = news_desc
17                 news_list.append(news_dict)
18     return news_list

四、获取所有页的URL,再用页的url并获取全部新闻 

1 # 获取校园新闻第22~32页的URL,并存到列表里
2 def get_page_urls():
3     page_urls = []
4     for i in range(22, 32):
5         page_url = 'http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'.format(i)
6         page_urls.append(page_url)
7     return page_urls

五、设置合理的爬取间隔

1 def sleep():
2     time.sleep(random.random() * 3)
3     return

六、用pandas做简单的数据处理并保存

1 def save_file():
2     news_df = pd.DataFrame(get_news_list())
3     print(news_df)
4     news_df.to_csv(r'f:news.csv')                                          # 生成csv文件保存到f盘
5     return

七、运行效果

八、完整源码

 1 import re
 2 import time
 3 import random
 4 import requests
 5 import pandas as pd
 6 from bs4 import BeautifulSoup
 7 from datetime import datetime
 8 
 9 
10 # 获取新闻点击次数
11 def get_click_num(arg_news_url):
12     click_id = re.findall("(\d{1,5})", arg_news_url)[-1]          # findall函数返回正则表达式在字符串中匹配的结果列表
13     click_url = "http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80".format(click_id)
14     response = requests.get(click_url)
15     news_click_num = int(response.text.split('.html')[-1].lstrip("('").rstrip("');"))  # lstrip,rstrip函数分别去除相应匹配的头,尾字符
16     return news_click_num
17 
18 
19 # 获取新闻发布时间及类型转换
20 def get_news_date(showinfo):
21     news_date = showinfo.split()[0].split(':')[1]
22     news_time = showinfo.split()[1]
23     news_dt = news_date + " " + news_time
24     dt = datetime.strptime(news_dt, "%Y-%m-%d %H:%M:%S")
25     return dt
26 
27 
28 # 获取相应URL的新闻标题,调用获取新闻发布时间、新闻的点击次数的函数
29 def get_news_detail(arg_news_url):
30     news_detail = {}
31     response = requests.get(arg_news_url)
32     response.encoding = 'utf-8'
33     soup = BeautifulSoup(response.text, 'html.parser')
34     news_detail['NewsTitle'] = soup.select('.show-title')[0].text
35     showinfo = soup.select('.show-info')[0].text
36     news_detail['NewsDateTime'] = get_news_date(showinfo)
37     news_detail['NewsClickNum'] = get_click_num(arg_news_url)
38     return news_detail
39 
40 
41 # 获取校园新闻第22~32页的URL,并存到列表里
42 def get_page_urls():
43     page_urls = []
44     for i in range(22, 32):
45         page_url = 'http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'.format(i)
46         page_urls.append(page_url)
47     return page_urls
48 
49 
50 # 获取22~32页所有新闻的详情信息并形成列表
51 def get_news_list():
52     news_list = []
53     page_urls = get_page_urls()
54     for page_url in page_urls:
55         sleep()
56         response = requests.get(page_url)
57         response.encoding = 'utf-8'
58         soup = BeautifulSoup(response.text, 'html.parser')
59         for news in soup.select('li'):
60             if len(news.select('.news-list-title')) > 0:
61                 news_url = news.select('a')[0]['href']                     # 链接
62                 news_desc = news.select('.news-list-description')[0].text  # 描述
63                 news_dict = get_news_detail(news_url)
64                 news_dict['NewsUrl'] = news_url
65                 news_dict['Description'] = news_desc
66                 news_list.append(news_dict)
67     return news_list
68 
69 
70 def sleep():
71     time.sleep(random.random() * 3)
72     return
73 
74 
75 def save_file():
76     news_df = pd.DataFrame(get_news_list())
77     print(news_df)
78     news_df.to_csv(r'f:news.csv')                                          # 生成csv文件保存到f盘
79     return
80 
81 
82 save_file()
View Code
原文地址:https://www.cnblogs.com/trojans/p/10697512.html