爬取全部的校园新闻

作业要求来源:https://edu.cnblogs.com/campus/gzcc/GZCC-16SE1/homework/3002


0.从新闻url获取点击次数,并整理成函数

  • newsUrl
  • newsId(re.search())
  • clickUrl(str.format())
  • requests.get(clickUrl)
  • re.search()/.split()
  • str.lstrip(),str.rstrip()
  • int
  • 整理成函数
  • 获取新闻发布时间及类型转换也整理成函数
 1 # 获取新闻url点击次数
 2 def getClickUrl(url):
 3     res2 = requests.get(url)
 4     return int(re.findall(r"$('#hits').html('(d+)", res2.text)[0])
 5 
 6 # 获取新闻发布时间及类型转换
 7 def getNewTime(url):
 8     res = requests.get(url)
 9     print(res.encoding)
10     res.encoding = 'utf-8'
11 
12     soup = BeautifulSoup(res.text, 'html.parser')
13 
14     info = soup.select('.show-info')[0].text
15     info = info.split("xa0xa0")
16     for i in info:
17         if (':' in i):
18             temp = i.split("发布时间:")
19             return datetime.datetime.strptime(temp[1], "%Y-%m-%d %H:%M:%S ")

1.从新闻url获取新闻详情: 字典,anews

2.从列表页的url获取新闻url:列表append(字典) alist

3.生成所页列表页的url并获取全部新闻 :列表extend(列表) allnews

*每个同学爬学号尾数开始的10个列表页

4.设置合理的爬取间隔

import time

import random

time.sleep(random.random()*3)

5.用pandas做简单的数据处理并保存

保存到csv或excel文件 

newsdf.to_csv(r'F:duym爬虫gzccnews.csv')

 1 def getNewList(new_page):
 2     new_list = []
 3     response = requests.get(new_page)
 4     response.encoding = 'utf-8'
 5     resopnse = response.text
 6     soup = BeautifulSoup(resopnse, 'html.parser')
 7     new_list_html = soup.select(".news-list a")
 8     for i in new_list_html:
 9         new_list.append(i['href'])
10     return new_list
11 
12 def getNewInfo(url):
13     res = requests.get(url)
14     print(res.encoding)
15     res.encoding = 'utf-8'
16 
17     soup = BeautifulSoup(res.text, 'html.parser')
18 
19     info = soup.select('.show-info')[0].text
20     info = info.split("xa0xa0")
21     dict = {"作者": '', "标题": '', "发布时间": '', "来源": '', "点击": 0}
22     for i in info:
23         if (':' in i):
24             temp = i.split("发布时间:")
25             dict["发布时间"] = temp[1]
26             # print(temp)
27             temp = {}
28         if ('' in i):
29             temp = i.split("")
30             dict[temp[0]] = temp[1]
31             # print(temp)
32             temp = {}
33 
34     # 获取点击次数
35     url2 = "http://oa.gzcc.cn/api.php?op=count&id=11094&modelid=80"
36     res2 = requests.get(url2)
37     # print(res2.text)
38     # print (re.findall(r"$('#hits').html('(d+)",res2.text))
39     dict['点击'] = int(re.findall(r"$('#hits').html('(d+)", res2.text)[0])
40 
41     # 时间转换
42     dict["发布时间"] = datetime.datetime.strptime(dict["发布时间"], "%Y-%m-%d %H:%M:%S ")
43     print("发布时间类型为", type(dict["发布时间"]))
44     # 获取标题
45     title = soup.select(".show-title")[0].text
46     dict["标题"] = title
47     return dict
48 
49 url = r'http://news.gzcc.cn/html/xiaoyuanxinwen/'
50 allNews = []
51 for i in range(44, 54):
52     new_page = url + str(i) + r'.html'
53     new_list = getNewList(new_page)
54     for new_url in new_list:
55         allNews.append(getNewInfo(new_url))
56         time.sleep(random.random() * 3)
57 news = DataFrame(allNews)
58 news.to_csv(r'newsInfo.csv')

运行效果图如下:


原文地址:https://www.cnblogs.com/grate/p/10697018.html