python-爬取校园新闻首页的新闻

1.作业代码

import requests
from bs4 import BeautifulSoup
from datetime import datetime


#========================================================================
#1.用requests库和BeautifulSoup库，爬取校园新闻首页新闻的标题、链接、正文。
#========================================================================
url='http://news.gzcc.cn/html/xiaoyuanxinwen/'
res=requests.get(url)
res.encoding='utf-8'
soup=BeautifulSoup(res.text,'html.parser')

for news in soup.select('li'):
    if len(news.select('.news-list-title'))>0:
        pert=news.select('.news-list-title')[0].text              #pertitle   每则新闻题目
        perdt=news.select('.news-list-info')[0].contents[0].text  #perDetail  每则新闻详细内容
        perhref=news.select('a')[0].attrs['href']                 #perHref    每则新闻源链接

        # ————————————爬取子页面内容——————————————————————————
        perdetail=requests.get(perhref)
        perdetail.encoding='utf-8'
        soupDetail=BeautifulSoup(perdetail.text,'html.parser')
        textContent=soupDetail.select('#content')[0].text

        #————————————输出内容——————————————
        print('题目：',pert)
        print('发布时间：',perdt)
        print('源页面：',perhref)
        print('正文内容：',textContent)
        break;
#=============================================================
#2.分析字符串，获取每篇新闻的发布时间，作者，来源，摄影等信息。
#=============================================================
info=soupDetail.select('.show-info')[0].text
catagory=['发布时间:','审核：','作者：','摄影：','来源：','点击：']
i=0
while(i<len(catagory)):
    valid=info.find(catagory[i])
    if(valid>=0):
        s=info[info.find(catagory[i]):].split()[0].lstrip(catagory[i])
#=============================================================
#3.将其中的发布时间由str转换成datetime类型。
#=============================================================
        if(valid>0 and i==0):
            timeC=datetime.strptime(s,'%Y-%m-%d %H:%M:%S')print(catagory[i]+s)
    i=i+1


2.结果截图

中间省略若干新闻报道内容...暂爬取新闻第一页作示例。
若源代码第31行的 break 注释掉，能够爬取更多的新闻页面。