import requests from lxml import etree # User-Agent:Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36 def body(): url = "https://gongkong.ofweek.com/2019-04/ART-310058-11000-30318953.html" header = {'User-Agent':'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36'} respones = requests.get(url,headers=header) print(respones) respones.encoding = "gbk" html = respones.text obj = etree.HTML(html) obj_titer = obj.xpath('//div[@class="artical"]/p/text()') print(obj_titer) obj_body = obj.xpath('//div[@id="articleC"]/p//text()') print(obj_body) '''提取图片''' obj_1_picture = obj.xpath('//div[@id="articleC"]/p/img/@src') print(obj_1_picture) """ 翻页 """ obj_page = obj.xpath('//div[@class="page"]/span/a/@href')[0] print(obj_page) obj_page_link = 'https://gongkong.ofweek.com/2019-05/' + obj_page print(obj_page_link) respones_2 = requests.get(obj_page_link,headers=header) print(respones_2) respones_2.encoding = "gbk" html_2 = respones_2.text obj_2 = etree.HTML(html_2) obj_2_body = obj_2.xpath('//div[@id="articleC"]/p//text()') print(obj_2_body) object = obj_titer + obj_body + obj_2_body '''提取图片''' obj_2_picture = obj_2.xpath('//div[@id="articleC"]/p/img/@src') print(obj_2_picture) """为保存文件的换行 空格""" v = [] for i in object: v.append("u3000"+"u3000" + i+" ") """文件头""" y = [] for i in obj_titer: s = i.strip() y.append(s) res = str(y)[2:12] #列表转字符串形式 print(res) with open(r"F:day08人民日报微信图片\%s.txt"%res,"w",encoding="utf-8") as f: for i in v: f.write(i) '''保存图片''' obj_picture = obj_1_picture + obj_2_picture print(obj_picture) ret = len(obj_picture) print(ret) s = range(0,ret) for i,d in zip(obj_picture,s): reg = requests.get(i) with open(r"F:day08人民日报微信图片\%s.jpg"%d,"wb",) as f: f.write(reg.content) body()