爬取化工单个网页

Python爬虫视频教程零基础小白到scrapy爬虫高手-轻松入门

https://item.taobao.com/item.htm?spm=a1z38n.10677092.0.0.482434a6EmUbbW&id=564564604865

# -*- coding: utf-8 -*-
"""
Created on Tue Mar 15 08:53:08 2016
采集化工标准补录项目
@author: Administrator
"""
import requests,bs4,openpyxl
from openpyxl.cell import get_column_letter,column_index_from_string

excelName="test.xlsx"
sheetName="Sheet1"
wb1=openpyxl.load_workbook(excelName)
sheet=wb1.get_sheet_by_name(sheetName)
start=1
columnname1="A" #标准编号
columnname2="B" #发布部门
columnname3="C" #实施日期
columnname1_index=column_index_from_string(columnname1)
columnname2_index=column_index_from_string(columnname2)
columnname3_index=column_index_from_string(columnname3)
cell1=sheet['A2']
cell2=sheet['B2']
cell3=sheet['C2']

del_content1="标准编号:"
del_content2="发布部门:"
del_content3="实施日期:"

webpage="http://www.bzwxw.com/html/2016/1988_0116/9.html"
res=requests.get(webpage)
res.encoding = 'gbk'
requests.codes.ok


#soup1=bs4.BeautifulSoup(res.text,"lxml",from_encoding="gb18030")
soup1=bs4.BeautifulSoup(res.text,"lxml")

title=soup1.select('title')
len(title)
title_content=title[0].getText()

StandardCode=soup1.select('h5')

content_list=[]
for i in StandardCode:
content=i.getText()
content_list.append(content)


for i in content_list:
if "标准编号" in i:
i=i.strip(del_content1)
cell1.value=i
if "发布部门" in i:
i=i.strip(del_content2)
cell2.value=i
if "实施日期" in i:
i=i.strip(del_content3)
cell3.value=i

wb1.save(excelName)

原文地址:https://www.cnblogs.com/webRobot/p/5278718.html