爬虫

    from bs4 import BeautifulSoup
from lxml import html
import xml
import requests

url = "http://share.zte.com.cn/tech/jsp/blogList?uid=10021031"
baseUrl="http://share.zte.com.cn"
abUrl="http://share.zte.com.cn/tech/jsp/"

#headers={'User-Agent': "Mozilla/5.0 (Windows 7 10.0; Win64; x64) 
 #AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36"}
#headers={'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36"}

headers={

"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36"
}
f = requests.get(url,headers=headers)

html = f.text
#
#print(html)

file=open("1234.html",'wb')
file.write(str.encode(html))
file.close()
                #Get该网页从而获取该html内容
soup = BeautifulSoup(f.content, "lxml")  #用lxml解析器解析该网页的内容, 好像f.text也是返回的html
#print(f.content.decode())								#尝试打印出网页内容,看是否获取成功
#content = soup.find_all('div',class_="p12" )   #尝试获取节点,因为calss和关键字冲突,所以改名class_
#print(soup)

NewHtml='<!DOCTYPE html>
        <head>
        </head>
        <body>
        <table> 
        </table> 
        </body>
        </html>'
newSoup=BeautifulSoup(NewHtml)
column=3
table=newSoup.find('table')
thead=newSoup.new_tag('thead')
table.append(thead)
for i in range(3):
    tdt=newSoup.new_tag('td',width="33%")
    tdt.string="conlumn"+str(i)
    thead.append(tdt)
    


# 查找内容的div
divContent=soup.find('dl',class_='abstract_view')
count=0
tr=None

for kk in divContent.find_all('dd'):
    if count%column==0 or tr==None:
        if tr!=None:
            table.append(tr)

        tr=newSoup.new_tag('tr')
    count=count+1

    hhref=kk.find('a')['href']
    newHref=abUrl+hhref
    kk.find('a')['href']=newHref
    a=newSoup.new_tag('a',href=newHref)
    a.string=kk.find('a').string
    td=newSoup.new_tag('td')
    td.append(a)
    tr.append(td)

 #应该有更好方法实现
#if tr!=None:
 #  table.append(tr) 
     



divPageFoot=soup.find('div',class_='W_pages')
#print(divPageFoot)
#找到最后一页的数字,简单写,这个url是拼出来的
firstPage=divPageFoot.find_all('a')[0]
hreff=firstPage["href"]
index=hreff.rfind("=")
hreff=hreff[0:index+1]
baseUrl=baseUrl+hreff
print(hreff) 

lastPage=divPageFoot.find_all('a')[-1]  
Pagenum=lastPage.string
print(Pagenum)
pageNumInt=int(Pagenum.strip(".."))
print(pageNumInt)


for k  in range(2,pageNumInt+1) :
    print(k)
    url=baseUrl+str(k)
    print(url)
    ff = requests.get(url,headers=headers)
    soupTemp = BeautifulSoup(ff.content, "lxml")
    divContentTemp=soupTemp.find('dl',class_='abstract_view')
    ddlist=divContentTemp.find_all('dd')
    for kk in ddlist:
        hhref=kk.find('a')['href']
        kk.find('a')['href']=abUrl+hhref
        divContent.append(kk)
        if count%column==0 or tr==None:
            if tr!=None:
                table.append(tr)

            tr=newSoup.new_tag('tr')
        count=count+1

       # hhref=kk.find('a')['href']
        newHref=abUrl+hhref
        kk.find('a')['href']=newHref
        a=newSoup.new_tag('a',href=newHref)
        a.string=kk.find('a').string
        td=newSoup.new_tag('td')
        td.append(a)
        tr.append(td)
    
    
  #应该有更好方法实现
if tr!=None:
   table.append(tr) 


file=open("345.html",'wb')
file.write(str.encode(newSoup.decode()))
file.close()
     

#    if k==2:
    #    break

#print(divContent)
file=open("123.html",'wb')
file.write(str.encode(soup.decode()))
file.close()
#for k in soup.find_all('a',class_='nbg'):#,找到div并且class为pl2的标签
   #a = k.find_all('span')       #在每个对应div标签下找span标签,会发现,一个a里面有四组span
   #print(k[0].string)            #取第一组的span中的字符串
   #print(k)
   #print(type(k))            #取第一组的span中的字符串

原文地址:https://www.cnblogs.com/meiguhuaxian/p/12540701.html