爬呀,列表有最大长度的哦

# import requests
# from lxml import etree
# import time,random
# header={
# 'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
# }
# response=requests.get('http://xh.5156edu.com/kxbs.html',headers=header).content.decode('gbk')
# print(response)
# html = etree.HTML(response)
# print(html.xpath('//table[@id="table1"]//a/text()'))
# print(len(html.xpath('//table[@id="table1"]//a/text()')))
# print(html.xpath('//table[@id="table1"]//a/@href'))
# print(len(html.xpath('//table[@id="table1"]//a/@href')))
# urllist=['http://xh.5156edu.com'+i for i in html.xpath('//table[@id="table1"]//a/@href')]
# print(urllist)
# print(len(urllist))
# print(etree.tostring(html))
# allli=[]
# s=0
# for i in urllist:
# time.sleep(random.uniform(3,4))
# response = requests.get(i, headers=header).content.decode('gbk')
# html = etree.HTML(response)
# nowlist = ['http://xh.5156edu.com' + i for i in html.xpath('//td[ @ width = "8%"]/a/@href')]
# print(len(nowlist),nowlist)
# s+=len(nowlist)
# print(s)
# allli+=nowlist
# for i in nowlist:
# with open('kxzd_urllist',mode='a',encoding='utf-8') as f:
# f.write(i+' ')
# print(len(allli),allli)


# import requests
# import time,random,os
# import re
# header={
# 'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
# }
# su=0
# de=0
# with open('kxzd_urllist') as f:
# for i in f:
# s=i.strip()
# time.sleep(random.uniform(1, 2))
# try:
# response=requests.get(s,headers=header).content.decode('gbk')
# name = os.path.join(r'D:urllist', re.findall('<title>(.*?)</title>',response)[0] + '.html')
# with open(name,mode='a',encoding='gbk') as f:
# f.write(response)
# with open('kxzd_urllist_complited', mode='a', encoding='gbk') as f:
# f.write(s+' ')
# su+=1
# except:
# with open('kxzd_urllist_uncomplited', mode='a', encoding='gbk') as f:
# f.write(s+' ')
# de+=1
# print(su,de)



# import requests
# import time,random,os
# import re
# header={
# 'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
# }
# su=0
# de=0
# with open('kxzd_urllist_uncomplited') as f:
# for i in f:
# s=i.strip()
# time.sleep(random.uniform(1, 2))
# try:
# response=requests.get(s,headers=header).content.decode('GB18030')
# name = os.path.join(r'D:urllist_utf-8', re.findall('<title>(.*?)</title>',response)[0] + '.html')
# with open(name, mode='w', encoding='utf-8') as f:
# f.write(response.replace('charset=gb2312', 'charset=utf-8'))
# with open('kxzd_urllist_complited', mode='a', encoding='gbk') as f:
# f.write(s+' ')
# su+=1
# except:
# with open('kxzd_urllist_stilluncomplited', mode='a', encoding='gbk') as f:
# f.write(s+' ')
# de+=1
# print(su,de)


# import requests
# import time,random,os
# import re
# header={
# 'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
# }
# su=0
# de=0
# with open('kxzd_urllist_stilluncomplited') as f:
# for i in f:
# s=i.strip()
# time.sleep(random.uniform(1, 2))
# try:
# response=requests.get(s,headers=header).content.decode('GB18030',errors='ignore')
# name = os.path.join(r'D:urllist_decode_error', re.findall('<title>(.*?)</title>',response)[0] + '.html')
# with open(name, mode='w', encoding='utf-8') as f:
# f.write(response.replace('charset=gb2312', 'charset=utf-8'))
# with open('kxzd_urllist_complited', mode='a', encoding='gbk') as f:
# f.write(s+' ')
# su+=1
# except:
# with open('kxzd_urllist_stillstilluncomplited', mode='a', encoding='gbk') as f:
# f.write(s+' ')
# de+=1
# print(su,de)



# from lxml import etree
# import time,random,os
# import re,requests
# header={
# 'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
# }
# response=requests.get('http://xh.5156edu.com/kx/z90m98j7521.html').content.decode('gbk')
# print(response)
# html = etree.HTML(response)
# print(html)
# s=html.xpath('//div/table/tbody/tr/td[@class="font_14"]|//div[@align="center"]/table/tr/td[@class="font_23"]')
# # print(s)
# m=etree.tostring(s[0],encoding='gbk',method="html")
# print(m.decode('gbk'))



# from lxml import etree
# import time,random,os
# import re,requests
# li=os.listdir(r'C:UserslenovoDesktopurllist_gbk')
# os.chdir(r'C:UserslenovoDesktopurllist_gbk')
# for i in li:
# with open(i,mode='r',encoding='gbk') as f:
# response=f.read()
# print(response)
# try:
# html = etree.HTML(response)
# print(html)
# s=html.xpath('//div[@align="center"]|//div[@align="center"]/table/tr/td[@class="font_23"]')
# if not s:
# print(s)
# m=etree.tostring(s[0],encoding='gbk',method="html")
# print('*'*20)
# print(m.decode('gbk'))
# s=m.decode('gbk')
# name = os.path.join(r'C:UserslenovoDesktopfingbk',i)
# with open(name, mode='w+', encoding='gbk') as f:
# f.write(s)
# except:
# with open('xpath无效文件',mode='a')as f1 :
# f1.write(i+' ')
# os.remove(i)


# from lxml import etree
# import time,random,os
# import re,requests
# li=os.listdir(r'C:UserslenovoDesktopurllist_utf-8')
# os.chdir(r'C:UserslenovoDesktopurllist_utf-8')
# for i in li:
# with open(i,mode='r',encoding='utf-8') as f:
# response=f.read()
# print(response)
# try:
# html = etree.HTML(response)
# print(html)
# s=html.xpath('//div[@align="center"]|//div[@align="center"]/table/tr/td[@class="font_23"]')
# if not s:
# print(s)
# m=etree.tostring(s[0],encoding='utf-8',method="html")
# print('*'*20)
# print(m.decode('utf-8'))
# s=m.decode('utf-8')
# name = os.path.join(r'C:UserslenovoDesktopfinutf8',i)
# with open(name, mode='w+', encoding='utf-8') as f:
# f.write(s)
# except:
# with open('xpath无效文件',mode='a')as f1 :
# f1.write(i+' ')
# os.remove(i)


# from lxml import etree
# import time,random,os
# import re,requests
# li=os.listdir(r'C:UserslenovoDesktopurllist_decode_error')
# os.chdir(r'C:UserslenovoDesktopurllist_decode_error')
# for i in li:
# with open(i,mode='r',encoding='utf-8',errors='ignore') as f:
# response=f.read()
# print(response)
# try:
# html = etree.HTML(response)
# print(html)
# s=html.xpath('//div[@align="center"]|//div[@align="center"]/table/tr/td[@class="font_23"]')
# if not s:
# print(s)
# m=etree.tostring(s[0],encoding='gbk',method="html")
# print('*'*20)
# print(m.decode('gbk'))
# s=m.decode('gbk')
# name = os.path.join(r'C:UserslenovoDesktopfinerror',i)
# with open(name, mode='w+', encoding='gbk') as f:
# f.write(s)
# except:
# with open('xpath无效文件',mode='a')as f1 :
# f1.write(i+' ')
# os.remove(i)


# import os
# li=os.listdir(r'C:UserslenovoDesktopfinutf81')
# os.chdir(r'C:UserslenovoDesktopfinutf81')
# for i in li:
# with open(i,mode='r',encoding='utf-8') as f:
# response=f.read()
# if '<br><b><span class="font_15">基本解释</span></b>' in response:
# response=response.split('<br><b><span class="font_15">基本解释</span></b>')[0]
# with open(i,mode='w',encoding='utf-8') as f:
# f.write(response)

# import os
# li=os.listdir(r'C:UserslenovoDesktopfinerror1')
# os.chdir(r'C:UserslenovoDesktopfinerror1')
# for i in li:
# with open(i,mode='r+',encoding='gbk') as f:
# response=f.read()
# if '<br><b><span class="font_15">基本解释</span></b>' in response:
# response=response.split('<br><b><span class="font_15">基本解释</span></b>')[0]
# with open(i,mode='w',encoding='utf-8') as f:
# f.write(response)

# import os
# li=os.listdir(r'C:UserslenovoDesktopfingbk1')
# os.chdir(r'C:UserslenovoDesktopfingbk1')
# for i in li:
# with open(i,mode='r+',encoding='gbk') as f:
# response=f.read()
# if '<br><b><span class="font_15">基本解释</span></b>' in response:
# response=response.split('<br><b><span class="font_15">基本解释</span></b>')[0]
# with open(i,mode='w',encoding='utf-8') as f:
# f.write(response)





# import os
# li=os.listdir(r'C:UserslenovoDesktopfinutf81')
# os.chdir(r'C:UserslenovoDesktopfinutf81')
# for i in li:
# with open(i,mode='r',encoding='utf-8') as f:
# response=f.read()
# if '/a>' in response:
# response=response.replace('<a','<span').replace('</a>','</span>')
# with open(i,mode='w',encoding='utf-8') as f:
# f.write(response)
# import os
# li=os.listdir(r'C:UserslenovoDesktopfingbk1')
# os.chdir(r'C:UserslenovoDesktopfingbk1')
# for i in li:
# with open(i,mode='r+',encoding='utf-8') as f:
# response=f.read()
# if '/a>' in response:
# response=response.replace('<a','<span').replace('</a>','</span>')
# with open(i,mode='w',encoding='utf-8') as f:
# f.write(response)




# import os
# import re
# li=os.listdir(r'C:UserslenovoDesktopfinutf81')
# os.chdir(r'C:UserslenovoDesktopfinutf81')
# for i in li:
# with open(i,mode='r',encoding='utf-8') as f:
# response=f.read()
# with open('urlist_gif',mode='a',encoding='utf-8') as f:
# li=re.findall('src="(.*?)">',response,re.S)
# print(li)
# for i in li:
# f.write('http://xh.5156edu.com'+i+' ')

# import os
# import re
# li=os.listdir(r'C:UserslenovoDesktopfinerror1')
# os.chdir(r'C:UserslenovoDesktopfinerror1')
# for i in li:
# with open(i,mode='r',encoding='utf-8') as f:
# response=f.read()
# with open('urlist_gif',mode='a',encoding='utf-8') as f:
# li=re.findall('src="(.*?)">',response,re.S)
# print(li)
# for i in li:
# f.write('http://xh.5156edu.com'+i+' ')
# import os
# import re
# li=os.listdir(r'C:UserslenovoDesktopfingbk1')
# os.chdir(r'C:UserslenovoDesktopfingbk1')
# for i in li:
# with open(i,mode='r',encoding='utf-8') as f:
# response=f.read()
# with open('urlist_gif',mode='a',encoding='utf-8') as f:
# li=re.findall('src="(.*?)">',response,re.S)
# print(li)
# for i in li:
# f.write('http://xh.5156edu.com'+i+' ')
# li=[]
# with open(r'C:UserslenovoDesktopfinutf81urlist_gif', mode='r',encoding='utf-8') as f:
# for i in f:
# s=f.readline().strip(' ')
# # print(s)
# li.append(s.strip())
# with open(r'C:UserslenovoDesktopfinutf81urlist_gif', mode='w',encoding='utf-8') as f:
# for i in set(li):
# f.write('http://xh.5156edu.com' + i + ' ')
#
# li=[]
# with open(r'C:UserslenovoDesktopfinerror1urlist_gif', mode='r',encoding='utf-8') as f:
# for i in f:
# s=f.readline().strip()
# # print(s)
# li.append(s.strip())
# with open(r'C:UserslenovoDesktopfinerror1urlist_gif', mode='w',encoding='utf-8') as f:
# for i in set(li):
# f.write( i + ' ')

# li=[]
# with open(r'C:UserslenovoDesktopfingbk1fingbk1urlist_gif', mode='r',encoding='utf-8') as f:
# for i in f:
# s=f.readline().strip()
# # print(s)
# li.append(s.strip())
# with open(r'C:UserslenovoDesktopfingbk1fingbk1urlist_gif', mode='w',encoding='utf-8') as f:
# for i in set(li):
# f.write( i + ' ')




# li=[]
# with open(r'C:UserslenovoDesktopfinutf81urlist_gif', mode='r',encoding='utf-8') as f:
# for i in f:
# s=f.readline().strip(' ')
# # print(s)
# li.append(s.strip())
# with open(r'C:UserslenovoDesktopfinutf81urlist_gif', mode='w',encoding='utf-8') as f:
# for i in set(li):
# f.write( i + ' ')


# import os
# import re
# li=os.listdir(r'C:UserslenovoDesktopallkxzd')
# os.chdir(r'C:UserslenovoDesktopallkxzd')
# for i in li:
# with open(i,mode='r',encoding='utf-8') as f:
# response=f.read()
# with open('urlist_gif',mode='a',encoding='utf-8') as f:
# li=re.findall('src="(.*?)">',response,re.S)
# print(li)
# for i in li:
# f.write('http://xh.5156edu.com'+i+' ')

# li=[]
# with open(r'C:UserslenovoDesktopallkxzdurlist_gif', mode='r',encoding='utf-8') as f:
# for i in f:
# s=f.readline().strip(' ')
# # print(s)
# li.append(s.strip())
# with open(r'C:UserslenovoDesktopallkxzd ew_urlist_gif', mode='w',encoding='utf-8') as f:
# for i in set(li):
# f.write( i + ' ')
#
# import os
# li=os.listdir(r'C:UserslenovoDesktopallkxzd')
# os.chdir(r'C:UserslenovoDesktopallkxzd')
# for i in li:
# with open(i,mode='r+',encoding='utf-8') as f:
# response=f.read()
# if 'src="/kx_images%5' in response:
# response=response.replace('src="/kx_images%5','src="')
# with open(i,mode='w',encoding='utf-8') as f:
# f.write(response)

# import requests,os
# import time,random
# header={
# 'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
# }
# with open(r'C:UserslenovoDesktopurlist_gif',mode='r')as f:
# for i in f:
# time.sleep(random.randint(2,3))
# url=i.strip()
# response=requests.get(url,headers=header).content
# name=os.path.join(r'C:UserslenovoDesktopallkxzd',url.split('http://xh.5156edu.com/kx_images%5C')[1])
# with open(name,mode='wb') as f:
# f.write(response)



# import requests,os
# import time,random
# header={
# 'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
# }
# li=os.listdir(r'C:UserslenovoDesktopdownload')
# lifin=['http://xh.5156edu.com/kx_images%5C'+i for i in li]
# print(len(lifin))
# with open(r'C:UserslenovoDesktopurlist_gif',mode='r')as f:
# for i in f:
# url=i.strip()
# if url not in lifin:
# time.sleep(random.randint(2, 3))
# response=requests.get(url,headers=header).content
# name=os.path.join(r'C:UserslenovoDesktopallkxzd',url.split('http://xh.5156edu.com/kx_images%5C')[1])
# with open(name,mode='wb') as f:
# f.write(response)
# else:
# print('done')
原文地址:https://www.cnblogs.com/diracy/p/14209772.html