下面是小編工作中爬取高企云數據的源代碼,增加验证码识别代码:
1 # -*- coding:utf-8 -*- 2 import requests 3 import time,re 4 import pytesseract 5 import urllib 6 import json 7 from PIL import Image 8 from selenium import webdriver 9 from lxml import etree 10 from connect_monogo import save #Python单利设计模式 11 12 13 class Login(object): 14 15 def __init__(self): 16 self.s = requests.session() 17 self.driver = webdriver.Chrome() 18 #self.driver = webdriver.PhantomJS(executable_path="D:/phantomjs-2.1.1-windows/bin/phantomjs.exe") 19 20 def get_code(self): 21 url = 'http://www.hights.cn' 22 self.driver.get('http://www.hights.cn/beetl/login/toLogin.html') 23 code = self.driver.page_source 24 #print(code) 25 patten = re.compile('<img id="codeImg" alt="" src="(.*?)" />') 26 code_url = patten.findall(code) 27 #print code_url 28 if len(code_url) ==0: 29 return '验证码为空号' 30 else: 31 #print( url + code_url[0]) 32 d_url = url + code_url[0] 33 print(d_url) 34 try: 35 urllib.urlretrieve(d_url,'code.jpg') 36 except IOError: 37 print('验证码链接错误') 38 finally: 39 im = Image.open('code.jpg') 40 hk = pytesseract.image_to_string(im) 41 self.driver.find_element_by_name('phone').send_keys('15766264244') 42 self.driver.find_element_by_name('password').send_keys('123456789') 43 time.sleep(2) 44 self.driver.find_element_by_name('code').send_keys(str(hk)) 45 time.sleep(5) 46 self.driver.find_element_by_id('login_btn').click() 47 print('登录成功') 48 time.sleep(55) 49 50 51 def Public_list(self):#公司名录,已爬取 52 'http://www.hights.cn/beetl/library/list.do?showCount=15&libtype=2¤tPage=2' 53 url_01 = 'http://www.hights.cn/beetl/library/list.do?showCount=15&libtype=0¤tPage=1' 54 name = requests.get(url_01,headers=headers) 55 return(name.text) 56 # save.save_monogo({'id':json.loads(name.text,encoding='UTF-8')['data']}) 57 58 def Template_model(self):#模板范文 59 for page_01 in range(1,3): 60 url_02 = 'http://www.hights.cn/beetl/library/list.do?showCount=15&libtype=2¤tPage='+str(page_01) 61 s1 = requests.get(url_02,headers=headers) 62 return json.loads(s1.text,encoding='UTF-8')['data'] 63 # save.save_monogo({'id':json.loads(name.text,encoding='UTF-8')['data']}) 64 65 def Government_documents(self):#政府文件 66 for page_02 in range(1,10): 67 url_03 = 'http://www.hights.cn/beetl/library/list.do?showCount=15&libtype=5¤tPage='+str(page_02) 68 s2 = requests.get(url_03,headers=headers) 69 return json.loads(s2.text,encoding='UTF-8')['data'] 70 # save.save_monogo({'id':json.loads(name.text,encoding='UTF-8')['data']}) 71 72 def Policy_interpretation(self):#政策解读 73 for page_03 in range(1,15): 74 url_04 = 'http://www.hights.cn/beetl/library/list.do?showCount=15&libtype=4¤tPage='+str(page_03) 75 print(url_04) 76 s3 = requests.get(url_04,headers=headers) 77 return json.loads(s3.text,encoding='UTF-8')['data'] 78 # save.save_monogo({'id':json.loads(name.text,encoding='UTF-8')['data']}) 79 80 81 def Other_information(self):#其他资料 82 for page_04 in range(1,3): 83 url_05 ='http://www.hights.cn/beetl/library/list.do?showCount=15&libtype=3¤tPage='+str(page_04) 84 s4 = requests.get(url_05,headers=headers) 85 return json.loads(s4.text,encoding='UTF-8')['data'] 86 # save.save_monogo({'id':json.loads(name.text,encoding='UTF-8')['data']}) 87 88 def jiexi_data(self): 89 #return(save.chaxun()) 90 91 pdf_list =[] 92 for i_json in self.Other_information(): 93 #for ij in i_json['id']: 94 targid = i_json['id'] 95 filepath = i_json['pdfpath'] 96 pdf_url = 'http://www.hights.cn/beetl/mydocument/download?targetId='+str(targid)+'&clazzName=com.fh.entity.system.Library&filePath='+ str(filepath) 97 path = targid+filepath 98 pdf_list.append(pdf_url) 99 print(pdf_list) 100 #print(type(str(self.shibie_code()))) 101 #ss = str(self.shibie_code()) 102 #self.driver.get('http://www.hights.cn/beetl/login/toLogin.html') 103 self.driver.find_element_by_name('phone').send_keys('15766264244') 104 self.driver.find_element_by_name('password').send_keys('123456789') 105 time.sleep(2) 106 self.driver.find_element_by_name('code').send_keys(ss) 107 time.sleep(5) 108 self.driver.find_element_by_id('login_btn').click() 109 time.sleep(4) 110 for i in pdf_list: 111 self.driver.get(i) 112 print('下载完成。。。。') 113 time.sleep(400) 114 115 def main(): 116 r = Login() 117 r.get_code() 118 #r.shibie_code() 119 #r.jk() 120 #r.jiexi_data() 121 #r.login_gaoqiyun() 122 #r.Public_list() 123 #r.Template_model() 124 #print(r.Government_documents()) 125 #print(r.Policy_interpretation()) 126 #print(r.jiexi_data()) 127 128 if __name__ =='__main__': 129 main()