食品伙伴网爬虫

常规爬虫,就是下载pdf文件

码云链接:https://gitee.com/MarkPolaris/food_partnership_network/tree/master

概览页

 1 import requests
 2 import re
 3 import pymysql
 4 import hashlib
 5 import datetime
 6 
 7 
 8 class GLY(object):
 9     def __init__(self):
10         self.url = 'http://down.foodmate.net/special/standard/8.html'
11         self.headers = {
12             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'
13         }
14         self.host = '127.0.0.1'
15         self.db = 'app_mark'
16         self.user = 'root'
17         self.passwd = '123456'
18         self.charset = 'utf8mb4'
19 
20     def get_url(self):
21         response = requests.get(self.url, headers=self.headers)
22         response.encoding = response.apparent_encoding
23         html = response.text
24         urls = re.findall('<A title=.*?href="(.*?)"', html)
25         # 去重
26         urls = set(urls)
27         for url in urls:
28             hkey = hashlib.md5(url.encode(encoding='utf-8')).hexdigest()
29             tag = '0'
30             channel = '食品添加剂标准'
31             sitename = '食品伙伴网'
32             lasttime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
33             list_data = [url, hkey, tag, channel, sitename, lasttime]
34             self.save_url(list_data)
35         print(len(urls))
36 
37     def save_url(self, list_data):
38         con = pymysql.connect(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset)
39         cur = con.cursor()
40         sql = 'insert into gly(link, hkey, tag, channel, sitename, lasttime) values (%s, %s, %s, %s, %s, %s)'
41         try:
42             cur.execute(sql, list_data)
43             print('insert success')
44         except Exception as e:
45             con.rollback()
46             print('error~', e)
47         else:
48             con.commit()
49         cur.close()
50         con.close()
51         
52 
53 
54 if __name__ == '__main__':
55     gly = GLY()
56     urls = gly.get_url()

细览页

 1 import pymysql
 2 import re
 3 import datetime
 4 import requests
 5 from multiprocessing.dummy import Pool as ThreadPool
 6 
 7 class XLY(object):
 8     def __init__(self):
 9         self.host = '127.0.0.1'
10         self.db = 'app_mark'
11         self.user = 'root'
12         self.passwd = '123456'
13         self.charset = 'utf8mb4'
14         self.headers = {
15             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'
16         }
17         self.start = datetime.datetime.now()
18 
19     def get_urls(self):
20         con = pymysql.connect(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset)
21         cur = con.cursor()
22         sql = 'select link from gly where tag = "0" and sitename = "食品伙伴网"'
23         after_sql = 'update gly set tag = "1"'
24         try:
25             cur.execute(sql)
26             results = cur.fetchall()
27         except Exception as e:
28             con.rollback()
29             print('error~', e)
30             results = None
31         else:
32             con.commit()
33         cur.close()
34         con.close()
35         return results
36 
37     def download(self, url):
38         url = url[0]
39         response = requests.get(url, headers=self.headers)
40         response.encoding = response.apparent_encoding
41         html = response.text
42         down_url = re.findall('<a class="telecom" href="(.*?)">', html, re.S)
43         try:
44             down_url = down_url[0]
45             r = requests.get(down_url, headers=self.headers)
46             file_name = 'D:/1_work/python采集/PDF/' + down_url.split('auth=')[-1] + '.pdf'
47             # print(file_name)  
48             with open(file_name, 'wb') as pdf:
49                 for content in r.iter_content():
50                     pdf.write(content)
51         except Exception as e:
52             print('error_url:{}; exception: {}'.format(url, e))
53         print(down_url)
54 
55 
56 if __name__ == '__main__':
57     xly = XLY()
58     urls = xly.get_urls()
59     if urls:
60         # 多线程
61         pool = ThreadPool(20)
62         pool.map(xly.download, urls)
63         pool.close()
64         pool.join()
65     end = datetime.datetime.now()
66     print('耗时: {}'.format(end - xly.start))
67         # for url in urls:
68             # url = url[0]
69             # xly.download(url)
70             # break
原文地址:https://www.cnblogs.com/MC-Curry/p/10561068.html