Python3.6爬虫+Djiago2.0+Mysql --数据爬取

1.下载对应版本的python mysql 模块 我的是:pymssql-2.2.0.dev0-cp36-cp36m-win_amd64.whl

2.手动创建table  

create table grilsbase
(
id int primary key auto_increment,
name varchar(50),height varchar(50),bwh varchar(50),title varchar(100),img_upload varchar(100),pc_img_upload varchar(100),
resource_id varchar(50),totals varchar(50),recommend_id varchar(50),date varchar(50),headimg_upload varchar(50),
show_datetime varchar(50),client_show_datetime varchar(50),video_duration varchar(50),free_select varchar(50),
trial_time varchar(50),viewtimes varchar(50),coop_customselect_654 varchar(50),coop_id varchar(50),tag_class varchar(50),
tag_name varchar(50),playerid varchar(50),block_detailid varchar(50),type varchar(50),istop varchar(50)
)

3.实现爬虫代码 

   导入模块:requests ,os,json,re,Mysqldb

   流程:获取数据=>分析数据=>解析数据=>持久化保存

  1 #coding:utf-8
  2 import  requests
  3 import os
  4 import json
  5 import re
  6 import MySQLdb
  7 import threading
  8 #获取数据url
  9 gilsUrl='http://act.vip.xunlei.com/ugirls/js/ugirlsdata.js'
 10 gilsDetailUrl='http://meitu.xunlei.com/detail.html'
 11 gilsImgUrl='http://data.meitu.xunlei.com/data/image/%s/%s'
 12 executor = threading.BoundedSemaphore(10)
 13 regex=re.compile('/([^/]*?.jpg)$')
 14 regexhead=re.compile('/([^/]*?).jpg$')
 15 class MySQL:
 16     def __init__(self,host,user,pwd,db):
 17         self.host=host
 18         self.user=user
 19         self.db=db
 20         self.pwd=pwd
 21     def GetConnect(self):
 22         if not self.db:
 23             raise(NameError,'没有目标数据库')
 24         self.connect=MySQLdb.connect(host=self.host,user=self.user,password=self.pwd,database=self.db,port=3306,charset='utf8')
 25         cur=self.connect.cursor()
 26         if not cur:
 27             raise(NameError,'数据库访问失败')
 28         else:
 29             return cur
 30     def ExecSql(self,sql):
 31          cur=self.GetConnect()
 32          cur.execute(sql)
 33          self.connect.commit()
 34          self.connect.close()
 35     def ExecQuery(self,sql):
 36         cur=self.GetConnect()
 37         cur.execute(sql)
 38         resList = cur.fetchall()
 39         self.connect.close()
 40         return resList   
 41 
 42 def getGirlsData():
 43     regex=re.compile("var ugirlsData=(.+)")
 44     r=requests.get(gilsUrl)
 45     jsond=regex.findall(r.text)
 46     with open('ugirlsdata.json','w+',encoding='utf-8') as f:
 47          f.write(jsond[0])
 48     #print('写入json成功')
 49     return json.loads(jsond[0])
 50 
 51 
 52 def getImgName(imgurl):
 53     if(imgurl==''):
 54         return ''
 55     m=regex.findall(imgurl)
 56     if m is None:
 57         return ''
 58     else:
 59         return  m[0] if len(m)>0 else ''
 60 
 61 def getImgNameHead(imgurl):
 62     if(imgurl==''):
 63         return ''
 64     m=regexhead.findall(imgurl)
 65     if m is None:
 66         return ''
 67     else:
 68         return  m[0] if len(m)>0 else ''
 69 
 70 def WriteDB(jsdata):
 71     ms = MySQL(host="192.168.0.108", user="lin", pwd="12345678", db="grils")
 72     for data in jsdata:
 73         sql="insert into grilsbase(
 74         name,height,bwh,title,img_upload,pc_img_upload,resource_id,totals,recommend_id,
 75         date,headimg_upload,show_datetime,client_show_datetime,video_duration,free_select,trial_time,
 76         viewtimes,coop_customselect_654,coop_id,tag_class,tag_name,playerid,block_detailid,type,istop)
 77         values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')" % 
 78         (data['name'],data['height'],data['bwh'],data['title'],getImgName(data.get('img_upload','')),data['pc_img_upload'],data['resource_id'],data["totals"],data["recommend_id"], 
 79         data['date'],getImgName(data.get("headimg_upload",'')),data["show_datetime"],data["client_show_datetime"],data["video_duration"],data["free_select"],data["trial_time"], 
 80         data['viewtimes'],data['coop_customselect_654'],data['coop_id'],data.get('tag_class',''),data.get('tag_name',''),data.get('playerid',''),data['block_detailid'],data['type'],data['istop'])
 81         #print(sql)
 82         ms.ExecSql(sql)
 83         print('完成'+data['name']+'数据更新...')
 84         DownImg(data['name'],data["totals"],data['resource_id'],data["headimg_upload"],data["img_upload"])
 85         
 86 
 87 
 88 def DownImg(name,totals,resource_id,headimg_upload,img_upload):
 89     path=creatFile(resource_id)
 90     if headimg_upload.strip()!='':
 91         #os.remove('./pic/'+resource_id+'/'+getImgName(headimg_upload)+'.jpg')
 92         DownImgRun(headimg_upload,path,getImgNameHead(headimg_upload))
 93     if img_upload.strip()!='':
 94         #os.remove('./pic/'+resource_id+'/'+getImgName(img_upload)+'.jpg')
 95         DownImgRun(img_upload,path,getImgNameHead(img_upload))
 96     #print('正在下载'+name+'图片')
 97     
 98     for i in range(1,int(totals)+1):
 99         url=gilsImgUrl%(resource_id,str(i)+'.jpg')
100         DownImgRun(url,path,i)
101         #t=threading.Thread(target=DownImgRun,args={url,path,i})
102         #t.start()
103         #t.join()
104     
105         
106 
107 def DownImgRun(url,path,i):
108     #print(url)
109     
110     r=requests.get(url)
111     if(r.status_code==200):
112         with open(path+'/'+str(i)+'.jpg','wb') as fimg:
113             fimg.write(r.content)
114             
115 
116 
117 def creatFile(dirname):
118     path='./pic/'+dirname
119     if os.path.exists(path):
120         return path
121     else:
122         os.makedirs(path)
123         return  path
124 
125 
126 if __name__ == '__main__':
127     gri=getGirlsData()
128     WriteDB(gri)

4.运行效果 和结果

 

原文地址:https://www.cnblogs.com/linsu/p/8606916.html