python 爬虫

#!/usr/bin/python3
# -*- coding: UTF-8 -*-
 
import urllib
from urllib.parse import urlencode
from urllib.request import Request, urlopen
import re
import time
import os
import mysql.connector
 
times = 0

def saveDownedurl(downedurl):
    url = downedurl
    conn = mysql.connector.connect(user='root', password='694521', database='picurl')
    cursor = conn.cursor()
    sql = "INSERT INTO downedurl (picurl) VALUES (%s)"
    cursor.execute(sql,[url])
    conn.commit() 
    print(cursor.rowcount, "记录插入成功。")
    conn.close()
    # sql = "INSERT INTO downedurl (picurl) VALUES (url)"
    # cursor.execute(sql)
    # conn.commit() 
    # print(cursor.rowcount, "记录插入成功。")
    # conn.close()


def download_pic(pic_url,root_url,down_times):
     url = pic_url
     Referer = root_url
     down_time = down_times
     headers = {
     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0',
     'Referer':Referer
     }
     down_path = str(down_time)+'.jpg'
     print (down_path)
     requests = Request(url, headers=headers)
     data = urlopen(requests).read()
     with open(down_path, 'wb') as f:
          f.write(data)
          f.close()
     down_time+=1
     return down_time




def jiexi_rootPic_url(next_rootUrl,down_times):
     url = next_rootUrl
     headers = {
     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0'
     }
     downtime = down_times
     request_url = Request(url, headers=headers)
     response = urlopen(request_url).read().decode("utf-8") 
     pattern = re.compile('<img src="(.*?)"', re.IGNORECASE)
     pic_path =  pattern.findall(response)
     for i in pic_path:
          print ('download_prepare')
          downtime = download_pic(i,url,downtime) 
          print(i)
     time.sleep(2)
     return downtime


def jiexi_url(root_url,down_times):
     headers = {
     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0'
     }
     downtime = down_times
     url = root_url
     request_url = Request(url, headers=headers)
     html = urlopen(request_url).read().decode("utf-8") 
     response = re.compile('/rnyy(.*?).html', re.IGNORECASE)
     all_next_root =  response.findall(html)
     for i in all_next_root:
          path = 'http://mmff30.com/rnyy'+i+'.html'
          print (path)
          saveDownedurl(path)
          downtime = jiexi_rootPic_url(path,downtime)




jiexi_url('http://mmff30.com/rwmy_9_3.html',4000)
原文地址:https://www.cnblogs.com/ytCui/p/13055992.html