python 爬虫

#!/usr/bin/python3

# -*- coding: UTF-8 -*-

import urllib

from urllib.parse import urlencode

from urllib.request import Request, urlopen

import re

import time

import os

import mysql.connector

times = 0

def saveDownedurl(downedurl):

url = downedurl

conn = mysql.connector.connect(user='root', password='694521', database='picurl')

cursor = conn.cursor()

sql = "INSERT INTO downedurl (picurl) VALUES (%s)"

cursor.execute(sql,[url])

conn.commit()

print(cursor.rowcount, "记录插入成功。")

conn.close()

# sql = "INSERT INTO downedurl (picurl) VALUES (url)"

# cursor.execute(sql)

# conn.commit()

# print(cursor.rowcount, "记录插入成功。")

# conn.close()

def download_pic(pic_url,root_url,down_times):

url = pic_url

Referer = root_url

down_time = down_times

headers = {

'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0',

'Referer':Referer

}

down_path = str(down_time)+'.jpg'

print (down_path)

requests = Request(url, headers=headers)

data = urlopen(requests).read()

with open(down_path, 'wb') as f:

f.write(data)

f.close()

down_time+=1

return down_time

def jiexi_rootPic_url(next_rootUrl,down_times):

url = next_rootUrl

headers = {

'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0'

}

downtime = down_times

request_url = Request(url, headers=headers)

response = urlopen(request_url).read().decode("utf-8")

pattern = re.compile('<img src="(.*?)"', re.IGNORECASE)

pic_path = pattern.findall(response)

for i in pic_path:

print ('download_prepare')

downtime = download_pic(i,url,downtime)

print(i)

time.sleep(2)

return downtime

def jiexi_url(root_url,down_times):

headers = {

'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0'

}

downtime = down_times

url = root_url

request_url = Request(url, headers=headers)

html = urlopen(request_url).read().decode("utf-8")

response = re.compile('/rnyy(.*?).html', re.IGNORECASE)

all_next_root = response.findall(html)

for i in all_next_root:

path = 'http://mmff30.com/rnyy'+i+'.html'

print (path)

saveDownedurl(path)

downtime = jiexi_rootPic_url(path,downtime)

jiexi_url('http://mmff30.com/rwmy_9_3.html',4000)