基本爬虫

代码:

#-*- coding: UTF-8 -*-
import requests
import os
from bs4 import BeautifulSoup
import urllib
start_url = 'http://www.521609.com/meinvxiaohua/'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'}
saved_path = r'C:UserszhoutiaxDesktopxiaohua'
x=1


def crawl(url):
   # req = urllib2.Request(url, headers=headers)
   # content = urllib2.urlopen(req, timeout=20).read()
   content = requests.get(url).text
   soup = BeautifulSoup(content, "html.parser")
   img_urls = soup.find_all("img")
   global x
   if not os.path.exists(saved_path):
      os.makedirs(saved_path)
   for img_url in img_urls:
      # print img_url['src']
      if img_url['src'].startswith('/uploads'):
         img = url.split('/m')[0] + img_url['src']
         urllib.urlretrieve(img, r'C:UserszhoutiaxDesktopxiaohua\%d.jpg' % x)
         x += 1
      else:
         exit


if __name__ == '__main__':
   for page in range(1, 5):  # 多页
      page_url = start_url + "list12%d.html" % page
      print page_url
      crawl(page_url)

  

原文地址:https://www.cnblogs.com/nevermore29/p/9606035.html