python爬西刺代理

爬IP代码

import requests
import re
import  dauk
from bs4 import BeautifulSoup
import time
def daili():
      print('[+]极速爬取代理IP,默认为99页')
      for b in range(1,99):
        url="http://www.xicidaili.com/nt/{}".format(b)
        header={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:58.0) Gecko/20100101 Firefox/48.0'}
        r=requests.get(url,headers=header)
        gsx=BeautifulSoup(r.content,'html.parser')
        for line in gsx.find_all('td'):
            sf=line.get_text()
            dailix=re.findall('(25[0-5]|2[0-4]d|[0-1]d{2}|[1-9]?d).(25[0-5]|2[0-4]d|[0-1]d{2}|[1-9]?d).(25[0-5]|2[0-4]d|[0-1]d{2}|[1-9]?d).(25[0-5]|2[0-4]d|[0-1]d{2}|[1-9]?d)',str(sf))
            for g in dailix:
                po=".".join(g)
                print(po)
                with open ('采集到的IP.txt','a') as l:
                    l.write(po+'
')

daili()


def dailigaoni():
    print('[+]极速爬取代理IP,默认为99页')
    for i in range(1,99):
      url="http://www.xicidaili.com/nn/{}".format(i)
      header={'User-Agent':'Mozilla/5.0 (Windows NT 6.1 Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
      r=requests.get(url,headers=header)
      bks=r.content
      luk=re.findall('(25[0-5]|2[0-4]d|[0-1]d{2}|[1-9]?d).(25[0-5]|2[0-4]d|[0-1]d{2}|[1-9]?d).(25[0-5]|2[0-4]d|[0-1]d{2}|[1-9]?d).(25[0-5]|2[0-4]d|[0-1]d{2}|[1-9]?d)',str(bks))
      for g in luk:
          vks=".".join(g)
          print(vks)
          with open('采集到的IP.txt','a') as b:
              b.write(vks+'
')
dailigaoni()

def dailihtp():
    print('[+]极速爬取代理IP,默认为99页')
    for x in range(1,99):
        header="{'User-Agent':'Mozilla/5.0 (Windows NT 6.1 Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}"
        url="http://www.xicidaili.com/wn/{}".format(x)
        r=requests.get(url,headers=header)
        gs=r.content
        bs=re.findall('(25[0-5]|2[0-4]d|[0-1]d{2}|[1-9]?d).(25[0-5]|2[0-4]d|[0-1]d{2}|[1-9]?d).(25[0-5]|2[0-4]d|[0-1]d{2}|[1-9]?d).(25[0-5]|2[0-4]d|[0-1]d{2}|[1-9]?d)',gs)
        for kl in bs:
            kgf=".".join(kl)
            print(kgf)
            with open ('采集到的IP.txt','a') as h:
                h.write(kgf)
dailihtp()

def dailihttps():
    print('[+]极速爬代理IP,默认为99页')
    for s in range(1,99):
        url="http://www.xicidaili.com/wt/{}".format(s)
        header={'User-Agent':'Mozilla/5.0 (Windows NT 6.1 Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
        r=requests.get(url,headers=header)
        kl=r.content
        lox=re.findall('(25[0-5]|2[0-4]d|[0-1]d{2}|[1-9]?d).(25[0-5]|2[0-4]d|[0-1]d{2}|[1-9]?d).(25[0-5]|2[0-4]d|[0-1]d{2}|[1-9]?d).(25[0-5]|2[0-4]d|[0-1]d{2}|[1-9]?d)',kl)
        for lk in lox:
            los=".".join(lk)
            print(los)
            with open('采集到的IP.txt','a') as lp:
                lp.write(los)
dailihttps()

 端口代码

import requests
import re
from bs4 import BeautifulSoup


def daili():
    print('[+]极速爬取代理IP端口,默认为99页')
    for b in range(1, 99):
        url = "http://www.xicidaili.com/nt/{}".format(b)
        header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:58.0) Gecko/20100101 Firefox/48.0'}
        r = requests.get(url, headers=header)
        gsx = BeautifulSoup(r.content, 'html.parser')
        for line in gsx.find_all('td'):
            sf = line.get_text()
            dailix = re.findall(
                '<td>([0-9]|[1-9]d{1,3}|[1-5]d{4}|6[0-5]{2}[0-3][0-5])</td>',
                str(sf))
            for g in dailix:
                po = ".".join(g)
                print(po )
                with open('采集到的端口.txt.txt', 'a') as l:
                    l.write(po + '
')


daili()


def dailigaoni():
    print('[+]极速爬取代理IP的端口,默认为99页')
    for i in range(1, 99):
        url = "http://www.xicidaili.com/nn/{}".format(i)
        header = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1 Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
        r = requests.get(url, headers=header)
        bks = r.content
        luk = re.findall(
            '<td>([0-9]|[1-9]d{1,3}|[1-5]d{4}|6[0-5]{2}[0-3][0-5])</td>',
            str(bks))
        for g in luk:
            vks = ".".join(g)
            print(vks)
            with open('采集到的端口.txt.txt', 'a') as b:
                b.write(vks + '
')


dailigaoni()


def dailihtp():
    print('[+]极速爬取代理IP,默认为99页')
    for x in range(1, 99):
        header = "{'User-Agent':'Mozilla/5.0 (Windows NT 6.1 Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}"
        url = "http://www.xicidaili.com/wn/{}".format(x)
        r = requests.get(url, headers=header)
        gs = r.content
        bs = re.findall(
            '<td>([0-9]|[1-9]d{1,3}|[1-5]d{4}|6[0-5]{2}[0-3][0-5])</td>',
            gs)
        for kl in bs:
            kgf = ".".join(kl)
            print(kgf)
            with open('采集到的端口.txt.txt', 'a') as h:
                h.write(kgf)


dailihtp()


def dailihttps():
    print('[+]极速爬代理IP的端口,默认为99页')
    for s in range(1, 99):
        url = "http://www.xicidaili.com/wt/{}".format(s)
        header = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1 Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
        r = requests.get(url, headers=header)
        kl = r.content
        lox = re.findall(
            '<td>([0-9]|[1-9]d{1,3}|[1-5]d{4}|6[0-5]{2}[0-3][0-5])</td>',
            kl)
        for lk in lox:
            los = ".".join(lk)
            print(los)
            with open('采集到的端口.txt', 'a') as lp:
                lp.write(los)


dailihttps()

  调用代码

print('''
                   _ooOoo_ 
                  o8888888o 
                  88" . "88 
                  (| -_- |) 
                  O  =  /O 
               ____/`---'\____ 
             .'  \|     |//  `. 
            /  \|||  :  |||//   
           /  _||||| -:- |||||-   
           |   | \  -  /// |   | 
           | \_|  ''---/''  |   | 
             .-\__  `-`  ___/-. / 
         ___`. .'  /--.--  `. . __ 
      ."" '<  `.___\_<|>_/___.'  >'"". 
     | | :  `- \`.;` _ /`;.`/ - ` : | | 
        `-.   \_ __ /__ _/   .-` /  / 
======`-.____`-.___\_____/___.-`____.-'====== 
                   `=---=' 
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 
            佛祖保佑       永无BUG 
            ''')

print('[!]爬虫速度过快,导致IP被封请更换IP')
print('[*]极速爬取代理IP')
print('1.普通代理IP')
print('2.高匿代理IP')
print('3.http代理IP')
print('4.https代理IP')
bk=input('请选择:')
def xs():
  import 代理.daili
  import 代理.dauk
  if bk=='1':
        代理.daili.daili.daili()
        代理.dauk.daili()
        exit()
  elif bk=='2':
      代理.daili.daili.dailigaoni()
      代理.dauk.dailigaoni()
      exit()
  elif bk=='3':
      代理.daili.daili.dailihtp()
      代理.dauk.dailihtp()
      exit()
  elif bk=='4':
      代理.daili .daili.dailihttps()
      代理.dauk.dailihttps()
      exit()
  elif bk=='q':
      exit()
  else:
      print('[-]没有找到你要的选项')
xs()

 

2018-02-17

原文地址:https://www.cnblogs.com/haq5201314/p/8451683.html