用python查看网站被百度所有收录网址与标题进行SEO分析

     SEO要是和python数据分析联合在一起,可谓是很好的方法,没事的时候尝试写的分析网站被百度收录的网址和标题。

  首先得引入两个py模块,分别是:Beautiful      Souprequests

    没有下载这两个模块的可以用以下命令下载:

    pip install BeautifulSoup

 pip install requests
     
#!/usr/bin/env python
# -*- coding:utf-8 -*-
'''
百度收录网址标题查询

'''
 
import requests
from random import randint
from bs4 import BeautifulSoup
import re
import datetime
import sys
 
reload(sys)
sys.setdefaultencoding("utf-8")
 
HEADERS = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",
        "X-Forwarded-For": '%s:%s:%s:%s' % (randint(1, 255),
                                                                                randint(1, 255), randint(1, 255), randint(1, 255)),
        "Content-Type": "application/x-www-form-urlencoded",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Connection": "keep-alive"}
 
# print HEADERS
start_time = datetime.datetime.now()  # 取当前时间
print (u'[-] 现在时间:%s') % start_time
 
for pn in range(0, 750, 10):
        print ('第【%s】页')%pn
        url_a = 'https://www.baidu.com/s?wd=site%3Azhimo.yuanzhumuban.cc&rsv_spt=1&rsv_iqid=0xac952cfa0005be29&issp=1&f=8&rsv_bp=0&rsv_idx=2&ie=utf-8&tn=baiduhome_pg&rsv_enter=1&rsv_sug3=1'
        joinUrl = url_a + str(pn) + url_b
        # print joinUrl   #拼接URL
        html_Doc = requests.get(joinUrl, headers=HEADERS).content  # 从Url 中取回网站源码
        html_Soup = BeautifulSoup(html_Doc, 'html.parser', from_encoding='utf-8')
        all_H3 = html_Soup.findAll('h3', attrs={'class': 't'})  # 取所有H3标签中class为t的所有元系
        print (u'[+] 此页共找到%s条数据!') % len(all_H3)
 
        for each in all_H3[0:]:
                # print each
                link = re.findall(r'" href="(.*?)" target="_blank">.*?</a></h3>', str(each), re.S)
                title = re.findall(r'" href=".*?" target="_blank">(.*?)</a>', str(each), re.S)
                print '[-] 标题:%s 链接:%s'%(str(title[0]), str(link[0]))

  

原文地址:https://www.cnblogs.com/68xi/p/9295183.html