python_爬百度百科词条

如何爬取?
  明确目标:爬取百度百科,定初始百度词条:python,初始URL:http://baike.baidu.com/item/Python,爬取数据量为1000条,值爬取简介,标题,和简介中url
  怎么爬: 利用谷歌开发工具,分析html结构,分析查询层次与方法
  怎么写: 面向过程和面向对象两个方向
环境声明:
  python 3.50
  requests 库
  beautifulsoup 库
使用面向过程的方式爬取
#!/usr/bin/python3
import re
import bs4
import requests
from bs4 import BeautifulSoup


# 从百度百科爬取数据为三个字段,标题,简介,关联URL
# 给定初始百度词条:python,初始URL:http://baike.baidu.com/item/Python,爬取数据量为1000条
# 那就先有4个模块,URL管理器,下载器,解析器,数据展示
# 通过requests、BeautifulSoup两个库,实现下载器和解析器,通过两个集合数据类型,实现URL管理器
# URL拼接 起始url :http://baike.baidu.com
# new_urls = set()
# old_urls = set()
# 已经在old_urls不再爬取,不在添加到new_urls中并从其中返回一个URL

def url_manager(links):
    if links is not None:
        # 把重复的url去掉
        links = links.difference(old_urls)
        if links is not None:
            for i in links:
                new_urls.add(i)


def download_html(url):
    headers = {
              # 'Host': 'static.tieba.baidu.com',
              'Referer': 'http://baike.baidu.com/item/Python',
              'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'}
    # 返回下载页面
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        response.encoding = response.apparent_encoding
        return response.text
    except:
        return None
    pass


def analysis(page_html, one_url):
    # 返回标题,简介,关联URL
    # temp = title + introduction + page_url
    # links = 关联URL
    links = []
    temp_url = 'http://baike.baidu.com'
    soup = BeautifulSoup(page_html, 'html.parser')

    # 获取标题
    title = soup.find('dd', class_="lemmaWgt-lemmaTitle-title").find('h1').get_text()
    # print(title)

    # 获取简介
    introduction = soup.find('div', class_="lemma-summary").get_text().replace('
Python[1]xa0
(英国发音:/ˈpaɪθən/ 美国发音:/ˈpaɪθɑːn/),', '')
    # print(introduction)

    # 获得关联URL,只爬取简介中关联的URL
    links_labl = soup.find('div', class_="lemma-summary").find_all('a', href=re.compile("^/item/"))

    # links_text = soup.find('div', class_="lemma-summary").find_all('a', href=re.compile("^/item/"))
    # for i in links_text:
    #     print(i.get_text())

    for link in links_labl:
        new_url = temp_url + link['href']
        links.append(new_url)
    temp = one_url + ' : ' + title + '_' + introduction
    message.append(temp)

    if links is not None:
        links = set(links)
    else:
        links = None
    return links


def out_data():
    for i in message:
        print(i)
    pass


if __name__ == '__main__':
    new_urls = set()
    old_urls = set()
    message = []
    start_url = 'http://baike.baidu.com/item/Python'
    # 起始页
    page_html = download_html(start_url)
    links = analysis(page_html, start_url)
    url_manager(links)

    # 起始页简介中URL
    for i in range(100):
        url = new_urls.pop()
        try:
            page_html = download_html(url)
            if not page_html:
                continue
            urls = analysis(page_html, url)
            url_manager(urls)
        except:
            print('爬取失败')
        old_urls.add(url)
    # 依次打印爬取到的值
    out_data()

  


原文地址:https://www.cnblogs.com/2bjiujiu/p/7193785.html