python爬虫

一：爬虫类型

　　定向：指定网站爬取

　　非定向：仍意网站爬取

二：简单流程

　　下载网页

　　　　requests模块

　　筛选特定标签

　　　　beautfulsoup/正则表达式

三：BeautifulSoup模块介绍

1.name属性，标签名称

soup=BeautifulSoup(html_doc,features='lxml')
# t1=soup.find(id='xml')
# print(t1.name)
# t1.name='div'#修改标签
# print(soup)

2.attr属性，获取标签的所有标签属性，类型字典

# t2=soup.find(name='p')
# print(t2.attrs)#查看该标签的所有属性
# print(t2.attrs.get('id'))
# t2.attrs={'name':'first','id':'irving'}#修改该标签的属性
# print(soup)
# t2.attrs['id']='curry'
# print(soup)

3.children,所有子标签

#children属性，获取所有的孩子标签
# from bs4.element import Tag
# t3=soup.find('div')
# print(list(t3.children))
# for item in t3:
#     if type(item) == Tag:#判断是否是标签
#         print('标签：',item)
#     else:
#         print('文本：',item)

4.。属性descendants，获取所有的子子孙孙的标签及其内容

# t4=soup.find('body')
# print(list(t4.descendants))

5.clear,将标签的所有子标签全部清空（保留标签名）

# t6=soup.find('div').clear()
# print(soup)

6.decompose,递归的删除所有的标签

# t7=soup.find('div')
# t7.decompose()
# print(soup)

7.extract,递归的删除所有的标签，并获取删除的标签

# t8=soup.find('div')
# print(t8.extract())#先是被删除的标签

8. 方法 decode,转换为字符串（含当前标签）；decode_contents（不含当前标签）

# t9=soup.find('div')
# print(type(t9.decode()))
# print(t9.encode())#返回字节模式

9.方法 find,获取匹配的第一个标签

# t10=soup.find('div')
#print(t10)
# t11 = soup.find(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie')#是否在子孙标签中查找
# t12=soup.find(name='a', attrs={'class': 'sister'}, recursive=False, text='Lacie')
# print(t12)

10.方法 find_all()获取匹配的所有标签

# t11=soup.find_all('a')
#print(t11)
# t11=soup.find(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie')
# print(t11)
#列表
# t11=soup.find_all(['div','a'])#查找标签为div或a的
# print(t11)
#t11=soup.find_all(class_=['title','story'])#查找class属性为title或story的
#print(t11)
#正则
# import re
# rep=re.compile('^p')
# t12=soup.find_all(rep)
# print(t12)

11.get()方法获取标签属性值

# t13=soup.find('p')
# print(t13.get('class'))

12. has_attr()方法,检查标签是否具有该属性

# t14=soup.find('p')
# print(t14.has_attr('name'))

13.get_text()方法，获取标签文本信息

t15=soup.find('p')
print(t15.get_text())

14.index方法,检查标签在某标签中的索引位置

# tar=soup.find('div')
# print(tar)
# print(len(tar))
# print(tar.index(tar.find('h1')))
# for i,v in enumerate(tar):
#     print(i,v)

15is_empty_element属性,是否是自闭合标签

#tar=soup.find('img')
tar=soup.find('div')
print(tar.is_empty_element)

16。属性,找出当前标签的关联标签

# soup.next 查找下一个标签
# soup.next_element　　
# soup.next_elements　
# soup.next_sibling　
# soup.next_siblings　
#
# tag.previous
# tag.previous_element
# tag.previous_elements
# tag.previous_sibling
# tag.previous_siblings
 
#
# tag.parent
# tag.parents

 1 from bs4.element import Tag
 2 # tar=soup.find(class_='story')
 3 #print(list(tar.next_elements))#找出所有跟自己有关系的标签及其内容包括自己的子标签
 4 # print('
')
 5 # for i in tar.next_elements:
 6 #     if type(i)==Tag:
 7 #         print(i)
 8 #print(list(tar.next_siblings))#找出所有跟自己有关系的标签机器，不包括自己的子标签
 9 #print(list(tar.previous))
10 #print(list(tar.previous_elements))#从上面找出跟自己像相关的标签及其内容,包括body和html标签
11 #print(list(tar.previous_siblings))#从上面找处更自己相关的标签，不包括body,html标签
12 tar=soup.find(id='link3')
13 # print(tar.parent)#找出父亲
14 # print((tar).parent.name)
15 # print(list(tar.parents))#找出父亲及其以上备份的
16 # for i in tar.parents:
17 #     print(i.name)
18 
19 #print(list(tar.next_siblings))#找出所有跟自己有关系的标签机器，不包括自己的子标签
20 #print(list(tar.previous_siblings))#从上面找处更自己相关的标签，不包括body,html标签
21 # print(tar.parent)#找出父亲

实例

17.方法,查找某标签的关联标签

######和上面查找的内容一样，只不过是可以指定查找的下一个上一个的条件
# tag.find_next(...)
# tag.find_all_next(...)
# tag.find_next_sibling(...)
# tag.find_next_siblings(...)
 
# tag.find_previous(...)
# tag.find_all_previous(...)
# tag.find_previous_sibling(...)
# tag.find_previous_siblings(...)
 
# tag.find_parent(...)
# tag.find_parents(...)


# tar=soup.find(class_='story')#朝朝class为story的标签
# x=tar.find_next('p')并且朝朝下一个标签为p的标签
# print(x)

18.select,select_one, CSS选择器语法

soup.select("title")
 
soup.select("p nth-of-type(3)")
 
soup.select("body a")
 
soup.select("html head title")
 
tag = soup.select("span,a")
 
soup.select("head > title")
 
soup.select("p > a")
 
soup.select("p > a:nth-of-type(2)")
 
soup.select("p > #link1")
 
soup.select("body > a")
 
soup.select("#link1 ~ .sister")
 
soup.select("#link1 + .sister")
 
soup.select(".sister")
 
soup.select("[class~=sister]")
 
soup.select("#link1")
 
soup.select("a#link2")
 
soup.select('a[href]')
 
soup.select('a[href="http://example.com/elsie"]')
 
soup.select('a[href^="http://example.com/"]')
 
soup.select('a[href$="tillie"]')
 
soup.select('a[href*=".com/el"]')
 
 
from bs4.element import Tag
 
def default_candidate_generator(tag):
    for child in tag.descendants:
        if not isinstance(child, Tag):
            continue
        if not child.has_attr('href'):
            continue
        yield child
 
tags = soup.find('body').select("a", _candidate_generator=default_candidate_generator)
print(type(tags), tags)
 
from bs4.element import Tag
def default_candidate_generator(tag):
    for child in tag.descendants:
        if not isinstance(child, Tag):
            continue
        if not child.has_attr('href'):
            continue
        yield child
 
tags = soup.find('body').select("a", _candidate_generator=default_candidate_generator, limit=1)
print(type(tags), tags)

19.string属性，获取标签的内容

tar=soup.find('body')
# print(tar.string)
# tar.string='this is my web'
# print(soup)
#print(list(tar.stripped_strings))#获取该标签下的所有文本信息

20.append()方法，在当前标签内部追加一个标签

from bs4.element import Tag
obj = Tag(name='p',attrs={'id': 'it'})#创建一个标签
obj.string='i am new element'
tar.append(obj)#在find的标签中末尾添加一个标签
print(soup)

21.insert方法，在当前标签内部指定位置插入一个标签

tag=Tag(name='input',attrs={"type":'button'})
tag.string="点击"
tar=soup.find('div')
tar.insert(2,tag)#在2的位置插入tag标签
print(soup)

22. insert_after(),insert_before()方法，在当前标签后面或前面插入

# from bs4.element import Tag
# obj = Tag(name='i', attrs={'id': 'it'})
# obj.string = '我是一个新来的'
# tag = soup.find('body')
# # tag.insert_before(obj)
# tag.insert_after(obj)
# print(soup)

23.replace_with 在当前标签替换为指定标签

tag=Tag(name='input',attrs={"type":'button'})
tar=soup.find('b')
tar.replace_with(tag)#将find的标签替换为该标签
print(soup)

24.wrap，将指定标签把当前标签包裹起来

tag=Tag(name='input',attrs={"type":'button'})
tar=soup.find('p')
tar.wrap(soup.find('img'))#将原有的标签移动到tar的外面

# print(list(tar.parents))
# tar.wrap(tag)#使tag吧tar包住
print(soup)

25.unwrap（），去掉当前标签，将保留其包裹的标签

tar=soup.find('a')
tar.unwrap()#之江find的标签移除，不包括其内容和子标签
print(soup)

四：获取汽车之家信息

import requests
import uuid
from bs4 import BeautifulSoup
response=requests.get(url="https://www.autohome.com.cn/news/")
response.encoding=response.apparent_encoding#设置编码
soup=BeautifulSoup(response.text,features='html.parser')#将request请求的html文本生成一个beautifulsoup对象
target=soup.find(id='auto-channel-lazyload-article')#查找id=auto-channel-lazyload-article的标签及其子标签
li_list=target.find_all('li')#查找有的li标签及其子标签，生成的是一个列表
for item in li_list:
    a=item.find('a')
    if a:
        print(a.attrs.get('href'))  # 找到a对象所有的属性
        b = a.find('img')
        print(b.attrs.get('src'))
        c=a.find('h3').text#加text返回的是文本类型，反之是一个标签对象
        print(c,type(c))
        img_url=a.find('img').attrs.get("src")
        img_response=requests.get(url='http:'+img_url)
        file_name=str(uuid.uuid4())+'.jpg'
        with open(file_name,'wb') as f:
            f.write(img_response.content)