爬虫bs4

import requests
# res = requests.get('http://httpbin.org/get')
# res1 = res.json()

#转换成json数据
# import json
# res1=json.loads(response.text) #太麻烦

#什么SSL,就是安全认证,就是http后面的那个s

# respone=requests.get('https://www.12306.cn',
# cert=('/path/server.crt',
# '/path/key'))
# print(respone.status_code)

#正向代理
#通过别人的服务器去访问你访问的地址
#ip 代理收费(通过代理访问自己的服务,在服务端取出客户端ip查看一下)
# proxies={
# # 'http':'http://egon:123@localhost:9743',#带用户名密码的代理,@符号前是用户名与密码
# # 'http':'http://localhost:9743',
# 'https':'https://localhost:9743',
# 'http':'http://124.205.155.148:9090'
# }
# respone=requests.get('https://www.12306.cn',
# proxies=proxies)
#
# print(respone.status_code)
#超时设置
# import requests
# respone=requests.get('https://www.baidu.com',
# timeout=0.0001)

#上传文件
import requests
files={'file':open('a.jpg','rb')}
respone=requests.post('http://httpbin.org/post',files=files)
print(respone.status_code)

from bs4 import BeautifulSoup
#可以将html页面数据转换成一个对象
'''
里面的两个方法,一个是find 还有一个是find_all
find:
-name="标签名" 标签
-id,class_,="" 把这个标签拿出来
-标签.text 取标签的内容
-标签.get(属性名) 取标签属性的内容
find_all

'''
url='https://www.autohome.com.cn/news/1/#liststart'
res = requests.get(url)
soup = BeautifulSoup(res.text,"lxml")
div = soup.find(id = "auto-channel-lazyload-article")
ul=div.find(name='ul') #只找第一个ul标签
# ul_list=div.find_all(class_="article") #找出下面所有类名为article的标签
# print(len(ul_list))
li_list=ul.find_all(name='li')
# print(len(li_list))
for li in li_list:
h3=li.find(name='h3')
if h3:
title=h3.text #把h3标签的text取出来
print(title)
a=li.find(name='a')
if a:
article_url=a.get('href') #取出a标签的href属性
print(article_url)

img=li.find(name='img')
if img:
img_url=img.get('src')
print(img_url)
p=li.find(name='p')
if p:
content=p.text
print(content)

#得到的数据永远都可以find,


from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>

<p class="title" id="bbaa"><b name="xx" age="18">The Dormouse's story</b><b>xxxx</b></p>
<p class="xxx" a="xxx">asdfasdf</p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

# soup=BeautifulSoup(html_doc,'lxml')
# ress=soup.prettify() #美化一下
# soup=BeautifulSoup(ress,'lxml')
# print(ress)

#遍历文档树
# print(soup.p.name)
# print(soup.p.attrs)
# print(soup.p.string)
# print(list(soup.p.strings))
# print(soup.p.text)

# print(soup.body.p.text)
# print(soup.body.p.contents)
# print(list(soup.body.p.children))
# print(list(soup.body.p.descendants))
# print(soup.body.p.parent)
# print(list(soup.body.p.parents))
# print(len(list(soup.body.p.parents)))
# print(soup.body.p.previous_sibling)
# print(soup.body.p.previous_sibling)
# print(soup.find(class_="xxx").previous_sibling)
# print(soup.a.next_sibling)
# print(soup.a.previous_sibling)
# print(type(soup.p))



#查找文档
#五种过滤器 :字符串,正则,布尔,方法,列表
import re
# print(soup.find_all(name='b'))


# print(soup.find_all(name=re.compile('^b')))
# print(soup.find_all(id=re.compile('^b')))


# print(soup.find_all(name=['a','b']))
# print(soup.find_all(name=True))

# def has_class_but_no_id(tag):
# return tag.has_attr('class') and not tag.has_attr('id')
# print(soup.find_all(name=has_class_but_no_id))

#css选择
# xpath
# print(soup.select(".title"))
# print(soup.select("#bbaa"))

# print(soup.select('#bbaa b')[0].attrs.get('name'))

#recursive=False 只找同一层
#limit 找到第几个之后停止

sibling_soup = BeautifulSoup("<a><b>text1</b><c>text2</c></b></a>",'lxml')
print(sibling_soup.b.next_sibling)
print(sibling_soup.c.previous_sibling )



from selenium import webdriver
from selenium.webdriver.common.keys import Keys #键盘按键操作
import time

# from selenium.webdriver.chrome.options import Options
# chrome_options = Options()
# chrome_options.add_argument('window-size=1920x3000') #指定浏览器分辨率
# chrome_options.add_argument('--disable-gpu') #谷歌文档提到需要加上这个属性来规避bug
# chrome_options.add_argument('--hide-scrollbars') #隐藏滚动条, 应对一些特殊页面
# chrome_options.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度
# chrome_options.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
# chrome_options.binary_location = r"C:Program Files (x86)GoogleChromeApplicationchrome.exe" #手动指定使
# bro=webdriver.PhantomJS()

# bro=webdriver.Chrome(chrome_options=chrome_options)
bro=webdriver.Chrome()
bro.get('https://www.baidu.com')

# print(bro.page_source)
# time.sleep(3)
time.sleep(1)
#取到输入框
inp=bro.find_element_by_id('kw')
#往框里写字
inp.send_keys("美女")
inp.send_keys(Keys.ENTER) #输入回车
#另一种方式,取出按钮,点击su
time.sleep(3)
bro.close()
原文地址:https://www.cnblogs.com/yangxinpython/p/11938124.html