python爬虫笔记01

1.urllib库中request,parse的学习

1.1 简单的请求页面获取,并下载到本地 request的使用

from urllib import request

# 获取此网页的demout
resp = request.urlopen('http://www.baidu.com')

# 读出10个字符
# 结果为 b'<!DOCTYPE '  b代表bytes 是一个字节流
# '<!DOCTYPE ' 包括空格 正好十个字符
# print(resp.read(10))

# 读取一行
# 结果为 b'<!DOCTYPE html>
'
# 
 代表换行
# ()里面可以写读几行
# print(resp.readline())

# 全部读取  二者对应的返回类型不同
# <class 'list'> readlines
# <class 'bytes'> read
# print(type(resp.readlines()))
# print(type(resp.read()))


# 下载到本地文件夹 
request.urlretrieve('http://www.baidu.com', 'baidu.html')

1.2 parse的使用

1.2.1 解决中文与码的对应问题

例:中文变成码 name=%E9%AB%98%E8%BE%BE 这些带%和中文之间的转换

 1 # 主要解决网址中中文是解析不了的问题
 2 
 3 from urllib import request
 4 from urllib import parse
 5 
 6 # 中文变成码 name=%E9%AB%98%E8%BE%BE&age=21
 7 
 8 # urlencode函数 把字典转化为字符串
 9 params = {'name': '高达', 'age': '21'}
10 result = parse.urlencode(params)
11 # 结果为
12 # <class 'str'>
13 # name=%E9%AB%98%E8%BE%BE&age=21
14 print(type(result))
15 print(result)
16 
17 # 如果直接在网址写中文会报错,虽然我们看到的是中文但是实际上是中文对应的码
18 # resp = request.urlopen('https://www.baidu.com/s?wd=刘德华')
19 
20 
21 # 写出一个字典
22 params = {'wd': '刘德华'}
23 print(params)
24 # 转化成正确的网站上的格式
25 qs = parse.urlencode(params)
26 print(qs)
27 # 这样请求就能成功
28 resp = request.urlopen('https://www.baidu.com/s?'+qs)
29 # print(resp.readlines())
30 # parse.parse_qs函数 还原解码
31 result = parse.parse_qs(qs)
32 # 结果为
33 # <class 'dict'>
34 # {'wd': ['刘德华']}
35 print(type(result))
36 print(result)

1.2.2 解析网址

# 解析网址
from urllib import parse


def test():
    # 测试用的 url
    url = 'http://www.baidu.com/s;sssas?wd=python&username=abc#1'
    url1 = 'http://www.baidu.com/s?wd=python&username=abc#1'
    url2 = 'https://api.bilibili.com/x/v2/reply?callback=jQuery17204184039578913048_1580701507886&jsonp=jsonp&pn=1&type=1&oid=70059391&sort=2&_=1580701510617'
    result1 = parse.urlparse(url)
    # <class 'urllib.parse.ParseResult'>
    print(type(result1))
    # 结果为 ParseResult(scheme='http', netloc='www.baidu.com', path='/s', params='sssas', query='wd=python&username=abc',
    # fragment='1')
    # print(result1)
    # print('scheme:', result1.scheme)
    # print('netloc:', result1.netloc)
    # print('path: ', result1.path)
    # print('params: ', result1.params)
    # print('query: ', result1.query)
    # print('fragment: ', result1.fragment)
    result2 = parse.urlsplit(url)
    print(result2)
    # urlparse函数与 urlsplit函数的对比
    # urlparse函数多一个params
    result1 = parse.urlparse(url1)
    result2 = parse.urlsplit(url1)
    print(result1)
    print(result2)
    # result3 = parse.urlparse(url2)
    # print('r3:', result3.query)


# 格式化输出函数
def myself_urlparse(url):
    result = parse.urlparse(url)
    print('scheme:', result.scheme)
    print('netloc:', result.netloc)
    print('path: ', result.path)
    print('params: ', result.params)
    print('query: ', result.query)
    print('fragment: ', result.fragment)


def myself_urlsplit(url):
    result = parse.urlsplit(url)
    print('scheme:', result.scheme)
    print('netloc:', result.netloc)
    print('path: ', result.path)
    print('query: ', result.query)
    print('fragment: ', result.fragment)


if __name__ == '__main__':
    test()

 1.3 使用代理IP

# ip代理
from urllib import request
# 没有代理是用本地ip
url = 'http://www.baidu.com'
resp = request.urlopen(url)
# print(resp.read())

# 使用代理
handler = request.ProxyHandler({'http': '60.170.234.221:65309'})
opener = request.build_opener(handler)
resp = opener.open(url)
print(resp.read())

1.4CookieJar

cokkie信息可以直接写在headers请求头里面,也可以创建CookieJar对象

# cookie信息
from urllib import request
from http.cookiejar import CookieJar
from urllib import parse
# 登录

cookiejar = CookieJar()
handler = request.HTTPCookieProcessor(cookiejar)
opener = request.build_opener(handler)
# 最简单的就是在handers中写cookie
handers = {
   'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400'
}

data = {
    'login-username': '15932639701',
    'login-passwd': 'w5153300'
}
url = 'https://passport.bilibili.com/login'

req = request.Request(url, data=parse.urlencode(data).encode('utf-8'), headers=handers, method='GET')
# 自动保存cokie信息
opener.open(req)

# 访问个人主页
url1 = 'https://space.bilibili.com/141417102'

# 使用之前的opener
req = request.Request(url1, headers=handers)
resp = opener.open(url1)
# 写入本地
with open('a.html', 'w', encoding='utf-8')as fp:
    fp.write(resp.read().decode('utf-8'))

2.1 requests的使用 它是对urllib的再次封装,它们使用的主要区别:
requests可以直接构建常用的get和post请求并发起,urllib一般要先构建get或者post请求,然后再发起请求。

import requests

resp = requests.get("https://www.baidu.com/")
#
#
# 返回响应内容 源码 返回对象str 有乱码就要decode解码
# print(type(resp.text))
# print(resp.text.decode('utf-8'))

# 返回响应内容 源码 返回对象bytes
# print(type(resp.content))
# print(resp.content)

#
print(resp.url)
# https://www.baidu.com/
print(resp.encoding)
# ISO-8859-1
print(resp.status_code)#
200
print(resp.headers)
print(resp.cookies)
import requests

# 写入请求头

# params = {
#     'wd': '中国'
# }
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400'
}
resp = requests.get("https://www.baidu.com/s?wd=%E4%B8%AD%E5%9B%BD&rsv_spt=1&rsv_iqid=0xc673aab900009518&issp=1&f=8&rsv_bp=1&rsv_idx=2&ie=utf-8&tn=56060048_11_pg&ch=6&rsv_enter=0&rsv_dl=tb&rsv_sug3=4&rsv_sug1=2&rsv_sug7=101&prefixsug=%25E4%25B8%25AD%25E5%259B%25BD%2520&rsp=1&inputT=5214&rsv_sug4=5933",headers=headers)

with open('a.html', 'w', encoding='utf-8')as fp:
    fp.write(resp.content.decode('utf-8'))
    print(resp.url)
import requests

# 使用代理ip
proxy = {
    'http': 'http://47.100.124.14/'
}
resp = requests.get('https://www.baidu.com', proxies=proxy)
print(resp.content.decode('utf-8'))
import requests
resp = requests.get("https://www.baidu.com/")
# 查看cookies信息
print(resp.cookies.get_dict())
session = requests.Session()
# 处理不信任的SSL证书
# 网站是https需要
# 出现错误是因为这个网站不是这个证书
import requests
resp = requests.post("https://www.baidu.com/", verify=False)

3.1 XPath

XPath语法

 补充 :text()是提取该标签下的文本

 图片来自于https://www.w3school.com.cn/xpath/xpath_syntax.asp

import requests
from lxml import html

etree = html.etree
parser = etree.HTMLParser(encoding='utf-8')
html = etree.parse('a.html', parser=parser)
divs = html.xpath('//div')
# 返回一个列表
for div in divs:
    print(etree.tostring(div, encoding='utf-8').decode('utf-8'))
import requests
from lxml import html

etree = html.etree
url = 'https://www.dytt8.net/html/gndy/dyzz/list_23_1.html'
headers = {
   'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400',
   'Referer': 'https://movie.douban.com/'
}
resp = requests.get(url, headers=headers)
# print(text)
text = resp.content.decode('gbk')
# print(text)
html = etree.HTML(text)
# []里面@XXX=XXX是寻找特定的属性,在/后面@属性是要这个值的
urls = html.xpath("//table[@class='tbspan']//a/@href")
# nead_urls = []
for url in urls:
    # nead_urls.append('https://www.dytt8.net'+url)
    print('https://www.dytt8.net'+url)
import requests
from lxml import html

etree = html.etree

# 请求头  网站url
url = 'https://movie.douban.com/cinema/nowplaying/langfang/'
headers = {
   'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400',
   'Referer': 'https://movie.douban.com/'
}
resp = requests.get(url,headers=headers)
# text str content bytes
# text 解码过的
# print(resp.content.decode('utf-8'))
# print(resp.text)

# 转化为html 对象 <class 'lxml.etree._Element'>
html = etree.HTML(resp.text)
print(type(html))
ul = html.xpath("//ul[@class='lists']")[0]
# print(ul)
# print(etree.tostring(ul,encoding='utf-8').decode('utf-8')
# 取出li标签存入lis
lis = ul.xpath("./li")
# print(etree.tostring(li,encoding='utf-8').decode('utf-8'))
a = html.xpath("//div[@id='upcoming']//a[@data-psource='title']")
for x in a:
    name = x.xpath('@title')
    href = x.xpath('@href')
    print(href)
    print(name)
# for li in lis:
#     # 取出@data-title属性的值
#     name = li.xpath("@data-title")
#     print(name)
#     # 取出图片链接
#     img = li.xpath(".//img/@src")
#     print(img)

3.2 正则表达式

import re
import requests


def parse_page(url):
    headers = {
        'User-Agent': 'Mozilla / 5.0(Windows NT 10.0;WOW64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 79.0.3945.130Safari / 537.36'
    }
    resp = requests.get(url,headers)
    text = resp.text
    # re.DOTALL .也可以匹配换行符
    titles = re.findall(r'<divsclass="cont">.*?<b>(.*?)</b>', text, re.DOTALL)
    print(titles)


def main():
    # url ='https://www.gushiwen.org/default_1.aspx'
    for x in range(1, 10):
        url ='https://www.gushiwen.org/default_%s.aspx' % x
        parse_page(url)


if __name__ == '__main__':
    main()
原文地址:https://www.cnblogs.com/gonT-iL-evoL-I/p/12258171.html