python爬虫基础知识学习笔记

python爬虫基础知识学习笔记

镜像源(加快下载模块)

国内常用的镜像源有 :
阿里云 http://mirrors.aliyun.com/pypi/simple/

中国科技大学 https://pypi.mirrors.ustc.edu.cn/simple/

豆瓣(douban) http://pypi.douban.com/simple/

清华大学 https://pypi.tuna.tsinghua.edu.cn/simple/

中国科学技术大学 http://pypi.mirrors.ustc.edu.cn/simple/

pip install lxml -i https://pypi.tuna.tsinghua.edu.cn/simple/

1. 爬虫基础(贴吧案例)

爬虫的基本操作
百度贴吧小案例(request的使用)
# 打开url,封装请求
from urllib.request import urlopen, Request
# 将字典封装为网站参数
from urllib.parse import urlencode
# 随机UA(防止ip被封)
from fake_useragent import UserAgent


# 获取html
def get_html(url):
    headers = {
        'User-Agent': UserAgent().chrome
    }
    request = Request(url, headers=headers)
    response = urlopen(request)
    # print(response.read().decode())
    return response.read()


# 保存html到本地
def save_html(filename, html_bytes):
    with open(filename, 'wb') as f:
        print('正在保存' + filename)
        f.write(html_bytes)


def main():
    context = input('请输入要下载的内容:')
    num = input('请输入要下载的页数:')
    base_url = 'https://tieba.baidu.com/f?ie=utf-8&{}'
    for pn in range(int(num)):
        args = {
            'pn': pn * 50,
            'kw': context
        }
        args = urlencode(args)
        # print(args)
        # print(base_url.format(args))
        filename = '第' + str(pn + 1) + '页.html'
        print('开始下载' + filename)
        html_bytes = get_html(base_url.format(args))
        save_html(filename, html_bytes)
        print(filename + '下载完成')


if __name__ == '__main__':
    main()

2. post请求的使用

给网站发送post请求,传递post参数
from urllib.request import urlopen, Request
from urllib.parse import urlencode
from fake_useragent import UserAgent

url = 'http://www.zengqiang.club/admin/login'

form_data = {
    'username': '曾强',
    'password': 'ZQZ981004'
}
# print(urlencode(form_data))
headers = {
    'User-Agent': UserAgent().random
}
# print(headers)

f_data = urlencode(form_data)
request = Request(url, data=f_data.encode(), headers=headers)
response = urlopen(request)
print(response.read().decode())

3. ajax请求的抓取(豆瓣电影排行榜信息抓取)

ajax主要是要在network下去查看url
找到url的参数的含义和规律
循环的去访问网址获取数据
from urllib.request import Request, urlopen
from fake_useragent import UserAgent

base_url = 'https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&start={}&limit=20'

i = 0
while True:
    headers = {
        'User-Agent': UserAgent().random
    }
    # 将base_url中{}代表的参数传入,封装为完整的url
    url = base_url.format(i * 50)
    request = Request(url, headers=headers)
    response = urlopen(request)
    info = response.read().decode()
    if len(info) < 10:
        break
    print(info)
    i += 1

4. 代理的使用

使用代理防止过多的访问导致ip被封
使用网站的其他ip
from urllib.request import Request, build_opener
from urllib.request import ProxyHandler
from fake_useragent import UserAgent

url = 'http://httpbin.org/get'

headers = {
    'User-Agent': UserAgent().chrome
}

request = Request(url, headers=headers)

# 两种方式:(1是购买使用,2是免费的,网上找)
# handler = ProxyHandler({'http':'username:password@ip:port'})
# handler = ProxyHandler({'http':'ip:port'})
handler = ProxyHandler({'http': '39.137.107.98:80'})
# 封装成自己的opener
opener = build_opener(handler)
# 用自定义的opener去发出请求
response = opener.open(request)
print(response.read().decode())

5. cookie的使用

使用cookie来完成需要登录而访问的页面
两种方式:直接使用cookie和保存cookie到文件在加载使用
from urllib.request import Request, HTTPCookieProcessor, build_opener
from urllib.parse import urlencode
from fake_useragent import UserAgent

# 登录
login_url = 'http://www.zengqiang.club/admin/login'

headers = {
    'User-Agent': UserAgent().chrome
}

form_date = {
    'username': '曾强',
    'password': 'ZQZ981004'
}
f_date = urlencode(form_date).encode()

request = Request(login_url, headers=headers, data=f_date)
handler = HTTPCookieProcessor()
opener = build_opener(handler)
opener.open(request)

# 登录成功
url = 'http://www.zengqiang.club/admin/blogs'

request = Request(url, headers=headers)
response = opener.open(request)
print(response.read().decode())
from urllib.request import Request, HTTPCookieProcessor, build_opener
from urllib.parse import urlencode
from fake_useragent import UserAgent
from http.cookiejar import MozillaCookieJar


# 登录
# 保存cookie到文件
def get_cookie():
    login_url = 'http://www.zengqiang.club/admin/login'
    headers = {
        'User-Agent': UserAgent().chrome
    }
    form_date = {
        'username': '曾强',
        'password': 'ZQZ981004'
    }
    f_date = urlencode(form_date).encode()
    request = Request(login_url, headers=headers, data=f_date)
    cookie_jar = MozillaCookieJar()
    handler = HTTPCookieProcessor(cookie_jar)
    opener = build_opener(handler)
    opener.open(request)
    cookie_jar.save('cookie.txt', ignore_expires=True, ignore_discard=True)


# 加载cookie
# 范文页面
def use_cookie():
    url = 'http://www.zengqiang.club/admin/blogs'
    headers = {
        'User-Agent': UserAgent().chrome
    }
    request = Request(url, headers=headers)
    cookie_jar = MozillaCookieJar()
    cookie_jar.load('cookie.txt',ignore_expires=True,ignore_discard=True)
    handler = HTTPCookieProcessor(cookie_jar)
    opener = build_opener(handler)
    response = opener.open(request)
    print(response.read().decode())
if __name__ == '__main__':
    get_cookie()
    use_cookie()

6. URLError的使用

异常处理try,except
from urllib.request import Request, urlopen
from fake_useragent import UserAgent
from urllib.error import URLError

url = 'http://www.zengqiang.club/1.html'

headers = {
    'User-Agent': UserAgent().random
}

try:
    request = Request(url, headers=headers)

    response = urlopen(request)

    print(response.read().decode())
except URLError as e:
    if e.args == ():
        print(e.code)
    else:
        print(e.args[0].errno)

7. requests的使用

requests比urllib更方便,代码更少
import requests
from fake_useragent import UserAgent

# get请求
# url = 'https://www.baidu.com/s'
# headers = {
#     'User-Agent': UserAgent().chrome
# }
# params = {
#     'wd': '重庆文理学院'
# }
# response = requests.get(url, headers=headers, params=params)
# response.encoding = 'utf-8'
#
# print(response.url)

# post请求
url = 'http://www.zengqiang.club/admin/login'
form_data = {
    'username': '曾强',
    'password': 'ZQZ981004'
}
headers = {
    'User-Agent': UserAgent().random
}
response = requests.post(url, data=form_data, headers=headers)

print(response.text)

8. re的使用(正则表达式)

记忆常用的表达式

import re

str = 'I love you6.6 forever'
print('-------match()从字符串的起始位置开始匹配---------')
m1 = re.match(r'I', str)
m2 = re.match(r'w', str)
m3 = re.match(r'.', str)
m4 = re.match(r'D', str)
m5 = re.match(r'S', str)
m6 = re.match(r'i', str, re.I)
print(m6.group())

print('-------serach()扫描整个字符串并返回第一个成功的匹配---------')
s1 = re.search(r'love', str)
s2 = re.search(r'lw+', str)
s3 = re.search(r'yw+.d', str)
print(s3.group())

print('-------findAll()查找全部---------')
f1 = re.findall(r'o', str)
print(f1)

print('--------练习---------')
str1 = '<div><a href="http://www.python.com">python官网</a></div>'
t1 = re.findall(r'pw+[u4e00-u9fa5]', str1)
t2 = re.findall(r'<a href="http://www.python.com">(.+)</a>', str1)
t3 = re.findall(r'<a href="(.+)">', str1)
print(t3)

print('---------sub() 替换字符串-------')
su1 = re.sub(r'<div>(.+)</div>', r'<span>1</span>', str1)
print(su1)

9. 使用re爬取本网站的首页的博客标题

import requests
from fake_useragent import UserAgent
import re


url = 'http://www.zengqiang.club/'

headers = {
    'User-Agent': UserAgent().random
}

response = requests.get(url, headers=headers)

# print(response.text)
info = response.text
result = re.findall(r'<a href="/blog/d+" target="_blank" class="m-black m-text-thin">(.+)</a>', info)
print(result)

10. bs4的使用

方便提取html代码中我们需要的内容
from bs4 import BeautifulSoup
from bs4.element import Comment

# 这里需要安装lxml模块
# 国内常用的镜像源有 :
# 阿里云 http://mirrors.aliyun.com/pypi/simple/
# 中国科技大学 https://pypi.mirrors.ustc.edu.cn/simple/
# 豆瓣(douban) http://pypi.douban.com/simple/
# 清华大学 https://pypi.tuna.tsinghua.edu.cn/simple/
# 中国科学技术大学 http://pypi.mirrors.ustc.edu.cn/simple/
# pip install lxml -i https://pypi.tuna.tsinghua.edu.cn/simple/
str = '''
<title>尚学堂</title>
<div class='info' float='left'>Welcome to SXT</div>
<div class='info' float='right'>
    <span>Good Good Study</span>
    <a href='www.bjsxt.cn'></a>
    <strong><!--没用--></strong>
</div>
'''

soup = BeautifulSoup(str, 'lxml')
print(soup.title)
print(soup.div)
print(soup.div.attrs)
print(soup.div.get('class'))
print(soup.div.get('float'))
print(soup.a['href'])
print(soup.div.string)
print(soup.div.text)
print(soup.strong.string)
print(type(soup.strong.string))

if type(soup.strong.string) == Comment:
    print(soup.strong.string)
    print(soup.strong.prettify())
else:
    print(soup.strong.text)

str1 = '''
<title id="title">尚学堂</title>
<div class='info' id="info" float='left'>Welcome to SXT</div>
<div class='info' float='right'>
    <span>Good Good Study</span>
    <a href='www.bjsxt.cn'></a>
    <strong><!--没用--></strong>
</div>
'''

print('------------find_all()-------------')
soup1 = BeautifulSoup(str1, 'lxml')
print(soup1.find_all('title'))
print(soup1.find_all(id='title'))
print(soup1.find_all(class_='info'))  # class是关键字
print(soup1.find_all(attrs={'float': 'left'}))

print('------------select() css选择器-------------')
print(soup1.select('title'))
print(soup1.select('#title'))
print(soup1.select('.info'))
print(soup1.select('div > span'))  # < 两边要有空格
print(soup1.select('div span'))
print(soup1.select('div'))
print(soup1.select('div')[1])
print(soup1.select('div')[1].select('span'))
print(soup1.select('title')[0].text)

11. xpath的使用

xpath用于获取html想要的内容
爬取起点中文网的部分书籍的书名和作者
from lxml import html
import requests
from fake_useragent import UserAgent

url = "https://www.qidian.com/rank/yuepiao?chn=21"
headers = {
    'User_Agent': UserAgent().random
}
response = requests.get(url, headers=headers)

etree = html.etree

e = etree.HTML(response.text)

names = e.xpath('//h4/a/text()')
authors = e.xpath('//p[@class="author"]/a[1]/text()')

# for num in range(len(names)):
#     print(names[num], ":", authors[num])

for name, author in zip(names, authors):
    print(name, ":", author)

# print(names)
# print(authors)

12. pyquery的使用

pyquer用于获取html想要的内容
爬取西刺代理的ip数据
from pyquery import PyQuery as pq
import requests
from fake_useragent import UserAgent

url = 'https://www.xicidaili.com/nn/'

headers = {
    'User-Agent': UserAgent().chrome
}

response = requests.get(url, headers=headers)

doc = pq(response.text)

strs = doc('#ip_list tr')

for num in range(1, len(strs)):
    ip = strs.eq(num).find('td').eq(1).text()
    port = strs.eq(num).find('td').eq(2).text()
    type = strs.eq(num).find('td').eq(5).text()
    print(ip, ":", port, "----", type)

13. json的使用

主要是json与string之间的转换
import json

str = '{"name":"我的小世界"}'
print(type(str))

# 将字符串转为json对象
obj = json.loads(str)
print(type(obj), ":", obj)

# 将json对象转为字符串
str1 = json.dumps(obj, ensure_ascii=False)
print(type(str1), ":", str1)

# 保存json到文件
json.dump(obj, open('json.txt', 'w', encoding='utf-8'), ensure_ascii=False)

# 从文件中提取数据
str2 = json.load(open('json.txt', encoding='utf-8'))
print(str2)

14. jsonpath的使用

爬取json数据中我们需要的内容
from jsonpath import jsonpath
import requests
from fake_useragent import UserAgent
import json

# json在线解析:https://www.json.cn/
url = 'https://www.lagou.com/lbs/getAllCitySearchLabels.json'

headers = {
    'User-Agent': UserAgent().chrome
}

response = requests.get(url, headers=headers)
# 两种方式将response转换为json对象
city_names = jsonpath(json.loads(response.text), '$..name')
city_codes = jsonpath(response.json(), '$..code')

for city_name, city_code in zip(city_names, city_codes):
    print(city_name, ":", city_code)

15. 多线程的使用

多线程主要是为了提高爬取的效率
爬取段子网的段子数据(面向对象)
from threading import Thread
from fake_useragent import UserAgent
import requests
from lxml import html
from queue import Queue


# 爬取网页类
class Spider_html(Thread):
    def __init__(self, url_queue, html_queue):
        Thread.__init__(self)
        self.url_queue = url_queue
        self.html_queue = html_queue

    def run(self):
        headers = {
            'User-Agent': UserAgent().random
        }
        while self.url_queue.empty() == False:
            url = self.url_queue.get()
            response = requests.get(url, headers=headers)
            if response.status_code == 200:
                self.html_queue.put(response.text)


# 解析类
class ParseInfo(Thread):
    def __init__(self, html_queue):
        Thread.__init__(self)
        self.html_queue = html_queue

    def run(self):
        etree = html.etree
        while self.html_queue.empty() == False:
            e = etree.HTML(self.html_queue.get())
            contents = e.xpath('//div[@class="post-content"]/p/text()')
            # print(contents)
            with open('duanzi.txt','a',encoding='utf-8')as f:
                for content in contents:
                    info = content
                    # 控制一行一个段子方便查看
                    f.write(info+'
')

if __name__ == '__main__':
    # 存储url
    url_queue = Queue()
    # 存储内容html
    html_queue = Queue()
    base_url = 'https://duanziwang.com/category/%E7%BB%8F%E5%85%B8%E6%AE%B5%E5%AD%90/{}'
    for i in range(1, 11):
        new_url = base_url.format(i)
        url_queue.put(new_url)
        # print(new_url)

    # 爬取网页
    spider_html_list = []
    # 开启三个线程
    for i in range(0, 3):
        spider1 = Spider_html(url_queue, html_queue)
        spider_html_list.append(spider1)
        spider1.start()

    for spider_html in spider_html_list:
        spider_html.join()

    # 解析网页,获取需要的内容
    parse_list = []
    for i in range(0, 3):
        parse = ParseInfo(html_queue)
        parse_list.append(parse)
        parse.start()
    for parse in parse_list:
        parse.join()

16. 云打码的使用

云打码主要用于处理登录时输入的验证码
需要给一点钱和注册账号
http://www.yundama.com/
# 打码工具类
import http.client, mimetypes, urllib, json, time, requests
from PIL import Image

######################################################################

class YDMHttp:
    apiurl = 'http://api.yundama.com/api.php'
    username = ''
    password = ''
    appid = ''
    appkey = ''

    def __init__(self, username, password, appid, appkey):
        self.username = username
        self.password = password
        self.appid = str(appid)
        self.appkey = appkey

    def request(self, fields, files=[]):
        response = self.post_url(self.apiurl, fields, files)
        response = json.loads(response)
        return response

    def balance(self):
        data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid,
                'appkey': self.appkey}
        response = self.request(data)
        if (response):
            if (response['ret'] and response['ret'] < 0):
                return response['ret']
            else:
                return response['balance']
        else:
            return -9001

    def login(self):
        data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid,
                'appkey': self.appkey}
        response = self.request(data)
        if (response):
            if (response['ret'] and response['ret'] < 0):
                return response['ret']
            else:
                return response['uid']
        else:
            return -9001

    def upload(self, filename, codetype, timeout):
        data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid,
                'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)}
        file = {'file': filename}
        response = self.request(data, file)
        if (response):
            if (response['ret'] and response['ret'] < 0):
                return response['ret']
            else:
                return response['cid']
        else:
            return -9001

    def result(self, cid):
        data = {'method': 'result', 'username': self.username, 'password': self.password, 'appid': self.appid,
                'appkey': self.appkey, 'cid': str(cid)}
        response = self.request(data)
        return response and response['text'] or ''

    def decode(self, filename, codetype, timeout):
        cid = self.upload(filename, codetype, timeout)
        if (cid > 0):
            for i in range(0, timeout):
                result = self.result(cid)
                if (result != ''):
                    return cid, result
                else:
                    time.sleep(1)
            return -3003, ''
        else:
            return cid, ''

    def report(self, cid):
        data = {'method': 'report', 'username': self.username, 'password': self.password, 'appid': self.appid,
                'appkey': self.appkey, 'cid': str(cid), 'flag': '0'}
        response = self.request(data)
        if (response):
            return response['ret']
        else:
            return -9001

    def post_url(self, url, fields, files=[]):
        for key in files:
            files[key] = open(files[key], 'rb');
        res = requests.post(url, files=files, data=fields)
        return res.text


######################################################################
def get_code(filename):
    # 用户名
    username = 'zq666_yh'

    # 密码
    password = 'ZQZ981004'

    # 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得!
    appid = 10039

    # 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得!
    appkey = 'f6248169a3f9857b57e778c52d9f5de2'

    # 图片文件
    filename = filename

    # 验证码类型,# 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html
    codetype = 1005

    # 超时时间,秒
    timeout = 60

    # 检查
    if (username == 'username'):
        print('请设置好相关参数再测试')
    else:
        # 初始化
        yundama = YDMHttp(username, password, appid, appkey)

        # 登陆云打码
        uid = yundama.login();
        # print('uid: %s' % uid)

        # 查询余额
        balance = yundama.balance();
        # print('balance: %s' % balance)

        # 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果
        cid, result = yundama.decode(filename, codetype, timeout);
        # print('cid: %s, result: %s' % (cid, result))
        return result
    ######################################################################


if __name__ == '__main__':
    img = 'yzm1.jpg'
    code = get_code(img)
    print(code)

# 使用
import requests
from fake_useragent import UserAgent
from 爬虫学习.ydm_util import get_code


def get_image():
    img_url = 'http://www.yundama.com/index/captcha'
    response = session.get(img_url, headers=headers)
    with open('yzm.jpg', 'wb')as f:
        f.write(response.content)
    code = get_code('yzm.jpg')
    print(code)
    return code


def do_login(code):
    login_url = 'http://www.yundama.com/index/login?'
    f_data = {
        'username': 'zq666_yh',
        'password': 'ZQZ981004',
        'utype': '1',
        'vcode': code
    }
    response = session.get(login_url, headers=headers, params=f_data)
    print(response.text)


# 三个操作必须在同一个会话下进行
if __name__ == '__main__':
    session = requests.Session()
    index_url = 'http://www.yundama.com/'
    headers = {
        'User-Agent': UserAgent().random
    }
    response = session.get(index_url, headers=headers)
    code = get_image()
    do_login(code)


17. selenium的使用

selenium主要你用于打开浏览器测试,并控制浏览器进行一些操作
需要现在python安装目录下的script中放置浏览器驱动:如chromedriver.exe
from selenium import webdriver

chrome = webdriver.Chrome()

chrome.get('http://www.zengqiang.club')

# chrome.save_screenshot('zqclub.jpg')

# html = chrome.page_source
# print(html)

id_content = chrome.find_element_by_id('run_time').text
print(id_content)

chrome.find_element_by_name('query').send_keys('爬虫')
chrome.find_element_by_class_name('search').click()
chrome.save_screenshot('爬虫.jpg')
print(chrome.current_url)
# 关闭当前页
chrome.close()

# 获取当前页面Cookie
print(chrome.get_cookies())

chrome.quit()

18. 练习-爬取360电影信息

练习使用四种爬取方式xpath,re,bs4,pyquery
import requests
from fake_useragent import UserAgent
from lxml import html
from random import randint
from time import sleep
from bs4 import BeautifulSoup
import re
from pyquery import PyQuery


# 获取页面的html代码
def get_html(url):
    headers = {
        'User-Agent': UserAgent().random
    }
    # 暂停2-6秒,使更像人的操作
    sleep(randint(2, 6))
    response = requests.get(url, headers=headers)
    response.encoding = 'utf-8'
    if response.status_code == 200:
        return response.text
    else:
        return None


# 解析首页(得到电影信息的url)
def parse_index(index_html):
    # ------pq--------
    doc = PyQuery(index_html)
    moive_a = doc('ul.list.g-clear a')
    moive_urls = []
    for a in moive_a:
        moive_urls.append(a.attrib['href'])

    # ------re--------
    # moive_urls = re.findall(r'<a class="js-tongjic" href="(.+)">', index_html)

    # ------bs4--------
    # soup = BeautifulSoup(index_html, 'lxml')
    # moive_a = soup.select('ul.list.g-clear a')
    # # print(moive_a)
    # moive_urls = []
    # for a in moive_a:
    #     moive_urls.append(a['href'])

    # ------xpath--------
    # etree = html.etree
    # e = etree.HTML(index_html)
    # moive_urls = e.xpath('//ul[@class="list g-clear"]//a/@href')

    return ['https://www.360kan.com{}'.format(url) for url in moive_urls]


# 解析电影信息,得到需要的内容
def parse_info(movie_html):
    # ------pq--------
    doc = PyQuery(movie_html)
    name = doc('h1').text()
    types = doc('p.item > a.cat').text()
    actors = doc('p.item.item-actor > a').text()

    # ------re--------
    # name = re.findall(r'<h1>(.+)</h1>', movie_html)[0]
    # types = re.findall(r'class="cat.+href.+">(.+)</', movie_html)
    # actors = re.findall(r'<a class="name" href=".+">(.+)</a>', movie_html)

    # ------bs4--------
    # soup = BeautifulSoup(movie_html, 'lxml')
    # name = soup.select('h1')[0].text
    # type = soup.select('p.item')[0].select('a')
    # types = []
    # for t in type:
    #     types.append(t.text)
    # actor = soup.select('p.item.item-actor')[0].select('a')
    # actors = []
    # for a in actor:
    #     actors.append(a.text)

    # ------xpath--------
    # etree = html.etree
    # e = etree.HTML(movie_html)
    # name = e.xpath('//h1/text()')[0]
    # types = e.xpath('//p[@class="item"][1]/a/text()')
    # actors = e.xpath('//p[@class="item item-actor"]/a/text()')
    return {
        'name': name,
        'types': types,
        'actors': actors
    }


# 主方法,遍历电影url,打印爬取的数据
def main():
    index_url = 'https://www.360kan.com/dianying/list.php?year=all&area=all&act=all&cat=all'
    index_html = get_html(index_url)
    moive_urls = parse_index(index_html)
    print(moive_urls)
    for url in moive_urls:
        moive_html = get_html(url)
        moive = parse_info(moive_html)
        print(moive)


if __name__ == '__main__':
    main()

19. 练习-爬取虎牙直播正在直播的主播信息

使用selenium爬取
from selenium import webdriver
from time import sleep

driver = webdriver.Chrome()

url = 'https://www.huya.com/g/2356'

driver.get(url)

num = 1
while True:
    print('第', str(num), '页------------')
    num += 1
    sleep(5)
    html = driver.page_source
    titles = driver.find_elements_by_xpath('//a[@class="title new-clickstat j_live-card"]')
    anthors = driver.find_elements_by_xpath('//i[@class="nick"]')
    audiences = driver.find_elements_by_xpath('//i[@class="js-num"]')

    for title, anthor, audience in zip(titles, anthors, audiences):
        print(title.text, '---', anthor.text, '---', audience.text)
    if html.find('laypage_next') != -1:
        driver.find_element_by_xpath('//a[@class="laypage_next"]').click()
    else:
        break

20. selenium滚动条的使用

有些网页需要滚动才会显示所有内容
爬取京东商品信息
from selenium import webdriver
from time import sleep

url = 'https://search.jd.com/Search?keyword=iqoo&enc=utf-8&pvid=1c71f2514c724500b5c4e7f4dc58c1f2'

driver = webdriver.Chrome()
driver.get(url)

js = 'document.documentElement.scrollTop=100000'
driver.execute_script(js)

sleep(3)
html = driver.page_source

names = driver.find_elements_by_xpath('//div[@class="gl-i-wrap"]//a/em')
prices = driver.find_elements_by_xpath('//div[@class="gl-i-wrap"]//strong/i')
print(len(names))

for name, price in zip(names, prices):
    print(name.text, ':', price.text)

21. 练习-图虫网首页所有组图的爬取

import requests
from fake_useragent import UserAgent
from lxml import html
from selenium import webdriver


def get_group_urls():
    driver = webdriver.Chrome()
    index_url = 'https://tuchong.com/'
    driver.get(index_url)
    index_html = driver.page_source
    # print(index_html)
    etree = html.etree
    e = etree.HTML(index_html)
    group_urls = e.xpath('//div[@class="post-item"]/a[1]/@href')
    return group_urls


def get_group_html(group_urls):
    etree = html.etree
    headers = {'User_Agent': UserAgent().random}
    group_num = 1
    for url in group_urls:
        group_name = 'group' + str(group_num)
        group_num += 1
        response = requests.get(url, headers=headers)
        e = etree.HTML(response.text)
        # print(response.text)
        img_urls = e.xpath('//article[@class="post-content"]//img[@class="multi-photo-image"]/@src')
        print(img_urls)
        for img_url in img_urls:
            img_name = img_url[img_url.rfind('/') + 1:]
            save_img(img_url, group_name, img_name)


def save_img(img_url, group_num, img_name):
    headers = {'User_Agent': UserAgent().random}
    response = requests.get(img_url, headers=headers)
    group_name = 'group' + str(group_num)
    with open('img/' + group_name + '-' + img_name, 'wb') as f:
        f.write(response.content)


def main():
    group_urls = get_group_urls()
    get_group_html(group_urls)


if __name__ == '__main__':
    main()

22. 双色球案例(存入数据到数据库)

链接数据库
并插入爬取的数据
import requests
from fake_useragent import UserAgent
from lxml import html
import pymysql


def get_html(url):
    headers = {
        'User-Agent': UserAgent().random
    }
    response = requests.get(url, headers=headers)
    return response.text


def save_mysql(trs, date_time):
    client = pymysql.connect(host='localhost', port=3306, user='root', password='ZQZ981004', charset='utf8',
                             db='python')
    print('数据库链接成功')
    cursor = client.cursor()
    sql = 'insert into double_ball values(0,%s,%s,%s)'

    for tr, time in zip(trs, date_time):
        # 提取红球
        red_ball = '-'.join(tr.xpath('./td[@class="chartBall01"]/text()'))
        # 提取蓝球
        blue_ball = tr.xpath('./td[@class="chartBall02"]/text()')[0]
        print("第 " + time + "期—红球是:" + red_ball + " 蓝球:" + blue_ball)
        cursor.execute(sql, [time, red_ball, blue_ball])
        client.commit()

    cursor.close()
    client.close()
    print('保存数据完成')


def main():
    url = 'https://datachart.500.com/ssq/'
    html_ = get_html(url)
    etree = html.etree
    e = etree.HTML(html_)
    data_time = e.xpath('//tbody[@id="tdata"]/tr/td[@align="center"]/text()')
    trs = e.xpath('//tbody[@id="tdata"]/tr[not(@class)]')
    save_mysql(trs, data_time)


if __name__ == '__main__':
    main()

23. 爬虫新写法(规范,类)

分离所有的方法
使用类
更专业
import requests
from fake_useragent import UserAgent
from lxml import etree


# url管理
class URLManager(object):
    def __init__(self):
        self.new_url = []
        self.old_url = []

    # 获取一个url
    def get_new_url(self):
        url = self.new_url.pop()
        self.old_url.append(url)
        return url

    # 增加一个url
    def add_new_url(self, url):
        if url not in self.new_url and url and url not in self.old_url:
            self.new_url.append(url)

    # 增加多个url
    def add_new_urls(self, urls):
        for url in urls:
            self.add_new_url(url)

    # 判断是否还有可以爬取的url
    def has_new_url(self):
        return self.get_new_url_size() > 0
        # 获取可以爬取的数量

    def get_new_url_size(self):
        return len(self.new_url)

    # 获取已经爬取的数量
    def get_old_url_size(self):
        return len(self.old_url)


# 爬取
class Downloader:
    def download(self, url):
        response = requests.get(url, headers={"User-Agent": UserAgent().random})
        if response.status_code == 200:
            response.encoding = 'utf-8'
            return response.text
        else:
            return None


# 解析
class Parser:
    def parse(self, html):
        e = etree.HTML(html)
        datas = self.parse_info(e)
        #datas = [span.xpath('string(.)') for span in e.xpath('//div[@class="content"]/span[1]')]
        urls = self.parse_urls(e)
        #urls = [ 'https://www.qiushibaike.com{}'.format(url) for url in e.xpath('//ul[@class="pagination"]/li/a/@href')]
        return datas, urls

    def parse_info(self, e):
        spans = e.xpath('//div[@class="content"]/span[1]')
        datas = []
        for span in spans:
            datas.append(span.xpath('string(.)'))
        return datas

    def parse_urls(self, e):
        base_url = 'https://www.qiushibaike.com{}'
        urls = []
        for url in e.xpath('//ul[@class="pagination"]/li/a/@href'):
            urls.append(base_url.format(url))
        return urls


# 数据处理
class DataOutPut:
    def save(self, datas):
        with open('duanzi.txt', 'a', encoding='utf-8') as f:
            for data in datas:
                f.write(data)


# 调度
class DiaoDu:
    def __init__(self):
        self.downloader = Downloader()
        self.url_manager = URLManager()
        self.parser = Parser()
        self.data_saver = DataOutPut()

    def run(self, url):
        self.url_manager.add_new_url(url)
        while self.url_manager.has_new_url():
            url = self.url_manager.get_new_url()
            html = self.downloader.download(url)
            data, urls = self.parser.parse(html)
            self.data_saver.save(data)
            self.url_manager.add_new_urls(urls)


if __name__ == '__main__':
    diao_du = DiaoDu()
    diao_du.run('https://www.qiushibaike.com/text/page/1/')

资料

链接:https://pan.baidu.com/s/10e8PphvR7Um0-WPAylw8Yw

提取码:h8i8

版权声明:本文为博主原创文章,转载请附上博文链接!
原文地址:https://www.cnblogs.com/zq98/p/15027998.html