python爬虫相关安装与应用

1、mysql数据库用于存储大量数据。

2、Navicat for MySQL以图形和表格等形式管理数据库工具。

3、编程语言python3与环境配置

4、pythcharm集成开发环境(社区版)不需要激活

5、Python包管理器Anaconda3(爬虫主要用到两个包requests,pymysql)与环境配置(网上可找安装教程).

链接:https://pan.baidu.com/s/1Zef6oPmtNZ4sWBXyAMBSgA
提取码:am9q

应用:

1、正则表达式提取猫眼top100电影中的电影名称、主演和上映时间

import pymysql
import requests
import re

def get_text(url):
    r = requests.get(url)
    r.raise_for_status()
    r.encoding = r.apparent_encoding
    return r.text

def parse_html(url, list):
    demo = get_text(url)
    patern = re.compile('class="name".*?title="(.*?)".*?:(.*?)s*?</p>.*?:(d{4}-d{2}-d{2})', re.S)
    results = re.findall(patern, demo)
    for result in results:
        list.append(result)
    return list

list = []
for i in range(0,10):
    url = 'https://maoyan.com/board/4?offset='+str(10*i)
    list = parse_html(url, list)

count = 0
for i in list:
    count = count + 1
    print(i)
print("一共有"+str(count)+"条数据!")

2、正则表达式提取西北大学讲座信息

import requests
import pymysql
import re
import os

def get_text(url):
    r = requests.get(url)
    r.raise_for_status()
    r.encoding = r.apparent_encoding
    return r.text


def parse_html(url, list):
    demo = get_text(url)
    patern = re.compile('<li><span class="fr">[(.*?)].*?&nbsp;&nbsp;(.*?)</a>',re.S)
    results = re.findall(patern, demo)
    for result in results:
        list.append(result)
    return list


list = []
url = 'http://computer.swu.edu.cn/s/computer/kxyj2xsky/index.html'
list = parse_html(url,list)
for i in range(2, 5):
    url = "http://computer.swu.edu.cn/s/computer/kxyj2xsky/index_"+str(i)+".html"
    list = parse_html(url, list)

count = 0
for i in list:
    count = count + 1
    print(i)
print("一共有"+str(count)+"条数据!")

3、爬取图片

import requests
import pymysql
import re
import os

def get_text(url):
    r = requests.get(url)
    r.raise_for_status()
    r.encoding = r.apparent_encoding
    return r.text


def parse_html(url, list):
    demo = get_text(url)
    patern = re.compile('<li><span class="fr">[(.*?)].*?&nbsp;&nbsp;(.*?)</a>',re.S)
    results = re.findall(patern, demo)
    for result in results:
        list.append(result)
    return list


list = []
url = 'http://computer.swu.edu.cn/s/computer/kxyj2xsky/index.html'
list = parse_html(url,list)
for i in range(2, 5):
    url = "http://computer.swu.edu.cn/s/computer/kxyj2xsky/index_"+str(i)+".html"
    list = parse_html(url, list)

count = 0
for i in list:
    count = count + 1
    print(i)
print("一共有"+str(count)+"条数据!")
import pymysql
import requests
from hashlib import md5
import re
import os

# db = pymysql.connect('localhost', 'root', '1458555801', 'world')
# print("数据库连接成功!")
# print("---------------------------------------------------")
# r = requests.get("https://python123.io/ws/demo.html")
# print(r.text)

# r = requests.get("https://python123.io/ws/demo.html")
# print(r)
# # 提取网页文本内容
# print(r.text)
# # 提取网页编码方式
# print(r.encoding)
# print(r.apparent_encoding)
# r.encoding = r.apparent_encoding
# # 打印状态码
# print(r.status_code)
# # 捕获异常
# print(r.raise_for_status())

def get_text(url):
    r = requests.get(url)
    r.raise_for_status()
    r.encoding = r.apparent_encoding
    return r.text

# print(get_text('https://python123.io/ws/demo.html'))

# demo = get_text('https://python123.io/ws/demo.html')
# result = re.search('Th.*?ge', demo)
# print(result)
# print(result.group())
# result2 = re.search('http.*?001', demo)
# print(result2.group())
# result3 = re.findall('<p.*?</p>', demo, re.S)
# print(result3)

def parse_html(url, list):
    demo = get_text(url)
    # 将正则表达式编译成正则表达式对象,方便复用该正则表达式
    # ".*?" :匹配任意字符串
    # [u4e00-u9fa5] :匹配中文
    # (d{4}-d{2}-d{2}) : 匹配日期
    patern = re.compile('<li><spansclass="fr">[(d{4}-d{2}-d{2})].*?&nbsp;&nbsp;(.*?)</a></li>', re.S)
    results = re.findall(patern, demo)
    for result in results:
        list.append(result)
    return list

list = []
url = 'http://computer.swu.edu.cn/s/computer/kxyj2xsky/index.html'
list = parse_html(url, list)
for i in range(2,5):
    # http://computer.swu.edu.cn/s/computer/kxyj2xsky/index_2.html
    url = 'http://computer.swu.edu.cn/s/computer/kxyj2xsky/index_'+str(i) + '.html'
    list = parse_html(url, list)

count = 0
for i in list:
    count = count + 1
    print(i)
print("一共有"+str(count)+"条数据!")

# def download_image(url):
#     r = requests.get(url)
#     r.raise_for_status()
#     save_image(r.content)
#
# def save_image(content):
#     file_path = '{0}/{1}.{2}'.format('C:/Users/Think/Desktop/image', md5(content).hexdigest(), 'jpg')
#     if not os.path.exists(file_path):
#         with open(file_path, 'wb') as f:
#             f.write(content)
#             f.close()

# for i in list:
#     download_image(i)
# print("下载成功")
原文地址:https://www.cnblogs.com/nonames/p/11144193.html