Python 基础语法+简单地爬取百度贴吧内容

Python笔记

1、Python3和Pycharm2018的安装

2、Python3基础语法

2.1.1、数据类型

2.1.1.1、数据类型：数字（整数和浮点数）

整数：int类型

浮点数：float类型。

2.1.1.2、数据类型：字符类型

2.1.1.3、数据类型：布尔类型

True：真

Flase：假

2.1.1.3、数据类型：列表(list)

>>> l =['aaa','bbb','ccc']
>>> l[0]
'aaa'
>>> l.append('ddd') # 追加元素
>>> l
['aaa', 'bbb', 'ccc', 'ddd']
>>> l.pop()   #数组尾部移除元素
'ddd'
>>> l
['aaa', 'bbb', 'ccc']
>>> l.pop(1)  #指定索引位置移除元素
'bbb'
>>> l
['aaa', 'ccc']
>>>

2.1.1.4、数据类型：元组(tuple)

元组和list类似，但是它是不能修改

2.1.1.5、数据类型：字典(dictionary)

类似java的map

>>> m = {'name':'test1','age':18}
>>> m
{'name': 'test1', 'age': 18}
>>> m['name']
'test1'
>>> m['name']='test2'
>>> m
{'name': 'test2', 'age': 18}
>>> m[name]     # 如果键不存在报错
Traceback (most recent call last):
  File "<pyshell#17>", line 1, in <module>
    m[name]
NameError: name 'name' is not defined
>>>

2.1.1.6、数据类型：集合(set)

set和dict类似，也是一组key的集合，但不存储value，key不能重复.

>>> s = set([1,2,3])
>>> s
{1, 2, 3}
>>> list = list(s)    # 将set类型转换为list类型
>>> list
[1, 2, 3]
>>> s.add(4)
>>> s
{1, 2, 3, 4}
>>> s.remove(2)
>>> s
{1, 3, 4}
>>>

2.1.1.7、数据类型：None

类似于java的null

2.1.2、变量

python的变量是动态语言，java是静态语言。

// java
int a=1;  //a的数字是int类型
a="abc";  //报错

a=1       //当前a的数据类型是整数
a=1.0     //浮点型
a='abc'   //字符型

常量

python习惯性以大写定义变量，认为是一个常量，但是python中没有办法去约束你不可以修改这个值。

变量：不可变和可变

python3里的6个基本数据类型：

不可变数据（3个）：数字、字符串类型、元组
可变数据（3个）：数组、字典、集合

>>> a='abc'
>>> b=a
>>> a='123'
>>> a
'123'
>>> b
'abc'

2.1.3、运算符

2.1.3.1、算术运算符

+、-、*、/、%

>>> 7/2
3.5
>>> 6/2
3.0
>>> 7//2
3
>>> 7%2
1
>>> 7.0/2   
3.5
>>> 7.0//2  #取整的值取决于相除的2方
3.0
>>> 5 ** 2  # 用** 表示计算幂乘方
25
>>> a='123'
>>> b = a*3  # 将a的值复制3次
>>> b
'123123123'
>>>

2.1.3.2、关系运算符

2.1.3.3、赋值运算符

2.1.3.4、逻辑运算符

and：且

or：或

not：取反

2.1.3.5、位运算符

2.1.3.6、成员运算符

>>> list = [1,2,3,4,5]
>>> a=2
>>> a in list    # 判断a是否在指定序列中
True
>>> b=10
>>> b in list
False
>>> b not in list    # 使用not in 判断是否不在指定的序列中
True

2.1.4、流程语句

2.1.4.1、条件判断

if 5>3:
   print(1)
elif 5>7:
   print(2)
#... elif可以写0到多个
else:   # else也可以省略
    #code

2.1.4.2、循环

>>> a =1
>>> while a<=10:
	print(a)
	a++              # python不支持
	
SyntaxError: invalid syntax
>>> while a<=10:
	print(a)
	a+=1	
1
2
3
4
5
6
7
8
9
10

>>> for n in range(5):
	print(n)

	
0
1
2
3
4
>>> for n in range(1,10):
	print(n)

	
1
2
3
4
5
6
7
8
9

3、爬百度贴吧

1、获取第一页的html

from urllib import request, parse

# 获取每一页的html
def loadPage(url):
    # 1、创建连接对象
    req = request.Request(url)
    # 2、创建连接并获得响应对象
    response = request.urlopen(req)
    # 3、读取响应的内容
    html = response.read()
    print(type(html))   # bytes类型的数字，不是str
    print('html:', html)
    # 4、将读取的内容解码
    content = html.decode('utf-8')
    print('content:', content)


if __name__ == '__main__':
    url = 'https://tieba.baidu.com/f?ie=utf-8&kw=%E6%9D%8E%E6%AF%85'
    loadPage(url)

2、将下载的内容写入到文件中

from urllib import request, parse

# 获取每一页的html
def loadPage(url):
    # 1、创建连接对象
    req = request.Request(url)
    # 2、创建连接并获得响应对象
    response = request.urlopen(req)
    # 3、读取响应的内容
    html = response.read()
    print(type(html))   # bytes类型的数字，不是str
    print('html:', html)
    # 4、将读取的内容解码
    content = html.decode('utf-8')
    print('content:', content)
    return content

# 将下载的内容写入到文件中
def writeFile(html, filename):
    print('正在保存：' + filename)
    f = open(filename, 'w', encoding='utf-8')  # open()用来读或写文件：读：mode='r'，写：mode='w'
    f.write(html)
    f.close()


if __name__ == '__main__':
    # 获取每一页的html
    url = 'https://tieba.baidu.com/f?ie=utf-8&kw=%E6%9D%8E%E6%AF%85'
    content = loadPage(url)
    # 将下载的内容写入到文件中
    filename = 'D:/course/tmp/test1.html'
    writeFile(content, filename)
    print('程序结束...')

3、设置起始页和终止页

int()：将str转换为int类型

float()：将str转换为float类型

str()：将转换为str字符串类型

from urllib import request, parse

# 获取每一页的html
def loadPage(url):
    # 1、创建连接对象
    req = request.Request(url)
    # 2、创建连接并获得响应对象
    response = request.urlopen(req)
    # 3、读取响应的内容
    html = response.read()
    print(type(html))   # bytes类型的数字，不是str
    print('html:', html)
    # 4、将读取的内容解码
    content = html.decode('utf-8')
    print('content:', content)
    return content

# 将下载的内容写入到文件中
def writeFile(html, filename):
    print('正在保存：' + filename)
    f = open(filename, 'w', encoding='utf-8')  # open()用来读或写文件：读：mode='r'，写：mode='w'
    f.write(html)
    f.close()

# 设置起始页和终止页
def tiebaSpider(url, beginPage, endPage):
    for page in range(beginPage, endPage+1):
        pn = (page-1) * 50   # 设置pn的值
        new_url = url + '&pn=' + str(pn)   # 生成新的url 
        # print('new_url: ', new_url)
        print('new_url %s' % new_url)
        # print('new_url %s, %s' %(new_url, new_url))
        filename = '第'+str(page)+'页.html' # 生成下载要保存的文件名
        html = loadPage(new_url)  # 获得下载页面的内容
        writeFile(html, filename)  # 写入文件


if __name__ == '__main__':
    # 获取每一页的html
    url = 'https://tieba.baidu.com/f?ie=utf-8&kw=%E6%9D%8E%E6%AF%85'
    beginPage = int(input('请输入起始页：'))  # 输入默认的是str类型
    endPage = int(input('请输入终止页：'))
    print('beginPage: %d , endPage：%d' %(beginPage, endPage))
    tiebaSpider(url, beginPage, endPage)
    print('程序结束...')

4、爬取贴吧图片

4.1、先要获得下载图片的链接地址

# 获得图片的链接地址
def loadPage2(url):
    # 创建连接对象
    req = request.Request(url)
    # 创建url连接获得响应对象
    response = request.urlopen(req)
    # 读取网页的内容
    html = response.read()
    # 对内容解码
    html = html.decode('utf-8')
    # 对内容创建dom树对象
    content = etree.HTML(html)
    print(type(content))  # <class 'lxml.etree._Element'>
    # 通过 xpath规则获取满足条件的数据
    link_list = content.xpath('//div[@id="post_content_102822382749"]/img/@src')
    # link_list = content.xpath('//div[@id="post_content_102822382749"]/a/text()')
    for link in link_list:
        print(link)   # 图片的链接地址


if __name__ == '__main__':
    url = 'https://tieba.baidu.com/p/4946204416'
    loadPage2(url)

4.2、将图片下载到本地保存

from urllib import request, parse
from lxml import etree

# 获取每一页的html
def loadPage(url):
    # 1、创建连接对象
    req = request.Request(url)
    # 2、创建连接并获得响应对象
    response = request.urlopen(req)
    # 3、读取响应的内容
    html = response.read()
    print(type(html))   # bytes类型的数字，不是str
    print('html:', html)  # 内容本身就是图片的内容，图片的内容由二进制组成，不需要解码
    return html

# 获得图片的链接地址
def loadPage2(url):
    # 创建连接对象
    req = request.Request(url)
    # 创建url连接获得响应对象
    response = request.urlopen(req)
    # 读取网页的内容
    html = response.read()
    # 对内容解码
    html = html.decode('utf-8')
    # 对内容创建dom树对象
    content = etree.HTML(html)
    print(type(content))  # <class 'lxml.etree._Element'>
    # 通过 xpath规则获取满足条件的数据
    link_list = content.xpath('//div[@id="post_content_102822382749"]/img/@src')
    # link_list = content.xpath('//div[@id="post_content_102822382749"]/a/text()')
    for link in link_list:
        print(link)   # 图片的链接地址
        filename = link[-20:]
        print('正在保存文件：' + filename)
        # 将图片写入到文件中
        # mode='w'：写的数据类型是str
        # 写入二进制内容需要设置mode='wb'，并且不需要设置encoding='utf-8'
        # 使用with就不需要 手动f.close()
        with open('D:/course/tmp/'+filename, 'wb') as f:   
            html1 = loadPage(link)
            f.write(html1)

        # f = open('D:/course/tmp/'+, 'w', encoding='utf-8')  # open()用来读或写文件：读：mode='r'，写：mode='w'
        # f.write(html)
        # f.close()


if __name__ == '__main__':
    url = 'https://tieba.baidu.com/p/4946204416'
    loadPage2(url)