第一个爬虫和测试

1.测试球赛.

from random import random
def printIntro(): #打印程序介绍信息
print("这个程序模拟两个选手A和B的某种竞技比赛")
print("程序运行需要A和B的能力值（以0到1之间的小数表示）")
def getInputs(): #获得程序运行参数
a = eval(input("请输入选手A的能力值(0-1): "))
b = eval(input("请输入选手B的能力值(0-1): "))
n = eval(input("模拟比赛的场次: "))
return a, b, n
def simNGames(n, probA, probB): # 进行N场比赛
winsA, winsB = 0, 0
for i in range(n):
for j in range(7): #进行7局4胜的比赛
scoreA, scoreB = simOneGame(probA, probB)
if scoreA > scoreB:
winsA += 1
else:
winsB += 1
return winsA, winsB
try:
simNGames(0.55)
except:
print("simNGames Error")

def gameOver(a,b): #正常比赛结束
return a==11 or b==11
def gameOver2(a,b): #进行抢12比赛结束
if abs((a-b))>=2:
return a,b
def simOneGame(probA, probB): #进行一场比赛
scoreA, scoreB = 0, 0 #初始化AB的得分
serving = "A"
while not gameOver(scoreA, scoreB): #用while循环来执行比赛
if scoreA==10 and scoreB==10:
return(simtwoGame2(probA,probB))
if serving == "A":
if random() < probA: ##用随机数生成胜负
scoreA += 1
else:
serving="B"
else:
if random() < probB:
scoreB += 1
else:
serving="A"
return scoreA, scoreB

try:
simOneGame(0.54)
except:
print("simNGame Error")

def simtwoGame2(probA,probB):
scoreA,scoreB=10,10
serving = "A"
while not gameOver2(scoreA, scoreB):
if serving == "A":
if random() < probA:
scoreA += 1
else:
serving="B"
else:
if random() < probB:
scoreB += 1
else:
serving="A"
return scoreA, scoreB

try:
simtwoGame2(0.44,0.66)
except:
print("simNGame2 Error")

def printSummary(winsA, winsB):
n = winsA + winsB
print("竞技分析开始，共模拟{}场比赛".format(n))
print("选手A获胜{}场比赛，占比{:0.1%}".format(winsA, winsA/n))
print("选手B获胜{}场比赛，占比{:0.1%}".format(winsB, winsB/n))
def main():
printIntro()
probA, probB, n = getInputs()
winsA, winsB = simNGames(n, probA, probB)
printSummary(winsA, winsB)
main()

--------------------------------------------------------------------------

这个程序模拟两个选手A和B的某种竞技比赛
程序运行需要A和B的能力值（以0到1之间的小数表示）

请输入选手A的能力值(0-1):

----------------------------------------------------------------------------------------------------------------------------------------------------------

2.用requests库的get()函数访问百度20次，打印返回状态，text（）内容，计算text（）属性和content属性所返回网页内容长度。

import requests
for i in(0,20):
r=requests.get("https://www.baidu.com/")
print(r.status_code)
print(r.text)
print(type(r.text))
print(type(r.content))
print(len(r.content))

-------------------------------------------------------------------------------------

200
<class 'str'>

<class 'bytes'>
2443

------------------------------------------------------------------------------------------------------------------------------

3.html库简单计算

----------------------------------------------原代码------------------------

<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>菜鸟教程(runboo.com) 26 </title>
</head>
<body>
<h1>我的第一个标题</h1>
<p id="first">我的第一个段落。</p >
</body>
<table border="1">
<tr>
<td>row 1,cell 1</td>
<td>row 1,cell 2</td>
</tr>
<tr>
<td>row 2,cell 1</td>
<td>row 2,cell 2</td>
</tr>
</table>
</html>

-------------------------------------------------------------------------------------------------------------------

(首先我们先将原来的代码存在一个html文件中，这里的'GB2312'是百度的编码，如果不行换'utf-8')

# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import re
path = 'C:/Users/Administrator/AppData/Roaming/Heinote/text.html'
htmlfile = open(path, 'r', encoding='GB2312')
htmlhandle = htmlfile.read()
soup=BeautifulSoup(htmlhandle, "html.parser")
print(soup.head,"41")
print(soup.body)
print(soup.find_all(id="first"))
r=soup.text
pattern = re.findall('[u4e00-u9fa5]+',r)
print(pattern)

------------------------------------------------结果-------------------------------------------------------------

<head>

<meta charset="utf-8"/>

<title>菜鸟教程(runboo.com) 26 </title>
</head> 41
<body>
<h1>我的第一个标题</h1>
<p id="first">我的第一个段落。</p>
</body>
[<p id="first">我的第一个段落。</p>]
['菜鸟教程', '我的第一个标题', '我的第一个段落']

----------------------------------------------------结果--------------------------------------------------------------------------------

4.爬去2016年的大学排名

网站：'http://www.zuihaodaxue.com/zuihaodaxuepaiming2016.htm’

1 import requests
2 from bs4 import BeautifulSoup
3 allUniv = []
4 def getHTMLText(url):
5 try:
6 r = requests.get(url, timeout=30)
7 r.raise_for_status()
8 r.encoding = 'utf-8'
9 return r.text
10 except:
11 return ""
12 def fillUnivList(soup):
13 data = soup.find_all('tr')
14 for tr in data:
15 ltd = tr.find_all('td')
16 if len(ltd)==0:
17 continue
18 singleUniv = []
19 for td in ltd:
20 singleUniv.append(td.string)
21 allUniv.append(singleUniv)
22 def printUnivList(num):
23 print("{1:^2}{2:{0}^10}{3:{0}^6}{4:{0}^4}{5:{0}^10}".format(chr(12288),"排名","学校名称","省市","总分","年费"))
24 for i in range(num):
25 u=allUniv[i]
26 print("{1:^4}{2:{0}^10}{3:{0}^5}{4:{0}^8.1f}{5:{0}^11}".format(chr(12288),u[0],u[1],u[2],eval(u[3]),u[11]))
27 def main():
28 url = 'http://www.zuihaodaxue.com/zuihaodaxuepaiming2016.html'
29 html = getHTMLText(url)
30 soup = BeautifulSoup(html, "html.parser")
31 fillUnivList(soup)
32 printUnivList(10)
33 main()

----------------------------------------------------------------

排名　　　学校名称　　　　　省市　　　总分　　　　　年费　　　　
1 　　　清华大学　　　　北京市　　　95.9　　　　　1187　　　　
2 　　　北京大学　　　　北京市　　　82.6　　　　　　799　　　　
3 　　　浙江大学　　　　浙江省　　　80.0　　　　　　833　　　　
4 　　上海交通大学　　　上海市　　　78.7　　　　　　909　　　　
5 　　　复旦大学　　　　上海市　　　70.9　　　　　　534　　　　
6 　　　南京大学　　　　江苏省　　　66.1　　　　　　239　　　　
7 　中国科学技术大学　　安徽省　　　65.5　　　　　　228　　　　
8 　哈尔滨工业大学　　黑龙江省　　　63.5　　　　　　379　　　　
9 　　华中科技大学　　　湖北省　　　62.9　　　　　　482　　　　
10 　　　中山大学　　　　广东省　　　62.1　　　　　　207