第二次作业

作业1

1.天气实验代码

#！/usr/bin/env python
# _*_ coding:utf-8 _*_
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
import sqlite3


class weatherDB:
    def openDB(self):
        self.con = sqlite3.connect("weather.db")
        self.cursor = self.con.cursor()
        try:
            self.cursor.execute(
                "create table weathers (wcity varchar(16),wdate varchar(16),wweather varchar(64),wtemp varchar(32),constraint pk_weather primary key(wcity,wdate))")
        except:
            self.cursor.execute("delete from weathers")

    def closeDB(self):
        self.con.commit()
        self.con.close()

    def insert(self, city, date, weather, temp):
        try:
            self.cursor.execute("insert into weathers (wcity,wdate,wweather,wtemp) values(?,?,?,?)",
                                (city, date, weather, temp))
        except:
            print("err")

    def show(self):
        self.cursor.execute("select * from weathers")
        rows = self.cursor.fetchall()
        print("%-16s%-16s%-32s%-16s" % ("city", "date", "weather", "temp"))
        for row in rows:
            print("%-16s%-16s%-32s%-16s" % (row[0], row[1], row[2], row[3]))


class weatherforecast():
    def __init__(self):
        self.headers = {                                       #伪装成主机，提前记录下来四个城市的代码编号
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
                          "Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3775.400 QQBrowser/10.6.4209.400"}
        self.citycode = {"北京": "101010100", "上海": "101020100", "广州": "101280101", "深圳": "101280601"}

    def forecastcity(self, city):
        if city not in self.citycode.keys():                   #if语句用来看看这个城市在不在这个网页上面，有没有可能是自己在输入的时候有错误
            print(city + "code not found")
            return
        url = "http://www.weather.com.cn/weather/" + self.citycode[city] + ".shtml"
        try:                                                   #进入不同城市的天气信息页面，开始行动
            req = urllib.request.Request(url, headers=self.headers)
            data = urllib.request.urlopen(req)
            data = data.read()
            dammit = UnicodeDammit(data, ["utf-8", "gbk"])
            data = dammit.unicode_markup
            soup = BeautifulSoup(data, 'html.parser')
            lis = soup.select("ul[class='t clearfix'] li")     #查看页面源码后发现，信息储存在ul里面
            for li in lis:
                try:
                    date_ = li.select('h1')[0].text            #用select逐层挑选，筛选出自己需要的城市，日期，天气和温度信息
                    weather_ = li.select('p[class="wea"]')[0].text
                    temp_ = li.select('p[class="tem"] span')[0].text + '℃/' + li.select("p[class='tem'] i")[0].text
                    print(city, date_, weather_, temp_)
                    self.db.insert(city, date_, weather_, temp_)
                except:
                    print('err1')
        except:
            print('err2')

    def precess(self, cities):
        self.db = weatherDB()                                   
        self.db.openDB()
        for city in cities:
            self.forecastcity(city)
        self.db.show()
        self.db.closeDB()


ws = weatherforecast()
ws.precess(["北京", '上海', '广州', '深圳'])
print('completed')

结果：

2.心得体会

在天气预报的代码方面，有点难以理解的是两个大类的定义和函数的处理，各种参数的作用在自己琢磨之后也就出来了。

作业2

1.股票爬取

import requests
from bs4 import BeautifulSoup
import re



def getHtmlText(url):
    head = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0',
            'Cookie': 'qgqp_b_id=54fe349b4e3056799d45a271cb903df3; st_si=24637404931419; st_pvi=32580036674154; st_sp=2019-11-12%2016%3A29%3A38; st_inirUrl=; st_sn=1; st_psi=2019111216485270-113200301321-3411409195; st_asi=delete'
            }
    try:
        r = requests.get(url, timeout=30, headers=head)
        r.raise_for_status()
        r.encoding = 'utf-8'
        return r.text
    except:
        return ""


recordfile = 'Data.txt'
url = 'http://51.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112408349318807687469_1574045112932&pn=1&pz=20&po=1&np=2&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&fs=m:0+t:6,m:0+t:13,m:0+t:80,m:1+t:2,m:1+t:23&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1574045112933'
head = {
        "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"
    }
Codelist=[]
DealData=[['股票代码', '今开', '最高', '最低', '昨收', '成交量', '成交额', '总市值', '流通市值', '振幅', '换手率', '市净率', '市盈率', ]]
r = requests.get(url, timeout=30, headers=head)
r.raise_for_status()
r.encoding = 'utf-8'
html=r.text
# print(html)
soup = str(BeautifulSoup(html,"html.parser"))
# print(html)
regex=re.compile(r'.f12...d{6}.')
listpatterns=regex.findall(soup)
for listpattern in listpatterns:
    numpattern=re.compile(r'd{6}')
    Codelist.append(numpattern.findall(listpattern)[0])
# print(Codelist)
total = len(Codelist)
CodeList = Codelist[:50]
finished = int(0)
for code in CodeList:
    finished = finished + 1
    finishedco = (finished / total) * 100
    print("total : {0}   finished : {1}    completion : {2}%".format(total, finished, finishedco))
    dealDataList = []
    dataUrl = 'http://info.stcn.com/dc/stock/index.jsp?stockcode=' + code
    dataHtml = getHtmlText(dataUrl)
    soup = BeautifulSoup(dataHtml, "html.parser")
    dealDataList.append(code)
    for i in range(1, 4):
        classStr = 'sj_r_' + str(i)
        divdata = soup.find_all('div', {'class': classStr})
        if len(divdata) == 0:
            dealDataList.append('该股票暂时没有交易数据！')
            break
        dealData = str(divdata[0])
        dealPattern = re.compile(r'd+.d+[u4e00-u9fa5]|d+.+.%|d+.d+')
        listdeal = dealPattern.findall(dealData)
        for j in range(0, 4):
            dealDataList.append(listdeal[j])
    DealData.append(dealDataList)
file = open(recordfile, 'a+')
for i in range(len(DealData)):
    if i == 0:
        s = str(DealData[i]).replace('[', '').replace(']', '')
        s = s.replace("'", '').replace(',', ' 	') + '
'
    else:
        s = str(DealData[i]).replace('[', '').replace(']', '')
        s = s.replace("'", '').replace(',', '	') + '
'
    file.write(s)
file.close()
print(len(DealData))

结果：

2.心得体会

因为是第一次涉及到爬取实时获取的数据，用了很多的时间，用的时间也是最多的。这次的作业，大部分的代码也是借鉴了CSDN上面的相关代码，再加上自己的修改最后才得出来的结果。

作业3

1.自选股票代码

import requests
Codelist=[]
List = ["股票代码号","股票名称","今日最高","今日最低","今日开"]
url = 'http://46.push2his.eastmoney.com/api/qt/stock/kline/get?cb=jQuery112406437068490950477_1602146854442&secid=1.600115&ut=fa5fd1943c7b386f172d6893dbfba10b&fields1=f1%2Cf2%2Cf3%2Cf4%2Cf5%2Cf6&fields2=f51%2Cf52%2Cf53%2Cf54%2Cf55%2Cf56%2Cf57%2Cf58%2Cf59%2Cf60%2Cf61&klt=101&fqt=0&end=20500101&lmt=120&_=1602146854482'
head = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"}
r = requests.get(url, timeout=30, headers=head)
r.raise_for_status()
r.encoding = 'utf-8'
html=r.text
msg=html[html.rindex('"',0,-10):]
result=msg.split(",")
print("股票代码号  股票名称  今日开  今日最高  今日最低")
print("600115    "+"东方航空  "+result[2]+"   "+result[3]+"    "+result[4])

结果：

2.心得体会

这次是请教霖哥才写出来的结果。用了rindex直接把相关的数据从后面爬出来后，再换成数组就能直接输出了。