python简单爬虫（股票信息）

 1 
 2 import requests
 3 from bs4 import BeautifulSoup
 4 import traceback
 5 import re
 6 
 7 def getHTMLText(url, code="utf-8"):   #获取股票页面信息
 8     try:
 9         r = requests.get(url)
10         r.raise_for_status()
11         r.encoding = code
12         return r.text
13     except:
14         return ""
15 
16 def getStockList(lst, stockURL):      #获取股票列表
17     html = getHTMLText(stockURL, "GB2312")
18     soup = BeautifulSoup(html, 'html.parser') 
19     a = soup.find_all('a')
20     for i in a:
21         try:
22             href = i.attrs['href']
23             lst.append(re.findall(r"[s][hz]d{6}", href)[0])
24         except:
25             continue
26 
27 def getStockInfo(lst, stockURL, fpath):  #获取单支股票信息
28     count = 0
29     for stock in lst:
30         url = stockURL + stock + ".html"
31         html = getHTMLText(url)
32         try:
33             if html=="":
34                 continue
35             infoDict = {}
36             soup = BeautifulSoup(html, 'html.parser')
37             stockInfo = soup.find('div',attrs={'class':'stock-bets'})
38 
39             name = stockInfo.find_all(attrs={'class':'bets-name'})[0]
40             infoDict.update({'股票名称': name.text.split()[0]})
41             
42             keyList = stockInfo.find_all('dt')
43             valueList = stockInfo.find_all('dd')
44             for i in range(len(keyList)):
45                 key = keyList[i].text
46                 val = valueList[i].text
47                 infoDict[key] = val
48             
49             with open(fpath, 'a', encoding='utf-8') as f:   #写入文件
50                 f.write( str(infoDict) + '
' )
51                 count = count + 1
52                 print("
当前进度: {:.2f}%".format(count*100/len(lst)),end="")
53         except:
54             count = count + 1
55             print("
当前进度: {:.2f}%".format(count*100/len(lst)),end="")
56             continue
57 
58 def main():
59     stock_list_url = 'http://quote.eastmoney.com/stocklist.html'
60     stock_info_url = 'https://gupiao.baidu.com/stock/'
61     output_file = 'C:/BaiduStockInfo.txt'
62     slist=[]
63     getStockList(slist, stock_list_url)
64     getStockInfo(slist, stock_info_url, output_file)