噢百万结果抓取

import requests
import re
from lxml import etree

'''噢百万抓取'''

url = 'http://www.obaiwan.com/hk49/results/'

p = re.compile('''<tr >
<td  >.+?</td>
<td  >(.+?)</td>
<td >(.+?)</td>
<td ><b style=".+?">(.+?)</b></td>
<td ><b style=".+?">(.+?)</b></td>
<td ><b style=".+?">(.+?)</b></td>
<td ><b style=".+?">(.+?)</b></td>
<td ><b style=".+?">(.+?)</b></td>
<td ><b style=".+?">(.+?)</b></td>
<td  >.+?</td>
<td >.+?</td>
<td >.+?</td>
<td >.+?</td>
<td >.+?</td>
<td >.+?</td>
<td >.+?</td>
<td ><b style=".+?">(.+?)</b></td>
</tr>''')

f = open('history.txt','w')
res = ''

for i in range(2003, 2016):
    year = i
    data = {'qinum':year,'submit':'%CC%E1%BD%BB%B2%E9%D1%AF'}
    r = requests.post(url, data=data)
    r.encoding = 'gb2312'
    matchs = p.findall(r.text)
    for row in matchs:
        res += ','.join(row) + '
'
        
f.write(res)
f.close()
原文地址:https://www.cnblogs.com/hhh5460/p/4402470.html