python简单爬数据(这两个成功了)

这两个做的比较早,也比较幸运,成功做出来了,把代码扔在这里

老师叫我从两个网站上弄点数据,数据不多,但是要分月份,小时,经度,纬度,虽然不用排列组合还是挺麻烦的

人生苦短,我用Python

之前就大半年前看了看语法,没有实践过,数据的网页也比较简单,也算是拿来练练手


代码里面已经包含了目标网址,就不具体介绍了,保存下来的是网页,还需要一个脚本来处理一下,这个比较简单也就不放了。

1

#!usr/bin/python

import requests
import time
import sys


#-------- 配置以选择要爬的东西
#'hour', 'month', 'latitude', 'longitude'
sel = 'longitude'
#--------

web_url = r'https://omniweb.gsfc.nasa.gov/vitmo/iri2012_vitmo.html' #IRI2012
request_url = r'https://omniweb.gsfc.nasa.gov/cgi/vitmo/vitmo_model.cgi'
#filepath = sys.path[0] + '\dataaa_' + time.strftime("%Y%m%d%H%M%S", time.localtime()) + '.txt'
filepath = sys.path[0] + '\data_iri2012_raw_' + sel + '.txt'
print(filepath)
fid = open(filepath, 'w', encoding = 'utf-8')

headers = {#POST /cgi/vitmo/vitmo_model.cgi HTTP/1.1
           'Host' : 'omniweb.gsfc.nasa.gov',
           'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; rv:53.0) Gecko/20100101 Firefox/53.0',
           'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
           'Accept-Language' : 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
           'Accept-Encoding' : 'gzip, deflate, br',
           'Content-Type' : 'application/x-www-form-urlencoded',
           'Content-Length' : '452',
           'Referer' : 'https://omniweb.gsfc.nasa.gov/vitmo/iri2012_vitmo.html',
           'Cookie' : '_ga=GA1.4.167527256.1494290145; _gid=GA1.4.2137494148.1494290145; _gat_GSA_ENOR0=1',
           'Connection' : 'keep-alive',
           'Upgrade-Insecure-Requests' : '1',
           'Pragma' : 'no-cache',
           'Cache-Control' : 'no-cache'
           }

payload = {'model' : 'iri_2012',
           'year' : '2016',        #
           'month' : '12',         #
           'day' : '01',           #
           'time_flag' : '1',
           'hour' : '8',           #
           'geo_flag' : '0.',
           'latitude' : '50.',     #
           'longitude' : '40.',    #
           'height' : '100.',      #
           'profile' : '1',
           'start' : '100.',       #起始
           'stop' : '1000.',       #结束
           'step' : '50.',         #步长
           'sun_n' : '',
           'ion_n' : '',
           'radio_f' : '',
           'radio_f81 ' :'',
           'htec_max' : '',
           'ne_top':'0.',
           'imap' : '0.',
           'ffof2' : '0.',
           'ib0' : '2.',
           'probab' : '0.',
           'fauroralb' : '1.',
           'ffoE' : '1.',
           'dreg' : '0.',
           'tset' : '0.',
           'icomp' : '0.',
           'nmf2' : '0.',
           'hmf2' : '0.',
           'user_nme' : '0.',
           'user_hme' : '0.',
           'format' : '0',
           'vars' : ['17', '19', '20', '21'],#Ne,Tn,Ti,Te : 电子密度,中子温度,离子温度,电子温度
           'linestyle' : 'solid',
           'charsize' : '',
           'symbol' : '2',
           'symsize' : '',
           'yscale' : 'Linear',
           'xscale' : 'Linear',
           'imagex' : '640',
           'imagey' : '480'
           }


payload['year'] = '2016'
payload['month'] = '12'
payload['day'] = '01'
payload['hour'] = '8'
payload['longitude'] = '120'
payload['latitude'] = '60'
payload['start'] = '60'
payload['stop'] = '1000'
payload['step'] = '1'

count = 0

hours = range(1, 25)
months = range(1, 13)
latitudes = range(-90, 100, 10)
longitudes = range(0, 360, 10)
dic = {'hour' : hours,
       'month' : months,
       'latitude' : latitudes,
       'longitude' : longitudes
       }

items = dic[sel]
itemname = sel

for item in items:
    payload[itemname] = str(item)
    fid.write('
#===================== ' + str(item) + ' =====================
')
    TOGET = True
    while TOGET:
        TOGET = True
        try:
            print('
===================== ' + str(item) + ' =====================
')
            count = count + 1
            print('count : ' + str(count))
            r = requests.post(request_url, data = payload, headers = headers)#这里如果出错是不会向下执行的
            fid.write(r.text)
            TOGET = False
        except Exception as e:
            print(e)
            TOGET = True
    fid.write('
--------------------- ' + str(item) + ' ---------------------
')

fid.close();

2

#!usr/bin/python

import requests
import time
import sys


#-------- 配置以选择要爬的东西
#'hour', 'month', 'latitude', 'longitude'
sel = 'longitude'
#--------

web_url = r'https://ccmc.gsfc.nasa.gov/modelweb/models/nrlmsise00.php' 
request_url = r'https://ccmc.gsfc.nasa.gov/cgi-bin/modelweb/models/vitmo_model.cgi'
#filepath = sys.path[0] + '\dataaa_' + time.strftime("%Y%m%d%H%M%S", time.localtime()) + '.txt'
filepath = sys.path[0] + '\data_nrmlsise_raw_' + sel + '.txt'
print(filepath)
fid = open(filepath, 'w', encoding = 'utf-8')


headers = {#POST /cgi-bin/modelweb/models/vitmo_model.cgi HTTP/1.1
           'Host': 'ccmc.gsfc.nasa.gov',
           'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:53.0) Gecko/20100101 Firefox/53.0',
           'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
           'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
           'Accept-Encoding': 'gzip, deflate, br',
           'Content-Type': 'application/x-www-form-urlencoded',
           'Content-Length': '296',
           'Referer': 'https://ccmc.gsfc.nasa.gov/modelweb/models/nrlmsise00.php',
           'Cookie': '__utma=35212851.490003371.1494462808.1494462808.1494462808.1; __utmb=35212851.12.10.1494462808; __utmc=35212851; __utmz=35212851.1494462808.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmt=1',
           'Connection': 'keep-alive',
           'Upgrade-Insecure-Requests': '1',
           'Cache-Control': 'max-age=0'
           }

payload = {'model' : 'nrlmsise',
           'year' : '2016',
           'month' : '12',
           'day' : '01',
           'time_flag' : '1',
           'hour' : '8',
           'geo_flag' : '0.',
           'latitude' : '60',
           'longitude' : '120',
           'height' : '100.',
           'profile' : '1',
           'start' : '60.',
           'stop' : '1000.',
           'step' : '10.',
           'f10_7' : '',
           'f10_7_3' : '',
           'ap' : '',
           'format' : '0',
           'vars' : ['08', '09', '10'],#O,N2,O2 : 氧原子,氮分子,氧分子
           'linestyle' : 'solid',
           'charsize' : '1.0',
           'symbol' : '2',
           'symsize' : '1.0',
           'yscale' : 'Lin',
           'xscale' : 'Lin',
           'imagex' : '640',
           'imagey' : '480',
           }

payload['year'] = '2016'
payload['month'] = '12'
payload['day'] = '01'
payload['hour'] = '8'
payload['longitude'] = '120'
payload['latitude'] = '60'
payload['start'] = '60'
payload['stop'] = '1000'
payload['step'] = '1'

count = 0

hours = range(1, 25)
months = range(1, 13)
latitudes = range(-90, 100, 10)
longitudes = range(0, 360, 10)
dic = {'hour' : hours,
       'month' : months,
       'latitude' : latitudes,
       'longitude' : longitudes
       }

items = dic[sel]
itemname = sel

for item in items:
    payload[itemname] = str(item)
    fid.write('
#===================== ' + str(item) + ' =====================
')
    TOGET = True
    while TOGET:
        TOGET = True
        try:
            print('
===================== ' + str(item) + ' =====================
')
            count = count + 1
            print('count : ' + str(count))
            r = requests.post(request_url, data = payload, headers = headers)#这里如果出错是不会向下执行的
            fid.write(r.text)
            TOGET = False
        except Exception as e:
            print(e)
            TOGET = True
    fid.write('
--------------------- ' + str(item) + ' ---------------------
')

fid.close();
原文地址:https://www.cnblogs.com/ippfcox/p/6947165.html