spider autohome (1)

Code:

#!/usr/bin/python
# -*- coding: UTF-8 -*-

import re
import urllib
import time
def getHtml(url):
    """ This function just simply get all the data
        by the url you get.and then decode and code
        to utf-8 which you need.
    """
    page = urllib.urlopen(url)
    html=page.read()
    uni_str = html.decode('gb2312')
    utf_str = uni_str.encode('utf-8')
    return utf_str

def getInfo(html):
    """
        This function just simply get the data from the html
        and filter some data which we are interest,and then
        return a list.
    """
    reg = r'config = {(.+?)};'
    config_re = re.compile(reg)
    config_list = re.findall(config_re,html)
    return config_list

def getEachCar(config_lists):
    """ This function will parse the data,and 
        then return a list include the all 
        information of each car,the each item
        of the car's information split by '|'.
    """
    each_car={}
    for sp in config_lists:
        config_str='{'+sp+'}'
        config_str=config_str.replace("null","None")
        regx=r'{"specid":d{5},"value":.+?}'
        cc=re.compile(regx)
        xx=re.findall(regx,config_str)
        for x in xx:
            x=eval(x)
            akey=repr(x['specid'])
            if each_car.has_key(akey):
                each_car[akey]=each_car[akey]+x["value"]+"|"
            else:
                each_car[akey]=x['value']
    jobs=[]
    for each in each_car:
        ter_data="|"+each_car[each]
        jobs.append(ter_data)
    return jobs
if __name__ == '__main__':
#   html = getHtml("http://car.autohome.com.cn/config/spec/21308.html#pvareaid=100679")
    html = getHtml("http://car.autohome.com.cn/config/spec/18239.html")
    config_lists=getInfo(html)
    each_car=getEachCar(config_lists)
    for acar in each_car:
        print acar

Result:

Can we drop this masquerade
原文地址:https://www.cnblogs.com/landpack/p/4555554.html