爬取美团网数据

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import requests

from lib.re_util import ReUtil

base_url = 'http://ns.meituan.com/meishi/b25710/'

cookies_str = ''

cookies_dict = {}
for cookie in cookies_str.split(";"):
    k, v = cookie.split("=", 1)
    cookies_dict[k.strip()] = v.strip()

headers = {
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.98 Chrome/71.0.3578.98 Safari/537.36'
}

page = requests.get(
    url=base_url,
    cookies=cookies_dict,
    headers=headers
)

def get_element_from_html(raw_html):
    regex = ReUtil.get_regex(begin_with=['"poiInfos":'], end_with=['},"comHeader"'])
    result = regex.findall(raw_html)
    print(result[0][1])
    ans = ""
    for i in range(4):
        ans += result[0][i]
    return result

get_element_from_html(page.text)

ReUtil，这个工具其实也够用了，但是还是建议用xPath这种正规的方法来处理HTML

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import re


class ReUtil:
    need_escape = {
        '\': True,
        '^': True,
        '$': True,
        '.': True,
        '*': True,
        '+': True,
        '?': True,
        '{': True,
        '}': True,
        '(': True,
        ')': True,
        '[': True,
        ']': True,
        '|': True,
    }
    exits = {}

    @classmethod
    def get_regex(cls, begin_with=None, must_contain=None, end_with=None) -> 're object':
        begin_with = cls.conver_to_list(begin_with)
        must_contain = cls.conver_to_list(must_contain)
        end_with = cls.conver_to_list(end_with)

        pattern = ''
        pattern += cls.list_to_restring(begin_with)
        pattern += '(.*)?'
        pattern += cls.list_to_restring(must_contain)
        pattern += '(.*)?'
        pattern += cls.list_to_restring(end_with)

        if cls.exits.get(pattern):
            return cls.exits[pattern]
        regex_obj = re.compile(pattern, re.DOTALL)
        cls.exits[pattern] = regex_obj
        return regex_obj

    @classmethod
    def list_to_restring(cls, args: list) -> 'str':
        ans = '((?i)' # ignore capitals
        for i, arg in enumerate(args):
            for j in range(len(arg)):
                if arg[j] in cls.need_escape:
                    ans += '\'
                ans += arg[j]
            if i != len(args) - 1:
                ans += '|'
        ans += ')'
        return ans

    @classmethod
    def conver_to_list(cls, value) -> 'list':
        return [] if not value else [value] if not isinstance(value, list) else value

    @classmethod
    def get_all_number_to_list(cls, string):
        return re.findall('d+.?d*', string)