大众点评字体加密2

import re
from pyquery import PyQuery as pq
import requests
from bs4 import BeautifulSoup

headers = {
    'Cookie': 'did_close_tag=; __mta=53728023.1611284104076.1611284104076.1611284104076.1; _lxsdk_cuid=17692643552c8-0f6aedeca7c4d3-3e604809-1fa400-17692643552c8; _hc.v=b32e7d08-accb-d713-b51b-e9f3f91b5edf.1609120457; s_ViewType=10; ua=dpuser_9207645725; fspop=test; cy=160; cye=zhengzhou; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; aburl=1; cityInfo=%7B%22cityId%22%3A160%2C%22cityName%22%3A%22%E9%83%91%E5%B7%9E%22%2C%22provinceId%22%3A0%2C%22parentCityId%22%3A0%2C%22cityOrderId%22%3A0%2C%22isActiveCity%22%3Afalse%2C%22cityEnName%22%3A%22zhengzhou%22%2C%22cityPyName%22%3Anull%2C%22cityAreaCode%22%3Anull%2C%22cityAbbrCode%22%3Anull%2C%22isOverseasCity%22%3Afalse%2C%22isScenery%22%3Afalse%2C%22TuanGouFlag%22%3A0%2C%22cityLevel%22%3A0%2C%22appHotLevel%22%3A0%2C%22gLat%22%3A0%2C%22gLng%22%3A0%2C%22directURL%22%3Anull%2C%22standardEnName%22%3Anull%7D; Hm_lvt_dbeeb675516927da776beeb1d9802bd4=1611198575; ll=7fd06e815b796be3df069dec7836c3df; ctu=86aab2ab6c4756757274f60b38f21950d48955d7ee9953f6a5bf87aae9d763a6; uamo=15824770183; uuid=DA34EA3FD6269A8A12E4F0DC658618076D55A2D5DB5F0F9BA006E145EBA0165B; iuuid=DA34EA3FD6269A8A12E4F0DC658618076D55A2D5DB5F0F9BA006E145EBA0165B; _lxsdk=DA34EA3FD6269A8A12E4F0DC658618076D55A2D5DB5F0F9BA006E145EBA0165B; _ga=GA1.2.1895823352.1611212247; _gid=GA1.2.769218074.1611212247; Hm_lvt_602b80cf8079ae6591966cc70a3940e7=1609120457,1610938415,1611198481,1611284068; Hm_lpvt_dbeeb675516927da776beeb1d9802bd4=1611284104; Hm_lvt_4c4fc10949f0d691f3a2cc4ca5065397=1611284126; Hm_lpvt_4c4fc10949f0d691f3a2cc4ca5065397=1611284126; lgtoken=0151ee32a-ad64-45a6-a153-c86aff4f5e61; dper=b2c6d5e2034c2abae310adab2004594fbefa99da8183294a9df2da90639ea0cf955b9cd7b4b84ff2dbce1b799ec44c8d671a35eaae9ffec36642947e930bf49a5424e4880315715ea0c2aee51c0ba850f547ad046acf78b7142d64e6bab72319; dplet=4731b96715688d19463cb7d344f97171; Hm_lpvt_602b80cf8079ae6591966cc70a3940e7=1611284182; _lxsdk_s=1772803b3fb-22f-c42-633%7C%7C411',
    'user-agent': 'MMozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}


def get_font_dict():
    """
    获取 svgmtsi 的class 的位置信息,也就是每个字代表的偏移量,整理成字典
    :return:
    """
    url = 'http://s3plus.meituan.net/v1/mss_0a06a471f9514fc79c981b5466f56b91/svgtextcss/9188e234976e64ada3d82aaf266d6f52.css'
    r = requests.get(url, headers=headers)
    font_list = re.findall('.*?{.*?}', r.text)
    font_dict = {}
    for font in font_list:
        class_font = font.split('{')[0].replace('.', '')
        pianyi_list = re.findall('d+', font.split('{')[1])
        num_list = [int(x) for x in pianyi_list if int(x) != 0]
        if len(num_list) == 1:
            num_list.insert(0, 0)
        font_dict[class_font] = num_list
    return font_dict


def get_font_place():
    """
    获取svg中字体的位置,便于获取css偏移的字,整理成字典比较好
    :return:
    """
    url = 'http://s3plus.meituan.net/v1/mss_0a06a471f9514fc79c981b5466f56b91/svgtextcss/fa364e43fd811d8b108e34479d0cc4a0.svg'
    place_dict = dict()
    r = requests.get(url, headers)
    soup = BeautifulSoup(r.text, 'html.parser')
    text_list = soup.find_all('text')
    for i in range(len(text_list)):
        y = text_list[i]['y']
        text = text_list[i].text
        place_dict[y] = text
    return place_dict


def main():
    """
    主方法: 1.html 就是点评的详情页,
    根据svgmtsi 的class属性 获取css 中的坐标值。
    a,b b值用来获取字体的行数,也就是在哪一行,b+23就是字体所在行数,我也不知道为啥是23。
    字体的宽度是14 所以用a/14 就是字体所在的行的位置索引
    用正则获取 <svgmtsi class=".*?"></svgmtsi> 然后解析成汉字 再去替换,就可以解析到正确的字体
    """
    font_dict = get_font_dict()
    place_dict = get_font_place()
    with open('1.html', 'r', encoding='utf8')as f:
        html = f.read()
    result = re.findall('(<svgmtsi class=".*?"></svgmtsi>)', html)
    for svgmtsi in result:
        doc = pq(svgmtsi).attr('class')
        doc_values = font_dict[doc]
        index_num = doc_values[1] + 23
        try:
            text = place_dict[str(index_num)]
        except KeyError:
            continue
        text_index = int(int(doc_values[0])/14)
        string = text[text_index]
        html = html.replace(svgmtsi, string)
    h_doc = pq(html)
    comments = h_doc('.reviews-items li').items()
    for comment in comments:
        comment_text1 = comment('.review-truncated-words').text()
        comment_text2 = comment('.review-words.Hide').text()
        comment_text = comment_text1 + comment_text2
        comment_text = comment_text.replace('展开评价','').replace('收起评价','').replace('
','')
        if comment_text:
            print(comment_text)
            print('==========================================')


if __name__ == '__main__':
    main()

 

这就是大众点评的评论字体加密

简单的demo 没有完整的爬取代码,只有解析字体加密的,反正我解决不了ip + cookie 的反扒,好气哟!

原文地址:https://www.cnblogs.com/lqn404/p/14313769.html