去哪儿网酒店数据启动程序更新

import re
import json
from odps import ODPS
from threading import Thread
import threading
from urllib import parse
import datetime
from lxml import etree

import random 
import requests
import time

from models import *

# def write_txt(html_data):
#     f = open("a.txt", 'a+')
#     f.write(html_data)
#     f.write(" ")
#     f.close()

domain_hotel = "https://hotel.qunar.com/cn/"
district_url = "https://hotel.qunar.com/napi/seo?path=%2Fseo%2Fnav&city="
hotel_comment = "https://hotel.qunar.com/napi/ugcCmtList?hotelSeq=nantong_5058&page=1" # 获取酒店评论数据的url,需要替换
#domain_hotel = "https://hotel.qunar.com/cn/sanya/?fromDate=2020-08-03&toDate=2020-08-04&cityName=%E4%B8%89%E4%BA%9A"

# 获取城市数据,存储,利用获取到的json文本数据
def save_city_list():
    with open('cityList.json','r',encoding='utf8')as fp:
        json_data = json.load(fp)
        for data in json_data:
            for data_0 in data:
                for data_value in data_0['value']:
                    district_url_0 = district_url + str(data_value['url'])
                    response = requests.request("GET", district_url_0)
                    json_city = json.loads(response.text)
                    #if len(json_city) > 0 and len(json_city['data'] > 0):
                    try:
                        if  (json_city['data'][0]["name"] == data_value['name'] + "行政区酒店") and (json_city['data'][0]['type'] == "city"):
                            for item in json_city['data'][0]['list']:
                                data_i = item["name"].split("酒店")[0]
                                data_i = data_i.split(" ")[0]
                                catalogue = List_City()
                                catalogue.district_name = data_i # 行政区域名字
                                catalogue.district_spell = item['id'] # 行政区域拼音
                                catalogue.city_name = data_value['name'] # 城市名称
                                catalogue.city_spell = data_value['url'] # 城市拼音
                                catalogue.create_time = datetime.datetime.now() # 抓取时间
                                existed_id = List_City.select().where(List_City.district_spell==item['id'])
                                if existed_id:
                                    pass  
                                else:
                                    catalogue.save(force_insert=True)
                        else:
                            catalogue = List_City()
                            catalogue.district_name = data_value['name'] # 行政区域名字
                            catalogue.district_spell = data_value['url'] # 行政区域拼音
                            catalogue.city_name = data_value['name'] # 城市名称
                            catalogue.city_spell = data_value['url'] # 城市拼音
                            catalogue.create_time = datetime.datetime.now() # 抓取时间
                            existed_id = List_City.select().where(List_City.city_name==data_value['name'])
                            if existed_id:
                                pass  
                            else:
                                catalogue.save(force_insert=True)
                    except:
                        #print(response.status_code)
                        print("非大陆数据或者城市酒店数据为空")
                        print(district_url_0)  

#根据catalogue存储的数据来获取城市的信息,用来拼接url
def save_hotel_url_to_redis():
    id_data = List_City.select()
    for item in id_data:
        city_name = item.city_name
        city_spell = item.city_spell
        district_name = item.district_name # 行政区域名字
        from_date = datetime.datetime.now().strftime('%Y-%m-%d')
        to_date = (datetime.datetime.now() + datetime.timedelta(days=1)).strftime('%Y-%m-%d')  
        #url = domain_hotel + city_spell + '/?fromDate=' + from_date + '&toDate=' + to_date + '&cityName=' + city_name
        url = domain_hotel + city_spell + '/?fromDate=' + from_date + '&toDate=' + to_date + '&cityName=' + parse.quote(city_name) + " " + str(district_name)
        r.lpush('qunar.com:hotel_url',url)#酒店数据的爬取url


#根据catalogue存储的数据来获取门票的信息,用来拼接url
def save_ticket_url_to_redis():
    id_data = List_City.select()
    for item in id_data:
        district_name = item.district_name # 行政区域名字
        url = tickect_url.replace('%E5%A6%82%E7%9A%8B%E5%B8%82',str(parse.quote(district_name)))  
        r.lpush('qunar.com:ticket_url',url)#酒店数据的爬取url

domain_vacation = "https://dujia.qunar.com/pdqk/list_%E5%8D%97%E9%80%9A_"
#'https://dujia.qunar.com/pdqk/list_%E5%8D%97%E9%80%9A_%E8%8B%8F%E5%B7%9E_all?ti=3&tm=l01_all_search_newc'
#根据catalogue存储的数据来获取城市的信息,用来拼接url
def save_vacation_url_to_redis():
    id_data = List_City.select()
    for item in id_data:
        city_name = item.city_name
        url = domain_vacation + parse.quote(city_name) + '_all?ti=3&tm=l01_all_search_newc' 
        r.lpush('qunar.com:vacation_url',url)#度假商品的url

def get_nodes_json():
    url =  r.lpop('qunar.com:hotel_url')
    #url = 'https://hotel.qunar.com/cn/wuzhishan/?fromDate=2020-08-06&toDate=2020-08-07&cityName=%E4%BA%94%E6%8C%87%E5%B1%B1'
    city_spell = re.search(r"cn/(.*)/?",url).group(1) # 此处获取城市的对应拼音
    city_name = re.search(r"cityName=(.*)",url).group(1) # 此处获取城市的名字
    district_name = re.search(r"([u2E80-u9FFF]+)",url).group(1) # 此处获取行政区域的名字

    url = url.split(" ")[0]
    # city_name = parse.quote(city_name) # 对城市名字进行转码

    from_date = datetime.datetime.now().strftime('%Y-%m-%d')
    to_date = (datetime.datetime.now() + datetime.timedelta(days=1)).strftime('%Y-%m-%d')
    
    payload_data = payload
    payload_data = payload_data.replace(""cityUrl":" "",""cityUrl":"" + city_spell + """)
    payload_data = payload_data.replace(""大兴区"",""" + district_name + """) # 行政区域名字
    # payload_data = payload_data.replace(""num":20",""num":20") # 酒店翻页数量
    #payload_data = payload_data.replace(""cityName":" "",""cityName":"" + city_name + """)
    payload_data = payload_data.replace(""fromDate":" "",""fromDate":"" + from_date + """)
    payload_data = payload_data.replace(""toDate":" "",""toDate":"" + to_date + """)
    # payload_data = payload_data.encode("utf-8")

    headers_data = headers
    cookie_data = headers_data['cookie']
    cookie_data = cookie_data.replace("cityUrl=" "","cityUrl=" + city_spell)
    cookie_data = cookie_data.replace("cityName=" "","cityName=" + city_name)
    cookie_data = cookie_data.replace("checkInDate=" "","checkInDate=" + from_date)
    cookie_data = cookie_data.replace("checkOutDate=" "","checkOutDate=" + to_date)

    headers_data['cookie'] = cookie_data
    headers_data['referer'] = url
    # print(payload_data)
    # print(headers_data)
    hotel_number = 0
    while(1):
        payload_data_0 = payload_data.encode("utf-8")
        response = requests.request("POST", url_hotel_api, headers=headers_data, data = payload_data_0)
        json_data = json.loads(response.text)
        if response.status_code == 200:
            #print(json_data['bstatus'])
            if json_data['bstatus']['code'] == 0:
                hotel_number = json_data['data']['tcount']    
                break

    print(hotel_number)
    start_num = 0
    before_num = 0
    while(1): 
        if hotel_number  > 0:
            print(before_num,start_num,hotel_number,"before_num","start_num","hotel_number")
            payload_data = payload_data.replace(""start":" + str(before_num),""start":" + str(start_num)) # 起始酒店序号
            payload_data_0 = payload_data.encode("utf-8")
            process_response_data(headers_data,payload_data_0,hotel_number,district_name)
            before_num = start_num
            #payload_data = payload_data.replace(""num":20",""num":" + str(20 if (hotel_number > 20)else hotel_number)) # 酒店翻页数量
            start_num = start_num + 20
            hotel_number = hotel_number - 20
        else:
            break

# 获取酒店详细评论量
def get_hotel_comment(hotel_id):
    hotel_comment = "https://hotel.qunar.com/napi/ugcCmtList?hotelSeq=nantong_5058&page=1" 
    hotel_comment = hotel_comment.replace("nantong_5058",hotel_id)
    response = requests.request("GET", hotel_comment)
    json_data = json.loads(response.text)
    negativeCount = json_data["data"]["ratingStat"]["negativeCount"]
    neutralCount = json_data["data"]["ratingStat"]["neutralCount"]
    positiveCount = json_data["data"]["ratingStat"]["positiveCount"]
    return [negativeCount,neutralCount,positiveCount]
    

def process_response_data(headers_data,payload_data,hotel_number,district_name): # 处理response的相应信息
    connect_times = 20 # 设置重连次数20次
    while(connect_times):
        response = requests.request("POST", url_hotel_api, headers=headers_data, data = payload_data)
        if response.status_code == 200:
            json_data = json.loads(response.text)
            if json_data['bstatus']['code'] == 0:
                time.sleep(random.randint(0,2)) # 设置随机休眠时间
                connect_times = 0 # 重置重连次数 
                hotel_city = json_data['data']['cityName'] # 酒店所在的城市
                print(len(json_data['data']['hotels']),"hotels的数量")
                if hotel_number > 20 and len(json_data['data']['hotels']) != 20: # 此处代码用来判断数据大于20的时候,取值缺少数据
                    connect_times = 20
                    print(f"当前hotel_number={hotel_number}")
                    print("获取残缺数据,数据不完整,跳出此处获取,重新抓取")
                    print("休眠120s")
                    time.sleep(120)
                    continue
                if hotel_number < 20 and len(json_data['data']['hotels']) != hotel_number: # 此处代码用来判断数据大于0的时候,取值数据缺少
                    connect_times = 20
                    print(f"当前hotel_number={hotel_number}")
                    print("获取残缺数据,数据不完整,跳出此处获取,重新抓取")
                    print("休眠120s")
                    time.sleep(120)
                    continue

                for data_hotel in json_data['data']['hotels']:
                    #print(data_hotel)
                    hotel_data = Hotel_data()
                    hotel_data.hotel_district = district_name
                    hotel_data.hotel_city = hotel_city
                    hotel_data.hotel_name = data_hotel['name']
                    #write_txt(data_hotel['name'])
                    hotel_data.hotel_level = data_hotel['dangciText']
                    hotel_data.hotel_score = data_hotel['score']
                    hotel_data.hotel_price = float(data_hotel['price']) # print(data_hotel['price'] + data_hotel['currencySign'])
                    #print(data_hotel['price'])
                    hotel_data.hotel_commentCount = data_hotel['commentCount']
                    negativeCount,neutralCount,positiveCount = get_hotel_comment(data_hotel["seqNo"])

                    hotel_data.hotel_negativeCount = negativeCount
                    hotel_data.hotel_neutralCount = neutralCount
                    hotel_data.hotel_positiveCount = positiveCount

                    hotel_data.hotel_Number = data_hotel['phoneNumber']
                    hotel_data.hotel_LocationInfo = data_hotel['locationInfo']
                    hotel_data.hotel_image = data_hotel["imageid"]
                    hotel_data.create_time = datetime.datetime.now() # 抓取时间
                    
                    hotel_data.save(force_insert=True)       
            else :
                if json_data['bstatus']['code'] == -1000:
                    print("搜索条件修改")
                    time.sleep(3)
                    pass 
                else:
                    print(f"第{20 - connect_times + 1}次尝试连接")
                    connect_times = connect_times -1
                    if 20 - connect_times + 1 > 18:
                        connect_times = 20
                        print("连接次数达到上线,休眠900s")
                        time.sleep(120)
                    pass    
        else:
            print("网页请求错误")

class parse_qunar_url_Thread(Thread):
    def run(self):
        while(1):
            get_nodes_json()
        #保存最终的数据

if __name__ == "__main__":
    create_tables()
    save_city_list()
    save_hotel_url_to_redis()
    save_vacation_url_to_redis()
    save_ticket_url_to_redis()
    for i in range(100):
        parse_qunar_url_thread = parse_qunar_url_Thread()     
        parse_qunar_url_thread.start()    
两年大概看此博客blog.codingnow.com/aee/
原文地址:https://www.cnblogs.com/dog-and-cat/p/13536708.html