去哪儿网门票数据

import re
import json
from odps import ODPS
from threading import Thread
import threading
from urllib import parse
import datetime
from lxml import etree

import random 
import requests
import time

from models import *

def write_txt(html_data):
    f = open("a.txt", 'a+')
    f.write(html_data)
    f.write(" ")
    f.close()

def get_nodes_json():
    url =  r.lpop('qunar.com:ticket_url')
    if url:
        city_name = re.search(r"%E5%8D%97%E9%80%9A_(.*)_all?",url).group(1) # 此处获取城市的名字
        vacation_url = vacation_url.replace("%E4%B8%8A%E6%B5%B7",city_name) # 替换
        payload_data = vacation_payload
        
        headers_data = vacation_headers
        headers_data['referer'] = url

        url = vacation_url

        vacation_number = 0
        while(1):
            response = requests.request("GET", url, headers=headers_data, data = payload_data)
            false = False
            true = True
            null = None
            json_data = eval(response.text)
            if response.status_code == 200:
                if json_data['status'] == 0:
                    vacation_number = json_data['data']['list']['numFound']    
                    break

        #print(vacation_number)
        start_num = 0
        while(1): 
            if vacation_number  > 0:
                vacation_url = vacation_url.replace("%E4%B8%8A%E6%B5%B7",city_name) # 替换
                page_start = re.search(r"&lm=(d+)%2C60",vacation_url).group(1)
                vacation_url = vacation_url.replace("&lm=" + str(page_start) + "%2C60","&lm=" + str(start_num) + "%2C60")
                
                process_response_data(headers_data,payload_data,vacation_url)
                start_num = start_num + 60
                vacation_number = vacation_number - 60
            else:
                break

def process_response_data(headers_data,payload_data,vacation_url): # 处理response的相应信息
    while(1):
        response = requests.request("GET", vacation_url, headers=headers_data, data = payload_data)
        if response.status_code == 200:
            json_data = json.loads(response.text)
            # false = False
            # true = True
            # null = None
            # json_data = eval(response.text)
            if json_data['status'] == 0:
                try:
                    #product_city = ''.join(json_data['data']['qdata']['destinations']) # list数据类型转换成str类型数据
                    product_city = json_data['data']['qdata']['realQuery'] # 目的地城市
                except:
                    product_city = "NULL"
                    #write_txt(json_data['data']['qdata']['destinations'])
                for data in json_data['data']['list']['results']:
                    try:
                        vacation_data = Vacation_Product()
                        vacation_data.product_title = data['title'].encode('utf-16','surrogatepass').decode('utf-16') # 度假产品名称
                        vacation_data.product_city = product_city
                        vacation_data.product_price = data['accuratePrice'] # 度假产品价格
                        vacation_data.product_score = data['productScore'] # 度假产品评分
                        vacation_data.product_reviews = data['reviews'] # 度假产品评论数
                        vacation_data.product_soldCount = data['soldCount'] # 度假产品销量
                        vacation_data.tripTime = data['details']['tripTime']  # 度假时长
                        vacation_data.hotel_night = data['details']['hotelNight'] # 住宿时长
                        vacation_data.traffic_tool = data['details']['traffic'] # 出行工具
                        vacation_data.supplier_name = data['summary']['supplier']['name'] # 商家名称
                        #vacation_data.supplier_url = "https:" + data['summary']['supplier']['url']  # 商家url链接
                        vacation_data.supplier_url = data['summary']['supplier']['url'].split('.')[0] # 商家url链接
                        vacation_data.create_time = datetime.datetime.now() # 抓取时间
                        r.lpush('qunar.com:store_url',"https:" + data['summary']['supplier']['url'])#度假商品商户的url
                        vacation_data.save(force_insert=True)
                    except:
                        pass
                break       
            else:
                pass    
        else:
            pass

class parse_qunar_url_Thread(Thread):
    def run(self):
        while(1):
            get_nodes_json()
            #保存最终的数据


if __name__ == "__main__":
    for i in range(10):
        parse_qunar_url_thread = parse_qunar_url_Thread()     
        parse_qunar_url_thread.start()    
    
两年大概看此博客blog.codingnow.com/aee/
原文地址:https://www.cnblogs.com/dog-and-cat/p/13536698.html