4. 12306火车购票系统 (数据制作)

使用环境

文件目录结构

文件目录结构

数据表结构 models.py

from django.db import models
from django.contrib.auth.models import AbstractUser
import time
"""
学习到的知识:
1) 一个表需要关联多次同一个表时,需要重命名正向查询(related_name)和反向查询明字(related_query_name)
2) 索引的使用:db_index=True

"""


# Create your models here.

# 用户表
class UserInfo(AbstractUser):
    gender = models.CharField('性别', max_length=32)  # 性别
    phone = models.CharField('手机号', max_length=32)  # 手机号
    name = models.CharField('姓名',max_length=32) # 姓名
    ID_number = models.CharField('身份证号', max_length=32)  # 身份证号

    def __str__(self):
        return str(self.pk)+'----'+str(self.username)
    class Meta:
        verbose_name_plural = '用户表'




# 车站表
class Station(models.Model):
    id = models.IntegerField(primary_key=True)
    station_name = models.CharField('车站名称', max_length=32)  # 车站名称
    english = models.CharField('英文编码', max_length=32, db_index=True)  # 英文编码
    spell = models.CharField('拼音', max_length=32)  # 拼音
    spell_brief = models.CharField('拼音简', max_length=32)  # 拼音简
    city = models.ForeignKey(verbose_name='关联城市', to='City', db_index=True)  # 关联城市

    def __str__(self):
        return str(self.station_name)

    class Meta:
        verbose_name_plural = '车站表'


# 城市
class City(models.Model):
    city_name = models.CharField('城市列表', max_length=32)  # 城市列表

    def __str__(self):
        return str(self.city_name)
    class Meta:
        verbose_name_plural = '城市'

# 列车表
class Train(models.Model):
    train_size = models.CharField('列车号', max_length=32, db_index=True)  # 列车号
    train_coding = models.CharField('列车编码', max_length=32)  # 列车编码
    # 一个表需要关联多次同一个表时,需要重命名正向查询(related_name)和反向查询明字(related_query_name)
    start_stand = models.ForeignKey(verbose_name='起始站', to='Station', related_name='related_start_stand')  # 起始站
    terminus = models.ForeignKey(verbose_name='终点站', to='Station', related_name='related_terminus')  # 终点站
    depart = models.CharField('始发时间', max_length=32)  # 始发时间
    arrive = models.CharField('到达时间', max_length=32)  # 到达时间
    coach_num = models.CharField('车厢数', max_length=32, default=7)  # 车厢数
    station = models.ManyToManyField(verbose_name='关联列车进站时间表', to='Station', through='Station2Train',
                                     through_fields=('train', 'station'))

    def __str__(self):
        return str(self.train_size)
    class Meta:
        verbose_name_plural = '列车表'

# 列车进站时间表 车站——列车多对多表
class Station2Train(models.Model):
    station = models.ForeignKey(verbose_name='关联车站表', to='Station')  # 关联车站表
    train = models.ForeignKey(verbose_name='关联列车表', to='Train')  # 关联列车表
    station_next = models.CharField('站次(这趟车第几次经过)', max_length=32)  # 站次(这趟车第几次经过)
    arrive_time = models.CharField('到达时间', max_length=32)  # 到达时间
    depart_time = models.CharField('出发时间', max_length=32)  # 出发时间
    distance = models.CharField('和上一站的距离', max_length=32)  # 和上一站的距离
    is_state = models.CharField('是起终停', max_length=32)  # 是起终停

    def __str__(self):
        return 'station' + '的到达时间:' + str(self.arrive_time) + '  出发时间' + str(self.depart_time)

    class Meta:
        unique_together = ("station", "train")
        verbose_name_plural = '列车进站时间表'



# 车座表
class Seat(models.Model):
    choices = (
        (1, '商务座'),
        (2, '一等座'),
        (3, '二等座'),
        (4, '高级软卧'),
        (5, '高级硬卧'),
        (6, '硬座'),
        (7, '无座')
    )
    train = models.ForeignKey(verbose_name='关联列车表', to='Train',null=True,db_index=True)  # 关联列车表
    coach_size = models.CharField('车厢号', max_length=32)  # 车厢号
    seat_type = models.IntegerField('座位类型', choices=choices)  # 座位类型
    seat_size = models.CharField('座位号', max_length=32)  # 座位号
    is_sell = models.CharField('出售情况', max_length=64, null=True,db_index=True)  # 出售情况

    def __str__(self):
        return str(self.seat_type)
    class Meta:
        verbose_name_plural = '车座表'



# 邮箱验证码
class EmailVerifyRecord(models.Model):  # 邮箱验证码
    code = models.CharField(max_length=20, verbose_name=u"验证码")
    email = models.EmailField(max_length=50, verbose_name=u"邮箱")
    send_type = models.CharField(choices=(('register', u"注册"), ('forget', u"找回密码")), max_length=10)
    send_time = models.DateTimeField(auto_now=True) # 获取时间

    class Meta:
        verbose_name = u"邮箱验证码"
        verbose_name_plural = verbose_name



# 车票表
class Ticket(models.Model):
    ticket = models.CharField('车票号', max_length=32, primary_key=True)  # 车票号
    train_size = models.ForeignKey(verbose_name='关联列车号', to='Train')  # 列车号
    coach_size = models.CharField('车厢号', max_length=32)  # 车厢号
    seat_size = models.CharField('座位号', max_length=32)  # 座位号
    user = models.ForeignKey(verbose_name='关联用户表', to='UserInfo')  # 关联用户表
    price = models.CharField('价格', max_length=32)  # 价格
    pay_type = models.CharField('支付方式', max_length=32)  # 支付方式
    depart_time = models.CharField('出发时间', max_length=32)  # 出发时间
    arrive_time = models.CharField('到达时间', max_length=32)  # 到达时间
    depart_stand = models.CharField('出发站', max_length=32)  # 出发站
    arrive_stand = models.CharField('到达站', max_length=32)  # 到达站
    buy_time = models.CharField('购买时间', max_length=32)  # 购买时间
    is_quit = models.CharField('是否退票', max_length=32)  # 是否退票

    def __str__(self):
        return str(self.ticket)
    class Meta:
        verbose_name_plural = '车票表'


# 爬虫IP表
class IP(models.Model):
    ip = models.CharField('IP地址', max_length=32)
    port = models.CharField('端口号', max_length=32)
    expire_time = models.CharField('过期时间', max_length=32)
    city = models.CharField('地区', max_length=32)

    def __str__(self):
        return str(self.ip) + ':' + str(self.port)
    class Meta:
        verbose_name_plural = '爬虫IP表'

爬取所需代理ip ip.py

(爬取可能失效 2019-7-4,自己可以先学一下request)

import requests

from app01 import models


class Get_IP():
    # 校验
    def __init__(self):
        pass
    def select_ip(self):
        ip_obj = models.IP.objects.filter(id=1).first()
        return ip_obj
    def zhimaruanjian(self,url=None):

        """
        http://webapi.http.zhimacangku.com/getip?
        使用的芝麻代理 http://webapi.http.zhimacangku.com
        """

        requests.session()
        if not url:
            url = 'http://webapi.http.zhimacangku.com/getip?num=1&type=2&pro=&city=0&yys=0&port=1&pack=自己的pack号&ts=1&ys=0&cs=1&lb=1&sb=0&pb=4&mr=1&regions='

        r = requests.get(url)
        res = r.json()
        if res['code'] == 0:
            ip = res['data'][0]['ip']
            port = res['data'][0]['port']
            expire_time = res['data'][0]['expire_time']
            city = res['data'][0]['city']
            res = models.IP.objects.filter(id=1).first()
            print('获取到新ip %s'%(str(ip) +':'+ str(port)))
            if res:
                models.IP.objects.filter(id=1).update(ip=ip, port=port, expire_time=expire_time, city=city)
            else:
                res = models.IP(ip=ip, port=port, expire_time=expire_time, city=city)
                res.save()

        else:
            return True
if __name__ == '__main__':
    res = Get_IP()
    res.zhimaruanjian()

数据的定制爬取

(截止2019-7-4,数据统计:爬取车站总数2863个,涉及城市1260个,车站停靠数86037个,自制座位数5244727条( 车站停靠数86037个 X 列车数7节 X 每车厢100座位))

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os

from django.core.wsgi import get_wsgi_application

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "cp12306.settings")

application = get_wsgi_application()

from app01 import models
import xlrd
import json
import time
import requests
import random
from lxml import etree
from django.db.models import Count
from app01.myfile.ip import Get_IP
from concurrent.futures import ThreadPoolExecutor  # 设置多线程池
Get_IP = Get_IP()
"""
用到是知识点:
1) bulk_create 批量添加数据
2) xlrd 操作Excel表
3)  list(set(city_name_list)) 列表 利用set的自动去重功能
4) 分组和聚合函数使用:
        train_list = models.Train.objects.annotate(a = Count('station2train__train')).values('train_size','a')

"""


# 添加城市
def city():
    data = xlrd.open_workbook('火车站信息表.xlsx')  # 打开Excel表
    city_name_list = data.sheets()[0].col_values(5)  # 获取需要的数据
    query_list = []
    for x, i in enumerate(list(set(city_name_list)), 1):
        City_obj = models.City(id=x, city_name=i)
        if not City_obj in query_list:
            query_list.append(City_obj)
    try:
        print('城市列表添加完成!')
        models.City.objects.bulk_create(query_list)
    except:
        print('城市列表已经存在!')


# 添加车站
def station():
    # 查询城市代码id
    city_list = models.City.objects.all().values('id', 'city_name')
    city_dic = {city.get('city_name'): city.get('id') for city in city_list}

    # 添加
    data = xlrd.open_workbook('火车站信息表.xlsx').sheets()[0]  # 打开Excel表
    query_list = []
    for i in range(data.nrows):
        data_col = data.row_values(i)  # 获取excel一行数据
        station_obj = models.Station(id=i + 1, station_name=data_col[1], english=data_col[2], spell=data_col[3],
                                     spell_brief=data_col[4], city_id=city_dic.get(data_col[5]))

        query_list.append(station_obj)  # 把所有对象,添加到列表中

    try:
        pass
        models.Station.objects.bulk_create(query_list)
        print('车站列表添加完成!')
    except:
        print('车站列表已经存在!')

#
# # 添加座位类型
# def seat_type():
#     seat_list = ['商务座', '一等座', '二等座', '高级软卧', '高级硬卧', '硬座', '无座']
#     query_list = []
#     for id, seat_type in enumerate(seat_list, 1):
#         query_list.append(models.Seat_Type(id=id, seat_type=seat_type))
#
#     # 批量插入数据库之bulk_create()
#     try:
#         models.Seat_Type.objects.bulk_create(query_list)
#         print('座位类型添加完成!')
#     except:
#         print('座位类型已经存在!')

# 爬取列车数据
def pa(station_dic,train_size):
    """
    需要的数据:
        本列车
            起始站、
            终点站、
            始发时间、
            到达时间、
            站次、
            途径站中到达时间、
            途径站中出发时间、
            到达站、
            和上一站距离、
            状态是起终停
    :param train_size:
    :param train_coding:
    :return:
    """
    # 代理ip 地址,随机IP地址

    def get_ip():
        ip_obj = Get_IP.select_ip()
        ip = ip_obj.ip + ":" + ip_obj.port
        proxies = {
            'http': ip,
            'https': ip
        }
        print(proxies)
        return proxies

    url = 'http://checi.114piaowu.com/{}'.format(train_size)
    requests.Session()
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
        'Accept-Encoding': 'gzip, deflate',
        'Cookie': 'CLIENT_SOURCE=baidu_www.baidu.com; CLIENT_FIRST_ENTER=pc_shike; tostation=%E5%88%B0%E8%BE%BE%E5%9F%8E%E5%B8%82; UM_distinctid=16bb7da72431fe-07a5e239d1d8e6-37677e02-1aeaa0-16bb7da7244977; JSESSIONID=DF0894D3C3B6127C656BF6ADF714674E; fromstation=%E9%98%BF%E5%B0%94%E5%B1%B1; CLIENT_LAST_ENTER=pc_checi',
        'Host': 'checi.114piaowu.com',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
    }
    def get_url():
        r = requests.get(url=url, headers=headers, proxies=get_ip(), timeout=10)
        print(r.status_code)
        return r
    try:
        try:
            try:
                try:
                    r = get_url()
                except requests.exceptions.ConnectTimeout or requests.exceptions.ProxyError: # 代理无效错误
                    print('代理ip无效')
                    Get_IP.zhimaruanjian()
                    return False
            except requests.exceptions.ReadTimeout:# 读取超时错误
                print('读取超时')
                return False
        except requests.exceptions.TooManyRedirects:
            print('ip被限制')
            try:
                Get_IP.zhimaruanjian()
                r = get_url()
            except requests.exceptions.ProxyError:
                try:
                    Get_IP.zhimaruanjian()
                    r = get_url()
                except:
                    print('未知错误!')
                    return False
    except :
        print('未知错误!')
        Get_IP.zhimaruanjian()
        return False
    # 对获取的页面进行解析
    selector = etree.HTML(r.text)
    func_dic = {}

    # 把需要的数据写入字典
    try:
        res = selector.xpath("//dd[@class='line']/ul/li/a/text()")
        func_dic['start_stand'] = station_dic[res[0]] # 起始站
        func_dic['terminus'] = station_dic[res[1]]  # 终点站
        res = selector.xpath("//dd[@class='line']/ul/li/text()")
        func_dic['depart'] = res[0]  # 始发时间
        func_dic['arrive'] = res[1].split('(')[0]  # 到达时间
        res = selector.xpath("//div[@class='list']/table//tr")
        func_dic['data'] = {}
    except:
        return False
    number = 0
    # 把数据进行封装  格式 func_dic = {'start_stand':start_stand, ... 'data':{id:[]}}
    for each in res:
        numbers = each.xpath("./td[5]/text()")
        if numbers:
            numbers = numbers[0]
            # 获取车站代码对应的id
            try:
                station_next = each.xpath("./td[1]/text()")[0]  # 站次
                station = station_dic[each.xpath("./td[2]/a/text()")[0]]  # 到达站
                arrive_time = each.xpath("./td[3]/text()")[0]  # 到达时间
                depart_time = each.xpath("./td[4]/text()")[0]  # 出发时间
                if numbers == '--':
                    numbers = 0
                # func_list.append()
                distance = int(numbers)  # 和上一站的距离
                # distance = int(numbers) - int(number)  # 和上一站的距离
                # number = numbers
                if station == func_dic['terminus']:
                    is_state = '终'  # 是起终停
                elif each.xpath("./td[3]/text()")[0] == '--':
                    is_state = '起'  # 是起终停
                else:
                    is_state = '暂'  # 是起终停
            except KeyError:
                return False
            func_dic['data'][station_next] = [station, arrive_time, depart_time, distance, is_state]
    return func_dic

# 保存车列表
def train_save(dic):
    train_size      = dic.get('train_size') #列车号
    train_coding    = dic.get('train_coding') #列车编码
    start_stand      = models.Station.objects.filter(id=dic.get('start_stand')).first()  #起始站
    terminus      = models.Station.objects.filter(id=dic.get('terminus')).first() #终点站
    depart      = dic.get('depart') #始发时间
    arrive      = dic.get('arrive') #到达时间
    train_obj = models.Train.objects.create(train_size=train_size,train_coding=train_coding,
                start_stand=start_stand,terminus=terminus,depart=depart,arrive=arrive )
    data      = dic.get('data') #多对多数据
    for key,val in data.items():
        station_next      = key #站次
        station      = models.Station.objects.filter(id=val[0]).first()  #起始站 #关联列车表
        train      = train_obj #关联车站表
        arrive_time = val[1] #到达时间
        depart_time = val[2] #出发时间
        distance = val[3] #和上一站的距离
        is_state = val[4] #是起终停
        models.Station2Train.objects.create(station_next=station_next,station=station
                    ,train=train,arrive_time=arrive_time,depart_time=depart_time,distance=distance,is_state=is_state)
    print('{}次列车信息存入成功!'.format(train_size))



# 获取车表
def train():
    # 获取车站信息字典
    info = []
    errors = []
    station_list = models.Station.objects.all().values('station_name','id')
    station_dic = {city.get('station_name'): city.get('id') for city in station_list}
    # 打开车次信息文件
    with open('train_list.js', 'rb') as f:
        data = json.loads(f.read()).get('2019-07-16')
        # 把所有列车信息转换为字典格式{车次:车次编号}
        data_list = []
        for val in data.values():
            for vals in val:
                data_list.append(vals)

        data_dic = {dic.get('station_train_code').split('(')[0]: dic.get('train_no') for dic in data_list}

    for i,(val,key) in enumerate(data_dic.items(),1):
        # 循环爬取数据
        print('正在爬第{}趟{}列车..'.format(i,val))
        train_obj = models.Train.objects.filter(train_size=val).first()
        # 判断列车是否已存在
        if not train_obj:

                res_dic = pa(station_dic,val)
                # 存入数据库
                if res_dic:
                    res_dic['train_size'] = val
                    res_dic['train_coding'] = key

                    train_save(res_dic)
                else:
                    print('正在爬第{}趟{}列车数据报错!'.format(i, val))
                    train_obj = models.Train.objects.filter(train_size=val).delete()

                    errors.append(val)
        else:
            print('正在爬第{}趟{}列车数据已存在!'.format(i, val))
            info.append(val)
    print('已存在列车数据:',info)
    print('不存在列车数据:',errors)

# 添加座位表
def seat():
    # 查询到所有城市列表
    train_obj = models.Train.objects.all()

    train_list = models.Train.objects.annotate(a = Count('station2train__train')).values('train_size','a')
    res_dic = {train.get('train_size'): train.get('a') for train in train_list}
    id = 0
    for index,train in enumerate(train_obj,1):
        # 查询每列车有多少站点
        train_size = train.train_size
        print(index,train_size,res_dic.get(train_size))
        sell = ''.join(['1' for i in range(res_dic.get(train_size))])
        print(sell)
        # Seat_list列表
        query_list = []
        # 7节车厢
        for coach_size in range(1,8):
            # 100座位
            for seat_size in range(1,101):
                id+=1
                Seat_obj = models.Seat(id=id,train=train, coach_size=coach_size,seat_type=coach_size,seat_size=seat_size,is_sell=sell)
                query_list.append(Seat_obj)
        try:
            models.Seat.objects.bulk_create(query_list)
            print('座位列表添加成功{}条!'.format(id))
        except:
            print('座位列表{}已经存在!'.format(id))

def get_ip():
    import pymysql
    host = '106.75.31.89'
    user = 'root'
    password = 'Aa428912'
    data = 'Ip_conn'
    port = 3306

    connect = pymysql.connect(host, user, password, data, port, charset='utf8')  # 数据库连接参数
    cursor = connect.cursor(pymysql.cursors.DictCursor)  # 获取一个游标
    cursor.execute('select ip from ip')
    data = cursor.fetchall()
    data_list = []
    for i in data:
        data_list.append(i['ip'])
    print(data_list)
    print(len(data_list))

    return data_list

# 爬取距离
def pa_distance(station_name,station_name_1,ip):
    print(ip)
    proxies = {
        "http": "http://{}".format(ip),
        "https": "http://{}".format(ip),
    }

    url = 'http://juli.liecheshike.com/从{}到{}有多远'.format(station_name,station_name_1,proxies=proxies)
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
        'Accept-Encoding': 'gzip, deflate',
        'Cookie': 'safedog-flow-item=9C74D71A66F8B17A458732499BAEC7FF; ASPSESSIONIDCADCQTDB=JAMEOIKBLEPLECFDNHNCADNH; __51cke__=; ASPSESSIONIDCABBRQAC=KAGHCJNAIICFAABHNEOKPLIA; __tins__1516098=%7B%22sid%22%3A%201562545736853%2C%20%22vd%22%3A%203%2C%20%22expires%22%3A%201562547696493%7D; __51laig__=14',
        'Host': 'juli.liecheshike.com',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
    }
    requests.Session()
    r = requests.get(url=url,headers=headers,timeout=30)
    selector = etree.HTML(r.text)
    res = selector.xpath("//h3/text()")
    return res[0].split('公里')[0]


# 距离
def get_distance(data):
    """
    逻辑:先查出列车数量,再去循环数量的次数,按照数量为列车ID去查找数据,再去爬取距离,存入数据库
    :return:
    """
    # 获取到车次数量
    def get(i,sum = 0):

        train_obj = models.Station2Train.objects.filter(train_id=i).values_list('station__station_name','pk')
        station_name=""
        distances = 0
        print('——————第{}站——————'.format(i))
        for index,train_data in enumerate(train_obj,1):
            sum +=1
            print('第',i,'的',sum,'个')
            if index>1:
                station_name_1=train_data[0]
                while True:
                    # try:
                    distance = pa_distance(station_name,station_name_1,ip=random.choice(data))
                    break
                    # except Exception as e:
                    #     if e=='list index out of range':
                    #         distance=1
                    #         break
                    #     print('第', i, '的', sum, '个错误:%s'%e)

                distances += int(distance)
                print(distances)
                models.Station2Train.objects.filter(train_id=i,station_next=index).update(distance=distances)
                station_name = station_name_1
            else:
                station_name = train_data[0]
                models.Station2Train.objects.filter(train_id=i,station_next=index).update(distance='0')

    res = time.time()
    sumber = models.Train.objects.all().count()
    TP = ThreadPoolExecutor(max_workers=1)
    for i in range(45,sumber+1):
        # get(i)
        TP.submit(get,i)


    print(sumber)
    print(time.time()-res)
if __name__ == '__main__':
    pass
    Get_IP.zhimaruanjian()

    city()  # 添加城市
    station()  # 添加车站
    train()  # 爬取列车表
    seat()  # 添加座位表
    data = get_ip()
    get_distance(data)


额外文件 (博客园无法上传大文件,给个外链接)

文件列表:
	火车站信息表.xlsx
  train_list.js
下载地址:
		小强云盘分享链接:http://www.liqianglog.top:8002/home/share_link/K6X8028O08 提取密码为:353C 点击分享快去分享给好友啵~~

(如果失效,请联系博主,1206709430@qq.com)

原文地址:https://www.cnblogs.com/liqianglog/p/11134687.html