csv .xlsx

def gen_file_data(fodir, fname, sheet_index=0, ):
    if fname.find('.xlsx') > -1:
        fname_open = '%s\%s' % (fodir, fname)
        book = xlrd.open_workbook(fname_open, on_demand=True)
        sheet = book.sheet_by_index(sheet_index)
        data = [[str(c.value) for c in sheet.row(i)] for i in range(sheet.nrows)]
        book.release_resources()
        del book
    elif fname.find('.csv') > -1:
        data = []
        fname_open = '%s\%s' % (fodir, fname)
        with open(fname_open, 'r', encoding='utf-8') as csvfile:
            spamreader = csv.reader(csvfile, delimiter=',')
            for row in spamreader:
                data.append(row)
        csvfile.close()
    return data

  

import xlrd
import time
import sys
import os
import requests
import sqlite3
import threading
import math
import csv

curPath = os.path.abspath(os.path.dirname(__file__))
rootPath = os.path.split(curPath)[0]
sys.path.append(rootPath)

MAX_USED_TIMES, overrun_str, DB_KEY_EXHAUST, next_day_tag = 1900, '天配额超限,限制访问', 'DB_KEY_EXHAUST', '000003'

db = 'py_bdspider_status.db'
db = '%s\%s' % (curPath, db)


def db_chk_one_exist(key):
    conn = sqlite3.connect(db)
    c = conn.cursor()
    sql = 'SELECT key FROM baidu_map_key_used WHERE key="%s"' % (key)
    r = 0
    res = c.execute(sql).fetchone()
    if res is not None:
        r = 1
    conn.close
    return r


# def db_init_key_table():
#     conn = sqlite3.connect(db)
#     c = conn.cursor()
#     k_file = '%s\%s' % (curPath, 'bdmap_key.txt')
#     with open(k_file, 'r', encoding='utf-8') as pf:
#         for i in pf:
#             if len(i) < 4:
#                 continue
#             author, key = i.replace(' ', '').replace('
', '').replace('	', '').split(';')
#             r = db_chk_one_exist(key)
#             if r == 0:
#                 localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
#                 sql = 'INSERT INTO baidu_map_key_used (author,key,update_time,today_used) VALUES ("%s","%s","%s",%s) ' % (
#                     author, key, localtime_, 0)
#                 c.execute(sql)
#                 conn.commit()
#     conn.close()
#     pf.close()
#
#
# db_init_key_table()



def db_recovery_bdkeynum():
    if time.strftime("%H%M%S", time.localtime()) == next_day_tag:
        conn = sqlite3.connect(db)
        c = conn.cursor()
        localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
        sql = 'UPDATE baidu_map_key_used SET today_used = 0 ,update_time=%s  ' % (localtime_)
        c.execute(sql)
        conn.commit()
        conn.close()
    return


def db_get_one_effective():
    db_recovery_bdkeynum()
    conn = sqlite3.connect(db)
    c = conn.cursor()
    sql = 'SELECT key FROM baidu_map_key_used WHERE today_used<=%s ORDER BY today_used ASC' % (MAX_USED_TIMES)
    res, r = c.execute(sql).fetchone(), ''
    if res is None:
        r = DB_KEY_EXHAUST
    else:
        r = res[0]
    conn.close()
    return r


def db_update_one_today_used(key):
    conn = sqlite3.connect(db)
    c = conn.cursor()
    localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
    sql = 'UPDATE baidu_map_key_used SET today_used = today_used+1 ,update_time=%s WHERE key="%s" ' % (
        localtime_, key)
    c.execute(sql)
    conn.commit()
    conn.close()


dir_, dir_exception, requested_file_list = 'baidu_map_uid', 'baidu_map_uid_exception', []
requested_file_dir_str, requested_file_dir_exception_str = '%s\%s\' % (curPath, dir_), '%s\%s\' % (
    curPath, dir_exception)
requested_file_dir = os.listdir(requested_file_dir_str)


def gen_requested_file_list(file_postfix='.html'):
    filepath = '%s\%s' % (curPath, dir_)
    pathDir = os.listdir(filepath)
    for allDir in pathDir:
        child = os.path.join('%s%s' % (filepath, allDir))
        requested_file = child.split(dir_)[1].split('&')[0].split(file_postfix)[0]
        if requested_file not in requested_file_list:
            requested_file_list.append(requested_file)


def gen_file_data(fodir, fname, sheet_index=0, ):
    if fname.find('.xlsx') > -1:
        fname_open = '%s\%s' % (fodir, fname)
        book = xlrd.open_workbook(fname_open, on_demand=True)
        sheet = book.sheet_by_index(sheet_index)
        data = [[str(c.value) for c in sheet.row(i)] for i in range(sheet.nrows)]
        book.release_resources()
        del book
    elif fname.find('.csv') > -1:
        data = []
        fname_open = '%s\%s' % (fodir, fname)
        with open(fname_open, 'r', encoding='utf-8') as csvfile:
            spamreader = csv.reader(csvfile, delimiter=',')
            for row in spamreader:
                data.append(row)
        csvfile.close()
    return data


# 3 9
request_dic, target_type_list, target_type_except_list = {}, ['北京市', '上海市', '广州市'], ['火车站', '高铁站', '汽车站', '飞机场', '小学',
                                                                                     '幼儿园', '中学',
                                                                                     '综合医院', '商场']
# ['4s店','餐饮','家电','酒店','咖啡馆','售楼处','专科医院']
# ['住宅小区','写字楼']

# file_postfix_l = ['.html', '.txt']
# for i in file_postfix_l:
#     gen_requested_file_list(i)

fname_source = 'jfinder_public_jmtool_old_data.csv'
data_file = gen_file_data(curPath, fname_source)


def replace_illeagl_tag(str_):
    l = [' ', '
', '	']
    for i in l:
        str_ = str_.replace(i, '')
    return str_


# 碧海富通城三期(3栋) ok
# =碧海富通城-三期(3栋) ok
replace_to_empty_l = [' ', '|', '	', '
', '/', '?', '?', '·', '.']


def gen_bd_query_origin_name(name_):
    for i in replace_to_empty_l:
        name_ = name_.replace(i, '')
    return name_.replace('(', '(').replace(')', ')').replace('?', '').replace('?', '')


for l in data_file:
    # db_from, db_id, db_area_code, db_name, db_type_, db_city, db_district, db_address, db_street, db_uid, db_submit_time = l
    # db_from, id, area_code, name, type_, city, district, address, street, uid, submit_time = l
    dbid, area_code, uid, name_, type_en, city, district, address, street, db_from, submit_time, type_ = l

    if db_from == 'db_from':
        continue
    request_name = gen_bd_query_origin_name(name_)
    input_ = '%s%s%s' % (city, district, request_name)
    if input_ in requested_file_list:
        print('requested', input_)
        continue
    if city not in request_dic:
        request_dic[city] = {}
    if district not in request_dic[city]:
        request_dic[city][district] = {}
        request_dic[city][district]['request_name_list'] = []
        request_dic[city][district]['request_uid_list'] = []
        request_dic[city][district]['file_row_list'] = []
    if request_name not in request_dic[city][district]['request_name_list']:
        request_dic[city][district]['request_name_list'].append(request_name)
    uid = uid.replace(' ', '')
    if len(uid) > 0 and uid not in request_dic[city][district]['request_uid_list']:
        request_dic[city][district]['request_uid_list'].append(uid)
    request_dic[city][district]['file_row_list'].append(l)
del data_file

base_url = 'http://api.map.baidu.com/place/v2/suggestion?query=R-QUERY&region=R-CITY&city_limit=true&output=json&ak=R-AK'
ex_l = ['Proxy Error', 'APP IP校验失败', 'APP不存在,AK有误请检查再重试', 'The requested URL could not be retrieved',
        'Address already in use', '天配额超限,限制访问', 'Parameter Invalid']

write_res_file_dir = '%s\%s\' % (curPath, dir_)


def write_res_file(str_, input_, ak, dir_=write_res_file_dir, file_postfix='.txt'):
    for ex in ex_l:
        if str_.find(ex) > -1:
            print('EXCEPTION-', ex, 'AK-', ak, 'STR-', str_)

            return
    fname = '%s%s%s' % (dir_, input_, file_postfix)
    with open(fname, 'w', encoding='utf-8') as ft:
        ft.write(str_)
    ft.close()
    print('ok', threading.get_ident(), input_)


class MyThread(threading.Thread):
    def __init__(self, func, args, name):
        threading.Thread.__init__(self)
        self.name, self.func, self.args = name, func, args

    def run(self):
        self.func(self.args)


def fun_(city):
    for district in request_dic[city]:
        for request_name in request_dic[city][district]['request_name_list']:
            ak = db_get_one_effective()
            if ak == DB_KEY_EXHAUST:
                print(DB_KEY_EXHAUST)
                break
            else:
                url_ = base_url.replace('R-QUERY', request_name).replace('R-CITY', city).replace('R-AK', ak)
                print(url_)
                input_ = '%s%s%s' % (city, district, request_name)

                bd_res_json_str = requests.get(url_).text
                db_update_one_today_used(ak)
                write_res_file(bd_res_json_str, input_, ak)

                # try:
                #     # gen_requested_file_list()
                #     # gen_requested_file_list('.txt')
                #     # if input_ in requested_file_list:
                #     #     continue
                #     bd_res_json_str = requests.get(url_).text
                #     db_update_one_today_used(ak)
                #     write_res_file(bd_res_json_str, input_)
                # except Exception:
                #     bd_res_json_str = '请求百度-异常'
                #     write_res_file(bd_res_json_str, input_, requested_file_dir_exception_str)
                #     print(bd_res_json_str, input_)


try:
    start_loop, stop_loop = int(sys.argv[1]), int(sys.argv[2])
except Exception:
    start_loop, stop_loop = -1, 200


def main():
    threads_list, nloop = [], 0
    request_dic_city_l = sorted(request_dic, reverse=False)
    for city in request_dic_city_l:
        nloop += 1
        if nloop < start_loop or nloop > stop_loop:
            continue
        thread_instance = MyThread(fun_, (city), fun_.__name__)
        threads_list.append(thread_instance)
    for t in threads_list:
        t.setDaemon = False
        t.start()
    for t in threads_list:
        t.join()


if __name__ == '__main__':
    main()

  

原文地址:https://www.cnblogs.com/rsapaper/p/7498129.html