Python3 | 爬取百度地图信息的的代码,我更改了城市,关键词,页码等,完成了获取有关“筛网”店铺的信息。

篇爬取百度地图信息的的代码,我更改了城市,关键词,页码等,完成了获取有关“筛网”店铺的信息。

代码如下:

import requests
import re
import csv
import time


def BusinessFromBaiduDitu(citycode = '287',key_word='筛网',pageno=0):
    parameter = {
            "newmap": "1",
            "reqflag": "pcmap",
            "biz": "1",
            "from": "webmap",
            "da_par": "direct",
            "pcevaname": "pc4.1",
            "qt": "con",
            "c": citycode,        # 城市代码
            "wd": key_word,       # 搜索关键词
            "wd2": "",
            "pn": pageno,         # 页数
            "nn": pageno * 10,
            "db": "0",
            "sug": "0",
            "addr": "0",
            "da_src": "pcmappg.poi.page",
            "on_gel": "1",
            "src": "7",
            "gr": "3",
            "l": "12",
            "tn": "B_NORMAL_MAP",
            # "u_loc": "12621219.536556,2630747.285024",
            "ie": "utf-8",
            # "b": "(11845157.18,3047692.2;11922085.18,3073932.2)",  #这个应该是地理位置坐标,可以忽略
            "t": "1468896652886"}

    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36(KHTML, like Gecko) Chrome/56.0.2924.87Safari/537.36'}

    url = 'http://map.baidu.com/'
    htm = requests.get(url, params=parameter, headers=headers)
    htm = htm.text.encode('latin-1').decode('unicode_escape')  # 转码
    pattern = r'(?<=address_norm":"[).+?(?="ty":)'
    htm = re.findall(pattern, htm)  # 按段落匹配

    for r in htm:
        pattern = r'(?<="},"name":").+?(?=")'
        name = re.findall(pattern, r)
        #if not name:
        pattern = r'(?<=,"name":").+?(?=")'
        name = re.findall(pattern, r)
        #print(name[0])  # 名称

        pattern = r'.+?(?=")'
        adr = re.findall(pattern, r)
        pattern = r'(.+?['
        address = re.sub(pattern, ' ', adr[0])
        pattern = r'(.+?]'
        address = re.sub(pattern, ' ', address)
        #print(address)  # 地址

        pattern = r'(?<="phone":").+?(?=")'
        phone = re.findall(pattern, r)
        try:
            if phone[0] and '",' != phone[0]:
                phone_list = phone[0].split(sep=',')
                for number in phone_list:
                    if re.match('1', number):
                        print(citycode+name[0]+','+address+','+number)
                        writer.writerow((name[0], address, number))
        except:
            continue
    print(citycode + '  ' + key_word + '  ' + str(pageno))

现在开始写我搜“丝网”“筛网”(key_word)的代码获取想要的数据,也要改城市代码(citycode)

#citynumlist是百度地图城市代码列表
citynumlist = ['33','34','35'
              '''''''''''''''''  
               '370','371','372']
keywordlist = ['丝网','筛网']

start = time.time()
num = 1

#建立csv文件,保存数据
csvFile = open(r'/Users/apple888/PycharmProjects/百度地图/Data/%s.csv' % 'CityData','a+', newline='', encoding='utf-8')
writer = csv.writer(csvFile)
writer.writerow(('name', 'address', 'number'))


for citycode in citynumlist:
    for kw in keywordlist:
        for page in range(10):
            BusinessFromBaiduDitu(citycode=citycode, key_word=kw, pageno=page)
            
            #防止访问频率太高,避免被百度公司封
            time.sleep(1)
            if num%20 == 0:
                time.sleep(2)
            if num%100== 0:
                time.sleep(3)
            if num%200==0:
                time.sleep(7)
            num = num + 1

end = time.time()
lasttime = int((end-start))
print('耗时'+str(lasttime)+'s')

程序运行了大约三个小时,抓取了1085条有用信息信息

python爬取上市公司办公地址

IDLE编辑器,python3.8版本。
import requests
from bs4 import BeautifulSoup
import re
import xlwt

def getHTMLText(url, code=“utf-8”):
kv={‘user-agent’:‘Mozilla/5.0’}
try:
r = requests.get(url,headers=kv)
r.raise_for_status()#抛出异常
r.encoding =r.apparent_encoding#设定编码格式
return r.text
except:
return “”

def getStockList(lst, stockURL):
html = getHTMLText(stockURL, “gb2312”) #只获取htlm文本
soup = BeautifulSoup(html, ‘html.parser’) #html解析,到这里把整个网站源代码排版整理干净
a = soup.find_all(‘a’) #解析页面,找到所有的a标签
for i in a:
try:
#找到a标签中的href属性,并且判断属性中间的链接,把链接后面的数字取出来
href = i.attrs[‘href’]
#深圳交易所的代码以sz开头,上海交易所的代码以sh开头,股票的数字有6位构成,所以正则表达式可以写为[s][hz]d{6}
lst.append(re.findall(r"[s][hz][0,3,6][0-9]{5}", href)[0])
except:
continue

def getStockInfo(lst, stockURL):
count = 0
for stock in lst[3500:3814]:
url = stockURL + ‘s’+ stock[2:8] + “.shtml”
#print(url)
inflinshi=[]
html = getHTMLText(url)#对一只股票进行操作
soup = BeautifulSoup(html, ‘html.parser’)
tds = soup.find_all(‘td’,attrs={‘class’: ‘’})
#print(tds[15].string)
count = count + 1
sheet01.write(count+1,0,tds[0].string)
sheet01.write(count+1,1,tds[1].string)
sheet01.write(count+1,2,tds[15].string)
sheet01.write(count+1,3,tds[8].string)
# count = count + 1
print(" 当前进度: {:.2f}%".format(count*100/len(lst[3500:3814])),end="")
print(stock)
stock_list_url = ‘http://quote.eastmoney.com/stock_list.html’
stock_info_url = ‘http://stockdata.stock.hexun.com/gszl/’
slist=[]
inf = []
getStockList(slist, stock_list_url)
print(len(slist))
f=xlwt.Workbook(encoding=‘utf-8’)
sheet01=f.add_sheet(u’sheet1’,cell_overwrite_ok=True)
sheet01.write(0,0,“股票简称”)
sheet01.write(0,1,“股票代码”)
sheet01.write(0,2,“注册地址”)
sheet01.write(0,3,“所属地域”)
getStockInfo(slist, stock_info_url)
f.save(u’E:股票基本资料(3500-3814).xls’)

因数据量较大,可分多次爬取。

二、在百度地图上标注。

首先在百度地图个人开始平台申请密钥AK。
vscode编辑器代码:

<!DOCTYPE html>
<html>

<head>
    <meta http-equiv="Content-Type" content="text/html; charset=utf8" />
    <meta name="viewport" content="initial-scale=1.0, user-scalable=no" />
    <script src="e:\xlsx.full.min.js "></script>
    <script src="e:\jquery-3.5.0.min.js "></script>
    <title>批量地址</title>
    <style type="text/css">
        body,
        html {
             100%;
            height: 100%;
            margin: 0;
            font-family: "微软雅黑";
        }
        
        #l-map {
            height: 550px;
             100%;
        }
        
        #r-result {
             100%;
            font-size: 14px;
            line-height: 20px;
        }
    </style>
</head>

<body>
    <div id="l-map"></div>
    <div id="r-result">
        <input type="button" value="批量地址解析" onclick="bdGEO()" />
        <input type="file" id="excel-file">
        <div id="result"></div>
    </div>
</body>

</html>
<script type="text/javascript" src="http://api.map.baidu.com/api?v=2.0&ak=申请到的密钥"></script>

<script type="text/javascript">
    // 百度地图API功能
    var map = new BMap.Map("l-map");
    map.centerAndZoom(new BMap.Point(116.402831, 39.914271), 13);
    map.enableScrollWheelZoom(true);
    var index = 0;
    var myGeo = new BMap.Geocoder();
    var adds = [];
    var jianchengs = [];
    //给input标签绑定change事件,一上传选中的.xls文件就会触发该函数
    $('#excel-file').change(function(e) {
        var files = e.target.files;
        var fileReader = new FileReader();
        fileReader.onload = function(ev) {
            try {
                var data = ev.target.result
                var workbook = XLSX.read(data, {
                        type: 'binary'
                    }) // 以二进制流方式读取得到整份excel表格对象
                var persons = []; // 存储获取到的数据
            } catch (e) {
                console.log('文件类型不正确');
                return;
            }
            // 表格的表格范围,可用于判断表头是否数量是否正确
            var fromTo = '';
            // 遍历每张表读取
            for (var sheet in workbook.Sheets) {
                if (workbook.Sheets.hasOwnProperty(sheet)) {
                    fromTo = workbook.Sheets[sheet]['!ref'];
                    console.log(fromTo);
                    persons = persons.concat(XLSX.utils.sheet_to_json(workbook.Sheets[sheet]));

                    // break; // 如果只取第一张表,就取消注释这行
                }
            }
            //在控制台打印出来表格中的数据
            console.log(persons);
            for (var i in persons) {
                adds = adds.concat(i + "," + persons[i].注册地址 + ",");
                jianchengs = jianchengs.concat(persons[i].股票简称);
            }
            console.log(adds);
        };
        // 以二进制方式打开文件
        fileReader.readAsBinaryString(files[0]);
    });
       function bdGEO() {
        var add = adds[index];
        // var jiancheng = jianchengs[index]
        console.log(add);
        // console.log(jiancheng);
        geocodeSearch(add);
        index++;
    }

    function geocodeSearch(add) {
        var jiancheng = jianchengs[index]
        console.log(jiancheng);
        if (index < adds.length) {
            setTimeout(window.bdGEO, 100);
        }
        myGeo.getPoint(add, function(point) {
            if (point) {
                //document.getElementById("result").innerHTML +=  index + "、" + add + ":" + point.lng + "," + point.lat + "</br>";
                document.getElementById("result").innerHTML += "longitude = " + point.lng + ", latitude =" + point.lat + "</br>";
                var address = new BMap.Point(point.lng, point.lat);
                addMarker(address, new BMap.Label(index + ":" + jiancheng, {
                    offset: new BMap.Size(20, -10)
                }));
            }
        }, "北京市");
    }
    // 编写自定义函数,创建标注
    function addMarker(point, label) {
        var marker = new BMap.Marker(point);
        map.addOverlay(marker);
        marker.setLabel(label);
        map.addEventListener("click", showInfo);
    }
    function showInfo(e){
		alert(e.point.lng + ", " + e.point.lat);
	}
	
</script>

运行代码,open in default browser.如图:

原文地址:https://www.cnblogs.com/xinxihua/p/14390797.html