python 爬虫 上传elasticSearch (包括日期)

python爬虫

1 利用pip引入相关库

from bs4 import BeautifulSoup
import requests

2 构建请求

以某网站为例,此为post请求,根据目标网站而定


      headers = {'Host': 'xxxxx',
               'Origin':'xxxxx',
               'Content-Type':'xxxx',
               'Origin': 'xxxx',
               'Referer': ',
               'User-Agent': 'xxxxxxx',
               'X-Requested-With': 'xxxxx'
                   }

     content = {'m': 'xflist',
                   'city': 'wf',
                   'district': ''}

      url='xxxxx';

     res = requests.post(url, data=content, headers=headers)

3 解析网页结果


      soup = BeautifulSoup(html, "html.parser")
      list = soup.select('body > div.main > section.mBox.mb8.dtbk > div > ul ')
      for i in list:
      lilist = i.find_all("li")
        for j in lilist:
            create_time = j.select("div.time")[0].getText()
            print(create_time)
            content = j.select("h4")[0].getText()
            print(content)
            json = {"createTime": create_time,"content":content}
            jsonArray.append(json)

经python爬虫爬出数据,包含时间格式截取部分如下


json={
               "floor_area": "57535 ㎡",
               "building_area": "250000 ㎡",
               "volume_rate": "3.78",
               "greening_rate": "30%",
               "parking_rate": "项目规划车位数量为1889个",           
               "record": [
                        {
                            "createTime": "2018-11-20 11:03:33",
                            "content": "新瑞都"
                        },
                        {
                            "createTime": "2020-12-31",
                            "content": "3号楼"
                        },
                        {
                            "createTime": "2018-11-03",
                            "content": "3号楼"
                        },
                }

4引入elasticsearch库,确保elasticsearch为启动状态

from e import Elasticsearch
from fang import make_request1


es = Elasticsearch([{'host':'127.0.0.1','port':9200}])
json = make_request1()

es.index(index="house_test",doc_type="fang",body = json)

运行后报错信息如下

显然虽然json 爬取为字符串,但是上传到elasticsearch,被识别为日期格式

于是elasticsearch创建索引时即规定type.

5 elasticsearch创建索引,具体实现代码如下


from elasticsearch import Elasticsearch
es = Elasticsearch('192.168.1.1:9200')

mappings = {
            "mappings": {
                "fang": {
                    "properties": {
                       
                         },"open_time": {
                            "type": "keyword",
                            "index": "false"
                        },"volume_rate": {
                            "type": "keyword",
                            "index": "false"
                        },"greening_rate": {
                            "type": "keyword",
                            "index": "false"
                        },"parking_rate": {
                            "type": "keyword",
                            "index": "false"
                         },"house_type": {
                            "type": "keyword",
                            "index": "false"
                         },
                        "property_company": {
                            "type": "keyword",
                            "index": "false"
                         },
                        # tags可以存json格式,访问tags.content
                        "projectComment": {
                            "type": "object",
                            "properties": {
                                "createTime": {"type": "keyword", "index": False},
                                "content": {"type": "keyword", "index": False},
                            }
                        },
                    }
                }
            }
        }



res = es.indices.create(index = 'index_test',body =mappings)

将数据连续插入

from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk

es = Elasticsearch('192.168.1.1:9200')

ACTIONS = []

json1 ={
                "floor_area": "57535 ㎡",
               "building_area": "250000 ㎡",
               "volume_rate": "3.78",
               "greening_rate": "30%",
               "parking_rate": "项目规划车位数量为1889个",           
               "record": [
                        {
                            "createTime": "2018-11-20 11:03:33",
                            "content": "新瑞都"
                        },
                        {
                            "createTime": "2020-12-31",
                            "content": "3号楼"
                        },
                        {
                            "createTime": "2018-11-03",
                            "content": "3号楼"
                        },
                }
json2 ={
               "floor_area": "354345 ㎡",
               "building_area": "234500 ㎡",
               "volume_rate": "453",
               "greening_rate": "43%",
               "parking_rate": "项目规划车位数量为1889个",           
               "record": [
                        {
                            "createTime": "2018-11-20 11:03:33",
                            "content": "新瑞都"
                        },
                        {
                            "createTime": "2020-12-31",
                            "content": "3号楼"
                        },
                        {
                            "createTime": "2018-11-03",
                            "content": "3号楼"
                        },
                    }
                }

ACTIONS.append(json1)
ACTIONS.append(json2)

res,_ =bulk(es, ACTIONS, index="indes_test", raise_on_error=True)


5 查询索引

通过postman发送请求

查询到结果

原文地址:https://www.cnblogs.com/gloria-liu/p/9995642.html