新房详情
from selenium import webdriver from selenium.webdriver.chrome.options import Options from time import sleep import json from datetime import datetime import re option = webdriver.ChromeOptions() # 防止打印一些无用的日志 option.add_experimental_option("excludeSwitches", ['enable-automation', 'enable-logging']) b = webdriver.Chrome(executable_path ="D:chrome_driver_win32chromedriver.exe", chrome_options=option) num = 1 base_urls = "https://nanjing.newhouse.fang.com/house/s/b9{}/".format(num) b.get(base_urls) name = b.find_elements_by_xpath('//*[@class="nl_con clearfix"]/ul/li/div/div[1]/a') house_lst = [] for i in name: href = (i.get_attribute('href')) house_lst.append(href) data_list = [] for url in house_lst: b.get(url) data = {} # 获取楼盘详情 quyu = b.find_element_by_xpath( '//div[@class="br_left"]//ul[@class="tf f12"]//li[3]/a').text # 一级区域 data['subarea'] = quyu[:-2] # 字符串切片,去掉后面2个字 data['area'] = b.find_element_by_xpath('//div[@class="s2"]/div/a').text # 当前城市 try: # 详情里的属性 fangyuan_url = b.find_element_by_xpath( "//*[@class='main_1200 tf']//div[@class='cxfnav']//a[contains(text(),'楼盘详情')]") href1 = fangyuan_url.get_attribute('href') b.get(href1) nodes= any main_items = b.find_elements_by_xpath('//div[@class="main_1200 tf"]//div[@class="main_1200"]//div[@class="main-cont clearfix"]//div[@class="main-left"]//div[@class="main-item"]') for i in main_items: # print(i.find_element_by_xpath(".//h3").text) # .//表示当前目录下的 xxx nodes1 = i.find_elements_by_xpath('.//ul//li') for n in nodes1: print(n.text) print('-'*50) # xxx位置及周边 dingwei_url = b.find_element_by_xpath('//div[@class="mapbox_dt"]/iframe').get_attribute( "src") # 获取定位连接 b.get(dingwei_url) sound_code = b.page_source # 获取网站的源码 re_search = re.search(r'"mapx":"(.*?)","mapy":"(.*?)"', sound_code, re.DOTALL) # 楼盘坐标..正则匹配"mapx":后面数数字 data['housecoord'] = re_search.group(2) + "," + re_search.group(1) except Exception as e: pass data_list.append(data) break print(data_list) with open('详情(南京).jsonlines', 'a', encoding='utf8') as f: for data in data_list: json.dump(data, f, ensure_ascii=False) f.write(' ') b.quit()