爬虫实战

爬取拉勾网职位信息

参考博客：https://www.cnblogs.com/wsmrzx/p/10993640.html

#https://www.lagou.com/jobs/positionAjax.json?city=%E4%B8%8A%E6%B5%B7&needAddtionalResult=false
import requests
#实际要爬取的url
url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'

payload = {
    'first': 'true',
    'pn': '1',
    'kd': 'python',
}

header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
    'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=',
    'Accept': 'application/json, text/javascript, */*; q=0.01'
}
#原始的url
urls ='https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='
#建立session
s = requests.Session()
# 获取搜索页的cookies
s.get(urls, headers=header, timeout=3)
# 为此次获取的cookies
cookie = s.cookies
# 获取此次文本
response = s.post(url, data=payload, headers=header, cookies=cookie, timeout=5).text
print(response)

爬取红楼梦小说

#http://www.shicimingju.com/book/hongloumeng.html

import requests

from bs4 import BeautifulSoup
ret=requests.get('https://www.shicimingju.com/book/hongloumeng.html')
# print(ret.text)

soup=BeautifulSoup(ret.text,'lxml')
li_list=soup.find(class_='book-mulu').find('ul').find_all('li')
with open('hlm.txt','w',encoding='utf-8') as f:
    for li in li_list:
        content=li.find('a').text
        url='https://www.shicimingju.com'+li.find('a').get('href')

        f.write(content)
        f.write('
')
        res_content=requests.get(url)
        soup2=BeautifulSoup(res_content.text,'lxml')
        content_detail=soup2.find(class_='chapter_content').text
        f.write(content_detail)
        f.write('
')
        print(content,'写入了')

爬取肯德基门店

# http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword
import requests

header = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'
}
data = {
    'cname': '',
    'pid': 20,
    'keyword': '浦东',
    'pageIndex': 1,
    'pageSize': 10
}
ret = requests.post('http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword', data=data, headers=header)
print(ret.json())

爬取糗事百科段子

#https://www.qiushibaike.com/text/page/2/
import requests
from bs4 import BeautifulSoup
ret=requests.get('https://www.qiushibaike.com/text/page/2/')
# print(ret.text)

soup=BeautifulSoup(ret.text,'html.parser')

article_list=soup.find_all(class_='article')
# print(article_list)
for article in article_list:
    content=article.find(class_='content').text
    print(content)
    print('-------')

爬取京东商品信息

from selenium import webdriver
import time
# 模拟键盘输入
from selenium.webdriver.common.keys import Keys
bro=webdriver.Chrome(executable_path='./chromedriver.exe')
# 设置隐士等待
bro.implicitly_wait(10)

def get_goods_info(bro):
    # li_list=bro.find_element_by_class_name('gl-warp').find_elements_by_tag_name('li')
    # goods=bro.find_elements_by_class_name('gl-item')
    goods = bro.find_elements_by_css_selector('.gl-item')
    # print(len(goods))
    for good in goods:
        try:
            price = good.find_element_by_css_selector('.p-price i').text
            name = good.find_element_by_css_selector('.p-name em').text
            url = good.find_element_by_css_selector('.p-img a').get_attribute('href')
            commits = good.find_element_by_css_selector('.p-commit strong>a').text
            photo_url = good.find_element_by_css_selector('.p-img img').get_attribute('src')

            print('''
            商品名字：%s
            商品价格：%s
            商品地址：%s
            商品评论数：%s
            商品图片地址：%s
    
            ''' % (name, price, url, commits, photo_url))
        except Exception as e:
            continue

    next_button = bro.find_element_by_partial_link_text('下一页')
    time.sleep(1)
    next_button.click()

    get_goods_info(bro)

try:
    bro.get('https://www.jd.com/')

    input_k=bro.find_element_by_id('key')

    input_k.send_keys('奶牛')
    # 模拟键盘的回车键
    input_k.send_keys(Keys.ENTER)
    get_goods_info(bro)


except Exception as e:
    print(e)

finally:
    bro.close()

自动登录12306

from selenium import webdriver
import time
#pillow
from PIL import Image

# 引入超级鹰

from chaojiying import Chaojiying_Client


from selenium.webdriver import ActionChains
bro=webdriver.Chrome(executable_path='./chromedriver.exe')
bro.implicitly_wait(10)
try:
    bro.get('https://kyfw.12306.cn/otn/resources/login.html')
    bro.maximize_window()  # 窗口最大化，全屏
    button_z=bro.find_element_by_css_selector('.login-hd-account a')
    button_z.click()
    time.sleep(2)
    # 截取整个屏幕
    bro.save_screenshot('./main.png')
    # 验证码的位置和大小
    img_t=bro.find_element_by_id('J-loginImg')
    print(img_t.size)
    print(img_t.location)

    size=img_t.size
    location=img_t.location

    img_tu = (int(location['x']), int(location['y']), int(location['x'] + size['width']), int(location['y'] + size['height']))
    # # 抠出验证码
    # #打开
    img = Image.open('./main.png')
    # 抠图
    fram = img.crop(img_tu)
    # 截出来的小图
    fram.save('code.png')

    # 调用超级鹰破解
    chaojiying = Chaojiying_Client('306334678', 'lqz12345', '903641')    #用户中心>>软件ID 生成一个替换 96001
    im = open('code.png', 'rb').read()                                                    #本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
    # print(chaojiying.PostPic(im, 9004))

    ## 返回结果如果有多个 260,133|123，233,处理这种格式[[260,133],[123,233]]
    res=chaojiying.PostPic(im, 9004)
    print(res)
    result=res['pic_str']

    all_list = []
    if '|' in result:
        list_1 = result.split('|')
        count_1 = len(list_1)
        for i in range(count_1):
            xy_list = []
            x = int(list_1[i].split(',')[0])
            y = int(list_1[i].split(',')[1])
            xy_list.append(x)
            xy_list.append(y)
            all_list.append(xy_list)
    else:
        x = int(result.split(',')[0])
        y = int(result.split(',')[1])
        xy_list = []
        xy_list.append(x)
        xy_list.append(y)
        all_list.append(xy_list)
    print(all_list)
    # 用动作链，点击图片
    # [[260,133],[123,233]]
    for a in all_list:
        x = a[0]
        y = a[1]
        ActionChains(bro).move_to_element_with_offset(img_t, x, y).click().perform()
        time.sleep(1)

    username=bro.find_element_by_id('J-userName')
    username.send_keys('306334678')
    password=bro.find_element_by_id('J-password')
    password.send_keys('lqz12345')
    time.sleep(3)
    submit_login=bro.find_element_by_id('J-login')
    submit_login.click()
    time.sleep(3)

    print(bro.get_cookies())
    time.sleep(10)
    bro.get('https://www.12306.cn/index/')
    time.sleep(5)

except Exception as e:
    print(e)
finally:
    bro.close()