抓取简书文章标题及链接

抓取简书文章标题链接

文章链接:https://www.jianshu.com/p/85f4624485b9

01 详细版本

# datetime:2020/10/6 13:53
# 抓取简书文章标题链接
import pandas as pd
from requests_html import HTMLSession
# 建立一个会话与服务器交谈
session = HTMLSession()
# 输入网址,存储到url变量名中
url = 'https://www.jianshu.com/p/85f4624485b9'
# 获取网页内容
r = session.get(url)
# 查看网页内容
# print(r.html.text)

# 查看links属性(可省)
# print(r.html.links)

# 查看绝对链接(可省)
# print(r.html.absolute_links)

# 找到链接a的路经,定义变量sel
sel = '#__next > div._21bLU4._3kbg6I > div > div._gp-ck > section:nth-child(1) > article > p:nth-child(4) > a'

# 把结果存到results变量中
results = r.html.find(sel)

# 查看results内容(可省)
# print(results)

# 让python显示results结果数据对应的文本(可省)
# print(results[0].text)

# 把链接提取出来(可省)
# print(results[0].absolute_links)
# {'https://www.jianshu.com/nb/130182'}显示的结果是集合

# 只要连接的字符
list(results[0].absolute_links)[0]
# print(list(results[0].absolute_links)[0])

# 编写函数获取 文本和链接
def get_text_link_from_sel(sel):
    mylist = []
    try:
        results = r.html.find(sel)
        for result in results:
            mytext = result.text
            mylink = list(result.absolute_links)[0]
            mylist.append((mytext, mylink))
        return mylist
    except:
        return None

sel = '#__next > div._21bLU4._3kbg6I > div > div._gp-ck > section:nth-child(1) > article >p> a'

# 查看输出结果
# print(get_text_link_from_sel(sel))
#将列表转换为数据框
df = pd.DataFrame(get_text_link_from_sel(sel))
# 设置表头
df.columns = ['text', 'link']
# 查看输出结果
print(df)
# 存入csv文档
df.to_csv('output.csv', encoding='GBK', index=False)

02 简单版本

# datetime:2020/10/6 13:53
# 抓取简书文章标题链接
import pandas as pd
from requests_html import HTMLSession
# 建立一个会话与服务器交谈
session = HTMLSession()
# 输入网址,存储到url变量名中
url = 'https://www.jianshu.com/p/85f4624485b9'
# 获取网页内容
r = session.get(url)
# 找到链接a的路经,定义变量sel
sel = '#__next > div._21bLU4._3kbg6I > div > div._gp-ck > section:nth-child(1) > article >p> a'
# 把结果存到results变量中
results = r.html.find(sel)
# 编写函数获取 文本和链接
def get_text_link_from_sel(sel):
    mylist = []
    try:
        results = r.html.find(sel)
        for result in results:
            mytext = result.text
            mylink = list(result.absolute_links)[0]
            mylist.append((mytext, mylink))
        return mylist
    except:
        return None
# 将列表转换为数据框
df = pd.DataFrame(get_text_link_from_sel(sel))
# 设置表头
df.columns = ['text', 'link']
# 查看输出结果
print(df)
# 存入csv文档
df.to_csv('output.csv', encoding='GBK', index=False)

原文地址:https://www.cnblogs.com/James-221/p/13773994.html