抓取简书文章标题链接
文章链接:https://www.jianshu.com/p/85f4624485b9
01 详细版本
# datetime:2020/10/6 13:53
# 抓取简书文章标题链接
import pandas as pd
from requests_html import HTMLSession
# 建立一个会话与服务器交谈
session = HTMLSession()
# 输入网址,存储到url变量名中
url = 'https://www.jianshu.com/p/85f4624485b9'
# 获取网页内容
r = session.get(url)
# 查看网页内容
# print(r.html.text)
# 查看links属性(可省)
# print(r.html.links)
# 查看绝对链接(可省)
# print(r.html.absolute_links)
# 找到链接a的路经,定义变量sel
sel = '#__next > div._21bLU4._3kbg6I > div > div._gp-ck > section:nth-child(1) > article > p:nth-child(4) > a'
# 把结果存到results变量中
results = r.html.find(sel)
# 查看results内容(可省)
# print(results)
# 让python显示results结果数据对应的文本(可省)
# print(results[0].text)
# 把链接提取出来(可省)
# print(results[0].absolute_links)
# {'https://www.jianshu.com/nb/130182'}显示的结果是集合
# 只要连接的字符
list(results[0].absolute_links)[0]
# print(list(results[0].absolute_links)[0])
# 编写函数获取 文本和链接
def get_text_link_from_sel(sel):
mylist = []
try:
results = r.html.find(sel)
for result in results:
mytext = result.text
mylink = list(result.absolute_links)[0]
mylist.append((mytext, mylink))
return mylist
except:
return None
sel = '#__next > div._21bLU4._3kbg6I > div > div._gp-ck > section:nth-child(1) > article >p> a'
# 查看输出结果
# print(get_text_link_from_sel(sel))
#将列表转换为数据框
df = pd.DataFrame(get_text_link_from_sel(sel))
# 设置表头
df.columns = ['text', 'link']
# 查看输出结果
print(df)
# 存入csv文档
df.to_csv('output.csv', encoding='GBK', index=False)
02 简单版本
# datetime:2020/10/6 13:53
# 抓取简书文章标题链接
import pandas as pd
from requests_html import HTMLSession
# 建立一个会话与服务器交谈
session = HTMLSession()
# 输入网址,存储到url变量名中
url = 'https://www.jianshu.com/p/85f4624485b9'
# 获取网页内容
r = session.get(url)
# 找到链接a的路经,定义变量sel
sel = '#__next > div._21bLU4._3kbg6I > div > div._gp-ck > section:nth-child(1) > article >p> a'
# 把结果存到results变量中
results = r.html.find(sel)
# 编写函数获取 文本和链接
def get_text_link_from_sel(sel):
mylist = []
try:
results = r.html.find(sel)
for result in results:
mytext = result.text
mylink = list(result.absolute_links)[0]
mylist.append((mytext, mylink))
return mylist
except:
return None
# 将列表转换为数据框
df = pd.DataFrame(get_text_link_from_sel(sel))
# 设置表头
df.columns = ['text', 'link']
# 查看输出结果
print(df)
# 存入csv文档
df.to_csv('output.csv', encoding='GBK', index=False)