import re
titleR ='<a rel="noreferrer" href=".*?" title=".*?" target="_blank" class="j_th_tit ">(.*?)</a>'
authorR='<span class=".*?" title="主题作者:(.*?)" data-field'
reduR ='<span class=".*?" title="回复">(.*?)</span>'withopen('test.html','r',encoding='utf-8')as f:
data = f.read()
title = re.findall(titleR,data)
author = re.findall(authorR,data)
redu = re.findall(reduR,data)for i inrange(0,len(author)):print(redu[i]+author[i]+' '+title[i]+' ')
二、提取小说内容
from lxml import etree
withopen('work2.html','r')as f:
text = f.read()
html = etree.HTML(text)
result = html.xpath('//*[@id="content"]/text()')withopen('斗罗大陆.txt','w',encoding='utf-8')as f:
f.write(''.join(result))print(result)
三、豆瓣小说
from lxml import etree
withopen('work3.html','r',encoding='utf-8')as f:
text = f.read()
html = etree.HTML(text)
allInfo =''for i inrange(1,25):
title = html.xpath('//*[@id="content"]/div/div[1]/ol/li[%d]/div/div[2]/div[1]/a/span[1]/text()'%(i))
score = html.xpath('//*[@id="content"]/div/div[1]/ol/li[%d]/div/div[2]/div[2]/div/span[2]/text()'%(i))
comment = html.xpath('//*[@id="content"]/div/div[1]/ol/li[%d]/div/div[2]/div[2]/p[2]/span/text()'%(i))
time = html.xpath('//*[@id="content"]/div/div[1]/ol/li[%d]/div/div[2]/div[2]/p[1]/text()[2]'%(i))
info =''.join(title)+' '+''.join(score)+' '+''.join(comment)+' '+''.join(time)+'\n'
allInfo=allInfo+info
withopen('豆瓣电影.txt','w',encoding='utf-8')as f:
f.write(allInfo)
四、Ajax爬微博
from urllib.parse import urlencode
from pyquery import PyQuery as pq
import requests
base_url ='https://m.weibo.cn/api/container/getIndex?'
headers ={'Host':'m.weibo.cn','Referer':'https://m.weibo.cn/u/2360812967','User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36','X-Requested-With':'XMLHttpRequest',}
def get_page():
params ={'uid':'2360812967','t':'0','luicode':'10000011','lfid':'100103type=1&q=李现','type':'uid','value':'2360812967','containerid':'1076032360812967',}
url = base_url +urlencode(params)try:
response = requests.get(url, headers=headers)if response.status_code ==200:return response.json()
except requests.ConnectionError as e:\
print('Error', e.args)
def parse_page(json):if json:
items = json.get('data').get('cards')
i =0;for item in items:if(i ==0):
i =1continue
item = item.get('mblog')
weibo ={}
weibo['id']= item.get('id')
weibo['text']=pq(item.get('text')).text()
weibo['attitudes']= item.get('attitudes_count')
weibo['comments']= item.get('comments_count')
weibo['reposts']= item.get('reposts_count')yield weibo
if __name__ =='__main__':
# result =get_page()
# print(result)for page inrange(1,2):
json =get_page()
results =parse_page(json)for result in results:print(result)