用python实现爬虫
代码示例
#!/usr/bin/python
# coding: UTF-8
# coding:utf8
import requests
from bs4 import BeautifulSoup
import uuid
import os
import csv,time
from os import path
headers = {
# 如某些网站(如p站)要检查referer,就给他加上
'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Version/11.0 Mobile/15A5341f Safari/604.1',
'content-type': 'application/x-www-form-urlencoded; charset=UTF-8'
}
i=0
file_path = os.path.abspath('.')
f=open(file_path+'/doubancomment.csv','w+',encoding='utf-8')
doubancomment=csv.writer(f)
# 读取所有的url链接
d=path.dirname(__file__)
zhhifouUrl=open(path.join(d,'zhhifouUrl.txt')).read()#zhhifouUrl.txt文件中事先存储url链接
zhhifouUrl_list=str(zhhifouUrl).split('\n')
# print(zhhifouUrl_list[0])
for urlEle in zhhifouUrl_list:
print(urlEle)
print("-----现在开始爬取第" + str(i + 1) + '回-----')
i=i+1
response=requests.get(url=urlEle,headers=headers)
print ('encoding:',response.encoding)
print ('apparent_encoding:',response.apparent_encoding)
if response.status_code==200:
response.encoding=response.encoding #下载的页面是什么编码就用什么编码格式 使用默认的编码原则
soup=BeautifulSoup(response.text,features='html.parser')# 实例化soup对象的两种参数方式
target=soup.find(id='nr1') #文章内容开始的id
li_list=target.find_all('p')
for content in li_list:
print(content.get_text()) # 找到p标签的内容s
f.write(' ' + content.get_text()+ '\n')
else:
pass
time.sleep(0.5)
f.close()
print('\n')
print('-----抓取完毕!-----')
zhhifouUrl.txt:
http://www.luoxia.com/minglan/55739.htm
http://www.luoxia.com/minglan/55738.htm
http://www.luoxia.com/minglan/55743.htm
http://www.luoxia.com/minglan/55746.htm
http://www.luoxia.com/minglan/55750.htm
http://www.luoxia.com/minglan/55751.htm
http://www.luoxia.com/minglan/55752.htm
http://www.luoxia.com/minglan/55754.htm
http://www.luoxia.com/minglan/55753.htm
http://www.luoxia.com/minglan/55759.htm
http://www.luoxia.com/minglan/55762.htm
http://www.luoxia.com/minglan/55765.htm