import sys
import json
import urllib
import urllib2
reload(sys)
sys.setdefaultencoding("utf-8")
class shiji():
def __init__(self):
self.page = 1
self.filename = open("sj.json","w")
def parse_page(self):
url = "http://search.jiayuan.com/v2/search_v2.php?"
headers = {"User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"}
formdata = {
"sex":"f",
"key":"",
"stc":"1:11,2:20.25,3:160.170,23:1",
"sn":"default",
"sv":"1",
"p":self.page,
"f":"select",
"listStyle":"bigPhoto",
"pri_uid":"170633820",
"jsversion":"v5"
}
data = urllib.urlencode(formdata)
request = urllib2.Request( url ,data = data , headers = headers)
response = urllib2.urlopen(request)
js = response.read()
js = js.replace("##jiayser##","").replace("//","")
content = json.loads(js)
self.parse_person(content['userInfo'])
def parse_person(self,userinfo):
for i in range(len(userinfo)):
form = {"昵称":userinfo[i]['nickname'],"年龄":userinfo[i]['age'],"内心独白":userinfo[i]['shortnote']}
form = json.dumps(form).decode("unicode_escape")
self.filename.write(form+"\n")
if self.page <10:
self.page+=1
self.parse_page()
if __name__ == "__main__":
sj = shiji()
sj.parse_page()
写这个爬虫浪费了一天的时间。。。我是真的菜。
错误总结:
关于unicode编码转汉字的方法以及UnicodeEncodeError: 'ascii' codec can't encode characters in position问题:
https://www.cnblogs.com/technologylife/p/6071787.html
http://blog.sina.com.cn/s/blog_64a3795a01018vyp.html
把dict写入文件的时候碰见的报的typeError的解决办法: http://blog.csdn.net/guoweish/article/details/47106263
另外加一篇ubuntu vim撤销操作的博客 http://blog.sina.com.cn/s/blog_7e9efc570101ays3.html
收获:这次的收获还可以,解决了很多没见过的bug,第一次爬取js渲染的网页的数据,值得记得的是:(1).js渲染过得网页怎么找数据来源(f12 network XHR 找是post请求还是get请求),(2)字符串的强大替换函数replace,(3)字典写入文件怎么处理 (先转换成json的字符串再写入),(4)unicode转汉字怎么处理(import sys + decode("unicode_escape"))
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】博客园社区专享云产品让利特惠,阿里云新客6.5折上折
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步