python数据抓取,抓点星星网的内容
代码:
#coding=utf-8 import os,sys,re import requests from webob.exc import strip_tags from xpinyin import Pinyin def str2dict(str): dict = {} groups1 = str.split(";"); for group1 in groups1: if not group1: continue arr = group1.split("=") dict[arr[0].strip()] = arr[1].strip() return dict cookie = 'gr_user_id=e=1675905351' baseurl = "https://www.xxwolo.com/"; cookies = str2dict(cookie) p = Pinyin() #print(cookies) #sys.exit(0) url0 = baseurl+"posts/topic/tsptf23456789011?v=Sun" response = requests.get(url=url0, cookies=cookies) strcommon = response.text jiashus = strcommon.split("<div class=\"dyn_list\">")[1].split("</div>")[0] resjiashu = re.findall("<a href=\"\/info\/(.*?)\" class=\"dyn_item\">([\s\S]*?)<span>(.*?)<\/span>", jiashus) for resjiashu1 in resjiashu: jiashumd5 = resjiashu1[0] jiashuname = resjiashu1[2] if jiashuname.find("谢谢") < 0: continue jiashunameen = p.get_pinyin(jiashuname) url = baseurl+"posts/topic/"+jiashumd5+"?v=Sun" response = requests.get(url=url, cookies=cookies) str0 = response.text fenleis = str0.split("<div class=\"rp_words\">")[1].split("</div>")[0] #print(fenleis) res = re.findall("<a href=\"(.*?)\">(.*?)<\/a>", fenleis) res.insert(0, (url, '太阳')) for r in res: url1 = r[0] title1 = r[1] print("开始获取:"+title1+"...") print("地址:"+url1+"...\n") response = requests.get(url=url1, cookies=cookies) str1 = response.text str1 = str1.split("<div class=\"rp_list\">")[1].split("<div style=\"text-align:center;margin:2em;\">")[0] #print(str1) res1 = re.findall("<a class=\"tit\" href=\"/(.*?)\" title=\"(.*?)\">(.*?)<\/a>([\s\S]*?)<p class=\"ellip_clamp\">([\s\S]*?)<\/p>", str1) for r1 in res1: url2 = baseurl + r1[0] title2 = r1[2] titleen2 = p.get_pinyin(title2) content1 = strip_tags(r1[4]) print("开始获取:"+jiashuname+"->"+title1+"->"+title2+"...") print("地址:"+url2+"...\n") if url2 == baseurl+'t/': res2 = [content1] else: response = requests.get(url=url2, cookies=cookies) str2 = response.text str2 = str2.split("<div class=\"ce_content\" >")[1].split("<!--添加解读-->")[0] res2 = re.findall("<p class=\"note\">(.*?)<\/p>", str2) filename = jiashuname+"/"+title2+".txt" filename2 = jiashuname+"_汇总.txt" dirname = os.path.dirname(filename) if not os.path.exists(dirname): os.mkdir(dirname) for test1 in res2: html2 = "" if os.path.exists(filename): file = open(filename, 'r', encoding='utf-8') html2 = file.read() file.close() if html2.find(test1) >= 0: continue try: file = open(filename, 'a', encoding='utf-8') file.write(test1+"\n\n") file.close() except Exception as e: print("写入文件["+filename+"]失败") print(test1+"\n\n") print(e) sys.exit(0) try: file = open(filename2, 'a', encoding='utf-8') file.write(title2+"\n"+test1+"\n\n") file.close() except Exception as e: print("写入文件["+filename2+"]失败") print(test1+"\n\n") print(e) sys.exit(0)
效果:
本文来自博客园,作者:河北大学-徐小波,转载请注明原文链接:https://www.cnblogs.com/xuxiaobo/p/17105194.html