python数据抓取,抓点星星网的内容

代码:
#coding=utf-8

import os,sys,re
import requests
from webob.exc import strip_tags
from xpinyin import Pinyin

def str2dict(str):
    dict = {}
    groups1 = str.split(";");
    for group1 in groups1:
        if not group1:
            continue
        arr = group1.split("=")
        dict[arr[0].strip()] = arr[1].strip()
    return dict

cookie = 'gr_user_id=e=1675905351'

baseurl = "https://www.xxwolo.com/";
cookies = str2dict(cookie)
p = Pinyin()
#print(cookies)
#sys.exit(0)

url0 = baseurl+"posts/topic/tsptf23456789011?v=Sun"
response = requests.get(url=url0, cookies=cookies)
strcommon = response.text
jiashus = strcommon.split("<div class=\"dyn_list\">")[1].split("</div>")[0]

resjiashu = re.findall("<a href=\"\/info\/(.*?)\" class=\"dyn_item\">([\s\S]*?)<span>(.*?)<\/span>", jiashus)
for resjiashu1 in resjiashu:
    jiashumd5 = resjiashu1[0]
    jiashuname = resjiashu1[2]
    if jiashuname.find("谢谢") < 0:
        continue
    jiashunameen = p.get_pinyin(jiashuname)
    url = baseurl+"posts/topic/"+jiashumd5+"?v=Sun"
    response = requests.get(url=url, cookies=cookies)
    str0 = response.text
    fenleis = str0.split("<div class=\"rp_words\">")[1].split("</div>")[0]
    #print(fenleis)
    res = re.findall("<a href=\"(.*?)\">(.*?)<\/a>", fenleis)
    res.insert(0, (url, '太阳'))
    
    for r in res:
        url1 = r[0]
        title1 = r[1]
        print("开始获取:"+title1+"...")
        print("地址:"+url1+"...\n")
        
        response = requests.get(url=url1, cookies=cookies)
        str1 = response.text
        str1 = str1.split("<div class=\"rp_list\">")[1].split("<div style=\"text-align:center;margin:2em;\">")[0]
        #print(str1)
        res1 = re.findall("<a class=\"tit\" href=\"/(.*?)\" title=\"(.*?)\">(.*?)<\/a>([\s\S]*?)<p class=\"ellip_clamp\">([\s\S]*?)<\/p>", str1)
        
        for r1 in res1:
            url2 = baseurl + r1[0]
            title2 = r1[2]
            titleen2 = p.get_pinyin(title2)
            content1 = strip_tags(r1[4])
            print("开始获取:"+jiashuname+"->"+title1+"->"+title2+"...")
            print("地址:"+url2+"...\n")
            if url2 == baseurl+'t/':
                res2 = [content1]
            else:
                response = requests.get(url=url2, cookies=cookies)
                str2 = response.text
                str2 = str2.split("<div class=\"ce_content\"  >")[1].split("<!--添加解读-->")[0]
                res2 = re.findall("<p class=\"note\">(.*?)<\/p>", str2)

            filename = jiashuname+"/"+title2+".txt"
            filename2 = jiashuname+"_汇总.txt"
            dirname = os.path.dirname(filename)
            if not os.path.exists(dirname):
                os.mkdir(dirname)
            
            for test1 in res2:
                html2 = ""
                if os.path.exists(filename):
                    file = open(filename, 'r', encoding='utf-8')
                    html2 = file.read()
                    file.close()
                
                if html2.find(test1) >= 0:
                    continue
                
                try:
                    file = open(filename, 'a', encoding='utf-8')
                    file.write(test1+"\n\n")
                    file.close()
                except Exception as e:
                    print("写入文件["+filename+"]失败")
                    print(test1+"\n\n")
                    print(e)
                    sys.exit(0)
                
                try:
                    file = open(filename2, 'a', encoding='utf-8')
                    file.write(title2+"\n"+test1+"\n\n")
                    file.close()
                except Exception as e:
                    print("写入文件["+filename2+"]失败")
                    print(test1+"\n\n")
                    print(e)
                    sys.exit(0)
    

 

效果:

 

 

 

posted @ 2023-02-09 14:32  河北大学-徐小波  阅读(34)  评论(0编辑  收藏  举报