python 新浪微博新闻抓取

# coding=UTF-8
import requests
import json
import bs4
import re
import os

#判断文件是否存在
def is_file_path(path):
    if os.path.isfile(path):
        return 0
    else:
        return 1
#判断文件目录是否存在
def is_folder_path(ymd):
    path = os.path.join(r'D:\news', '%s' % (ymd))
    # print(path)
    if not os.path.exists(path):
        os.makedirs(path)
    return path

#解析页面
def handle_html(html):
    bsobj = bs4.BeautifulSoup(html,'html.parser')
    title_ = bsobj.find('h1', attrs={'class': 'art_tit_h1'})
    title = re.findall(r'\>(.*?)\<',str(title_))[0]
    news_soure_ = bsobj.find('cite',attrs={'class':'art_cite'})
    news_soure = re.findall(r'\>(.*?)\<',str(news_soure_))[0]
    release_time_ = bsobj.find('time',attrs={'class':'art_time'})
    release_time = re.findall(r'\>(.*?)\<',str(release_time_))[0]
    content_list = bsobj.find('article',attrs={'class':'art_box'}).find_all('p')
    ymd = release_time.split(' ')[0]
    dhs = release_time.split(' ')[1].replace(':','-')
    path = is_folder_path(ymd)
    #组合路径
    file_path = os.path.join(path,'%s.txt' % (dhs))
    limit_number = is_file_path(file_path)
    if limit_number == 1:
        with open(file_path,'a',encoding='utf-8')as file:
            file.write(str(title)+'\n')
            file.write(str(release_time)+' '+str(news_soure)+'\n')
        for item in content_list:
            content = re.findall(r'\>(.*?)\<',str(item))[0]
            #写入文件
            with open(file_path,'a',encoding='utf-8')as file:
                file.write(str(content)+'\n')
    else:
        print('%s.txt' % (dhs),"文件已存在")
    print()
#根据新闻url访问具体的网页并提取信息
def news_url_info(url):
    """
    功能:访问单一路由的新闻页面
    参数:目标网页的url
    返回:目标网页的html
    """
    url = url.replace('\\','')
    r = requests.get(url)
    r.raise_for_status()
    # print(r.text)
    handle_html(r.text)
    # t = threading.Thread(target=handle_html, args=(r.text))
    # t.setDaemon(True)  # 把子进程设置为守护线程，必须在start()之前设置
    # t.start()
    # t.join()  # 设置主线程等待子线程结束
    # print(url)

#初次提取结构数据:提取所需数据
def handle_one_information(data):
    """
    功能:提取字典结构的数据
    参数:目标网页的数据
    返回:处理后的数据字典
    """
    data = data.split('(')[1].split(')')[0]
    return data
#二次提取数据:提取新闻链接的列表
def handle_two_information(information):
    """
    功能:提取新闻url的列表
    参数:初次处理后的数据结构
    返回:
    """
    news_list = information['result'].get('data')
    for item in news_list['list']:
        url = item['URL']
        news_url_info(url)

def start_url(page,count):
    """
    功能:访问新闻url的集合网页
    参数:目标的url
    返回:
    """
    url = 'http://interface.sina.cn/dfz/outside/wap/news/list.d.html?col=56325&level=undefined&show_num=15&page={}&act=more&jsoncallback=callbackFunction&callback=jsonp{}'.format(page,count)
    header = {
        'Accept':'*/*',
        'Accept-Encoding':'gzip, deflate',
        'Accept-Language':'zh-CN,zh;q=0.9',
        'Connection':'keep-alive',
        'Cookie':'ustat=__10.79.112.91_1600136573_0.19598300; genTime=1600136573; vt=4; Apache=7531690419683.026.1600136575343; SINAGLOBAL=7531690419683.026.1600136575343; ULV=1600136575344:1:1:1:7531690419683.026.1600136575343:; historyRecord={"href":"http://sz.sina.cn/news/list-p1.d.html","refer":"http://sz.sina.com.cn/"}',
        'Host':'interface.sina.cn',
        'Referer':'http://sz.sina.cn/news/list-p1.d.html',
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
    }
    res = requests.get(url, headers=header)
    data = res.text.encode('utf-8').decode('unicode_escape')
    information = json.loads(handle_one_information(data))
    handle_two_information(information)

if __name__ == '__main__':
    """
    循环界限:取决于所需数据
    """
    for i in range(1,100000000):
        start_url(i, i)
posted on 2020-09-15 16:50 痴人谈情阅读(331) 评论(0) 编辑收藏举报