社交媒体关键字查询

"""
author：张鑫
date:2022/02/15 13:14
发布账号昵称、发布账号ID、发布时间、内容、发帖链接、账号主页链接、点赞数、评论数、分享数
运行环境 Chrome浏览器 和 selenium驱动版本一致即可
可自行接入公司数据库
keyword_list = ['tsaiingwen'] 放入采集政府、人物、政党的账号 tsaiingwenwe为蔡英文账号
"""
import locale
# -*- coding=utf-8 -*-
# 导入模块
import os
import random
import re
import sys
import time
from time import sleep
from selenium.webdriver import ActionChains
import pymongo
import xlsxwriter
from lxml import etree


def comments(comments_data):
    comments_data = ''.join(comments_data)
    if comments_data == '':
        ir_nresrved2 = 0

        ir_nresrved3 = 0

    else:
        comments_data = comments_data.replace('回應', '评论').replace('個', '条').replace('轉發', '分享').replace('萬', '万')
        ir_nresrved2 = ''.join(re.findall('(.*?)条评论', comments_data))
        if ir_nresrved2 == '':
            ir_nresrved2 = '0'
        ir_nresrved3 = ''.join(re.findall('条评论(.*?)次分享', comments_data))
        if ir_nresrved3 == '':
            ir_nresrved3 = ''.join(re.findall('(.*?)次分享', comments_data))
            if ir_nresrved3 == '':
                ir_nresrved3 = '0'
        if '万' in ir_nresrved2:
            ir_nresrved2 = ir_nresrved2.replace('万', '')
            ir_nresrved2 = int(float(ir_nresrved2) * 10000)
        if '万' in ir_nresrved3:
            ir_nresrved3 = ir_nresrved3.replace('万', '')
            ir_nresrved3 = int(float(ir_nresrved3) * 10000)

    return ir_nresrved2, ir_nresrved3

# 中文支持
locale.setlocale(locale.LC_CTYPE, 'chinese')

# 连接数据库
database = pymongo.MongoClient('192.168.1.103', port=27017)
db = database['facebook']
kzxy_list = db['kzxy']

sys.path.append(os.path.dirname(__file__))

from tools.logger_server import logger
from tools.selenium_server import SeleniumServer
from tools.extract import Extract
from settings import REDIS, EPR_TIME


import datetime
import re
import time


def today_start():
    today = datetime.date.today()
    today_time = int(time.mktime(today.timetuple()))
    return today_time


def time_turn(timenum):
    if timenum == '0':
        return '0'
    elif 0 < len(timenum) < 11 and timenum.isdigit():
        timenum = int(timenum)
        timeArray = time.localtime(timenum)
        otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
        return otherStyleTime


    else:
        print('请输入11位以内的数字')


def time_turns(time1):
    if time1[0] == '昨' and len(time1) > 2:
        time1 = time1.split('發佈')[0]
        time1 = (time1.split('天')[-1])
        time1 = (today_start() - 24 * 3600) + int(time1.split(':')[0]) * 3600 + int(time1.split(':')[1]) * 60
        # print(time1)
        return time1
    if time1 == '昨天':
        time1 = (int(time.time()) - 24 * 3600)
        return time1
    if time1 == '刚刚':
        time1 = int(time.time())
        return time1
    if '天前' in time1:
        time1 = int(time.time()) - (int(time1.split('天')[0]) * 3600 * 24)
        return time1
    if time1=='0':
        return time1
    try:
        try:
            # 1小时转年月日
            TTime = time.time()
            try:
                xs = int(time1.split('小时')[0])
            except:
                xs = int(time1.split('小時')[0])
            sjc = xs * 60 * 60
            time1 = int(TTime - sjc)
            # print(time1)
            return time1
        except:
            try:
                TTime = time.time()
                try:
                    xs = int(time1.split('分钟')[0])
                except:
                    xs = int(time1.split('分鐘')[0])
                sjc = xs * 60
                time1 = int(TTime - sjc)
                # print(time1)
                return time1
            except:
                TTime = time.time()

                xs = int(time1.split('天')[0])

                sjc = xs * 60 * 60 * 24
                time1 = int(TTime - sjc)
                # print(time1)
                return time1
    except:

        if time1[1] == '月':
            if len(re.findall('(.*?)月', time1)) == 1:

                time1 = time1.replace('月', '-').replace('日', ' ')
                if ':' in time1:
                    try:
                        time1 = '2022-' + time1 + ':00'
                        time1 = time1.replace(' :', ':')
                    except:
                        time1 = '2022-0' + time1 + ':00'
                        time1 = time1.replace(' :', ':')
                else:
                    try:
                        time1 = '2022-' + time1 + '00:00:00'
                        time1 = time1.replace(' :', ':')
                    except:
                        time1 = '2022-0' + time1 + '00:00:00'
                        time1 = time1.replace(' :', ':')
            else:
                time1 = time1.replace('月', '-').replace('日', ' ')
                time1 = '2022-' + time1 + '00:00:00'
                time1 = time1.replace(' :', ':')
            dt = datetime.datetime.strptime(time1, '%Y-%m-%d %H:%M:%S')
            # result从数据库中读出来的标准格式时间数据
            # # 10位，时间点相当于从1.1开始的当年时间编号
            time1 = int(str(int(time.mktime(dt.timetuple()))))
            # print(time1)
            return time1


        elif '2022年' in time1:
            time1 = time1.replace('年', '-').replace('月', '-').replace('日', ' ')
            time1 = time1 + '00' + ':00' + ':00'
            time1 = time1.replace(' :', ':')
            dt = datetime.datetime.strptime(time1, '%Y-%m-%d %H:%M:%S')
            # result从数据库中读出来的标准格式时间数据
            # # 10位，时间点相当于从1.1开始的当年时间编号
            time1 = int(str(int(time.mktime(dt.timetuple()))))
            return time1
        elif time1.split('年')[0] != 2022:
            time1 = 0
            print('不是今年的数据，不采集')
            return time1
    time1 = time_turn(time_turns(time1))
    return time1

class FB:

    def __init__(self):

        self.selenium = SeleniumServer()
        self.driver = self.selenium.driver
        self.redis = REDIS
        self.logger = logger
        self.extract = Extract()
        self.epr_time = EPR_TIME

    def start(self, keyword, zong_sums, wsheet1):
        """程序的入口
        :param keyword 搜索的关键字

        """
        # 退出设置
        tc_sum = 1
        sums = zong_sums
        # 输出搜索日志
        self.logger.debug('start: {}'.format(keyword))
        # 打开首页
        home_url='https://www.facebook.com/search/posts?q={}&sde=AbpC92W4Tk5CSRWpvFucDAgwoQ2gfVVMoruVxGcnOdkNfijX1G4VeLHZZ-EnQ0efWg6zxEoc9fii4P4dV05zM55j8iXbnj0pKLaEKctnRPAS4US5p7M6k4Fp-eLfhveWhJuF0cv4QOeSInYjSHv4dVDec8Y9-fp6-3LG3rHVWJ2CEBIpBbHttlepqOq0ONMyH0A9S8P4Z0pbFDRloMORLtxp-_Ol2JykorAkLsdNd4NJcPvaIII4N8TmBAhHZ414FArZ48w6A3mWL5lGM_nNlKzVHaSYBU2DQPVBcRHq1Ni2JI7I-Fq6RtzHS4gRuCLM6z-P5weqvUi4RtQP-pGHiwt1huE0xDOGeDTTjWesxsNYmJsVRYQJ77vE3Xq2-3N0D2w&filters=eyJyZWNlbnRfcG9zdHM6MCI6IntcIm5hbWVcIjpcInJlY2VudF9wb3N0c1wiLFwiYXJnc1wiOlwiXCJ9In0%3D'.format(keyword)
        try:
            # 打开facebook
            self.driver.get(home_url)
           
            sleep(random.randint(6, 8))
        except:
            pass


        num = 0
        # 进入二级页面的游标，控制指针停在时间
        zz_num = 1
        
        # 一共下滑一百次次，下滑一次停顿0.5s
        # 由于是异步加载，所以需要拉到最下方
        # 发布帐号昵称
        zhnc = ''
        # 点击主页
        # self.driver.execute_script('window.scrollBy(0,2200)')
        # 1.没对号的统一划分为人物，有对好的炫富获取分类，分类游人物，政府，媒体
        # 2.一天内的数据
        # 3.内容中含义关键字
        # 4.获取昵称，分类，时间，内容，
        
        for i in range(1, 101):
            print(f'*****************************第{i}页***************************')
            
            self.driver.execute_script('window.scrollBy(0,2200)')
            sleep(random.randint(3, 5))

            # 获取网页源码
            html = etree.HTML(self.driver.page_source)
       
            all_list = html.xpath(
                '//div[@class="du4w35lb l9j0dhe7"]')
            print(len(all_list))
            if len(all_list)>3:
                da_list1 = all_list[:-3]
            else:
                da_list1 = all_list
            for all_li in da_list1[num:len(da_list1) + 1]:
                # 临时储存数据的列表
                list_shuju = []
                list_shuju.append(keyword)
                
            
                zhnc = all_li.xpath(
                    f'//div[{zz_num}]/div/div/div/div/div/div/div/div/div/div/div[2]/div/div[2]/div/div[2]/div/div[1]/span/h3/span/strong[1]/span/a/span/span//text()'
                    )
       
                if zhnc==[]:

                    zhnc = all_li.xpath(
                        f'//div[{zz_num}]/div/div/div/div/div/div/div/div/div/div/div[2]/div/div[2]/div/div[2]/div/div[1]/span/h3/span/a/strong/span//text()'
                        )
                    if zhnc==[]:
                        zz_num+=1
                        break
                zhnc=zhnc[0]  
                print(f'发布帐号昵称:{zhnc}')
                # list_shuju.append(zhnc)


                biaoqian=all_li.xpath(
                    './/div[@class="j83agx80 cbu4d94t ew0dbk1b irj2b8pg"]/div/span/h3/span[2]/span/div/@aria-label'
                    )
                print('标签',biaoqian)
                

                # 内容/div[@class="rq0escxv l9j0dhe7 du4w35lb hybvsw6c io0zqebd m5lcvass fbipl8qg nwvqtn77 k4urcfbm ni8dbmo4 stjgntxs sbcfpzgs"]/div/div[2]/div/div[3]//text()
                content_data = all_li.xpath(
                    f'//div[{zz_num}]/div/div/div/div/div/div/div/div/div/div/div[2]/div/div[3]/div[1]/div/div/div/span//text()'
                    )
                if content_data==[]:
                    content_data=['暂无文字内容']
                       

                texts = []
                for text in content_data:
                    if text == '.' or text == '=' or text == 'ownloa':
                        continue
                    else:
                        text = text.replace('#', '')
                        text += '\n'
                        texts.append(text)
               
                texts = ''.join(texts)

                print('内容数据', texts)
                print('zz_num', zz_num)
                # print(texts)

                if '查看更多' in texts:
                    print('查看更多')
                  
                    list_towurl = self.driver.find_element_by_xpath(
                        f'//div[{zz_num}]/div/div/div/div/div/div/div/div/div/div/div[2]/div/div[2]/div/div[2]/div/div[2]/span/span/span[2]/span/a')  # 定位鼠标悬停的地方

                    ActionChains(self.driver).move_to_element(list_towurl).perform()
                    time.sleep(5)
                    print('二级页面')
                    # 悬浮在查看更多上
                    a = self.driver.find_element_by_xpath(
                        f'//div[{zz_num}]/div/div/div/div/div/div/div/div/div/div/div[2]/div/div[3]/div[1]/div/div/div/span//div[@role="button"]'
                    )
                    ActionChains(self.driver).move_to_element(a).perform()
                    print('查看更多位置')
                    time.sleep(5)

                    print("ff1")
                    # 点击查看更多
                    a.click()
                    print("ff2")
                    time.sleep(3)
                    windows = self.driver.window_handles
                    self.driver.switch_to.window(windows[-1])
                    print("ff3")
                    html = etree.HTML(self.driver.page_source)
                    print("f1")
                    all_list1 = html.xpath(
                        '//div[@class="du4w35lb l9j0dhe7"]')
                
                    all_li = all_list1[num]
                    print("f2")
                    texts = all_li.xpath(
                        f'//div[{zz_num}]/div/div/div/div/div/div/div/div/div/div/div[2]/div/div[3]/div[1]/div/div/div/span//text()'
                    )
                    neirong = []
                   
                    for text in texts:
                        if text == '.':
                            pass
                        else:
                            text = text.replace('#', '')
                            text += '\n'
                            neirong.append(text)
                    neirong = ''.join(neirong)
                    print('内容数据', zz_num, '：', neirong)
                    list_shuju.append(neirong)

                    two_url = html.xpath(
                        f'//div[{zz_num}]/div/div/div/div/div/div/div/div/div/div/div[2]/div/div[2]/div/div[2]/div/div[2]/span/span/span[2]/span/a//@href')
                    if two_url == []:
                        two_url = html.xpath(
                            f'//div[{zz_num}]/div/div/div/div/div/div/div/div/div/div/div[2]/div/div[2]/div/div[2]/div/div[2]/span/span/span[3]/span/a//@href')
                    two_url = two_url[0]
                   
                    
                   
                    print('二级页面标签：', two_url)

                    list_shuju.append(two_url)
                else:
                    print('no')
                    # 悬浮在时间位置 获取二级页面链接
                    
                    
                  
                    list_towurl = self.driver.find_element_by_xpath(
                        f'//div[{zz_num}]/div/div/div/div/div/div/div/div/div/div/div[2]/div/div[2]/div/div[2]/div/div[2]/span/span/span[2]/span/a')  # 定位鼠标悬停的地方


                    ActionChains(self.driver).move_to_element(list_towurl).perform()
                    time.sleep(5)

                    html = etree.HTML(self.driver.page_source)
                    two_url = html.xpath(
                        f'//div[{zz_num}]/div/div/div/div/div/div/div/div/div/div/div[2]/div/div[2]/div/div[2]/div/div[2]/span/span/span[2]/span/a//@href')
                    if two_url == []:
                        two_url = html.xpath(
                            f'//div[{zz_num}]/div/div/div/div/div/div/div/div/div/div/div[2]/div/div[2]/div/div[2]/div/div[2]/span/span/span[3]/span/a//@href')
                    print('详情页链接')
                    two_url = two_url[0]
                  
                    
                    
                    neirong = []
                    texts = texts.split('\n')
                    for text in texts:
                        if text == '.' or text == '=' or text == 'ownloa':
                            pass
                        else:

                            text = text.replace('#', '')
                            text += '\n'
                            neirong.append(text)
                    neirong = ''.join(neirong)
                    print('内容数据', zz_num, '：', neirong)
                    list_shuju.append(neirong)
                   

                    print('二级页面标签：', two_url)
                    list_shuju.append(two_url)
                                         
                Time_list1=f'//div[{zz_num}]/div/div/div/div/div/div/div/div/div/div/div[2]/div/div[2]/div/div[2]/div/div[2]/span/span/span[2]/span//text()'
                Time_list2=f'//div[{zz_num}]/div/div/div/div/div/div/div/div/div/div/div[2]/div/div[2]/div/div[2]/div/div[2]/span/span/span[3]/span//text()'
                Time_list=all_li.xpath(Time_list1)
                    
                if Time_list == [] or Time_list == ['\xa0', ' · ']:
                    Time_list = all_li.xpath(Time_list2)
                print(Time_list)
                time1 = []
                for time2 in Time_list:
                    if time2 == '=':
                        pass
                    else:
                        time1.append(time2)
                time1 = time1[0].replace('2022年','')
                time1=int(time_turns(time1))
                print("********发布时间*****", time_turn(str(time1)))
                time2=int(time.time())
                print(time2-time1<=15*24*3600)
                print('新疆' in neirong and '洪秀柱' in neirong)
                if time2-time1<=15*24*3600 and '新疆' in neirong and '洪秀柱' in neirong:
                    time1=time_turn(str(time1))
                    list_shuju.append(time1)                     
  
                else:
                    
                    print('不符合抓取要求')
                    zz_num+=1
                    break                                   
                print('+++list_shuju+++', list_shuju)
                row = 'A' + str(sums)
                wsheet1.write_row(row, list_shuju)
                print('--------------第', num + 1, '条数据！--------------------')
                print('--------------写入到第', sums, '行数据！--------------------')
                num += 1
                sums += 1
                zz_num+=1
    
        self.driver.close()

if __name__ == '__main__':
    # 写入excel
    keyword_list=['洪秀柱新疆']
    # keyword_list=['洪秀柱新疆']
    try:
        wbook = xlsxwriter.Workbook('洪秀柱新疆.xlsx')
        # 创建工作表
        wsheet1 = wbook.add_worksheet('Sheet1')

        title = ['发布账号','内容','详情页连接','时间']
        wsheet1.write_row('A1', title)  # 从A1单元格写入表头
        # excel 所有用户总发帖数
        zong_sums = 2
        for keyword in keyword_list:
            print(
                f"---------------------------------------------索引：{keyword_list.index(keyword)}------------------------------------------------")

            fb_server = FB()
            zong_sums = fb_server.start(keyword, zong_sums, wsheet1)
            sleep(random.randint(15, 30))
            print('写入完成！', zong_sums)
        wbook.close()
    except:
        wbook.close()
        print("人工终止！")
posted @ 2022-03-02 17:50 布都御魂阅读(127) 评论(0) 编辑收藏举报
刷新页面返回顶部
布都御魂

社交媒体关键字查询

公告