孔子学院

"""
author：张鑫
date:2022/02/15 13:14
发布账号昵称、发布账号ID、发布时间、内容、发帖链接、账号主页链接、点赞数、评论数、分享数
运行环境 Chrome浏览器 和 selenium驱动版本一致即可
可自行接入公司数据库
keyword_list = ['tsaiingwen'] 放入采集政府、人物、政党的账号 tsaiingwenwe为蔡英文账号
"""
import locale
# -*- coding=utf-8 -*-
# 导入模块
import os
import random
import re
import sys
import time
from time import sleep
from selenium.webdriver import ActionChains
import pymongo
import xlsxwriter
from lxml import etree


def comments(comments_data):
    comments_data = ''.join(comments_data)
    if comments_data == '':
        ir_nresrved2 = 0

        ir_nresrved3 = 0

    else:
        comments_data = comments_data.replace('回應', '评论').replace('個', '条').replace('轉發', '分享').replace('萬', '万')
        ir_nresrved2 = ''.join(re.findall('(.*?)条评论', comments_data))
        if ir_nresrved2 == '':
            ir_nresrved2 = '0'
        ir_nresrved3 = ''.join(re.findall('条评论(.*?)次分享', comments_data))
        if ir_nresrved3 == '':
            ir_nresrved3 = ''.join(re.findall('(.*?)次分享', comments_data))
            if ir_nresrved3 == '':
                ir_nresrved3 = '0'
        if '万' in ir_nresrved2:
            ir_nresrved2 = ir_nresrved2.replace('万', '')
            ir_nresrved2 = int(float(ir_nresrved2) * 10000)
        if '万' in ir_nresrved3:
            ir_nresrved3 = ir_nresrved3.replace('万', '')
            ir_nresrved3 = int(float(ir_nresrved3) * 10000)

    return ir_nresrved2, ir_nresrved3

# 中文支持
locale.setlocale(locale.LC_CTYPE, 'chinese')

# 连接数据库
database = pymongo.MongoClient('192.168.1.103', port=27017)
db = database['facebook']
kzxy_list = db['kzxy']

sys.path.append(os.path.dirname(__file__))

from tools.logger_server import logger
from tools.selenium_server import SeleniumServer
from tools.extract import Extract
from settings import REDIS, EPR_TIME


import datetime
import re
import time


def today_start():
    today = datetime.date.today()
    today_time = int(time.mktime(today.timetuple()))
    return today_time


def time_turn(timenum):
    if timenum == '0':
        return '0'
    elif 0 < len(timenum) < 11 and timenum.isdigit():
        timenum = int(timenum)
        timeArray = time.localtime(timenum)
        otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
        return otherStyleTime


    else:
        print('请输入11位以内的数字')


def time_turns(time1):
    if time1[0] == '昨' and len(time1) > 2:
        time1 = time1.split('發佈')[0]
        time1 = (time1.split('天')[-1])
        time1 = (today_start() - 24 * 3600) + int(time1.split(':')[0]) * 3600 + int(time1.split(':')[1]) * 60
        # print(time1)
        return time1
    if time1 == '昨天':
        time1 = (int(time.time()) - 24 * 3600)
        return time1
    if time1 == '刚刚':
        time1 = int(time.time())
        return time1
    if '天前' in time1:
        time1 = int(time.time()) - (int(time1.split('天')[0]) * 3600 * 24)
        return time1
    if time1=='0':
        return time1
    try:
        try:
            # 1小时转年月日
            TTime = time.time()
            try:
                xs = int(time1.split('小时')[0])
            except:
                xs = int(time1.split('小時')[0])
            sjc = xs * 60 * 60
            time1 = int(TTime - sjc)
            # print(time1)
            return time1
        except:
            try:
                TTime = time.time()
                try:
                    xs = int(time1.split('分钟')[0])
                except:
                    xs = int(time1.split('分鐘')[0])
                sjc = xs * 60
                time1 = int(TTime - sjc)
                # print(time1)
                return time1
            except:
                TTime = time.time()

                xs = int(time1.split('天')[0])

                sjc = xs * 60 * 60 * 24
                time1 = int(TTime - sjc)
                # print(time1)
                return time1
    except:

        if time1[1] == '月':
            if len(re.findall('(.*?)月', time1)) == 1:

                time1 = time1.replace('月', '-').replace('日', ' ')
                if ':' in time1:
                    try:
                        time1 = '2022-' + time1 + ':00'
                        time1 = time1.replace(' :', ':')
                    except:
                        time1 = '2022-0' + time1 + ':00'
                        time1 = time1.replace(' :', ':')
                else:
                    try:
                        time1 = '2022-' + time1 + '00:00:00'
                        time1 = time1.replace(' :', ':')
                    except:
                        time1 = '2022-0' + time1 + '00:00:00'
                        time1 = time1.replace(' :', ':')
            else:
                time1 = time1.replace('月', '-').replace('日', ' ')
                time1 = '2022-' + time1 + '00:00:00'
                time1 = time1.replace(' :', ':')
            dt = datetime.datetime.strptime(time1, '%Y-%m-%d %H:%M:%S')
            # result从数据库中读出来的标准格式时间数据
            # # 10位，时间点相当于从1.1开始的当年时间编号
            time1 = int(str(int(time.mktime(dt.timetuple()))))
            # print(time1)
            return time1


        elif '2022年' in time1:
            time1 = time1.replace('年', '-').replace('月', '-').replace('日', ' ')
            time1 = time1 + '00' + ':00' + ':00'
            time1 = time1.replace(' :', ':')
            dt = datetime.datetime.strptime(time1, '%Y-%m-%d %H:%M:%S')
            # result从数据库中读出来的标准格式时间数据
            # # 10位，时间点相当于从1.1开始的当年时间编号
            time1 = int(str(int(time.mktime(dt.timetuple()))))
            return time1
        elif time1.split('年')[0] != 2022:
            time1 = 0
            print('不是今年的数据，不采集')
            return time1
    time1 = time_turn(time_turns(time1))
    return time1

class FB:

    def __init__(self):

        self.selenium = SeleniumServer()
        self.driver = self.selenium.driver
        self.redis = REDIS
        self.logger = logger
        self.extract = Extract()
        self.epr_time = EPR_TIME

    def start(self, keyword, zong_sums, wsheet1):
        """程序的入口
        :param keyword 搜索的关键字

        """
        # 退出设置
        tc_sum = 1
        sums = zong_sums
        # 输出搜索日志
        self.logger.debug('start: {}'.format(keyword))
        # 打开首页
        home_url = 'https://www.facebook.com/{}'.format(keyword)
        try:
            # 打开facebook
            self.driver.get(home_url)
            # sleep(random.randint(5, 7))
            sleep(random.randint(15, 20))
            print('网页长度',len(self.driver.get(home_url)))
        except:
            pass


        num = 0
        # 进入二级页面的游标，控制指针停在时间
        zz_num = 1
        sleep(random.randint(6, 8))
        # 一共下滑一百次次，下滑一次停顿0.5s
        # 由于是异步加载，所以需要拉到最下方
        # 发布帐号昵称
        zhnc = ''
        # 点击主页
        try:
            self.driver.find_element_by_xpath('/html/body/div[1]/div/div[1]/div/div[3]/div/div/div[1]/div[1]/div[3]/div/div/div/div[1]/div/div/div[1]/div/div/div/div/div/a[1]/div/span').click()
        except:
            self.driver.find_element_by_xpath('/html/body/div[1]/div/div[1]/div/div[3]/div/div/div[1]/div[1]/div/div/div[3]/div/div/div/div[1]/div/div/div[1]/div/div/div/div/div/div/a[1]/div/span').click()
        
        for i in range(1, 20000):
            print(f'*****************************第{i}页***************************')
            # sleep(random.randint(2, 3))
            self.driver.execute_script('window.scrollBy(0,2200)')
            sleep(random.randint(10, 15))

            # 获取网页源码
            html = etree.HTML(self.driver.page_source)
            try:
                all_list = html.xpath(
                    '//div[@data-pagelet="ProfileTimeline"]/div/div[@class="du4w35lb k4urcfbm l9j0dhe7 sjgh65i0"]')
                if len(all_list) == 0:
                    all_list1 = html.xpath('//div[@class="k4urcfbm"]/div[@class="du4w35lb k4urcfbm l9j0dhe7 sjgh65i0"]')
                    if len(all_list1) == 0:
                        all_list1 = html.xpath(
                            '//div[@data-pagelet="ProfileTimeline"]/div/div[@class="du4w35lb k4urcfbm l9j0dhe7 sjgh65i0"]')
                        print("****进入第二方案***", len(all_list1))
                        if len(all_list1) == 1:
                            self.driver.execute_script('window.scrollBy(0,-1800)')
                            sleep(random.randint(5, 8))
                            print('上划一下', tc_sum)
                            tc_sum += 1
                            if tc_sum == 3:
                                print("<<<<<<<<共", num, "条数据<<<<<<<<<<<", keyword, "本月数据读取完毕,写入第", sums,
                                      "行,下一用户>>>>>>>>>>>>>>>>>")
                                self.driver.close()
                                return sums
                        elif len(all_list1) == 0:
                            print('有置顶帖')
                            all_list1=html.xpath('//div[@class="du4w35lb k4urcfbm l9j0dhe7 sjgh65i0"]')
                            # print(len(all_list1),html.xpath('//div[@class="du4w35lb k4urcfbm l9j0dhe7 sjgh65i0"]//text()'))
                            if len(all_list1) == 0:
                                print('无发帖', tc_sum)
                                tc_sum += 1
                                if tc_sum == 3:
                                    print("<<<<<<<<共", num, "条数据<<<<<<<<<<<", keyword, "本月数据读取完毕,写入第", sums,
                                          "行,下一用户>>>>>>>>>>>>>>>>>")
                                    self.driver.close()
                                    return sums                                
                            
                        elif len(all_list1) == 2:
                            print('无发帖', tc_sum)
                            tc_sum += 1
                            if tc_sum == 3:
                                print("<<<<<<<<共", num, "条数据<<<<<<<<<<<", keyword, "本月数据读取完毕,写入第", sums,
                                      "行,下一用户>>>>>>>>>>>>>>>>>")
                                self.driver.close()
                                return sums
                        elif len(all_list1) == 3:
                            print('发帖3', tc_sum)
                            tc_sum += 1
                            if tc_sum == 3:
                                print("<<<<<<<<共", num, "条数据<<<<<<<<<<<", keyword, "本月数据读取完毕,写入第", sums,
                                      "行,下一用户>>>>>>>>>>>>>>>>>")
                                self.driver.close()
                                return sums
                        da_list1 = all_list1[:-3]
                        for all_li in da_list1[num:len(da_list1) + 1]:
                            # 临时储存数据的列表
                            list_shuju = []
                            # 账号昵称 //div[@class="qzhwtbm6 knvmm38d"]/span/h2/span/a/strong/span/text()
                            try:
                                zhnc = all_li.xpath(
                                    './/div[@class="qzhwtbm6 knvmm38d"]/span/h2/span/a/strong/span/text()')[0]
                                print(f'发布帐号昵称:{zhnc}')
                                list_shuju.append(zhnc)
                            except:
                                zhnc = all_li.xpath(
                                    '//span[@class="nc684nl6"]//text()')[0]
                                print(f'发布帐号昵称1:{zhnc}')
                                list_shuju.append(zhnc)

                            # 发布账号id
                            print('发布账号id', keyword)
                            list_shuju.append(keyword)

                            # 发布时间//div[@class="buofh1pr"]/div/div[2]/span/span/span[2]//text()
                            if zz_num==1:
                                Time_list=all_li.xpath(
                                    f'//div[1]/div[{zz_num}]/div/div/div/div/div/div/div/div/div/div[2]/div/div[2]/div/div[2]/div/div[2]/span/span/span[2]/span/a//text()'
                                    )
                                if Time_list == [] or Time_list == ['\xa0', ' · ']:
                                    Time_list = all_li.xpath(
                                        './/div[@class="rq0escxv l9j0dhe7 du4w35lb hybvsw6c io0zqebd m5lcvass fbipl8qg nwvqtn77 k4urcfbm ni8dbmo4 stjgntxs sbcfpzgs"]/div/div/div/div[2]//div[@class="buofh1pr"]/div/div[2]/span/span/span[2]//text()')
                                    if Time_list == [] or Time_list == ['\xa0', ' · ']:
                                        Time_list = all_li.xpath(
                                            './/div[@class="rq0escxv l9j0dhe7 du4w35lb hybvsw6c io0zqebd m5lcvass fbipl8qg nwvqtn77 k4urcfbm ni8dbmo4 stjgntxs sbcfpzgs"]/div/div/div/div[2]//div[@class="buofh1pr"]/div/div[2]/span/span/span[3]//text()')

                            else:
                                Time_list = all_li.xpath(
                                    f'//div[{zz_num}]/div/div/div/div/div/div/div/div/div/div[2]/div/div[2]/div/div[2]/div/div[2]/span/span/span[2]/span/a'
                                        # f'//div[{zz_num}]/div/div/div/div/div/div/div/div/div/div/div[2]/div/div[2]/div/div[2]/div/div[2]/span/span//a//text()'
                                    )
                                    
                                if Time_list == [] or Time_list == ['\xa0', ' · ']:
                                    Time_list = all_li.xpath(
                                        './/div[@class="rq0escxv l9j0dhe7 du4w35lb hybvsw6c io0zqebd m5lcvass fbipl8qg nwvqtn77 k4urcfbm ni8dbmo4 stjgntxs sbcfpzgs"]/div/div/div/div[2]//div[@class="buofh1pr"]/div/div[2]/span/span/span[2]//text()')
                                    if Time_list == [] or Time_list == ['\xa0', ' · ']:
                                        Time_list = all_li.xpath(
                                            './/div[@class="rq0escxv l9j0dhe7 du4w35lb hybvsw6c io0zqebd m5lcvass fbipl8qg nwvqtn77 k4urcfbm ni8dbmo4 stjgntxs sbcfpzgs"]/div/div/div/div[2]//div[@class="buofh1pr"]/div/div[2]/span/span/span[3]//text()')
                              
          
                            print(Time_list[-1])
                            time1 = []
                            for time2 in Time_list:
                                if time2 == '=':
                                    pass
                                else:
                                    time1.append(time2)
                            time1 = time1[0].replace('2022年','')
                            time1=int(time_turns(time1))
                            print("********发布时间*****", time_turn(str(time1)))
                            if time1>=1643644800:
                                time1=time_turn(str(time1))
                                list_shuju.append(time1)
                            elif time1<1643644800 and da_list1.index(all_li) == 0:
                                pass
                            elif time1<1643644800 and da_list1.index(all_li) != 0:
                                self.driver.close()
                                return sums
                            else:
                                
                                print("<<<<<<<<共", num, "条数据<<<<<<<<<<<", keyword, "本月数据读取完毕,写入第", sums,
                                      "行,下一用户>>>>>>>>>>>>>>>>>")
                                self.driver.close() 
                            
                            # 内容.//div[@class="rq0escxv l9j0dhe7 du4w35lb hybvsw6c io0zqebd m5lcvass fbipl8qg nwvqtn77 k4urcfbm ni8dbmo4 stjgntxs sbcfpzgs"]/div/div[2]/div/div[3]//text()
                            content_data = all_li.xpath(
                                './/div[@class="rq0escxv l9j0dhe7 du4w35lb hybvsw6c io0zqebd m5lcvass fbipl8qg nwvqtn77 k4urcfbm ni8dbmo4 stjgntxs sbcfpzgs"]/div/div[2]/div/div[3]//text()')
                            text = ''
                            for data in content_data:
                                if data == '=':
                                    continue
                                else:
                                    text += data
                            # print('内容数据：', text)
                            list_shuju.append(text)

                            # 发布账号主页链接
                            list_shuju.append(home_url)

                            try:
                                ir_nresrved1 = \
                                    all_li.xpath(
                                        './/span[@class="gpro0wi8 cwj9ozl2 bzsjyuwj ja2t1vim"]/span/span/text()')[
                                        0]
                            except:
                                ir_nresrved1 = []
                            try:
                                comments_data = all_li.xpath(
                                    './/div[@class="bp9cbjyn j83agx80 pfnyh3mw p1ueia1e"]/div//span//text()')
                            except:
                                comments_data = []
                            print(f'点赞：{ir_nresrved1},评论转发： {comments_data}')
                            if len(comments_data) == 0:
                                comments(comments_data)

                            elif len(comments_data) == 1:
                                comments(comments_data)
                            elif comments_data[1][-3:] == comments_data[0][-3:]:
                                comments_data = comments_data[0]
                                comments(comments_data)
                            else:
                                comments_data = ''.join(comments_data[0] + comments_data[1])
                                comments(comments_data)
                            if len(comments_data)==0:
                                ir_nresrved2=0
                                ir_nresrved3=0
                            else:
                                ir_nresrved2 = comments(comments_data)[0].replace(',', '')
                                ir_nresrved3 = comments(comments_data)[1].replace(',', '')

                            if ir_nresrved1 == []:
                                ir_nresrved1 = 0
                            else:
                                ir_nresrved1 = (str(ir_nresrved1)).replace('xa0', '').replace(',', '').replace(' ', '')
                                if '万' in ir_nresrved1:
                                    ir_nresrved1 = ir_nresrved1.replace('万', '')
                                    ir_nresrved1 = int(float(ir_nresrved1) * 10000)
                            print(ir_nresrved1, ir_nresrved2, ir_nresrved3)
                            ir_nresrved1 = int(str(ir_nresrved1))
                            ir_nresrved2 = int(str(ir_nresrved2))
                            ir_nresrved3 = int(str(ir_nresrved3))
                            print(type(ir_nresrved1), type(ir_nresrved2), type(ir_nresrved3))
                            list_shuju.append(ir_nresrved1)
                            list_shuju.append(ir_nresrved2)
                            list_shuju.append(ir_nresrved3)

                            print('+++list_shuju+++', list_shuju)
                            row = 'A' + str(sums)
                            wsheet1.write_row(row, list_shuju)
                            print('--------------第', num + 1, '条数据！--------------------')
                            print('--------------写入到第', sums, '行数据！--------------------')
                            num += 1
                            sums += 1
                    else:
                        print('all_list1', len(all_list1))
                        print("****进入第三方案***")
                        if len(all_list1) == 0:
                            tc_sum += 1
                            if tc_sum == 3:
                                break
                        da_list1 = all_list1[:-3]
                        for all_li in da_list1[num:len(da_list1) + 1]:
                            # 临时储存数据的列表
                            list_shuju = []
                            # 账号昵称 //div[@class="qzhwtbm6 knvmm38d"]/span/h2/span/a/strong/span/text()
                            try:
                                zhnc = all_li.xpath(
                                    './/div[@class="qzhwtbm6 knvmm38d"]/span/h2/span/a/strong/span/text()')[0]
                                print(f'发布帐号昵称:{zhnc}')
                                list_shuju.append(zhnc)
                            except:
                                zhnc = all_li.xpath(
                                    '//span[@class="nc684nl6"]//text()')[0]
                                print(f'发布帐号昵称1:{zhnc}')
                                list_shuju.append(zhnc)
                            # 发布账号id
                            print('发布账号id', keyword)
                            list_shuju.append(keyword)
                            Time_list = all_li.xpath(
                                    f'//div[{zz_num}]/div/div/div/div/div/div/div/div/div/div/div[2]/div/div[2]/div/div[2]/div/div[2]/span/span//a//text()'
                                )
                            if Time_list == [] or Time_list == ['\xa0', ' · ']:
                                Time_list = all_li.xpath(
                                    './/div[@class="rq0escxv l9j0dhe7 du4w35lb hybvsw6c io0zqebd m5lcvass fbipl8qg nwvqtn77 k4urcfbm ni8dbmo4 stjgntxs sbcfpzgs"]/div/div/div/div[2]//div[@class="buofh1pr"]/div/div[2]/span/span/span[2]//text()')
                                if Time_list == [] or Time_list == ['\xa0', ' · ']:
                                    Time_list = all_li.xpath(
                                        './/div[@class="rq0escxv l9j0dhe7 du4w35lb hybvsw6c io0zqebd m5lcvass fbipl8qg nwvqtn77 k4urcfbm ni8dbmo4 stjgntxs sbcfpzgs"]/div/div/div/div[2]//div[@class="buofh1pr"]/div/div[2]/span/span/span[3]//text()')
                            print(Time_list)
                            time1 = []
                            for time2 in Time_list:
                                if time2 == '=':
                                    pass
                                else:
                                    time1.append(time2)
                            time1 = time1[0].replace('2022年','')
                            time1=int(time_turns(time1))
                            print("********发布时间*****", time_turn(str(time1)))
                            if time1>=1643644800:
                                time1=time_turn(str(time1))
                                list_shuju.append(time1)
                            elif time1<1643644800 and da_list1.index(all_li) == 0:
                                pass
                            elif time1<1643644800 and da_list1.index(all_li) != 0:
                                self.driver.close()
                                return sums
                            else:
                                
                                print("<<<<<<<<共", num, "条数据<<<<<<<<<<<", keyword, "本月数据读取完毕,写入第", sums,
                                      "行,下一用户>>>>>>>>>>>>>>>>>")
                                self.driver.close()                           


                            # 修改
                            # 先判断内容中是否有查看更多
                            # 有鼠标单击 //div[@class="rq0escxv l9j0dhe7 du4w35lb hybvsw6c io0zqebd m5lcvass fbipl8qg nwvqtn77 k4urcfbm ni8dbmo4 stjgntxs sbcfpzgs"]//div[@dir='auto']//div[@class='qzhwtbm6 knvmm38d']/span/div/div/div
                            content_data = all_li.xpath(
                                './/div[@class="rq0escxv l9j0dhe7 du4w35lb hybvsw6c io0zqebd m5lcvass fbipl8qg nwvqtn77 k4urcfbm ni8dbmo4 stjgntxs sbcfpzgs"]/div/div[2]/div/div[3]//text()')

                            text = ''
                            for data in content_data:
                                if data == '=':
                                    continue
                                elif 'ownloa' in data:
                                    continue
                                else:
                                    text += data
                            list_shuju.append(text)
                            # 发布账号主页链接
                            list_shuju.append(home_url)
                            # 点赞
                            try:
                                ir_nresrved1 = \
                                    all_li.xpath(
                                        './/span[@class="gpro0wi8 cwj9ozl2 bzsjyuwj ja2t1vim"]/span/span/text()')[0]
                            except:
                                ir_nresrved1 = []
                            try:
                                comments_data = all_li.xpath(
                                    './/div[@class="bp9cbjyn j83agx80 pfnyh3mw p1ueia1e"]/div//span//text()')
                                if len(comments_data) == 0:
                                    comments_data = ['0 個回應', '0次分享']
                            except:
                                comments_data = ['0 個回應', '0次分享']
                            print(f'点赞：{ir_nresrved1},评论转发： {comments_data}')
                            if len(comments_data) == 0:
                                comments(comments_data)

                            elif len(comments_data) == 1:
                                comments(comments_data)
                            elif comments_data[1][-3:] == comments_data[0][-3:]:
                                comments_data = comments_data[0]
                                comments(comments_data)
                            else:
                                comments_data = ''.join(comments_data[0] + comments_data[1])
                                comments(comments_data)
                            if len(comments_data)==0:
                                ir_nresrved2=0
                                ir_nresrved3=0
                            else:
                                ir_nresrved2 = comments(comments_data)[0].replace(',', '')
                                ir_nresrved3 = comments(comments_data)[1].replace(',', '')

                            if ir_nresrved1 == []:
                                ir_nresrved1 = 0
                            else:
                                ir_nresrved1 = (str(ir_nresrved1)).replace('xa0', '').replace(',', '').replace(' ', '')
                                if '万' in ir_nresrved1:
                                    ir_nresrved1 = ir_nresrved1.replace('万', '')
                                    ir_nresrved1 = int(float(ir_nresrved1) * 10000)
                            print(ir_nresrved1, ir_nresrved2, ir_nresrved3)
                            list_shuju.append(ir_nresrved1)
                            list_shuju.append(ir_nresrved2)
                            list_shuju.append(ir_nresrved3)
                            print('+++list_shuju+++', list_shuju)
                            row = 'A' + str(sums)
                            wsheet1.write_row(row, list_shuju)
                            print('--------------第', num + 1, '条数据！--------------------')
                            print('--------------写入到第', sums, '行数据！--------------------')
                            num += 1
                            sums += 1
                            zz_num += 1

                else:
                    # //div[@data-pagelet="ProfileTimeline"]/div/div[@class="du4w35lb k4urcfbm l9j0dhe7 sjgh65i0"]
                    print("进入第一方案！！")
                    print('all_list', len(all_list))
                    if len(all_list)==1:
                        all_list=html.xpath('//div[@data-pagelet="ProfileTimeline"]//div[@class="du4w35lb k4urcfbm l9j0dhe7 sjgh65i0"]')
                    if len(all_list)<4:
                        self.driver.close()
                        return sums

                    da_list1 = all_list[:-3]

                    # da_list1 = all_list
                    for all_li in da_list1[num:len(da_list1) + 1]:
                        # 临时储存数据的列表
                        list_shuju = []
                        # 账号昵称 //div[@class="qzhwtbm6 knvmm38d"]/span/h2/span/a/strong/span/text()
                        try:
                            zhnc = all_li.xpath(
                                './/div[@class="qzhwtbm6 knvmm38d"]/span/h2/span/a/strong/span/text()')[0]
                            print(f'发布帐号昵称:{zhnc}')
                            list_shuju.append(zhnc)
                        except:
                            zhnc = all_li.xpath(
                                '//span[@class="nc684nl6"]//text()')[0]
                            print(f'发布帐号昵称1:{zhnc}')
                            list_shuju.append(zhnc)
                        # 发布账号id
                        print('发布账号id', keyword)
                        list_shuju.append(keyword)

                        Time_list = all_li.xpath(
                                f'//div[{zz_num}]/div/div/div/div/div/div/div/div/div/div/div[2]/div/div[2]/div/div[2]/div/div[2]/span/span//a//text()'
                            )
                        if Time_list == [] or Time_list == ['\xa0', ' · ']:
                            Time_list = all_li.xpath(
                                './/div[@class="rq0escxv l9j0dhe7 du4w35lb hybvsw6c io0zqebd m5lcvass fbipl8qg nwvqtn77 k4urcfbm ni8dbmo4 stjgntxs sbcfpzgs"]/div/div/div/div[2]//div[@class="buofh1pr"]/div/div[2]/span/span/span[2]//text()')
                            if Time_list == [] or Time_list == ['\xa0', ' · ']:
                                Time_list = all_li.xpath(
                                    './/div[@class="rq0escxv l9j0dhe7 du4w35lb hybvsw6c io0zqebd m5lcvass fbipl8qg nwvqtn77 k4urcfbm ni8dbmo4 stjgntxs sbcfpzgs"]/div/div/div/div[2]//div[@class="buofh1pr"]/div/div[2]/span/span/span[3]//text()')
                            if Time_list==[]:
                                print('不是本人发布，不采集')
                                continue
                                
                        print(Time_list)
                        time1 = []
                        for time2 in Time_list:
                            if time2 == '=':
                                pass
                            else:
                                time1.append(time2)
                        time1 = time1[0].replace('2022年','')
                        
                        time1=int(time_turns(time1))
                        print("********发布时间*****", time_turn(str(time1)))
                        if time1>=1643644800:
                            time1=time_turn(str(time1))
                            list_shuju.append(time1)
                        elif time1<1643644800 and da_list1.index(all_li) == 0:
                            pass
                        elif time1<1643644800 and da_list1.index(all_li) != 0:
                            self.driver.close()
                            return sums
                        else:
                            
                            print("<<<<<<<<共", num, "条数据<<<<<<<<<<<", keyword, "本月数据读取完毕,写入第", sums,
                                  "行,下一用户>>>>>>>>>>>>>>>>>")
                            self.driver.close()                        
                  

                        content_data = all_li.xpath(
                            './/div[@class="rq0escxv l9j0dhe7 du4w35lb hybvsw6c io0zqebd m5lcvass fbipl8qg nwvqtn77 k4urcfbm ni8dbmo4 stjgntxs sbcfpzgs"]/div/div[2]/div/div[3]//text()')

                        text = ''
                        for data in content_data:
                            if data == '=':
                                continue
                            elif 'ownloa' in data:
                                continue
                            else:
                                text += data
                        list_shuju.append(text)
                     
                        # 发布账号主页链接
                        list_shuju.append(home_url)
                        # 点赞
                        try:
                            ir_nresrved1 = \
                                all_li.xpath(
                                    './/span[@class="gpro0wi8 cwj9ozl2 bzsjyuwj ja2t1vim"]/span/span/text()')[0]
                        except:
                            ir_nresrved1 = []
                        try:
                            comments_data = all_li.xpath(
                                './/div[@class="bp9cbjyn j83agx80 pfnyh3mw p1ueia1e"]/div//span//text()')
                            if len(comments_data) == 0:
                                comments_data = ['0 個回應', '0次分享']
                        except:
                            comments_data = ['0 個回應', '0次分享']
                        print(f'点赞：{ir_nresrved1},评论转发： {comments_data}')
                        if len(comments_data) == 0:
                            comments(comments_data)

                        elif len(comments_data) == 1:
                            comments(comments_data)
                        elif comments_data[1][-3:] == comments_data[0][-3:]:
                            comments_data = comments_data[0]
                            comments(comments_data)
                        else:
                            comments_data = ''.join(comments_data[0] + comments_data[1])
                            comments(comments_data)
                        if len(comments_data)==0:
                            ir_nresrved2=0
                            ir_nresrved3=0
                        else:
                            ir_nresrved2 = comments(comments_data)[0].replace(',', '')
                            ir_nresrved3 = comments(comments_data)[1].replace(',', '')

                        if ir_nresrved1 == []:
                            ir_nresrved1 = 0
                        else:
                            ir_nresrved1 = (str(ir_nresrved1)).replace('xa0', '').replace(',', '').replace(' ', '')
                            if '万' in ir_nresrved1:
                                ir_nresrved1 = ir_nresrved1.replace('万', '')
                                ir_nresrved1 = int(float(ir_nresrved1) * 10000)
                        print(ir_nresrved1, ir_nresrved2, ir_nresrved3)
                        list_shuju.append(ir_nresrved1)
                        list_shuju.append(ir_nresrved2)
                        list_shuju.append(ir_nresrved3)
                        print('+++list_shuju+++', list_shuju)
                        row = 'A' + str(sums)
                        wsheet1.write_row(row, list_shuju)
                        print('--------------第', num + 1, '条数据！--------------------')
                        print('--------------写入到第', sums, '行数据！--------------------')
                        num += 1
                        sums += 1
                        zz_num += 1

            except  Exception as e:
                print(f'提取有误！:{e}')
                break
        self.driver.close()

if __name__ == '__main__':
    # 师资处--欧洲教师自愿者
    # keyword_list = ['daniela.marieiragii.9', 'donna.rice.3511', 'echo.guo.334', 'graceweihair', 'hahahoho2299',
    #                 'jessica.chu.982292', 'linlin.jubujubu', 'mjeanas', 'ping.cui.5876', 'ppanan.liu',
    #                 'Rebecca.Niu.505', 'suku.bee', 'sunny.qiao.50', 'yang.mi.7758', 'yanli.ren.9',
    #                 'profile.php?id=100014285328747', 'celine.jiang.10', 'judyyye.lyu.3', 'jane.chinese.1',
    #                 'jing.pan.79', 'jinxiu.wang.754', 'liping.liu.370', 'min.fan.9', 'profile.php?id=100022188851437',
    #                 'profile.php?id=100029100170030', 'xiangyi.tanglan', 'li.guodong.50', 'han.qi.102',
    #                 'stefano.shi.509', 'ZhuGuizhi', 'yuge.fu',  'weizheng.soon.16',
    #                 'profile.php?id=100035897914640', 'lanyu.huang.12', 
    #                 'feifei.guo.399',  'qian.yin.7921', 
    #                                                       
    #                 'profile.php?id=100000264646084',  
    #                   'jingzhou.wang', 'profile.php?id=100001713489629',  'profile.php?id=100002055568754',                                              
    #                 'profile.php?id=100048763005166',  'zixun',
    #                 ]

    # # 欧洲媒体
    # keyword_list = ['uclm.es', 'bsuby', 'ikopole', 'instytutkonfucjusza', 'IstitutoConfucioDiMilano', 'icpp.fr',
    #                 'KonfuziusInstitutNuernbergErlangen', 'clasaconfucius.ovidius', 'InstitutoConfucioUMinho',
    #                 'ConfuciusInstituteMunich', 'KonfucijevInstitut.UNIZG', 'ConfuciusInstituteUBB',
    #                 'confuciusmaastricht', 'confuciusinstitute.galway', 'KonfuziusInstitutErfurt', 'kiunibl.org',
    #                 'IstitutoConfucioUnimc', 'konfuziusinstitutleipzig', 'civspu', 'ICdeLaReunion',
    #                 'BrookesConfuciusInstitute', 'konfucjuszUG', 'chinainmiskolc', 'InstitutConfuciusFinistere',
    #                 'InstitutoConfucioUC', 'www.ciut.edu.al', 'Instytut-Konfucjusza-UAM-w-Poznaniu-271081649596487',
    #                 'IstitutoConfuciodiRoma', 'konfuziusinstitut', 'CIatGlasgowUni', 'profile.php?id=100009360645905',
    #                 'pecsikonfuciuszintezet', 'ConfuciusMCR', 'Vilniaus.universiteto.Konfucijaus.institutas',
    #                 'IC.ULPGC', 'institutconfuciusmontpellier', 'confucioule', 'IstitutoConfucioUCSC', 'ConfucioUniTo',
    #                 'IstitutoConfuciodiPisa', 'bangorconfuciusinstitute', 'BCIUL', 'InstitutConfuciusdesPaysdelaLoire',
    #                 'KonfuciuvInstitut', 'konfucius.vsfs', 'KonfuziusInstitutFrankfurt',
    #                 'profile.php?id=100057530954564', 'Institut-Confucius-de-Bretagne-127321330622202',
    #                 'Школа-Конфуций-346940955894652', 'InstitutoConfucio.UP', 'confuciusinstituteaberdeen',
    #                 'Groningen-Confucius-Institute-108520622564327', 'institutulconfuciusbucuresti',
    #                 'NEOMACONFUCIUSINSTITUTEFORBUSINESS', 'institutkonfucij.mk',
    #                 'Institut-Confucius-de-Liège-154641651216924', 'ConfuciusCovUni', 'um.confucius',
    #                 'confucius.instituteucc.1', 'IstitutoConfucioUnipd', 'Confucioenna', 'istituto.confucio.napoli',
    #                 'Konfuciov-In%C5%A1tit%C3%BAt-v-Bratislave-1699960763610272',
    #                 'Konfuzius-Institut-an-der-Universit%C3%A4t-Freiburg-eV-641341015921819',
    #                 'szegedikonfuciuszintezet',
    #                  'profile.php?id=100054235160523','confucius.institute.si','istitutoconfuciofirenze',
    #                  'cilulv','konfuziusinstitutBonn','konfuziusinstituthamburg',
    #                   'Konfuzius-Institut-an-der-Universit%C3%A4t-Heidelberg-e-V-787435294604775',
    # ]
    # 亚非处--官方网站脸书数据
    # keyword_list = [
    # 'cu.edu.eg/?ref=search&__tn__=%2Cd%2CP-R&eid=ARDPb5hZJeNaG6s3irgLw6XRg9oSAgZ7Y95c2Eoss60dCHkqLdJ-No3tE4-HLDiQetoHUL86pksBAYyU',
    # 'KZIUM/', 'AUFconfucius/', 'oshci', 'pjkyhyst', 'Confucius-PSU-Hatyai-637584140021699/', 'confucius.ismailia',
    # 'Confucius-Institute-UMS-108726743828499/', 'kzxyqm', 'confuciu8/',
    # 'yguinternationalcenter/?__tn__=%2Cd%2CP-R&eid=ARBMc5h6kj6s5TUqQYTvi1H5sXCzbjkgi9EUlC5DwKpCN-eKqMIuv0X-_35EhI6vFwdrQTV5P_wKQkEY',
    # 'www.uae.ma/',
    # '%E5%B7%B4%E5%BA%93%E5%9B%BD%E7%AB%8B%E5%A4%A7%E5%AD%A6%E5%AD%94%E5%AD%90%E5%AD%A6%E9%99%A2-100606824762987/',
    # 'cisdus.suphanburi',
    # '%E0%BA%AA%E0%BA%B0%E0%BA%96%E0%BA%B2%E0%BA%9A%E0%BA%B1%E0%BA%99%E0%BA%82%E0%BA%BB%E0%BA%87%E0%BA%88%E0%BA%B7-%E0%BA%A1%E0%BA%8A-538306683255260/?__tn__=%2Cd%2CP-R&eid=ARDue1VjE-JNr-BI59WFNFZEGI98QZ6w2eROHK7wAEezRyQwa9_ZlUixj7VnE-i4-VKB9aThhnAmyuth',
    # 'CI.Assumption/?__tn__=%2Cd%2CP-R&eid=ARCAlblYZvxv0yhgz57IF-_bjBF0z21I0WynB44K5lFL1l2d7dIPwZTpfNhmvFhSOtHFdTN5Smfm3DZ7',
    # 'ADU-n%C9%99zdind%C9%99-Konfutsi-%C4%B0nstitutu-154717901724865/', 'Segiconfucius/',
    # 'Confucius-Institute-at-University-of-Cape-Coast-121849225138873', 'ConfuciusKU/', 'uob.institutconfucius',
    # 'confucius.ainshams/',
    # '%E5%B1%B1%E6%A2%A8%E5%AD%A6%E9%99%A2%E5%A4%A7%E5%AD%A6%E5%AD%94%E5%AD%90%E5%AD%A6%E9%99%A2-108255324183669',
    # 'ateneoconfucius/?ref=br_rs', 'Confucius-Institute-at-University-of-Liberia-113582253664745/',
    # 'ConfuciusInstituteUoM/?__tn__=%2Cd-%5C-R&eid=ARAGudpmXVzi8mQSkSJPCr1LwAVgGNljBR6bURogEvYaBac38TsI0DR19pLMhb-XhFCTsN2GqfyVtlxt',
    # 'CIRAC.edu/', 'kungzitau/', 'umpmlcc',
    # 'cinsubd/?__tn__=%2Cd%2CP-R&eid=ARBaELSZvuqq2d7Z2AJbsr1a4udB83RPhSJeJv7hC2q2NtVXGw6i96PkqFNAu7McSi9PhrH_xjU6N0wD',
    # 'ConfuciusInstituteNUM', 'Institut-Confucius-de-lAcad%C3%A9mie-Diplomatique-Congolaise-100658168167864/',
    # 'madakongyuan', 'TMCKZKT2006/', 'Institut-Confucius-de-lUniversit%C3%A9-dAntananarivo-2111644015571322',
    # 'Mandarim-na-CV-319869092135514/', 'cscc66/',
    # 'ConfuciusUniversityofNairobi/?__tn__=%2Cd%2CP-R&eid=ARAn05_8u1wFIgok3127lQg4l7vQPbbUHrKgE0i4LnJhr7xbfoAglKmXYpG5stBsKIvi3rsJdEXzCs1K',
    # 'CIUOC/', 'haishangsilu/', 'ciuaf/', 'TAGConfucius/', 'ciub.botswana.9', 'UJCI1/']
    # 志愿者处脸书数据
    # keyword_list = ['kunthea.yan.10', 'lu.hong.94214', 'tan.jue.35', 'LowProfileLuxurious', '100029205681446', 'MR.yaoyang',
    #             'cheng.kim.311', 'lingling.he.14', 'sebastian.paguyan', 'louis.wang.921', 'claire.zhao.902',
    #             'chao.yang.3958', 'Fuyu-187988891256633', 'JooliaWang', 'htetnaing.htun.522', 'hai.chang.31',
    #             'ricki.lei.9', 'anna.deng.735', 'lucy.luan.319', '100040814273360', 'monica.ch.5811', 'liu.h.hui.73',
    #             'wiley.lee.7', 'mary.juanma.1', 'xia.han.9', 'ying.mnre', 'chunrong.zhou.3', 'cinsubd', 'tarquin.wang',
    #             '100023579753735', 'Fionawu3698', '100016457490430', 'wenxuan.zuo.73', 'huafang.shen',
    #             '100006981612725', 'zoe.wang1', 'li.xinyi.3', 'gloriatzen', 'cheli.sag', 'fei.sun.731135',
    #             'xia.chang.9237']
    # 美大地区孔院院长自媒体脸书数据
    # keyword_list = ['ana.qiao', 'jun.du.334491', 'renyan.li.18', 'dong.hongle.9', 'sofia.mazheng', 'xiaofen.bi.7',
            
    #                 'zheng.fu.1213', 'humanhairextesion', 'UBA.Confucio', 'confucio.ufrgs',
    #                 'profile.php?id=100015822002088',
    #                 'CIAUCKLAND?__tn__=%2Cd%2CP-R&eid=ARDCG43dUTqfqXZqmYHR89HXZcqa2znznIS_sx6ZM42Z230XY4p8uGDolSQRZrktnGrFvwTyhZJeOKFI',
    #                 'ConfuciusInstituteWellington', 'UNSWCI',
    #                 'uonconfucius?ref=search&__tn__=%2Cd%2CP-R&eid=ARBgizLv2VBU7mRs0cdC9DsqVXDxgVuk8dytHCUX7F_oFdO6fixTkmcAxN9RhrdCAro1E81BaJVnDh9t']
    # 亚非处--中方院长个人自媒体脸书数据
    # keyword_list = ['tsogzolmaa.erdenebayar', 'rangsri.yang', 'miyya.zhang.5', 'yongkang.wang.184']
    # 师资处--亚非（公派途径）脸书数据
    # keyword_list = ['profile.php?id=100004607378501', 'profile.php?id=100007066143774',
    #                 'profile.php?id=100038107328752']
    # 师资处--亚非（发声途径）脸书数据
    # keyword_list = ['yang.jin.50309', 'rob.mar.71653', 'xu.ma.315', 'jiannisjiang', 'confuciusbuu',
    #                 'vanessa.chen.58760', 'tao.feng.948494', 'feifei.dai.1', 'shenghua.zhang.16', 'qingyi.chen.7',
    #                 'gongcuiyun.megan']
    # 师资处--美大教师自媒体脸书数据
    # keyword_list = ['lina.zhang.737001', 'lindawang3112', 'ChinaDoll4ever', 'SpooKPryme', 'mei.hu.5', 'ping.wang.1800',
    #                 'helen.lee.796774', 'zhili.chen.50', 'cheermyself3', 'xiao.hu.37853', 'victoria.yl.1',
    #                 'maureen.magiera', 'fabianaxm','profile.php?id=100003391386920','profile.php?id=4946315']
    # 写入excel
    keyword_list=['xia.chang.9237']
    try:
        wbook = xlsxwriter.Workbook('9327.xlsx')
        # 创建工作表
        wsheet1 = wbook.add_worksheet('Sheet1')
        title = ['发布账号昵称', '发布账号ID', '发布时间', '内容',  '主页链接', '点赞数', '评论数', '分享数']
        wsheet1.write_row('A1', title)  # 从A1单元格写入表头
        # excel 所有用户总发帖数
        zong_sums = 2
        for keyword in keyword_list[0:]:
            print(
                f"---------------------------------------------索引：{keyword_list.index(keyword)}------------------------------------------------")

            fb_server = FB()
            zong_sums = fb_server.start(keyword, zong_sums, wsheet1)
            sleep(random.randint(15, 30))
            print('写入完成！', zong_sums)
        wbook.close()
    except:
        wbook.close()
        print("人工终止！")
posted @ 2021-11-30 18:00 布都御魂阅读(162) 评论(0) 编辑收藏举报
刷新页面返回顶部
布都御魂

孔子学院

公告