孔子学院
"""
author:张鑫
date:2022/02/15 13:14
发布账号昵称、发布账号ID、发布时间、内容、发帖链接、账号主页链接、点赞数、评论数、分享数
运行环境 Chrome浏览器 和 selenium驱动版本一致即可
可自行接入公司数据库
keyword_list = ['tsaiingwen'] 放入采集政府、人物、政党的账号 tsaiingwenwe为蔡英文账号
"""
import locale
# -*- coding=utf-8 -*-
# 导入模块
import os
import random
import re
import sys
import time
from time import sleep
from selenium.webdriver import ActionChains
import pymongo
import xlsxwriter
from lxml import etree
def comments(comments_data):
comments_data = ''.join(comments_data)
if comments_data == '':
ir_nresrved2 = 0
ir_nresrved3 = 0
else:
comments_data = comments_data.replace('回應', '评论').replace('個', '条').replace('轉發', '分享').replace('萬', '万')
ir_nresrved2 = ''.join(re.findall('(.*?)条评论', comments_data))
if ir_nresrved2 == '':
ir_nresrved2 = '0'
ir_nresrved3 = ''.join(re.findall('条评论(.*?)次分享', comments_data))
if ir_nresrved3 == '':
ir_nresrved3 = ''.join(re.findall('(.*?)次分享', comments_data))
if ir_nresrved3 == '':
ir_nresrved3 = '0'
if '万' in ir_nresrved2:
ir_nresrved2 = ir_nresrved2.replace('万', '')
ir_nresrved2 = int(float(ir_nresrved2) * 10000)
if '万' in ir_nresrved3:
ir_nresrved3 = ir_nresrved3.replace('万', '')
ir_nresrved3 = int(float(ir_nresrved3) * 10000)
return ir_nresrved2, ir_nresrved3
# 中文支持
locale.setlocale(locale.LC_CTYPE, 'chinese')
# 连接数据库
database = pymongo.MongoClient('192.168.1.103', port=27017)
db = database['facebook']
kzxy_list = db['kzxy']
sys.path.append(os.path.dirname(__file__))
from tools.logger_server import logger
from tools.selenium_server import SeleniumServer
from tools.extract import Extract
from settings import REDIS, EPR_TIME
import datetime
import re
import time
def today_start():
today = datetime.date.today()
today_time = int(time.mktime(today.timetuple()))
return today_time
def time_turn(timenum):
if timenum == '0':
return '0'
elif 0 < len(timenum) < 11 and timenum.isdigit():
timenum = int(timenum)
timeArray = time.localtime(timenum)
otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
return otherStyleTime
else:
print('请输入11位以内的数字')
def time_turns(time1):
if time1[0] == '昨' and len(time1) > 2:
time1 = time1.split('發佈')[0]
time1 = (time1.split('天')[-1])
time1 = (today_start() - 24 * 3600) + int(time1.split(':')[0]) * 3600 + int(time1.split(':')[1]) * 60
# print(time1)
return time1
if time1 == '昨天':
time1 = (int(time.time()) - 24 * 3600)
return time1
if time1 == '刚刚':
time1 = int(time.time())
return time1
if '天前' in time1:
time1 = int(time.time()) - (int(time1.split('天')[0]) * 3600 * 24)
return time1
if time1=='0':
return time1
try:
try:
# 1小时转年月日
TTime = time.time()
try:
xs = int(time1.split('小时')[0])
except:
xs = int(time1.split('小時')[0])
sjc = xs * 60 * 60
time1 = int(TTime - sjc)
# print(time1)
return time1
except:
try:
TTime = time.time()
try:
xs = int(time1.split('分钟')[0])
except:
xs = int(time1.split('分鐘')[0])
sjc = xs * 60
time1 = int(TTime - sjc)
# print(time1)
return time1
except:
TTime = time.time()
xs = int(time1.split('天')[0])
sjc = xs * 60 * 60 * 24
time1 = int(TTime - sjc)
# print(time1)
return time1
except:
if time1[1] == '月':
if len(re.findall('(.*?)月', time1)) == 1:
time1 = time1.replace('月', '-').replace('日', ' ')
if ':' in time1:
try:
time1 = '2022-' + time1 + ':00'
time1 = time1.replace(' :', ':')
except:
time1 = '2022-0' + time1 + ':00'
time1 = time1.replace(' :', ':')
else:
try:
time1 = '2022-' + time1 + '00:00:00'
time1 = time1.replace(' :', ':')
except:
time1 = '2022-0' + time1 + '00:00:00'
time1 = time1.replace(' :', ':')
else:
time1 = time1.replace('月', '-').replace('日', ' ')
time1 = '2022-' + time1 + '00:00:00'
time1 = time1.replace(' :', ':')
dt = datetime.datetime.strptime(time1, '%Y-%m-%d %H:%M:%S')
# result从数据库中读出来的标准格式时间数据
# # 10位,时间点相当于从1.1开始的当年时间编号
time1 = int(str(int(time.mktime(dt.timetuple()))))
# print(time1)
return time1
elif '2022年' in time1:
time1 = time1.replace('年', '-').replace('月', '-').replace('日', ' ')
time1 = time1 + '00' + ':00' + ':00'
time1 = time1.replace(' :', ':')
dt = datetime.datetime.strptime(time1, '%Y-%m-%d %H:%M:%S')
# result从数据库中读出来的标准格式时间数据
# # 10位,时间点相当于从1.1开始的当年时间编号
time1 = int(str(int(time.mktime(dt.timetuple()))))
return time1
elif time1.split('年')[0] != 2022:
time1 = 0
print('不是今年的数据,不采集')
return time1
time1 = time_turn(time_turns(time1))
return time1
class FB:
def __init__(self):
self.selenium = SeleniumServer()
self.driver = self.selenium.driver
self.redis = REDIS
self.logger = logger
self.extract = Extract()
self.epr_time = EPR_TIME
def start(self, keyword, zong_sums, wsheet1):
"""程序的入口
:param keyword 搜索的关键字
"""
# 退出设置
tc_sum = 1
sums = zong_sums
# 输出搜索日志
self.logger.debug('start: {}'.format(keyword))
# 打开首页
home_url = 'https://www.facebook.com/{}'.format(keyword)
try:
# 打开facebook
self.driver.get(home_url)
# sleep(random.randint(5, 7))
sleep(random.randint(15, 20))
print('网页长度',len(self.driver.get(home_url)))
except:
pass
num = 0
# 进入二级页面的游标,控制指针停在时间
zz_num = 1
sleep(random.randint(6, 8))
# 一共下滑一百次次,下滑一次停顿0.5s
# 由于是异步加载,所以需要拉到最下方
# 发布帐号昵称
zhnc = ''
# 点击主页
try:
self.driver.find_element_by_xpath('/html/body/div[1]/div/div[1]/div/div[3]/div/div/div[1]/div[1]/div[3]/div/div/div/div[1]/div/div/div[1]/div/div/div/div/div/a[1]/div/span').click()
except:
self.driver.find_element_by_xpath('/html/body/div[1]/div/div[1]/div/div[3]/div/div/div[1]/div[1]/div/div/div[3]/div/div/div/div[1]/div/div/div[1]/div/div/div/div/div/div/a[1]/div/span').click()
for i in range(1, 20000):
print(f'*****************************第{i}页***************************')
# sleep(random.randint(2, 3))
self.driver.execute_script('window.scrollBy(0,2200)')
sleep(random.randint(10, 15))
# 获取网页源码
html = etree.HTML(self.driver.page_source)
try:
all_list = html.xpath(
'//div[@data-pagelet="ProfileTimeline"]/div/div[@class="du4w35lb k4urcfbm l9j0dhe7 sjgh65i0"]')
if len(all_list) == 0:
all_list1 = html.xpath('//div[@class="k4urcfbm"]/div[@class="du4w35lb k4urcfbm l9j0dhe7 sjgh65i0"]')
if len(all_list1) == 0:
all_list1 = html.xpath(
'//div[@data-pagelet="ProfileTimeline"]/div/div[@class="du4w35lb k4urcfbm l9j0dhe7 sjgh65i0"]')
print("****进入第二方案***", len(all_list1))
if len(all_list1) == 1:
self.driver.execute_script('window.scrollBy(0,-1800)')
sleep(random.randint(5, 8))
print('上划一下', tc_sum)
tc_sum += 1
if tc_sum == 3:
print("<<<<<<<<共", num, "条数据<<<<<<<<<<<", keyword, "本月数据读取完毕,写入第", sums,
"行,下一用户>>>>>>>>>>>>>>>>>")
self.driver.close()
return sums
elif len(all_list1) == 0:
print('有置顶帖')
all_list1=html.xpath('//div[@class="du4w35lb k4urcfbm l9j0dhe7 sjgh65i0"]')
# print(len(all_list1),html.xpath('//div[@class="du4w35lb k4urcfbm l9j0dhe7 sjgh65i0"]//text()'))
if len(all_list1) == 0:
print('无发帖', tc_sum)
tc_sum += 1
if tc_sum == 3:
print("<<<<<<<<共", num, "条数据<<<<<<<<<<<", keyword, "本月数据读取完毕,写入第", sums,
"行,下一用户>>>>>>>>>>>>>>>>>")
self.driver.close()
return sums
elif len(all_list1) == 2:
print('无发帖', tc_sum)
tc_sum += 1
if tc_sum == 3:
print("<<<<<<<<共", num, "条数据<<<<<<<<<<<", keyword, "本月数据读取完毕,写入第", sums,
"行,下一用户>>>>>>>>>>>>>>>>>")
self.driver.close()
return sums
elif len(all_list1) == 3:
print('发帖3', tc_sum)
tc_sum += 1
if tc_sum == 3:
print("<<<<<<<<共", num, "条数据<<<<<<<<<<<", keyword, "本月数据读取完毕,写入第", sums,
"行,下一用户>>>>>>>>>>>>>>>>>")
self.driver.close()
return sums
da_list1 = all_list1[:-3]
for all_li in da_list1[num:len(da_list1) + 1]:
# 临时储存数据的列表
list_shuju = []
# 账号昵称 //div[@class="qzhwtbm6 knvmm38d"]/span/h2/span/a/strong/span/text()
try:
zhnc = all_li.xpath(
'.//div[@class="qzhwtbm6 knvmm38d"]/span/h2/span/a/strong/span/text()')[0]
print(f'发布帐号昵称:{zhnc}')
list_shuju.append(zhnc)
except:
zhnc = all_li.xpath(
'//span[@class="nc684nl6"]//text()')[0]
print(f'发布帐号昵称1:{zhnc}')
list_shuju.append(zhnc)
# 发布账号id
print('发布账号id', keyword)
list_shuju.append(keyword)
# 发布时间//div[@class="buofh1pr"]/div/div[2]/span/span/span[2]//text()
if zz_num==1:
Time_list=all_li.xpath(
f'//div[1]/div[{zz_num}]/div/div/div/div/div/div/div/div/div/div[2]/div/div[2]/div/div[2]/div/div[2]/span/span/span[2]/span/a//text()'
)
if Time_list == [] or Time_list == ['\xa0', ' · ']:
Time_list = all_li.xpath(
'.//div[@class="rq0escxv l9j0dhe7 du4w35lb hybvsw6c io0zqebd m5lcvass fbipl8qg nwvqtn77 k4urcfbm ni8dbmo4 stjgntxs sbcfpzgs"]/div/div/div/div[2]//div[@class="buofh1pr"]/div/div[2]/span/span/span[2]//text()')
if Time_list == [] or Time_list == ['\xa0', ' · ']:
Time_list = all_li.xpath(
'.//div[@class="rq0escxv l9j0dhe7 du4w35lb hybvsw6c io0zqebd m5lcvass fbipl8qg nwvqtn77 k4urcfbm ni8dbmo4 stjgntxs sbcfpzgs"]/div/div/div/div[2]//div[@class="buofh1pr"]/div/div[2]/span/span/span[3]//text()')
else:
Time_list = all_li.xpath(
f'//div[{zz_num}]/div/div/div/div/div/div/div/div/div/div[2]/div/div[2]/div/div[2]/div/div[2]/span/span/span[2]/span/a'
# f'//div[{zz_num}]/div/div/div/div/div/div/div/div/div/div/div[2]/div/div[2]/div/div[2]/div/div[2]/span/span//a//text()'
)
if Time_list == [] or Time_list == ['\xa0', ' · ']:
Time_list = all_li.xpath(
'.//div[@class="rq0escxv l9j0dhe7 du4w35lb hybvsw6c io0zqebd m5lcvass fbipl8qg nwvqtn77 k4urcfbm ni8dbmo4 stjgntxs sbcfpzgs"]/div/div/div/div[2]//div[@class="buofh1pr"]/div/div[2]/span/span/span[2]//text()')
if Time_list == [] or Time_list == ['\xa0', ' · ']:
Time_list = all_li.xpath(
'.//div[@class="rq0escxv l9j0dhe7 du4w35lb hybvsw6c io0zqebd m5lcvass fbipl8qg nwvqtn77 k4urcfbm ni8dbmo4 stjgntxs sbcfpzgs"]/div/div/div/div[2]//div[@class="buofh1pr"]/div/div[2]/span/span/span[3]//text()')
print(Time_list[-1])
time1 = []
for time2 in Time_list:
if time2 == '=':
pass
else:
time1.append(time2)
time1 = time1[0].replace('2022年','')
time1=int(time_turns(time1))
print("********发布时间*****", time_turn(str(time1)))
if time1>=1643644800:
time1=time_turn(str(time1))
list_shuju.append(time1)
elif time1<1643644800 and da_list1.index(all_li) == 0:
pass
elif time1<1643644800 and da_list1.index(all_li) != 0:
self.driver.close()
return sums
else:
print("<<<<<<<<共", num, "条数据<<<<<<<<<<<", keyword, "本月数据读取完毕,写入第", sums,
"行,下一用户>>>>>>>>>>>>>>>>>")
self.driver.close()
# 内容.//div[@class="rq0escxv l9j0dhe7 du4w35lb hybvsw6c io0zqebd m5lcvass fbipl8qg nwvqtn77 k4urcfbm ni8dbmo4 stjgntxs sbcfpzgs"]/div/div[2]/div/div[3]//text()
content_data = all_li.xpath(
'.//div[@class="rq0escxv l9j0dhe7 du4w35lb hybvsw6c io0zqebd m5lcvass fbipl8qg nwvqtn77 k4urcfbm ni8dbmo4 stjgntxs sbcfpzgs"]/div/div[2]/div/div[3]//text()')
text = ''
for data in content_data:
if data == '=':
continue
else:
text += data
# print('内容数据:', text)
list_shuju.append(text)
# 发布账号主页链接
list_shuju.append(home_url)
try:
ir_nresrved1 = \
all_li.xpath(
'.//span[@class="gpro0wi8 cwj9ozl2 bzsjyuwj ja2t1vim"]/span/span/text()')[
0]
except:
ir_nresrved1 = []
try:
comments_data = all_li.xpath(
'.//div[@class="bp9cbjyn j83agx80 pfnyh3mw p1ueia1e"]/div//span//text()')
except:
comments_data = []
print(f'点赞:{ir_nresrved1},评论转发: {comments_data}')
if len(comments_data) == 0:
comments(comments_data)
elif len(comments_data) == 1:
comments(comments_data)
elif comments_data[1][-3:] == comments_data[0][-3:]:
comments_data = comments_data[0]
comments(comments_data)
else:
comments_data = ''.join(comments_data[0] + comments_data[1])
comments(comments_data)
if len(comments_data)==0:
ir_nresrved2=0
ir_nresrved3=0
else:
ir_nresrved2 = comments(comments_data)[0].replace(',', '')
ir_nresrved3 = comments(comments_data)[1].replace(',', '')
if ir_nresrved1 == []:
ir_nresrved1 = 0
else:
ir_nresrved1 = (str(ir_nresrved1)).replace('xa0', '').replace(',', '').replace(' ', '')
if '万' in ir_nresrved1:
ir_nresrved1 = ir_nresrved1.replace('万', '')
ir_nresrved1 = int(float(ir_nresrved1) * 10000)
print(ir_nresrved1, ir_nresrved2, ir_nresrved3)
ir_nresrved1 = int(str(ir_nresrved1))
ir_nresrved2 = int(str(ir_nresrved2))
ir_nresrved3 = int(str(ir_nresrved3))
print(type(ir_nresrved1), type(ir_nresrved2), type(ir_nresrved3))
list_shuju.append(ir_nresrved1)
list_shuju.append(ir_nresrved2)
list_shuju.append(ir_nresrved3)
print('+++list_shuju+++', list_shuju)
row = 'A' + str(sums)
wsheet1.write_row(row, list_shuju)
print('--------------第', num + 1, '条数据!--------------------')
print('--------------写入到第', sums, '行数据!--------------------')
num += 1
sums += 1
else:
print('all_list1', len(all_list1))
print("****进入第三方案***")
if len(all_list1) == 0:
tc_sum += 1
if tc_sum == 3:
break
da_list1 = all_list1[:-3]
for all_li in da_list1[num:len(da_list1) + 1]:
# 临时储存数据的列表
list_shuju = []
# 账号昵称 //div[@class="qzhwtbm6 knvmm38d"]/span/h2/span/a/strong/span/text()
try:
zhnc = all_li.xpath(
'.//div[@class="qzhwtbm6 knvmm38d"]/span/h2/span/a/strong/span/text()')[0]
print(f'发布帐号昵称:{zhnc}')
list_shuju.append(zhnc)
except:
zhnc = all_li.xpath(
'//span[@class="nc684nl6"]//text()')[0]
print(f'发布帐号昵称1:{zhnc}')
list_shuju.append(zhnc)
# 发布账号id
print('发布账号id', keyword)
list_shuju.append(keyword)
Time_list = all_li.xpath(
f'//div[{zz_num}]/div/div/div/div/div/div/div/div/div/div/div[2]/div/div[2]/div/div[2]/div/div[2]/span/span//a//text()'
)
if Time_list == [] or Time_list == ['\xa0', ' · ']:
Time_list = all_li.xpath(
'.//div[@class="rq0escxv l9j0dhe7 du4w35lb hybvsw6c io0zqebd m5lcvass fbipl8qg nwvqtn77 k4urcfbm ni8dbmo4 stjgntxs sbcfpzgs"]/div/div/div/div[2]//div[@class="buofh1pr"]/div/div[2]/span/span/span[2]//text()')
if Time_list == [] or Time_list == ['\xa0', ' · ']:
Time_list = all_li.xpath(
'.//div[@class="rq0escxv l9j0dhe7 du4w35lb hybvsw6c io0zqebd m5lcvass fbipl8qg nwvqtn77 k4urcfbm ni8dbmo4 stjgntxs sbcfpzgs"]/div/div/div/div[2]//div[@class="buofh1pr"]/div/div[2]/span/span/span[3]//text()')
print(Time_list)
time1 = []
for time2 in Time_list:
if time2 == '=':
pass
else:
time1.append(time2)
time1 = time1[0].replace('2022年','')
time1=int(time_turns(time1))
print("********发布时间*****", time_turn(str(time1)))
if time1>=1643644800:
time1=time_turn(str(time1))
list_shuju.append(time1)
elif time1<1643644800 and da_list1.index(all_li) == 0:
pass
elif time1<1643644800 and da_list1.index(all_li) != 0:
self.driver.close()
return sums
else:
print("<<<<<<<<共", num, "条数据<<<<<<<<<<<", keyword, "本月数据读取完毕,写入第", sums,
"行,下一用户>>>>>>>>>>>>>>>>>")
self.driver.close()
# 修改
# 先判断内容中是否有查看更多
# 有鼠标单击 //div[@class="rq0escxv l9j0dhe7 du4w35lb hybvsw6c io0zqebd m5lcvass fbipl8qg nwvqtn77 k4urcfbm ni8dbmo4 stjgntxs sbcfpzgs"]//div[@dir='auto']//div[@class='qzhwtbm6 knvmm38d']/span/div/div/div
content_data = all_li.xpath(
'.//div[@class="rq0escxv l9j0dhe7 du4w35lb hybvsw6c io0zqebd m5lcvass fbipl8qg nwvqtn77 k4urcfbm ni8dbmo4 stjgntxs sbcfpzgs"]/div/div[2]/div/div[3]//text()')
text = ''
for data in content_data:
if data == '=':
continue
elif 'ownloa' in data:
continue
else:
text += data
list_shuju.append(text)
# 发布账号主页链接
list_shuju.append(home_url)
# 点赞
try:
ir_nresrved1 = \
all_li.xpath(
'.//span[@class="gpro0wi8 cwj9ozl2 bzsjyuwj ja2t1vim"]/span/span/text()')[0]
except:
ir_nresrved1 = []
try:
comments_data = all_li.xpath(
'.//div[@class="bp9cbjyn j83agx80 pfnyh3mw p1ueia1e"]/div//span//text()')
if len(comments_data) == 0:
comments_data = ['0 個回應', '0次分享']
except:
comments_data = ['0 個回應', '0次分享']
print(f'点赞:{ir_nresrved1},评论转发: {comments_data}')
if len(comments_data) == 0:
comments(comments_data)
elif len(comments_data) == 1:
comments(comments_data)
elif comments_data[1][-3:] == comments_data[0][-3:]:
comments_data = comments_data[0]
comments(comments_data)
else:
comments_data = ''.join(comments_data[0] + comments_data[1])
comments(comments_data)
if len(comments_data)==0:
ir_nresrved2=0
ir_nresrved3=0
else:
ir_nresrved2 = comments(comments_data)[0].replace(',', '')
ir_nresrved3 = comments(comments_data)[1].replace(',', '')
if ir_nresrved1 == []:
ir_nresrved1 = 0
else:
ir_nresrved1 = (str(ir_nresrved1)).replace('xa0', '').replace(',', '').replace(' ', '')
if '万' in ir_nresrved1:
ir_nresrved1 = ir_nresrved1.replace('万', '')
ir_nresrved1 = int(float(ir_nresrved1) * 10000)
print(ir_nresrved1, ir_nresrved2, ir_nresrved3)
list_shuju.append(ir_nresrved1)
list_shuju.append(ir_nresrved2)
list_shuju.append(ir_nresrved3)
print('+++list_shuju+++', list_shuju)
row = 'A' + str(sums)
wsheet1.write_row(row, list_shuju)
print('--------------第', num + 1, '条数据!--------------------')
print('--------------写入到第', sums, '行数据!--------------------')
num += 1
sums += 1
zz_num += 1
else:
# //div[@data-pagelet="ProfileTimeline"]/div/div[@class="du4w35lb k4urcfbm l9j0dhe7 sjgh65i0"]
print("进入第一方案!!")
print('all_list', len(all_list))
if len(all_list)==1:
all_list=html.xpath('//div[@data-pagelet="ProfileTimeline"]//div[@class="du4w35lb k4urcfbm l9j0dhe7 sjgh65i0"]')
if len(all_list)<4:
self.driver.close()
return sums
da_list1 = all_list[:-3]
# da_list1 = all_list
for all_li in da_list1[num:len(da_list1) + 1]:
# 临时储存数据的列表
list_shuju = []
# 账号昵称 //div[@class="qzhwtbm6 knvmm38d"]/span/h2/span/a/strong/span/text()
try:
zhnc = all_li.xpath(
'.//div[@class="qzhwtbm6 knvmm38d"]/span/h2/span/a/strong/span/text()')[0]
print(f'发布帐号昵称:{zhnc}')
list_shuju.append(zhnc)
except:
zhnc = all_li.xpath(
'//span[@class="nc684nl6"]//text()')[0]
print(f'发布帐号昵称1:{zhnc}')
list_shuju.append(zhnc)
# 发布账号id
print('发布账号id', keyword)
list_shuju.append(keyword)
Time_list = all_li.xpath(
f'//div[{zz_num}]/div/div/div/div/div/div/div/div/div/div/div[2]/div/div[2]/div/div[2]/div/div[2]/span/span//a//text()'
)
if Time_list == [] or Time_list == ['\xa0', ' · ']:
Time_list = all_li.xpath(
'.//div[@class="rq0escxv l9j0dhe7 du4w35lb hybvsw6c io0zqebd m5lcvass fbipl8qg nwvqtn77 k4urcfbm ni8dbmo4 stjgntxs sbcfpzgs"]/div/div/div/div[2]//div[@class="buofh1pr"]/div/div[2]/span/span/span[2]//text()')
if Time_list == [] or Time_list == ['\xa0', ' · ']:
Time_list = all_li.xpath(
'.//div[@class="rq0escxv l9j0dhe7 du4w35lb hybvsw6c io0zqebd m5lcvass fbipl8qg nwvqtn77 k4urcfbm ni8dbmo4 stjgntxs sbcfpzgs"]/div/div/div/div[2]//div[@class="buofh1pr"]/div/div[2]/span/span/span[3]//text()')
if Time_list==[]:
print('不是本人发布,不采集')
continue
print(Time_list)
time1 = []
for time2 in Time_list:
if time2 == '=':
pass
else:
time1.append(time2)
time1 = time1[0].replace('2022年','')
time1=int(time_turns(time1))
print("********发布时间*****", time_turn(str(time1)))
if time1>=1643644800:
time1=time_turn(str(time1))
list_shuju.append(time1)
elif time1<1643644800 and da_list1.index(all_li) == 0:
pass
elif time1<1643644800 and da_list1.index(all_li) != 0:
self.driver.close()
return sums
else:
print("<<<<<<<<共", num, "条数据<<<<<<<<<<<", keyword, "本月数据读取完毕,写入第", sums,
"行,下一用户>>>>>>>>>>>>>>>>>")
self.driver.close()
content_data = all_li.xpath(
'.//div[@class="rq0escxv l9j0dhe7 du4w35lb hybvsw6c io0zqebd m5lcvass fbipl8qg nwvqtn77 k4urcfbm ni8dbmo4 stjgntxs sbcfpzgs"]/div/div[2]/div/div[3]//text()')
text = ''
for data in content_data:
if data == '=':
continue
elif 'ownloa' in data:
continue
else:
text += data
list_shuju.append(text)
# 发布账号主页链接
list_shuju.append(home_url)
# 点赞
try:
ir_nresrved1 = \
all_li.xpath(
'.//span[@class="gpro0wi8 cwj9ozl2 bzsjyuwj ja2t1vim"]/span/span/text()')[0]
except:
ir_nresrved1 = []
try:
comments_data = all_li.xpath(
'.//div[@class="bp9cbjyn j83agx80 pfnyh3mw p1ueia1e"]/div//span//text()')
if len(comments_data) == 0:
comments_data = ['0 個回應', '0次分享']
except:
comments_data = ['0 個回應', '0次分享']
print(f'点赞:{ir_nresrved1},评论转发: {comments_data}')
if len(comments_data) == 0:
comments(comments_data)
elif len(comments_data) == 1:
comments(comments_data)
elif comments_data[1][-3:] == comments_data[0][-3:]:
comments_data = comments_data[0]
comments(comments_data)
else:
comments_data = ''.join(comments_data[0] + comments_data[1])
comments(comments_data)
if len(comments_data)==0:
ir_nresrved2=0
ir_nresrved3=0
else:
ir_nresrved2 = comments(comments_data)[0].replace(',', '')
ir_nresrved3 = comments(comments_data)[1].replace(',', '')
if ir_nresrved1 == []:
ir_nresrved1 = 0
else:
ir_nresrved1 = (str(ir_nresrved1)).replace('xa0', '').replace(',', '').replace(' ', '')
if '万' in ir_nresrved1:
ir_nresrved1 = ir_nresrved1.replace('万', '')
ir_nresrved1 = int(float(ir_nresrved1) * 10000)
print(ir_nresrved1, ir_nresrved2, ir_nresrved3)
list_shuju.append(ir_nresrved1)
list_shuju.append(ir_nresrved2)
list_shuju.append(ir_nresrved3)
print('+++list_shuju+++', list_shuju)
row = 'A' + str(sums)
wsheet1.write_row(row, list_shuju)
print('--------------第', num + 1, '条数据!--------------------')
print('--------------写入到第', sums, '行数据!--------------------')
num += 1
sums += 1
zz_num += 1
except Exception as e:
print(f'提取有误!:{e}')
break
self.driver.close()
if __name__ == '__main__':
# 师资处--欧洲教师自愿者
# keyword_list = ['daniela.marieiragii.9', 'donna.rice.3511', 'echo.guo.334', 'graceweihair', 'hahahoho2299',
# 'jessica.chu.982292', 'linlin.jubujubu', 'mjeanas', 'ping.cui.5876', 'ppanan.liu',
# 'Rebecca.Niu.505', 'suku.bee', 'sunny.qiao.50', 'yang.mi.7758', 'yanli.ren.9',
# 'profile.php?id=100014285328747', 'celine.jiang.10', 'judyyye.lyu.3', 'jane.chinese.1',
# 'jing.pan.79', 'jinxiu.wang.754', 'liping.liu.370', 'min.fan.9', 'profile.php?id=100022188851437',
# 'profile.php?id=100029100170030', 'xiangyi.tanglan', 'li.guodong.50', 'han.qi.102',
# 'stefano.shi.509', 'ZhuGuizhi', 'yuge.fu', 'weizheng.soon.16',
# 'profile.php?id=100035897914640', 'lanyu.huang.12',
# 'feifei.guo.399', 'qian.yin.7921',
#
# 'profile.php?id=100000264646084',
# 'jingzhou.wang', 'profile.php?id=100001713489629', 'profile.php?id=100002055568754',
# 'profile.php?id=100048763005166', 'zixun',
# ]
# # 欧洲媒体
# keyword_list = ['uclm.es', 'bsuby', 'ikopole', 'instytutkonfucjusza', 'IstitutoConfucioDiMilano', 'icpp.fr',
# 'KonfuziusInstitutNuernbergErlangen', 'clasaconfucius.ovidius', 'InstitutoConfucioUMinho',
# 'ConfuciusInstituteMunich', 'KonfucijevInstitut.UNIZG', 'ConfuciusInstituteUBB',
# 'confuciusmaastricht', 'confuciusinstitute.galway', 'KonfuziusInstitutErfurt', 'kiunibl.org',
# 'IstitutoConfucioUnimc', 'konfuziusinstitutleipzig', 'civspu', 'ICdeLaReunion',
# 'BrookesConfuciusInstitute', 'konfucjuszUG', 'chinainmiskolc', 'InstitutConfuciusFinistere',
# 'InstitutoConfucioUC', 'www.ciut.edu.al', 'Instytut-Konfucjusza-UAM-w-Poznaniu-271081649596487',
# 'IstitutoConfuciodiRoma', 'konfuziusinstitut', 'CIatGlasgowUni', 'profile.php?id=100009360645905',
# 'pecsikonfuciuszintezet', 'ConfuciusMCR', 'Vilniaus.universiteto.Konfucijaus.institutas',
# 'IC.ULPGC', 'institutconfuciusmontpellier', 'confucioule', 'IstitutoConfucioUCSC', 'ConfucioUniTo',
# 'IstitutoConfuciodiPisa', 'bangorconfuciusinstitute', 'BCIUL', 'InstitutConfuciusdesPaysdelaLoire',
# 'KonfuciuvInstitut', 'konfucius.vsfs', 'KonfuziusInstitutFrankfurt',
# 'profile.php?id=100057530954564', 'Institut-Confucius-de-Bretagne-127321330622202',
# 'Школа-Конфуций-346940955894652', 'InstitutoConfucio.UP', 'confuciusinstituteaberdeen',
# 'Groningen-Confucius-Institute-108520622564327', 'institutulconfuciusbucuresti',
# 'NEOMACONFUCIUSINSTITUTEFORBUSINESS', 'institutkonfucij.mk',
# 'Institut-Confucius-de-Liège-154641651216924', 'ConfuciusCovUni', 'um.confucius',
# 'confucius.instituteucc.1', 'IstitutoConfucioUnipd', 'Confucioenna', 'istituto.confucio.napoli',
# 'Konfuciov-In%C5%A1tit%C3%BAt-v-Bratislave-1699960763610272',
# 'Konfuzius-Institut-an-der-Universit%C3%A4t-Freiburg-eV-641341015921819',
# 'szegedikonfuciuszintezet',
# 'profile.php?id=100054235160523','confucius.institute.si','istitutoconfuciofirenze',
# 'cilulv','konfuziusinstitutBonn','konfuziusinstituthamburg',
# 'Konfuzius-Institut-an-der-Universit%C3%A4t-Heidelberg-e-V-787435294604775',
# ]
# 亚非处--官方网站脸书数据
# keyword_list = [
# 'cu.edu.eg/?ref=search&__tn__=%2Cd%2CP-R&eid=ARDPb5hZJeNaG6s3irgLw6XRg9oSAgZ7Y95c2Eoss60dCHkqLdJ-No3tE4-HLDiQetoHUL86pksBAYyU',
# 'KZIUM/', 'AUFconfucius/', 'oshci', 'pjkyhyst', 'Confucius-PSU-Hatyai-637584140021699/', 'confucius.ismailia',
# 'Confucius-Institute-UMS-108726743828499/', 'kzxyqm', 'confuciu8/',
# 'yguinternationalcenter/?__tn__=%2Cd%2CP-R&eid=ARBMc5h6kj6s5TUqQYTvi1H5sXCzbjkgi9EUlC5DwKpCN-eKqMIuv0X-_35EhI6vFwdrQTV5P_wKQkEY',
# 'www.uae.ma/',
# '%E5%B7%B4%E5%BA%93%E5%9B%BD%E7%AB%8B%E5%A4%A7%E5%AD%A6%E5%AD%94%E5%AD%90%E5%AD%A6%E9%99%A2-100606824762987/',
# 'cisdus.suphanburi',
# '%E0%BA%AA%E0%BA%B0%E0%BA%96%E0%BA%B2%E0%BA%9A%E0%BA%B1%E0%BA%99%E0%BA%82%E0%BA%BB%E0%BA%87%E0%BA%88%E0%BA%B7-%E0%BA%A1%E0%BA%8A-538306683255260/?__tn__=%2Cd%2CP-R&eid=ARDue1VjE-JNr-BI59WFNFZEGI98QZ6w2eROHK7wAEezRyQwa9_ZlUixj7VnE-i4-VKB9aThhnAmyuth',
# 'CI.Assumption/?__tn__=%2Cd%2CP-R&eid=ARCAlblYZvxv0yhgz57IF-_bjBF0z21I0WynB44K5lFL1l2d7dIPwZTpfNhmvFhSOtHFdTN5Smfm3DZ7',
# 'ADU-n%C9%99zdind%C9%99-Konfutsi-%C4%B0nstitutu-154717901724865/', 'Segiconfucius/',
# 'Confucius-Institute-at-University-of-Cape-Coast-121849225138873', 'ConfuciusKU/', 'uob.institutconfucius',
# 'confucius.ainshams/',
# '%E5%B1%B1%E6%A2%A8%E5%AD%A6%E9%99%A2%E5%A4%A7%E5%AD%A6%E5%AD%94%E5%AD%90%E5%AD%A6%E9%99%A2-108255324183669',
# 'ateneoconfucius/?ref=br_rs', 'Confucius-Institute-at-University-of-Liberia-113582253664745/',
# 'ConfuciusInstituteUoM/?__tn__=%2Cd-%5C-R&eid=ARAGudpmXVzi8mQSkSJPCr1LwAVgGNljBR6bURogEvYaBac38TsI0DR19pLMhb-XhFCTsN2GqfyVtlxt',
# 'CIRAC.edu/', 'kungzitau/', 'umpmlcc',
# 'cinsubd/?__tn__=%2Cd%2CP-R&eid=ARBaELSZvuqq2d7Z2AJbsr1a4udB83RPhSJeJv7hC2q2NtVXGw6i96PkqFNAu7McSi9PhrH_xjU6N0wD',
# 'ConfuciusInstituteNUM', 'Institut-Confucius-de-lAcad%C3%A9mie-Diplomatique-Congolaise-100658168167864/',
# 'madakongyuan', 'TMCKZKT2006/', 'Institut-Confucius-de-lUniversit%C3%A9-dAntananarivo-2111644015571322',
# 'Mandarim-na-CV-319869092135514/', 'cscc66/',
# 'ConfuciusUniversityofNairobi/?__tn__=%2Cd%2CP-R&eid=ARAn05_8u1wFIgok3127lQg4l7vQPbbUHrKgE0i4LnJhr7xbfoAglKmXYpG5stBsKIvi3rsJdEXzCs1K',
# 'CIUOC/', 'haishangsilu/', 'ciuaf/', 'TAGConfucius/', 'ciub.botswana.9', 'UJCI1/']
# 志愿者处脸书数据
# keyword_list = ['kunthea.yan.10', 'lu.hong.94214', 'tan.jue.35', 'LowProfileLuxurious', '100029205681446', 'MR.yaoyang',
# 'cheng.kim.311', 'lingling.he.14', 'sebastian.paguyan', 'louis.wang.921', 'claire.zhao.902',
# 'chao.yang.3958', 'Fuyu-187988891256633', 'JooliaWang', 'htetnaing.htun.522', 'hai.chang.31',
# 'ricki.lei.9', 'anna.deng.735', 'lucy.luan.319', '100040814273360', 'monica.ch.5811', 'liu.h.hui.73',
# 'wiley.lee.7', 'mary.juanma.1', 'xia.han.9', 'ying.mnre', 'chunrong.zhou.3', 'cinsubd', 'tarquin.wang',
# '100023579753735', 'Fionawu3698', '100016457490430', 'wenxuan.zuo.73', 'huafang.shen',
# '100006981612725', 'zoe.wang1', 'li.xinyi.3', 'gloriatzen', 'cheli.sag', 'fei.sun.731135',
# 'xia.chang.9237']
# 美大地区孔院院长自媒体脸书数据
# keyword_list = ['ana.qiao', 'jun.du.334491', 'renyan.li.18', 'dong.hongle.9', 'sofia.mazheng', 'xiaofen.bi.7',
# 'zheng.fu.1213', 'humanhairextesion', 'UBA.Confucio', 'confucio.ufrgs',
# 'profile.php?id=100015822002088',
# 'CIAUCKLAND?__tn__=%2Cd%2CP-R&eid=ARDCG43dUTqfqXZqmYHR89HXZcqa2znznIS_sx6ZM42Z230XY4p8uGDolSQRZrktnGrFvwTyhZJeOKFI',
# 'ConfuciusInstituteWellington', 'UNSWCI',
# 'uonconfucius?ref=search&__tn__=%2Cd%2CP-R&eid=ARBgizLv2VBU7mRs0cdC9DsqVXDxgVuk8dytHCUX7F_oFdO6fixTkmcAxN9RhrdCAro1E81BaJVnDh9t']
# 亚非处--中方院长个人自媒体脸书数据
# keyword_list = ['tsogzolmaa.erdenebayar', 'rangsri.yang', 'miyya.zhang.5', 'yongkang.wang.184']
# 师资处--亚非(公派途径)脸书数据
# keyword_list = ['profile.php?id=100004607378501', 'profile.php?id=100007066143774',
# 'profile.php?id=100038107328752']
# 师资处--亚非(发声途径)脸书数据
# keyword_list = ['yang.jin.50309', 'rob.mar.71653', 'xu.ma.315', 'jiannisjiang', 'confuciusbuu',
# 'vanessa.chen.58760', 'tao.feng.948494', 'feifei.dai.1', 'shenghua.zhang.16', 'qingyi.chen.7',
# 'gongcuiyun.megan']
# 师资处--美大教师自媒体脸书数据
# keyword_list = ['lina.zhang.737001', 'lindawang3112', 'ChinaDoll4ever', 'SpooKPryme', 'mei.hu.5', 'ping.wang.1800',
# 'helen.lee.796774', 'zhili.chen.50', 'cheermyself3', 'xiao.hu.37853', 'victoria.yl.1',
# 'maureen.magiera', 'fabianaxm','profile.php?id=100003391386920','profile.php?id=4946315']
# 写入excel
keyword_list=['xia.chang.9237']
try:
wbook = xlsxwriter.Workbook('9327.xlsx')
# 创建工作表
wsheet1 = wbook.add_worksheet('Sheet1')
title = ['发布账号昵称', '发布账号ID', '发布时间', '内容', '主页链接', '点赞数', '评论数', '分享数']
wsheet1.write_row('A1', title) # 从A1单元格写入表头
# excel 所有用户总发帖数
zong_sums = 2
for keyword in keyword_list[0:]:
print(
f"---------------------------------------------索引:{keyword_list.index(keyword)}------------------------------------------------")
fb_server = FB()
zong_sums = fb_server.start(keyword, zong_sums, wsheet1)
sleep(random.randint(15, 30))
print('写入完成!', zong_sums)
wbook.close()
except:
wbook.close()
print("人工终止!")