社交媒体关键字查询
"""
author:张鑫
date:2022/02/15 13:14
发布账号昵称、发布账号ID、发布时间、内容、发帖链接、账号主页链接、点赞数、评论数、分享数
运行环境 Chrome浏览器 和 selenium驱动版本一致即可
可自行接入公司数据库
keyword_list = ['tsaiingwen'] 放入采集政府、人物、政党的账号 tsaiingwenwe为蔡英文账号
"""
import locale
# -*- coding=utf-8 -*-
# 导入模块
import os
import random
import re
import sys
import time
from time import sleep
from selenium.webdriver import ActionChains
import pymongo
import xlsxwriter
from lxml import etree
def comments(comments_data):
comments_data = ''.join(comments_data)
if comments_data == '':
ir_nresrved2 = 0
ir_nresrved3 = 0
else:
comments_data = comments_data.replace('回應', '评论').replace('個', '条').replace('轉發', '分享').replace('萬', '万')
ir_nresrved2 = ''.join(re.findall('(.*?)条评论', comments_data))
if ir_nresrved2 == '':
ir_nresrved2 = '0'
ir_nresrved3 = ''.join(re.findall('条评论(.*?)次分享', comments_data))
if ir_nresrved3 == '':
ir_nresrved3 = ''.join(re.findall('(.*?)次分享', comments_data))
if ir_nresrved3 == '':
ir_nresrved3 = '0'
if '万' in ir_nresrved2:
ir_nresrved2 = ir_nresrved2.replace('万', '')
ir_nresrved2 = int(float(ir_nresrved2) * 10000)
if '万' in ir_nresrved3:
ir_nresrved3 = ir_nresrved3.replace('万', '')
ir_nresrved3 = int(float(ir_nresrved3) * 10000)
return ir_nresrved2, ir_nresrved3
# 中文支持
locale.setlocale(locale.LC_CTYPE, 'chinese')
# 连接数据库
database = pymongo.MongoClient('192.168.1.103', port=27017)
db = database['facebook']
kzxy_list = db['kzxy']
sys.path.append(os.path.dirname(__file__))
from tools.logger_server import logger
from tools.selenium_server import SeleniumServer
from tools.extract import Extract
from settings import REDIS, EPR_TIME
import datetime
import re
import time
def today_start():
today = datetime.date.today()
today_time = int(time.mktime(today.timetuple()))
return today_time
def time_turn(timenum):
if timenum == '0':
return '0'
elif 0 < len(timenum) < 11 and timenum.isdigit():
timenum = int(timenum)
timeArray = time.localtime(timenum)
otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
return otherStyleTime
else:
print('请输入11位以内的数字')
def time_turns(time1):
if time1[0] == '昨' and len(time1) > 2:
time1 = time1.split('發佈')[0]
time1 = (time1.split('天')[-1])
time1 = (today_start() - 24 * 3600) + int(time1.split(':')[0]) * 3600 + int(time1.split(':')[1]) * 60
# print(time1)
return time1
if time1 == '昨天':
time1 = (int(time.time()) - 24 * 3600)
return time1
if time1 == '刚刚':
time1 = int(time.time())
return time1
if '天前' in time1:
time1 = int(time.time()) - (int(time1.split('天')[0]) * 3600 * 24)
return time1
if time1=='0':
return time1
try:
try:
# 1小时转年月日
TTime = time.time()
try:
xs = int(time1.split('小时')[0])
except:
xs = int(time1.split('小時')[0])
sjc = xs * 60 * 60
time1 = int(TTime - sjc)
# print(time1)
return time1
except:
try:
TTime = time.time()
try:
xs = int(time1.split('分钟')[0])
except:
xs = int(time1.split('分鐘')[0])
sjc = xs * 60
time1 = int(TTime - sjc)
# print(time1)
return time1
except:
TTime = time.time()
xs = int(time1.split('天')[0])
sjc = xs * 60 * 60 * 24
time1 = int(TTime - sjc)
# print(time1)
return time1
except:
if time1[1] == '月':
if len(re.findall('(.*?)月', time1)) == 1:
time1 = time1.replace('月', '-').replace('日', ' ')
if ':' in time1:
try:
time1 = '2022-' + time1 + ':00'
time1 = time1.replace(' :', ':')
except:
time1 = '2022-0' + time1 + ':00'
time1 = time1.replace(' :', ':')
else:
try:
time1 = '2022-' + time1 + '00:00:00'
time1 = time1.replace(' :', ':')
except:
time1 = '2022-0' + time1 + '00:00:00'
time1 = time1.replace(' :', ':')
else:
time1 = time1.replace('月', '-').replace('日', ' ')
time1 = '2022-' + time1 + '00:00:00'
time1 = time1.replace(' :', ':')
dt = datetime.datetime.strptime(time1, '%Y-%m-%d %H:%M:%S')
# result从数据库中读出来的标准格式时间数据
# # 10位,时间点相当于从1.1开始的当年时间编号
time1 = int(str(int(time.mktime(dt.timetuple()))))
# print(time1)
return time1
elif '2022年' in time1:
time1 = time1.replace('年', '-').replace('月', '-').replace('日', ' ')
time1 = time1 + '00' + ':00' + ':00'
time1 = time1.replace(' :', ':')
dt = datetime.datetime.strptime(time1, '%Y-%m-%d %H:%M:%S')
# result从数据库中读出来的标准格式时间数据
# # 10位,时间点相当于从1.1开始的当年时间编号
time1 = int(str(int(time.mktime(dt.timetuple()))))
return time1
elif time1.split('年')[0] != 2022:
time1 = 0
print('不是今年的数据,不采集')
return time1
time1 = time_turn(time_turns(time1))
return time1
class FB:
def __init__(self):
self.selenium = SeleniumServer()
self.driver = self.selenium.driver
self.redis = REDIS
self.logger = logger
self.extract = Extract()
self.epr_time = EPR_TIME
def start(self, keyword, zong_sums, wsheet1):
"""程序的入口
:param keyword 搜索的关键字
"""
# 退出设置
tc_sum = 1
sums = zong_sums
# 输出搜索日志
self.logger.debug('start: {}'.format(keyword))
# 打开首页
home_url='https://www.facebook.com/search/posts?q={}&sde=AbpC92W4Tk5CSRWpvFucDAgwoQ2gfVVMoruVxGcnOdkNfijX1G4VeLHZZ-EnQ0efWg6zxEoc9fii4P4dV05zM55j8iXbnj0pKLaEKctnRPAS4US5p7M6k4Fp-eLfhveWhJuF0cv4QOeSInYjSHv4dVDec8Y9-fp6-3LG3rHVWJ2CEBIpBbHttlepqOq0ONMyH0A9S8P4Z0pbFDRloMORLtxp-_Ol2JykorAkLsdNd4NJcPvaIII4N8TmBAhHZ414FArZ48w6A3mWL5lGM_nNlKzVHaSYBU2DQPVBcRHq1Ni2JI7I-Fq6RtzHS4gRuCLM6z-P5weqvUi4RtQP-pGHiwt1huE0xDOGeDTTjWesxsNYmJsVRYQJ77vE3Xq2-3N0D2w&filters=eyJyZWNlbnRfcG9zdHM6MCI6IntcIm5hbWVcIjpcInJlY2VudF9wb3N0c1wiLFwiYXJnc1wiOlwiXCJ9In0%3D'.format(keyword)
try:
# 打开facebook
self.driver.get(home_url)
sleep(random.randint(6, 8))
except:
pass
num = 0
# 进入二级页面的游标,控制指针停在时间
zz_num = 1
# 一共下滑一百次次,下滑一次停顿0.5s
# 由于是异步加载,所以需要拉到最下方
# 发布帐号昵称
zhnc = ''
# 点击主页
# self.driver.execute_script('window.scrollBy(0,2200)')
# 1.没对号的统一划分为人物,有对好的炫富获取分类,分类游人物,政府,媒体
# 2.一天内的数据
# 3.内容中含义关键字
# 4.获取昵称,分类,时间,内容,
for i in range(1, 101):
print(f'*****************************第{i}页***************************')
self.driver.execute_script('window.scrollBy(0,2200)')
sleep(random.randint(3, 5))
# 获取网页源码
html = etree.HTML(self.driver.page_source)
all_list = html.xpath(
'//div[@class="du4w35lb l9j0dhe7"]')
print(len(all_list))
if len(all_list)>3:
da_list1 = all_list[:-3]
else:
da_list1 = all_list
for all_li in da_list1[num:len(da_list1) + 1]:
# 临时储存数据的列表
list_shuju = []
list_shuju.append(keyword)
zhnc = all_li.xpath(
f'//div[{zz_num}]/div/div/div/div/div/div/div/div/div/div/div[2]/div/div[2]/div/div[2]/div/div[1]/span/h3/span/strong[1]/span/a/span/span//text()'
)
if zhnc==[]:
zhnc = all_li.xpath(
f'//div[{zz_num}]/div/div/div/div/div/div/div/div/div/div/div[2]/div/div[2]/div/div[2]/div/div[1]/span/h3/span/a/strong/span//text()'
)
if zhnc==[]:
zz_num+=1
break
zhnc=zhnc[0]
print(f'发布帐号昵称:{zhnc}')
# list_shuju.append(zhnc)
biaoqian=all_li.xpath(
'.//div[@class="j83agx80 cbu4d94t ew0dbk1b irj2b8pg"]/div/span/h3/span[2]/span/div/@aria-label'
)
print('标签',biaoqian)
# 内容/div[@class="rq0escxv l9j0dhe7 du4w35lb hybvsw6c io0zqebd m5lcvass fbipl8qg nwvqtn77 k4urcfbm ni8dbmo4 stjgntxs sbcfpzgs"]/div/div[2]/div/div[3]//text()
content_data = all_li.xpath(
f'//div[{zz_num}]/div/div/div/div/div/div/div/div/div/div/div[2]/div/div[3]/div[1]/div/div/div/span//text()'
)
if content_data==[]:
content_data=['暂无文字内容']
texts = []
for text in content_data:
if text == '.' or text == '=' or text == 'ownloa':
continue
else:
text = text.replace('#', '')
text += '\n'
texts.append(text)
texts = ''.join(texts)
print('内容数据', texts)
print('zz_num', zz_num)
# print(texts)
if '查看更多' in texts:
print('查看更多')
list_towurl = self.driver.find_element_by_xpath(
f'//div[{zz_num}]/div/div/div/div/div/div/div/div/div/div/div[2]/div/div[2]/div/div[2]/div/div[2]/span/span/span[2]/span/a') # 定位鼠标悬停的地方
ActionChains(self.driver).move_to_element(list_towurl).perform()
time.sleep(5)
print('二级页面')
# 悬浮在查看更多上
a = self.driver.find_element_by_xpath(
f'//div[{zz_num}]/div/div/div/div/div/div/div/div/div/div/div[2]/div/div[3]/div[1]/div/div/div/span//div[@role="button"]'
)
ActionChains(self.driver).move_to_element(a).perform()
print('查看更多位置')
time.sleep(5)
print("ff1")
# 点击查看更多
a.click()
print("ff2")
time.sleep(3)
windows = self.driver.window_handles
self.driver.switch_to.window(windows[-1])
print("ff3")
html = etree.HTML(self.driver.page_source)
print("f1")
all_list1 = html.xpath(
'//div[@class="du4w35lb l9j0dhe7"]')
all_li = all_list1[num]
print("f2")
texts = all_li.xpath(
f'//div[{zz_num}]/div/div/div/div/div/div/div/div/div/div/div[2]/div/div[3]/div[1]/div/div/div/span//text()'
)
neirong = []
for text in texts:
if text == '.':
pass
else:
text = text.replace('#', '')
text += '\n'
neirong.append(text)
neirong = ''.join(neirong)
print('内容数据', zz_num, ':', neirong)
list_shuju.append(neirong)
two_url = html.xpath(
f'//div[{zz_num}]/div/div/div/div/div/div/div/div/div/div/div[2]/div/div[2]/div/div[2]/div/div[2]/span/span/span[2]/span/a//@href')
if two_url == []:
two_url = html.xpath(
f'//div[{zz_num}]/div/div/div/div/div/div/div/div/div/div/div[2]/div/div[2]/div/div[2]/div/div[2]/span/span/span[3]/span/a//@href')
two_url = two_url[0]
print('二级页面标签:', two_url)
list_shuju.append(two_url)
else:
print('no')
# 悬浮在时间位置 获取二级页面链接
list_towurl = self.driver.find_element_by_xpath(
f'//div[{zz_num}]/div/div/div/div/div/div/div/div/div/div/div[2]/div/div[2]/div/div[2]/div/div[2]/span/span/span[2]/span/a') # 定位鼠标悬停的地方
ActionChains(self.driver).move_to_element(list_towurl).perform()
time.sleep(5)
html = etree.HTML(self.driver.page_source)
two_url = html.xpath(
f'//div[{zz_num}]/div/div/div/div/div/div/div/div/div/div/div[2]/div/div[2]/div/div[2]/div/div[2]/span/span/span[2]/span/a//@href')
if two_url == []:
two_url = html.xpath(
f'//div[{zz_num}]/div/div/div/div/div/div/div/div/div/div/div[2]/div/div[2]/div/div[2]/div/div[2]/span/span/span[3]/span/a//@href')
print('详情页链接')
two_url = two_url[0]
neirong = []
texts = texts.split('\n')
for text in texts:
if text == '.' or text == '=' or text == 'ownloa':
pass
else:
text = text.replace('#', '')
text += '\n'
neirong.append(text)
neirong = ''.join(neirong)
print('内容数据', zz_num, ':', neirong)
list_shuju.append(neirong)
print('二级页面标签:', two_url)
list_shuju.append(two_url)
Time_list1=f'//div[{zz_num}]/div/div/div/div/div/div/div/div/div/div/div[2]/div/div[2]/div/div[2]/div/div[2]/span/span/span[2]/span//text()'
Time_list2=f'//div[{zz_num}]/div/div/div/div/div/div/div/div/div/div/div[2]/div/div[2]/div/div[2]/div/div[2]/span/span/span[3]/span//text()'
Time_list=all_li.xpath(Time_list1)
if Time_list == [] or Time_list == ['\xa0', ' · ']:
Time_list = all_li.xpath(Time_list2)
print(Time_list)
time1 = []
for time2 in Time_list:
if time2 == '=':
pass
else:
time1.append(time2)
time1 = time1[0].replace('2022年','')
time1=int(time_turns(time1))
print("********发布时间*****", time_turn(str(time1)))
time2=int(time.time())
print(time2-time1<=15*24*3600)
print('新疆' in neirong and '洪秀柱' in neirong)
if time2-time1<=15*24*3600 and '新疆' in neirong and '洪秀柱' in neirong:
time1=time_turn(str(time1))
list_shuju.append(time1)
else:
print('不符合抓取要求')
zz_num+=1
break
print('+++list_shuju+++', list_shuju)
row = 'A' + str(sums)
wsheet1.write_row(row, list_shuju)
print('--------------第', num + 1, '条数据!--------------------')
print('--------------写入到第', sums, '行数据!--------------------')
num += 1
sums += 1
zz_num+=1
self.driver.close()
if __name__ == '__main__':
# 写入excel
keyword_list=['洪秀柱新疆']
# keyword_list=['洪秀柱新疆']
try:
wbook = xlsxwriter.Workbook('洪秀柱新疆.xlsx')
# 创建工作表
wsheet1 = wbook.add_worksheet('Sheet1')
title = ['发布账号','内容','详情页连接','时间']
wsheet1.write_row('A1', title) # 从A1单元格写入表头
# excel 所有用户总发帖数
zong_sums = 2
for keyword in keyword_list:
print(
f"---------------------------------------------索引:{keyword_list.index(keyword)}------------------------------------------------")
fb_server = FB()
zong_sums = fb_server.start(keyword, zong_sums, wsheet1)
sleep(random.randint(15, 30))
print('写入完成!', zong_sums)
wbook.close()
except:
wbook.close()
print("人工终止!")