脸书关键字评论
"""
时间:2021/6/30 13:14
发布账号昵称、发布账号ID、发布时间、内容、发帖链接、账号主页链接、点赞数、评论数、分享数
运行环境 Chrome浏览器 和 selenium驱动版本一致即可
可自行接入公司数据库
keyword_list = ['tsaiingwen'] 放入采集政府、人物、政党的账号 tsaiingwenwe为蔡英文账号
配置脚本在 tools settings文件夹内
"""
import base64
import hashlib
import datetime
import locale
# -*- coding=utf-8 -*-
# 导入模块
import os
import random
import re
import pymysql
import sys
import time
from time import sleep
from selenium.webdriver import ActionChains
import pymongo
import xlsxwriter
from lxml import etree
import requests
# 中文支持
locale.setlocale(locale.LC_CTYPE, 'chinese')
# 连接数据库
# database = pymongo.MongoClient('192.168.1.103', port=27017)
# db = database['facebook']
# kzxy_list = db['kzxy']
sys.path.append(os.path.dirname(__file__))
from tools.logger_server import logger
from tools.selenium_server import SeleniumServer
from tools.extract import Extract
from settings import REDIS, EPR_TIME
import paramiko
class FB:
def __init__(self):
self.selenium = SeleniumServer()
self.driver = self.selenium.driver
self.redis = REDIS
self.logger = logger
self.extract = Extract()
self.epr_time = EPR_TIME
def start(self, keyword, sums, wsheet1):
"""程序的入口
:param keyword 搜索的关键字
"""
# id_use = '何欣純'
sum = 1
# 退出设置
tc_sum = 1
# 输出搜索日志
self.logger.debug('start: {}'.format(keyword))
# 打开首页
# https: // www.facebook.com / forpeople / posts / 474893957336080
# https: // www.facebook.com / forpeople / posts / 474319574060185
# https://www.facebook.com/forpeople/posts/474319574060185
# //div[@class='stjgntxs ni8dbmo4 l82x9zwi uo3d90p7 h905i5nu monazrh9']/div/div[@class='cwj9ozl2 tvmbv18p']/ul/li//div[@class='ecm0bbzt e5nlhep0 a8c37x1j']/span/div//text()
# home_url = 'https://www.facebook.com/{}'.format(keyword)
# taiwanweicher / posts / 5099328600118935
# home_url = 'https://www.facebook.com/94achun/posts/{}'.format(keyword)
print('keyword',keyword)
home_url = 'https://www.facebook.com/search/posts/?q={}'.format(keyword)
print('home_url',home_url)
try:
# 打开facebook
self.driver.get(home_url)
self.driver.maximize_window()
# sleep(random.randint(5, 7))
sleep(random.randint(3, 5))
except:
pass
for i in range(1, 51):
print(f'*****************************第{i}页***************************')
zz_num=i
self.driver.execute_script('window.scrollBy(0,2200)')
sleep(random.randint(3, 5))
if zz_num==3:
zz_num+=1
continue
else:
# 获取网页源码
try:
print('方法1')
list_towurl=self.driver.find_element_by_xpath(
f'//div[{zz_num}]/div/div/div/div/div/div/div/div/div/div/div[2]/div/div[2]/div/div[2]/div/div[2]/span/span/span[2]/span/a'
# f'//div[{zz_num}]/div/div/div/div/div/div/div/div/div/div/div[2]/div/div[2]/div/div[2]/div/div[2]/span/span/span[2]/span/a'
)
ActionChains(self.driver).move_to_element(list_towurl).perform()
time.sleep(3)
html = etree.HTML(self.driver.page_source)
home_url = html.xpath(
f'//div[{zz_num}]/div/div/div/div/div/div/div/div/div/div/div[2]/div/div[2]/div/div[2]/div/div[2]/span/span/span[2]/span/a//@href'
)
# f'//div[{zz_num}]/div/div/div/div/div/div/div/div/div/div/div[2]/div/div[2]/div/div[2]/div/div[2]/span/span/span[2]/span/a//@href')
home_url=home_url[0]
self.driver.find_element_by_xpath(
f'//div[{zz_num}]/div/div/div/div/div/div/div/div/div/div/div[2]/div/div[2]/div/div[2]/div/div[2]/span/span/span[2]/span/a'
# f'//div[{zz_num}]/div/div/div/div/div/div/div/div/div/div/div[2]/div/div[2]/div/div[2]/div/div[2]/span/span/span[2]/span/a'
).click()
time.sleep(5)
html = etree.HTML(self.driver.page_source)
try:
print('没有视频')
all_list1 = html.xpath(
"//div[@class='stjgntxs ni8dbmo4 l82x9zwi uo3d90p7 h905i5nu monazrh9']/div/div[@class='cwj9ozl2 tvmbv18p']/ul/li")
print('all_list1', len(all_list1))
if all_list1==[]:
comm = '//span[@class="d2edcug0 hpfvmrgz qv66sw1b c1et5uql lr9zc1uh a8c37x1j fe6kdd0r mau55g9w c8b282yb keod5gw0 nxhoafnm aigsh9s9 d3f4x2em iv3no6db jq4qci2q a3bd9o3v b1v8xokw m9osqain"]'
self.driver.find_element_by_xpath(comm).click()
time.sleep(2)
more='//span[@class="d2edcug0 hpfvmrgz qv66sw1b c1et5uql lr9zc1uh a8c37x1j fe6kdd0r mau55g9w c8b282yb keod5gw0 nxhoafnm aigsh9s9 d3f4x2em iv3no6db jq4qci2q a3bd9o3v lrazzd5p m9osqain"]'
self.driver.find_element_by_xpath(more).click()
time.sleep(2)
all_list1=html.xpath(
f'//div[@class="cwj9ozl2 j83agx80 cbu4d94t buofh1pr du4w35lb ni8dbmo4 stjgntxs"]//ul'
)
for list_pl in all_list1:
id_use=list_pl.xpath('//span[@class="nc684nl6"]//span//text()')[0]
list_text_pl = []
pl_text = list_pl.xpath(
".//div[@class='ecm0bbzt e5nlhep0 a8c37x1j']/span/div//text()")
id_pls = ''
for i in pl_text:
id_pls += i
print(sum)
list_text_pl.append(id_use)
list_text_pl.append(id_pls)
list_text_pl.append(home_url)
print(list_text_pl)
row = 'A' + str(sums)
wsheet1.write_row(row, list_text_pl)
sums +=1
sum += 1
# self.driver.back()
except:
print('有视频')
try:
print('点击评论1')
comm=f'//span[@class="d2edcug0 hpfvmrgz qv66sw1b c1et5uql lr9zc1uh a8c37x1j fe6kdd0r mau55g9w c8b282yb keod5gw0 nxhoafnm aigsh9s9 d3f4x2em iv3no6db jq4qci2q a3bd9o3v b1v8xokw py34i1dx"]'
self.driver.find_element_by_xpath(comm).click()
time.sleep(2)
print('点击评论')
more='.//div[@class="j83agx80 bkfpd7mw kvgmc6g5 wkznzc2l oygrvhab dhix69tm"]//span[@class="j83agx80 fv0vnmcu hpfvmrgz"]//span[@class="d2edcug0 hpfvmrgz qv66sw1b c1et5uql lr9zc1uh a8c37x1j fe6kdd0r mau55g9w c8b282yb keod5gw0 nxhoafnm aigsh9s9 d3f4x2em iv3no6db jq4qci2q a3bd9o3v lrazzd5p m9osqain"]'
self.driver.find_element_by_xpath(more).click()
time.sleep(2)
print('查看更多')
except:
print('没有点击评论')
more='//div[@class="j83agx80 bkfpd7mw kvgmc6g5 wkznzc2l oygrvhab dhix69tm"]//span[@class="d2edcug0 hpfvmrgz qv66sw1b c1et5uql lr9zc1uh a8c37x1j fe6kdd0r mau55g9w c8b282yb keod5gw0 nxhoafnm aigsh9s9 d3f4x2em iv3no6db jq4qci2q a3bd9o3v lrazzd5p m9osqain"]'
self.driver.find_element_by_xpath(more).click()
time.sleep(2)
print('查看更多')
all_list1=html.xpath(
f'//div[@class="cwj9ozl2 j83agx80 cbu4d94t buofh1pr du4w35lb ni8dbmo4 stjgntxs"]//ul'
)
print('all_list1', len(all_list1))
for list_pl in all_list1:
id_use=list_pl.xpath('//span[@class="nc684nl6"]//span//text()')[0]
list_text_pl = []
pl_text = list_pl.xpath(
".//div[@class='ecm0bbzt e5nlhep0 a8c37x1j']/span/div//text()")
id_pls = ''
for i in pl_text:
id_pls += i
print(sum)
list_text_pl.append(id_use)
list_text_pl.append(id_pls)
list_text_pl.append(home_url)
print(list_text_pl)
row = 'A' + str(sums)
wsheet1.write_row(row, list_text_pl)
sums +=1
sum += 1
self.driver.back()
try:
self.driver.find_element_by_xpath('//div[@aria-label="關閉浮動視窗"]').click()
time.sleep(2)
except:
pass
except:
print('方法2')
self.driver.back()
zz_num+=1
continue
# list_towurl=self.driver.find_element_by_xpath(
# f'//div[{zz_num}]/div/div/div/div/div/div/div/div/div/div/div[2]/div/div[2]/div/div[2]/div/div[2]/span/span/span[3]/span/a'
# )
# ActionChains(self.driver).move_to_element(list_towurl).perform()
# time.sleep(3)
# html = etree.HTML(self.driver.page_source)
# home_url = html.xpath(
# f'//div[{zz_num}]/div/div/div/div/div/div/div/div/div/div/div[2]/div/div[2]/div/div[2]/div/div[2]/span/span/span[3]/span/a//@href')
# home_url=home_url[0]
# self.driver.find_element_by_xpath(
# f'//div[{zz_num}]/div/div/div/div/div/div/div/div/div/div/div[2]/div/div[2]/div/div[2]/div/div[2]/span/span/span[3]/span/a'
# ).click()
# time.sleep(5)
# html = etree.HTML(self.driver.page_source)
# try:
# print('没有视频')
# all_list1 = html.xpath(
# "//div[@class='stjgntxs ni8dbmo4 l82x9zwi uo3d90p7 h905i5nu monazrh9']/div/div[@class='cwj9ozl2 tvmbv18p']/ul/li")
# print('all_list1', len(all_list1))
# for list_pl in all_list1:
# id_use=list_pl.xpath('//span[@class="nc684nl6"]//span//text()')[0]
# list_text_pl = []
# pl_text = list_pl.xpath(
# ".//div[@class='ecm0bbzt e5nlhep0 a8c37x1j']/span/div//text()")
# id_pls = ''
# for i in pl_text:
# id_pls += i
# print(sum)
# list_text_pl.append(id_use)
# list_text_pl.append(id_pls)
# list_text_pl.append(home_url)
# print(list_text_pl)
# row = 'A' + str(sums)
# wsheet1.write_row(row, list_text_pl)
# sums +=1
# sum += 1
# self.driver.back()
# except:
# print('有视频')
# comm='//span[@class="d2edcug0 hpfvmrgz qv66sw1b c1et5uql lr9zc1uh a8c37x1j fe6kdd0r mau55g9w c8b282yb keod5gw0 nxhoafnm aigsh9s9 d3f4x2em iv3no6db jq4qci2q a3bd9o3v b1v8xokw py34i1dx"]'
# self.driver.find_element_by_xpath(comm).click()
# time.sleep(2)
# try:
# more='//span[@class="d2edcug0 hpfvmrgz qv66sw1b c1et5uql lr9zc1uh a8c37x1j fe6kdd0r mau55g9w c8b282yb keod5gw0 nxhoafnm aigsh9s9 d3f4x2em iv3no6db jq4qci2q a3bd9o3v lrazzd5p m9osqain"]'
# self.driver.find_element_by_xpath(more).click()
# time.sleep(2)
# except:
# pass
# all_list1=html.xpath(
# f'//div[@class="cwj9ozl2 j83agx80 cbu4d94t buofh1pr du4w35lb ni8dbmo4 stjgntxs"]//ul'
# )
# print('all_list1', len(all_list1))
# for list_pl in all_list1:
# id_use=list_pl.xpath('//span[@class="nc684nl6"]//span//text()')[0]
# list_text_pl = []
# pl_text = list_pl.xpath(
# ".//div[@class='ecm0bbzt e5nlhep0 a8c37x1j']/span/div//text()")
# id_pls = ''
# for i in pl_text:
# id_pls += i
# print(sum)
# list_text_pl.append(id_use)
# list_text_pl.append(id_pls)
# list_text_pl.append(home_url)
# print(list_text_pl)
# row = 'A' + str(sums)
# wsheet1.write_row(row, list_text_pl)
# sums +=1
# sum += 1
# self.driver.back()
self.driver.close()
if __name__ == '__main__':
# 人物
# keyword_list = ['tsaiingwen']
keyword_list = ['李明哲']
# 写入excel
try:
wbook = xlsxwriter.Workbook('李明哲.xlsx')
# wbook = xlsxwriter.Workbook('志愿者处.xlsx')
# 创建工作表
wsheet1 = wbook.add_worksheet('Sheet1')
title = ['发布账号昵称', '评论内容', '抓取链接']
wsheet1.write_row('A1', title) # 从A1单元格写入表头
# excel 所有用户总发帖数
sum = 1
sums = 2
for keyword in keyword_list:
print(
f"---------------------------------------------索引:{keyword_list.index(keyword)}------------------------------------------------")
fb_server = FB()
zong_sums = fb_server.start(keyword, sums, wsheet1)
sleep(random.randint(15, 30))
print('写入完成!', zong_sums)
wbook.close()
except:
wbook.close()
print("人工终止!")