tiktok

'''
常见问题
1.网速问题,有时候加载不出页面,需要盯着,下滑有时候也没数据
2.滑动验证码

'''
import datetime
import re
import time


def time_turn(timenum):
if 0 < len((timenum)) < 11 and timenum.isdigit():
timenum = int(timenum)
timeArray = time.localtime(timenum)
otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
return otherStyleTime
else:
print('请输入11位以内的数字')


def today_start():
today = datetime.date.today()
today_time = int(time.mktime(today.timetuple()))
return today_time


def time_turns(time1):
time1 = str(time1).replace('发布', '').replace('發布', '')
if time1[0] == '昨' and len(time1) > 2:
time1 = time1.split('發佈')[0]
time1 = (time1.split('天')[-1])
time1 = (today_start() - 24 * 3600) + int(time1.split(':')[0]) * 3600 + int(time1.split(':')[1]) * 60
# print(time1)
return time1
if time1 == '昨天':
time1 = (int(time.time()) - 24 * 3600)
return time1
if time1 == '今天更新':
time1 = (int(time.time()))
return time1
if time1 == '刚刚':
time1 = int(time.time())
return time1
if '天前' in time1:
time1 = int(time.time()) - (int(time1.split('天')[0]) * 3600 * 24)
return time1
try:
try:
# 1小时转年月日
TTime = time.time()
try:
xs = int(time1.split('小时')[0])
except:
xs = int(time1.split('小時')[0])
sjc = xs * 60 * 60
time1 = int(TTime - sjc)
# print(time1)
return time1
except:
try:
TTime = time.time()
try:
xs = int(time1.split('分钟')[0])
except:
xs = int(time1.split('分鐘')[0])
sjc = xs * 60
time1 = int(TTime - sjc)
# print(time1)
return time1
except:
TTime = time.time()

xs = int(time1.split('天')[0])

sjc = xs * 60 * 60 * 24
time1 = int(TTime - sjc)
# print(time1)
return time1
except:

if time1[1] == '月':
if len(re.findall('(.*?)月', time1)) == 1:

time1 = time1.replace('月', '-').replace('日', ' ')
if ':' in time1:
try:
time1 = '2022-' + time1 + ':00'
time1 = time1.replace(' :', ':')
except:
time1 = '2022-0' + time1 + ':00'
time1 = time1.replace(' :', ':')
else:
try:
time1 = '2022-' + time1 + '00:00:00'
time1 = time1.replace(' :', ':')
except:
time1 = '2022-0' + time1 + '00:00:00'
time1 = time1.replace(' :', ':')
else:
time1 = time1.replace('月', '-').replace('日', ' ')
time1 = '2022-' + time1 + '00:00:00'
time1 = time1.replace(' :', ':')
dt = datetime.datetime.strptime(time1, '%Y-%m-%d %H:%M:%S')
# result从数据库中读出来的标准格式时间数据
# # 10位,时间点相当于从1.1开始的当年时间编号
time1 = int(str(int(time.mktime(dt.timetuple()))))
# print(time1)
return time1


elif '2022年' in time1:
time1 = time1.replace('年', '-').replace('月', '-').replace('日', ' ')
time1 = time1 + ':00'
time1 = time1.replace(' :', ':')
dt = datetime.datetime.strptime(time1, '%Y-%m-%d %H:%M:%S')
# result从数据库中读出来的标准格式时间数据
# # 10位,时间点相当于从1.1开始的当年时间编号
time1 = int(str(int(time.mktime(dt.timetuple()))))
return time1
elif time1.split('年')[0] != 2022:
time1 = 0
print('不是今年的数据,不采集')
return time1
time1 = time_turn(time_turns(time1))
return time1


import random
import pandas as pd
from selenium import webdriver
from lxml import etree
import time

url = 'https://www.tiktok.com/@xiaoqiww'
driver = webdriver.Chrome()
driver.get(url=url)
time.sleep(5)
for page in range(1, 3):
time.sleep(random.randint(3, 5))
print(f'********************第{page}页******************')
driver.execute_script('window.scrollBy(0,2200)')
html = driver.page_source
tree = etree.HTML(html)
second_url = tree.xpath('//div[@class="tiktok-yz6ijl-DivWrapper e1cg0wnj1"]//a//@href')
print(len(second_url))
names = []
publishtimes = []
contents = []
loves = []
comments = []
shares = []
second_urls = []
sums = 0
for second_url in second_url:
time.sleep(5)
print(second_url)
second_urls.append(second_url)
driver.get(second_url)

html2 = driver.page_source
tree2 = etree.HTML(html2)
# 姓名
name = tree2.xpath('/html/body/div[2]/div[2]/div[2]/div[1]/div[1]/div/div[1]/div[1]/a[2]/h3//text()')[0]
names.append(name)
# 发布时间
publishtime = tree2.xpath(
'//div[@data-e2e="recommend-list-item-container"][1]//a[@class="tiktok-1lqhxf7-StyledAuthorAnchor emt6k1z1"]//text()')[
-1]
publishtime = time_turn(str(time_turns(publishtime)))
publishtimes.append(publishtime)
# 内容
content = tree2.xpath('/html/body/div[2]/div[2]/div[2]/div[1]/div[1]/div/div[1]/div[2]//text()')[0]
contents.append(content)
# 点赞
love = tree2.xpath('/html/body/div[2]/div[2]/div[2]/div[1]/div[1]/div/div[2]/div[2]/button[1]/strong//text()')[0]
love = int(love)
loves.append(love)
# 评论
comment = tree2.xpath('/html/body/div[2]/div[2]/div[2]/div[1]/div[1]/div/div[2]/div[2]/button[2]/strong//text()')[0]
comment = int(comment)
comments.append(comment)
# 转发
share = tree2.xpath('/html/body/div[2]/div[2]/div[2]/div[1]/div[1]/div/div[2]/div[2]/button[3]/strong//text()')[0]
share = ''.join(share).replace('分享', '0')
share = int(share)
shares.append(share)
# print(publishtime)
data = {
'详情页链接': second_urls,
'姓名': names,
'发布时间': publishtimes,
'内容': contents,
'点赞': loves,
'评论': comments,
'转发': shares,
}
print(data)
s = pd.DataFrame(data=data)
s.to_excel('tiktok.xlsx')
print('保存成功')
driver.quit()
posted @ 2022-06-16 16:19  布都御魂  阅读(304)  评论(1编辑  收藏  举报