from selenium import webdriver
import xlsxwriter as xw
from time import sleep
import time
import random
import requests
from lxml import html
import pymysql
import datetime
import re
import time
def time_turn(timenum):
if 0 < len((timenum)) < 11 and timenum.isdigit():
timenum = int(timenum)
timeArray = time.localtime(timenum)
otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
return otherStyleTime
else:
print('请输入11位以内的数字')
def today_start():
today = datetime.date.today()
today_time = int(time.mktime(today.timetuple()))
return today_time
def time_turns(time1):
time1 = str(time1).replace('发布', '').replace('發布', '')
if time1[0] == '昨' and len(time1) > 2:
time1 = time1.split('發佈')[0]
time1 = (time1.split('天')[-1])
time1 = (today_start() - 24 * 3600) + int(time1.split(':')[0]) * 3600 + int(time1.split(':')[1]) * 60
# print(time1)
return time1
if time1 == '昨天':
time1 = (int(time.time()) - 24 * 3600)
return time1
if time1 == '今天更新':
time1 = (int(time.time()))
return time1
if time1 == '刚刚':
time1 = int(time.time())
return time1
if '天前' in time1:
time1 = int(time.time()) - (int(time1.split('天')[0]) * 3600 * 24)
return time1
try:
try:
# 1小时转年月日
TTime = time.time()
try:
xs = int(time1.split('小时')[0])
except:
xs = int(time1.split('小時')[0])
sjc = xs * 60 * 60
time1 = int(TTime - sjc)
# print(time1)
return time1
except:
try:
TTime = time.time()
try:
xs = int(time1.split('分钟')[0])
except:
xs = int(time1.split('分鐘')[0])
sjc = xs * 60
time1 = int(TTime - sjc)
# print(time1)
return time1
except:
TTime = time.time()
xs = int(time1.split('天')[0])
sjc = xs * 60 * 60 * 24
time1 = int(TTime - sjc)
# print(time1)
return time1
except:
if time1[1] == '月':
if len(re.findall('(.*?)月', time1)) == 1:
time1 = time1.replace('月', '-').replace('日', ' ')
if ':' in time1:
try:
time1 = '2022-' + time1 + ':00'
time1 = time1.replace(' :', ':')
except:
time1 = '2022-0' + time1 + ':00'
time1 = time1.replace(' :', ':')
else:
try:
time1 = '2022-' + time1 + '00:00:00'
time1 = time1.replace(' :', ':')
except:
time1 = '2022-0' + time1 + '00:00:00'
time1 = time1.replace(' :', ':')
else:
time1 = time1.replace('月', '-').replace('日', ' ')
time1 = '2022-' + time1 + '00:00:00'
time1 = time1.replace(' :', ':')
dt = datetime.datetime.strptime(time1, '%Y-%m-%d %H:%M:%S')
# result从数据库中读出来的标准格式时间数据
# # 10位,时间点相当于从1.1开始的当年时间编号
time1 = int(str(int(time.mktime(dt.timetuple()))))
# print(time1)
return time1
elif '2022年' in time1:
time1 = time1.replace('年', '-').replace('月', '-').replace('日', ' ')
time1 = time1 + ':00'
time1 = time1.replace(' :', ':')
dt = datetime.datetime.strptime(time1, '%Y-%m-%d %H:%M:%S')
# result从数据库中读出来的标准格式时间数据
# # 10位,时间点相当于从1.1开始的当年时间编号
time1 = int(str(int(time.mktime(dt.timetuple()))))
return time1
elif time1.split('年')[0] != 2022:
time1 = 0
print('不是今年的数据,不采集')
return time1
time1 = time_turn(time_turns(time1))
return time1
password_dict = {
'/': 0,
'折': 1,
'万': 2,
'+': 3,
'%': 4,
'起': 5,
'元': 6,
'-': 7,
'¥': 8,
'时': 9,
'.': '.'
}
def save_mysql(title, updated_at, uid, address_id, username, money, mobile, created_at, prov_id, city_id,
coun_id, lng,
lat,
service_typeid, descs, hot, list_tupian):
dic = {}
dic["title"] = title
dic["updated_at"] = updated_at
dic["uid"] = uid
dic["address_id"] = address_id
dic["username"] = username
dic["money"] = money
dic["mobile"] = mobile
dic["created_at"] = created_at
dic["prov_id"] = prov_id
dic["city_id"] = city_id
dic["coun_id"] = coun_id
dic["lng"] = lng
dic["lat"] = lat
dic["service_typeid"] = service_typeid
dic["`desc`"] = descs
dic["hot"] = hot
dic["type"] = '3' # 1需求 2闲置物品 3车子 4房子
connection = pymysql.Connect(
host='140.210.4.73',
port=3306,
user='agr_sql',
passwd='bj@#agr_sql',
db='fa_admin',
charset='utf8mb4'
)
with connection.cursor() as cursor:
# 创建sql语句
sheet_name = 'm_app_usedcar'
sql = """INSERT INTO {}(title,updated_at,uid,address_id,username,money,mobile,created_at,prov_id,city_id,coun_id,lng,lat,service_typeid,`desc`,hot)\
VALUES (%s, %s,%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,%s)""".format(
sheet_name)
cursor.execute(sql, (
dic["title"], dic["updated_at"], dic["uid"], dic["address_id"], dic["username"],
dic["money"],
dic["mobile"],
dic["created_at"],
dic["prov_id"], dic["city_id"], dic["coun_id"], dic["lng"], dic["lat"],
dic["service_typeid"], dic["`desc`"], dic["hot"]))
ir_idd = connection.insert_id()
print(ir_idd, type(ir_idd))
connection.commit()
dic["publish_id"] = ir_idd
# 循环提交多图
for li_tu in list_tupian:
dic["url"] = li_tu
with connection.cursor() as cursor1:
# 创建sql语句
sheet_name2 = 'm_user_publish_file'
sql1 = """INSERT INTO {}(type, uid, created_at, url, publish_id)\
VALUES (%s, %s, %s, %s, %s)""".format(
sheet_name2)
cursor1.execute(sql1, (
dic["type"], dic["uid"], dic["created_at"], dic["url"], dic["publish_id"]
))
connection.commit()
print("<<<<<<<<<数据存储成功>>>>>>>>", )
with connection.cursor() as cursor2:
# 创建sql语句
sheet_name2 = 'm_user_publish'
sql2 = """INSERT INTO {}(type, uid, created_at, publish_id,prov_id,city_id,coun_id)\
VALUES ( %s, %s, %s, %s, %s, %s, %s)""".format(
sheet_name2)
cursor2.execute(sql2, (
dic["type"], dic["uid"], dic["created_at"], dic["publish_id"],
dic["prov_id"], dic["city_id"], dic["coun_id"],
))
connection.commit()
print("<<<<<<<<<m_user_publish数据存储成功>>>>>>>>", )
connection.close()
# 创建表
wbook = xw.Workbook('二手车.xlsx')
wsheet1 = wbook.add_worksheet('Sheet1') # 创建工作表
wsheet1.activate() # 激活表
title = ['title', 'money', '用户名', 'id', '星评', '产品名', '购买类型'] # 设置表头
wsheet1.write_row('A1', title) # 从A1单元格写入表头
i = 2 # 从第二行写入数据
option = webdriver.ChromeOptions()
# option.add_argument(r"user-data-dir=C:\Users\Administrator\AppData\Local\Chromium\Application") # 浏览器路径
hot2 = {
'电动座椅': 111,
'座椅加热': 112,
'无钥匙启动': 113,
'方向盘换挡': 114,
'倒车影像': 115,
'电动折叠后视镜': 116,
'全景天窗': 117,
'全景影像': 118,
'无钥匙进入': 119,
}
service_typeid2 = {
'宝马': 6,
'奥迪': 7,
'1万公里以内': 9,
'1万-2万公里': 10,
'1年以内': 11,
'1-2年': 12,
'白色': 13,
'黑色': 14,
'手动': 15,
'自动': 16,
'新能源': 17,
'燃油车': 18,
'座椅通风': 19,
'定速巡航': 20,
'不限': 37,
'国三': 38,
'现代': 39,
'100km以下': 41,
'纯电动': 42,
'增程式': 43,
'终身': 44,
'3年或10万公里': 45,
'4年或12万公里': 46,
'本田': 47,
'深灰色': 48,
'绿色': 49,
'租赁': 50,
'出售': 51,
'国四': 65,
'国五': 66,
'国六': 67,
'100-200km': 68,
'200-300km': 69,
'300-400km': 70,
'400km以上': 71,
'5年或10万公里': 72,
'6年或15万公里': 73,
'8年12万公里': 74,
'8年或15万公里': 75,
'8年或20万公里': 76,
'大众': 77,
'奔驰': 78,
'丰田': 79,
'吉利汽车': 80,
'广汽传祺': 81,
'比亚迪': 82,
'别克': 83,
'日产': 84,
'长安': 85,
'2万-3万公里': 86,
'3万-4万公里': 87,
'4万-5万公里': 88,
'5万-6万公里': 89,
'6万-7万公里': 90,
'7万-8万公里': 91,
'2-3年': 92,
'3-4年': 93,
'4-5年': 94,
'5-6年': 95,
'6-7年': 96,
'红色': 97,
'金色': 98,
'黄色': 99,
'咖啡色': 100,
'粉红色': 101,
'橙色': 102,
'银灰色': 103,
'蓝色': 104,
'棕色': 105,
'香槟色': 106,
'银色': 107,
'汽油': 108,
'油电混合': 109,
'柴油': 110,
}
m_userb = [
[1, '张三', '18201355004', '110000', '110100', '110106', '116.29560982612', '39.840624161575', '1'],
[2, '我是曹野', '18600806657', '110000', '110100', '110101', '116.42240097766', '39.934827272396', '2'],
[3, '陈十一', '18600806656', '110000', '110100', '110106', '116.29768500168', '39.839177478909', '3'],
[4, '清宁', '18600806655', '110000', '110100', '110106', '116.29595243984', '39.840078179251', '6'],
[11, '过时的短靴', '18600806652', '110000', '110100', '110101', '116.42240097766', '39.934827272396', '8'],
[5, '曹野', '18600806654', '110000', '110100', '110101', '116.42240097766', '39.934827272396', '15'],
[6, '嘉平九', '13103657333', '110000', '110100', '110101', '116.42240097766', '39.934827272396', '14'],
[9, '山山', '18600806653', '110000', '110100', '110101', '116.42240097766', '39.934827272396', '16'],
[13, '合适', '12345678902', '110000', '110100', '110101', '116.42240097766', '39.934827272396', '24'],
[8, '陈建锋', '12345678901', '110000', '110100', '110101', '116.42240097766', '39.934827272396', '18'],
[10, '雨剑门秋', '12345678091', '110000', '110100', '110101', '116.42240097766', '39.934827272396', '20'],
[12, '无私的鞋垫', '19921478807', '110000', '110100', '110101', '116.42240097766', '39.934827272396', '21']
]
summ = 1
browser = webdriver.Chrome(options=option)
time.sleep(5)
for urlpage in range(10, 101):
browser = webdriver.Chrome(options=option)
time.sleep(random.randint(3, 8))
browser.get(
f'https://bj.58.com/ershouche/pn{urlpage}/?PGTID=0d100000-0000-1d61-a85d-b30f1ba39c06&ClickID=65'
) # 第二页
print(f'**********************第{urlpage}页******************************')
browser.maximize_window()
html2 = html.etree.HTML(browser.page_source)
# time.sleep(5)
updated_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
for fog in range(1, 21):
# 原价
money2 = html2.xpath(f'/html/body/div[4]/div[4]/ul/li[{fog}]/div/a/div[3]/b//text()')
if money2 == []:
summ += 1
t = random.randint(3, 4)
time.sleep(t)
continue
else:
try:
money2 = money2[0].strip()
# print(money2)
money = ''
for money2 in money2:
money += str(password_dict[money2])
money = int(float(''.join(money).replace('面议', '0'))) * 1000000
h2 = browser.window_handles
browser.switch_to.window(h2[-1])
# print('fog',fog)
time.sleep(random.randint(3, 5))
browser.execute_script('window.scrollBy(0,2200)')
time.sleep(2)
browser.find_element_by_xpath(f'/html/body/div[4]/div[4]/ul/li[{fog}]/div/a').click()
h2 = browser.window_handles
browser.switch_to.window(h2[-1])
html1 = html.etree.HTML(browser.page_source)
# 标题
title = html1.xpath('//h1[@class="info-title"]//text()')[0].strip()
# 描述
descs = html1.xpath('//dd[@class="info-usr-desc_cont"]//text()')[0].strip()
# 身份信息
sfxx = random.choice(m_userb)
uid = sfxx[0]
username = sfxx[1]
mobile = sfxx[2]
prov_id = sfxx[3]
city_id = sfxx[4]
coun_id = sfxx[5]
lng = sfxx[6]
lat = sfxx[7]
address_id = sfxx[8]
created_at = ''.join(html1.xpath('//span[@class="info-post-date"]//text()')).strip()
created_at = time_turn(str(time_turns(created_at)))
print(created_at)
try:
print('点击查看更多参数')
browser.find_element_by_xpath('/html/body/section[3]/a[2]').click()
time.sleep(2)
browser.execute_script('window.scrollBy(0,2200)')
time.sleep(2)
h2 = browser.window_handles
browser.switch_to.window(h2[-1])
html3 = html.etree.HTML(browser.page_source)
service_typeids = html3.xpath('//tbody//tr//td[@class="cs_cont"]//text()')
hot = ''
for service_typeids in service_typeids:
if service_typeids == '\n ':
pass
elif service_typeids not in hot2:
pass
else:
hot += str(hot2[service_typeids])
hot += ','
service_typeid = ''
for service_typeids in service_typeids:
if service_typeids == '\n ':
pass
elif service_typeids not in service_typeid2:
pass
else:
service_typeid += str(service_typeid2[service_typeids])
service_typeid += ','
browser.close()
print('返回详情页')
time.sleep(2)
browser.close()
print('返回主页')
except:
print('显示更多参数')
money2 = money2[0].strip()
money = ''
for money2 in money2:
money += str(password_dict[money2])
money = int(float(''.join(money).replace('面议', '0'))) * 1000000
h2 = browser.window_handles
browser.switch_to.window(h2[-1])
time.sleep(random.randint(3, 5))
hots = html1.xpath('//ul[@class="info-specialconfs h-clearfix"]//li//text()')
if hots == []:
hots = html1.xpath(
'//ul[@class="info-meta-s h-clearfix"]//li[@class="info-meta"]//span[1]//text()')
service_typeids = html1.xpath(
'//div[@class="info-confs"]//dd//span[@class="info-conf_value"]//text()')
hot = ''
for hots in hots:
if hots == '\n ':
pass
elif hots not in hot2:
pass
else:
hot += str(hot2[hots])
hot += ','
service_typeid = ''
for service_typeids in service_typeids:
if service_typeids == '\n ':
pass
elif service_typeids not in service_typeid2:
pass
else:
service_typeid += str(service_typeid2[service_typeids])
service_typeid += ','
list_tupian = html1.xpath(
'//div[@class="info-pics h-clearfix"]//img//@data-original')
d = ''
hot = hot.split(',')
service_typeid = service_typeid.split(',')
for c in hot:
if c not in d:
d += str(c)
d += ','
else:
pass
d = d[:-1]
hot = d
e = ''
for f in service_typeid:
if f not in e:
e += str(f)
e += ','
else:
pass
e = e[:-1]
service_typeid = e
print('热点', hot, type(hot))
print('介绍', service_typeid, type(service_typeid))
print(title, updated_at, uid, address_id, username, money, mobile, created_at, prov_id, city_id,
coun_id, lng,
lat,
service_typeid, descs, hot, list_tupian)
if len(service_typeid) != 0 and len(hot) != 0:
save_mysql(title, updated_at, uid, address_id, username, money, mobile, created_at, prov_id,
city_id, coun_id,
lng, lat,
service_typeid, descs, hot, list_tupian)
else:
pass
print('+++++++++++++++++第', summ, '条数据入库+++++++++++++++++')
summ += 1
t = random.randint(3, 4)
time.sleep(t)
browser.close()
except Exception as e:
print(e)
summ += 1
t = random.randint(3, 4)
time.sleep(t)