selenium 爬虫
# -*- coding: utf-8 -*-
from selenium import webdriver
import os
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from lxml import etree
from selenium.webdriver.support import expected_conditions as EC
import datetime
import pymysql as MySQLdb
import random
#初始化(打开浏览器和网页)
def openUrl(url):
driver = webdriver.Firefox()
driver.get(url)#打开网页
time.sleep(5)
return driver
def get_app_data(dt,package,req_num):
#安卓,该app的搜索页面
url ='https://aso100.com/search/android?country=cn&search='+package
# https://aso100.com/search/android?country=cn&search=me.ht.local.hot
#print url
driver.get(url) #跳转至app详情页面
time.sleep(2.0)
driver.refresh()
#有可能该包在本网站没有发现任何对应的app
try:
#获取appid
current_url = driver.current_url
app = current_url.split('/appid/')
appid = app[1]
day_url = 'https://aso100.com/andapp/downDay/appid/'+str(appid)
total_url = 'https://aso100.com/andapp/downTotal/appid/'+str(appid)
#获取总下载量和新增下载量
down_xpath = '//*[@id="container"]/table/tbody/tr/td[2]/span/text()'
company_xpath = '//*[@id="app"]/div[3]/div[2]/div[1]/div/div[1]/div[1]/p[2]/text()'
cate_xpath = '//*[@id="app"]/div[3]/div[2]/div[1]/div/div[1]/div[2]/p[2]/text()'
appname_xpath = '//*[@id="app"]/div[3]/div[2]/div[1]/div/h3/text()'
driver.get(day_url)
page_data = driver.page_source
page_html = etree.HTML(page_data)
day_num = page_html.xpath(down_xpath)
company = page_html.xpath(company_xpath)
category = page_html.xpath(cate_xpath)
appname = page_html.xpath(appname_xpath)
time.sleep(1.0)
driver.get(total_url)
page_data = driver.page_source
page_html = etree.HTML(page_data)
total_num = page_html.xpath(down_xpath)
line = str(appid),appname[0].encode('utf-8'),category[0].encode('utf-8'),str(dt),day_num[0],total_num[0],company[0].encode('utf-8'),package,int(req_num)
except:
time.sleep(1)
if driver.current_url == 'https://aso100.com/error/ipLImit':
print('==============出现了滑动条验证页面,需要进行验证=====================')
print('等你40s,去拉下滑动条')
line = 'error'
time.sleep(40)
else :
print('该包在本网站没有记录对应的app')
line = None,None,None,str(dt),None,None,None,package,int(req_num)
print(line)
return line
#从mysql数据库读取
def read_app_data(dt):
conn = MySQLdb.connect(
host="rdsmn82888784o2w256x.mysql.rds.aliyuncs.com",
port=3306,
user="huihex",
passwd="koolma2010",
db="huihex_dmp_crawler",
charset="utf8"
)
#创建游标
cur = conn.cursor()
#获得表中某一天的所有商品id
#SELECT COUNT(*) from `almm_jt_item` WHERE `start_time` = '2017-08-01 16:00:00';
print("select dt,package,req_num from black_app_analy WHERE dt = "+dt)
aa=cur.execute("select dt,package,req_num from black_app_analy WHERE dt = "+dt +' and req_num>10000')
info = cur.fetchmany(aa)
cur.close()
conn.commit()
conn.close()
return info
#执行数据写入数据库操作
def saveToMysql(tic_data):
conn = MySQLdb.connect(
host="xxx",
port=3306,
user="xxx",
passwd="xxx",
db="xxx",
charset="utf8"
)
#创建游标
cur = conn.cursor()
try:
#执行插入数据库的操作
#一次性插入多条数据
print('开始插入数据')
sql_base = 'insert into black_app_download values (%s,%s,%s,%s,%s,%s,%s,%s,%s)'
cur.executemany(sql_base,tic_data)
#关闭游标
cur.close()
#conn.commit()方法在提交事物,在向数据库插入一条数据时必须要有这个方法,否则数据不会被真正的插入。
conn.commit()
#关闭数据库连接
conn.close()
except Exception as e:
print(e)
conn.rollback()
#读取已经爬过的package
#从mysql数据库读取
def read_crawred_package():
conn = MySQLdb.connect(
host="xxx",
port=3306,
user="xxx",
passwd="xxx",
db="xxx",
charset="utf8"
)
#创建游标
cur = conn.cursor()
aa=cur.execute("select package from black_app_download")
info = cur.fetchmany(aa)
cur.close()
conn.commit()
conn.close()
success_package=[]
for i in info:
success_package.append(i[0])
return success_package
def run():
url = 'https://aso100.com/'
global driver
driver = openUrl(url)
#获取已经爬过的包
success_package = read_crawred_package()
#获取今日日期
now = datetime.datetime.now()
#暂时注释掉
#dt = (now + datetime.timedelta(days=-1)).strftime('%Y%m%d')
#设定dt的值
dt='20170810'
appdata_save=[]#保存app的相关信息,存入数据库
#获取需要爬的app包
appdatas = read_app_data(dt)
j=0
for appdata in appdatas:
j=j+1
#循环16次会被检测出爬虫
dt = appdata[0]
package = appdata[1]
req_num = appdata[2]
if package=='' or package=='-1':
print('包数据不合法:',appdata[1])
elif package in success_package:
print('这个包的数据已经爬过了',appdata[1])
else:
print('开始爬数据',appdata[1])
#为防止被检测出爬虫
waittime = [1.5,1.2,1.8]
sleeptime = random.choice(waittime)
time.sleep(sleeptime)
#执行爬取操作
line = get_app_data(dt,package,req_num)
if line =='error':
print(appdata[1],'出现滑动条页面')
else:
appdata_save.append(line)
print(appdata[1],'对应app的相关信息爬取完毕')
#数据保存至mysql
if j%10==0:
print('插数据库')
saveToMysql(appdata_save)
appdata_save=[]#数据都已经保存成功,便清理一次
time.sleep(2)
#已经爬取过的app不再进行爬取
#最后再保存一次到数据库
saveToMysql(appdata_save)
if __name__ == '__main__':
run()
静以修身,俭以养德!