selenium 爬虫

# -*- coding: utf-8 -*- 
from selenium import webdriver
import os
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from lxml import etree
from selenium.webdriver.support import expected_conditions as EC
import datetime
import pymysql as MySQLdb
import random
#初始化（打开浏览器和网页）
def openUrl(url):
    driver =  webdriver.Firefox()
    driver.get(url)#打开网页
    time.sleep(5)
    return driver
def get_app_data(dt,package,req_num):
    #安卓，该app的搜索页面
    url ='https://aso100.com/search/android?country=cn&search='+package
    # https://aso100.com/search/android?country=cn&search=me.ht.local.hot
    
    #print url
    driver.get(url) #跳转至app详情页面
    time.sleep(2.0)
    driver.refresh()
    #有可能该包在本网站没有发现任何对应的app
    try:
        #获取appid
        current_url =  driver.current_url
        app = current_url.split('/appid/')
        appid = app[1]
        day_url = 'https://aso100.com/andapp/downDay/appid/'+str(appid)
        total_url = 'https://aso100.com/andapp/downTotal/appid/'+str(appid)
        #获取总下载量和新增下载量
        down_xpath = '//*[@id="container"]/table/tbody/tr/td[2]/span/text()'
        company_xpath = '//*[@id="app"]/div[3]/div[2]/div[1]/div/div[1]/div[1]/p[2]/text()'
        cate_xpath = '//*[@id="app"]/div[3]/div[2]/div[1]/div/div[1]/div[2]/p[2]/text()'
        appname_xpath = '//*[@id="app"]/div[3]/div[2]/div[1]/div/h3/text()'
        driver.get(day_url)
        page_data = driver.page_source
        page_html = etree.HTML(page_data)
        day_num = page_html.xpath(down_xpath)
        company = page_html.xpath(company_xpath)
        category = page_html.xpath(cate_xpath)
        appname = page_html.xpath(appname_xpath)
        time.sleep(1.0)
        driver.get(total_url)
        page_data = driver.page_source
        page_html = etree.HTML(page_data)
        total_num = page_html.xpath(down_xpath)
        line = str(appid),appname[0].encode('utf-8'),category[0].encode('utf-8'),str(dt),day_num[0],total_num[0],company[0].encode('utf-8'),package,int(req_num)
    except:
        time.sleep(1)
        if driver.current_url == 'https://aso100.com/error/ipLImit':
            print('==============出现了滑动条验证页面，需要进行验证=====================')
            print('等你40s，去拉下滑动条')
            line = 'error'
            time.sleep(40)
        else :
            print('该包在本网站没有记录对应的app')
            line = None,None,None,str(dt),None,None,None,package,int(req_num)
            print(line)
    return line
#从mysql数据库读取
def read_app_data(dt):
    conn = MySQLdb.connect(
        host="rdsmn82888784o2w256x.mysql.rds.aliyuncs.com",
        port=3306,
        user="huihex",
        passwd="koolma2010",
        db="huihex_dmp_crawler",
        charset="utf8"
    )
    #创建游标
    cur = conn.cursor()
    #获得表中某一天的所有商品id
    #SELECT COUNT(*) from `almm_jt_item` WHERE `start_time` = '2017-08-01 16:00:00';
    print("select dt,package,req_num from black_app_analy WHERE dt = "+dt)
    aa=cur.execute("select dt,package,req_num from black_app_analy WHERE dt = "+dt +' and req_num>10000')
    info = cur.fetchmany(aa)
    cur.close()
    conn.commit()
    conn.close()
    return info
#执行数据写入数据库操作
def saveToMysql(tic_data):
    conn = MySQLdb.connect(
        host="xxx",
        port=3306,
        user="xxx",
        passwd="xxx",
        db="xxx",
        charset="utf8"
    )
    #创建游标
    cur = conn.cursor()
    try:
        #执行插入数据库的操作
        #一次性插入多条数据
        print('开始插入数据')
        sql_base = 'insert into black_app_download values (%s,%s,%s,%s,%s,%s,%s,%s,%s)'
        cur.executemany(sql_base,tic_data)
        #关闭游标
        cur.close()
        #conn.commit()方法在提交事物，在向数据库插入一条数据时必须要有这个方法，否则数据不会被真正的插入。
        conn.commit()
        #关闭数据库连接
        conn.close()
    except Exception as e:
        print(e)
        conn.rollback()
#读取已经爬过的package
#从mysql数据库读取
def read_crawred_package():
    conn = MySQLdb.connect(
        host="xxx",
        port=3306,
        user="xxx",
        passwd="xxx",
        db="xxx",
        charset="utf8"
    )
    #创建游标
    cur = conn.cursor()
    aa=cur.execute("select package from black_app_download")
    info = cur.fetchmany(aa)
    cur.close()
    conn.commit()
    conn.close()
    success_package=[]
    for i in info:
        success_package.append(i[0])
    return success_package
def run():
    url = 'https://aso100.com/'
    global driver
    driver = openUrl(url)
    #获取已经爬过的包
    success_package = read_crawred_package()
    #获取今日日期
    now = datetime.datetime.now()
    #暂时注释掉
    #dt = (now + datetime.timedelta(days=-1)).strftime('%Y%m%d')
    #设定dt的值
    dt='20170810'
    appdata_save=[]#保存app的相关信息，存入数据库
    #获取需要爬的app包
    appdatas = read_app_data(dt)
    j=0
    for appdata in appdatas:
        j=j+1
        #循环16次会被检测出爬虫
        dt = appdata[0]
        package = appdata[1]
        req_num = appdata[2]
        if package=='' or package=='-1':
            print('包数据不合法:',appdata[1])
        elif package in success_package:
            print('这个包的数据已经爬过了',appdata[1])
        else:
            print('开始爬数据',appdata[1])
             #为防止被检测出爬虫
            waittime = [1.5,1.2,1.8]
            sleeptime = random.choice(waittime)
            time.sleep(sleeptime)
            #执行爬取操作
            line = get_app_data(dt,package,req_num)
            if line =='error':
                print(appdata[1],'出现滑动条页面')
            else:
                appdata_save.append(line)
                print(appdata[1],'对应app的相关信息爬取完毕')
                #数据保存至mysql
        if j%10==0:
            print('插数据库'）
            saveToMysql(appdata_save)
            appdata_save=[]#数据都已经保存成功，便清理一次
            time.sleep(2)
        #已经爬取过的app不再进行爬取
    #最后再保存一次到数据库
    saveToMysql(appdata_save)
       
if __name__ == '__main__':
    run()
posted @ 2018-07-12 11:05 数据菜鸟阅读(271) 评论(0) 编辑收藏举报
刷新页面返回顶部
数据菜鸟

selenium 爬虫

公告