不爱贞子爱爽子
バキューン

1、环境python2.7+selenium+PhantomJS+mysql(自行安装)

2、详细看代码,简单易懂

3、数据库链接的本地数据库,测试的话改成自己的就行了

4、贴上代码

# -*- coding: utf-8 -*-
# 
# 
#
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import urllib2 
import sys
import pymysql
from bs4 import BeautifulSoup
import random
reload(sys)
sys.setdefaultencoding('utf-8')
# 连接数据库
conn = pymysql.Connection(host="localhost", user="root", passwd="root", db='test',charset="UTF8")
# 创建指针
cursor = conn.cursor()
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.loadImages"] = False
phantomjs_driver_path = 'E:\\p_python\\Scripts\\phantomjs\\bin\\phantomjs.exe'
#####创建表格函数 db_cp
def ready():
    try:
        ####创建表格
        cursor.execute('select number from db_cp')
        # 获取查询结果
        row = cursor.fetchall()
           # 没有设置默认自动提交,需要主动提交,以保存所执行的语句
           conn.commit()
    except:
        ####报错说明表不存在
        #开始创建表
        #sql = 'CREATE TABLE 表名称(id int,number varchar)'
        cursor.execute('CREATE TABLE db_cp(id int,number varchar(255))')
        conn.commit()

#########
def insert_tb(list):
    try:
        effect_row = cursor.executemany("insert into db_cp(number)values(%s)", list)
        ## 提交,不然无法保存新建或者修改的数据
        conn.commit()
    except:
        print 'Add this db fault!'


###定义分割线
def line():
    print('-'*80)
    print('Strating...............')


#####html
def get_html(url):
    '''获取html'''
    ##定义headers
    user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
    headers={"User-Agent":user_agent}
    request = urllib2.Request(url, headers=headers)
    #request.encoding = 'utf-8'
    try:
        html = urllib2.urlopen(request).read()
    except urllib2.URLError as e:
        print url+'Download error:', e.reason
        html = None
    return html
######定义Phantomjs函数
def get_bt(url):
    print 'starting get  data in '+url+'\n'
    driver = webdriver.PhantomJS(phantomjs_driver_path, desired_capabilities=dcap)
    #开始获取里面的内容
    driver.get(url)
    new_span_list = driver.find_element_by_id('redsId').find_elements_by_tag_name('span')
    ###开始获取编号
    number = ''
    list = []
    for v in new_span_list:
        print v.text
        list.append(v.text)
    number="-".join(list)
    white_ball = driver.find_element_by_id('blueId')
    number+='-'+str(white_ball.text)
    return number


####创建主函数
def main():
    line()
    #ready()
    for v in range(1,2):
        if v==1:
            url = 'http://www.zhcw.com/ssq/kjgg/'
        else:
            url = 'http://www.zhcw.com/ssq/kjgg/index_'+str(v)+'.shtml'
        html = get_html(url)
        obj = BeautifulSoup(html, 'html.parser')
        span_list = obj.find_all('span', {'class':'Nlink'})
        number_list = []
        for span in span_list:
            href = span.find('a')['href']
            print('-'*80)
            #print(href)
            new_href  = 'http://www.zhcw.com'+href
            number = get_bt(new_href)
            number_list.append((number))
            print number
        insert(number_list)

def test():
    html = get_html('http://www.foods1.com/TurnImg/mobile')
    print html

###数据库操作
####开始主函数
if __name__ == '__main__':
    main()

 

posted on 2018-01-02 14:07  不爱贞子爱爽子  阅读(727)  评论(0编辑  收藏  举报

! !