1、环境python2.7+selenium+PhantomJS+mysql(自行安装)
2、详细看代码,简单易懂
3、数据库链接的本地数据库,测试的话改成自己的就行了
4、贴上代码
# -*- coding: utf-8 -*- # # # from selenium import webdriver from selenium.webdriver.common.desired_capabilities import DesiredCapabilities import urllib2 import sys import pymysql from bs4 import BeautifulSoup import random reload(sys) sys.setdefaultencoding('utf-8') # 连接数据库 conn = pymysql.Connection(host="localhost", user="root", passwd="root", db='test',charset="UTF8") # 创建指针 cursor = conn.cursor() dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.loadImages"] = False phantomjs_driver_path = 'E:\\p_python\\Scripts\\phantomjs\\bin\\phantomjs.exe' #####创建表格函数 db_cp def ready(): try: ####创建表格 cursor.execute('select number from db_cp') # 获取查询结果 row = cursor.fetchall() # 没有设置默认自动提交,需要主动提交,以保存所执行的语句 conn.commit() except: ####报错说明表不存在 #开始创建表 #sql = 'CREATE TABLE 表名称(id int,number varchar)' cursor.execute('CREATE TABLE db_cp(id int,number varchar(255))') conn.commit() ######### def insert_tb(list): try: effect_row = cursor.executemany("insert into db_cp(number)values(%s)", list) ## 提交,不然无法保存新建或者修改的数据 conn.commit() except: print 'Add this db fault!' ###定义分割线 def line(): print('-'*80) print('Strating...............') #####html def get_html(url): '''获取html''' ##定义headers user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36" headers={"User-Agent":user_agent} request = urllib2.Request(url, headers=headers) #request.encoding = 'utf-8' try: html = urllib2.urlopen(request).read() except urllib2.URLError as e: print url+'Download error:', e.reason html = None return html ######定义Phantomjs函数 def get_bt(url): print 'starting get data in '+url+'\n' driver = webdriver.PhantomJS(phantomjs_driver_path, desired_capabilities=dcap) #开始获取里面的内容 driver.get(url) new_span_list = driver.find_element_by_id('redsId').find_elements_by_tag_name('span') ###开始获取编号 number = '' list = [] for v in new_span_list: print v.text list.append(v.text) number="-".join(list) white_ball = driver.find_element_by_id('blueId') number+='-'+str(white_ball.text) return number ####创建主函数 def main(): line() #ready() for v in range(1,2): if v==1: url = 'http://www.zhcw.com/ssq/kjgg/' else: url = 'http://www.zhcw.com/ssq/kjgg/index_'+str(v)+'.shtml' html = get_html(url) obj = BeautifulSoup(html, 'html.parser') span_list = obj.find_all('span', {'class':'Nlink'}) number_list = [] for span in span_list: href = span.find('a')['href'] print('-'*80) #print(href) new_href = 'http://www.zhcw.com'+href number = get_bt(new_href) number_list.append((number)) print number insert(number_list) def test(): html = get_html('http://www.foods1.com/TurnImg/mobile') print html ###数据库操作 ####开始主函数 if __name__ == '__main__': main()
今ならできます。