爬取西刺代理并存入mysql
#!/usr/bin/env python # coding:utf8 # author:Z time:2018/8/14 import requests import time from bs4 import BeautifulSoup import pymysql from pymysql import OperationalError from selenium import webdriver import logging from ip_proxy.ip_proxy_filter import filter_db from selenium.webdriver.chrome.options import Options from selenium.common.exceptions import WebDriverException chrome_options = Options() chrome_options.add_argument('window-size=1920x3000') #指定浏览器分辨率 chrome_options.add_argument('--disable-gpu') #谷歌文档提到需要加上这个属性来规避bug chrome_options.add_argument('--hide-scrollbars') #隐藏滚动条, 应对一些特殊页面 chrome_options.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度 chrome_options.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败 # chrome_options.binary_location = r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe" #手动指定使用的浏览器位置 driver=webdriver.Chrome(chrome_options=chrome_options) # # driver=webdriver.Chrome() # logger=logging.getLogger(__file__) def save_log(): # 2、filter对象:不常用, # 3、Handler对象:接收logger传来的日志,然后控制输出 fh = logging.FileHandler('D:/py3code/jintong_day1/aaa/cnstock.log', encoding='utf8') # 创建一个handler,用于写入日志文件 ch = logging.StreamHandler() # 再创建一个handler,用于输出到控制台(终端) # 4、formatter对象:日志格式 formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s -%(module)s: %(message)s' , datefmt='%Y-%m-%d %H:%M:%S %p', ) fh.setLevel(logging.DEBUG) # 5、为Handler对象绑定格式 fh.setFormatter(formatter) ch.setFormatter(formatter) # 6、将Handler添加给logger并设置日志级别 logger.addHandler(fh) # logger对象可以添加多个fh和ch对象 logger.addHandler(ch) logger.setLevel(20) def save_db(details): connect=pymysql.Connect( host='127.0.0.1', port=3306, user='root', password='', db='haha', charset='utf8' ) cursor=connect.cursor() title = details[0][0] sql = """ select ip_address from ip_proxy WHERE ip_address='{}' """.format(title) # sql = """ # select news_url from (select news_url from news where to_days(crawl_datetime) = to_days(now())) as pdf_demo WHERE pdf_demo.news_url='{}' # """.format(news_url) # 插入数据 sql2 = """ INSERT INTO ip_proxy (ip_address,port,server_address,whether_anonymous,type_,live_time,proof_time) VALUES ('%s','%s','%s','%s','%s','%s','%s') """ cursor.execute(sql) title = cursor.fetchall() details = details[0] details[4] = pymysql.escape_string(details[4]) data = tuple(details) try : if len(title)==0: cursor.execute(sql2 % data) connect.commit() logger.info('成功插入cnstock 1 条数据') cursor.close() connect.close() else: logger.info('已有数据') cursor.close() connect.close() except OperationalError: pass def main(): while True: # url = 'http://www.xicidaili.com/nn' # driver.get(url) # total_page = driver.find_element_by_xpath('//*[@id="body"]/div[2]/a[10]').text # # print(total_page) # print(11111) driver = webdriver.Chrome(chrome_options=chrome_options) # print(22222) for i in range(1, 50): # print(33333) url='http://www.xicidaili.com/nn/{}'.format(i) # print(44444) try: driver.get(url) except WebDriverException as e: print(e) # print(55555) tr_nums=driver.find_elements_by_xpath('//*[@id="ip_list"]/tbody/tr') # print('fuckyouyou') print(len(tr_nums)) # print('fuckyouyouyou') save_log() for i in range(2,len(tr_nums)+1): # print(66666) detail=[] ip_address=driver.find_element_by_xpath('//*[@id="ip_list"]/tbody/tr['+str(i)+']/td[2]').text port=driver.find_element_by_xpath('//*[@id="ip_list"]/tbody/tr['+str(i)+']/td[3]').text server_address=driver.find_element_by_xpath('//*[@id="ip_list"]/tbody/tr['+str(i)+']/td[4]').text whether_anonymous=driver.find_element_by_xpath('//*[@id="ip_list"]/tbody/tr['+str(i)+']/td[5]').text type=driver.find_element_by_xpath('//*[@id="ip_list"]/tbody/tr['+str(i)+']/td[6]').text live_time=driver.find_element_by_xpath('//*[@id="ip_list"]/tbody/tr['+str(i)+']/td[9]').text proof_time=driver.find_element_by_xpath('//*[@id="ip_list"]/tbody/tr['+str(i)+']/td[10]').text detail.append([ip_address,port,server_address,whether_anonymous,type,live_time,proof_time]) try: save_db(detail) except OperationalError as e: print(e) time.sleep(2) try: filter_db() except OperationalError as e: print(e) driver.close() time.sleep(300) if __name__ == '__main__': main()
#!/usr/bin/env python # coding:utf8 # author:Z time:2018/8/15 import pymysql import re #删除大于6分钟存活时间的ip def filter_db(): connect=pymysql.Connect( host='127.0.0.1', port=3306, user='root', password='', db='haha', charset='utf8' ) sql=""" select live_time from ip_proxy """ cursor=connect.cursor() cursor.execute(sql) live_time=cursor.fetchall() for i in live_time: live_time_num=re.findall('(\d+)\D+',i[0])[0] live_time_end=re.findall('\d+(\D+)',i[0])[0] if not (int(live_time_num)<6 and live_time_end=='分钟'): sql=""" delete from ip_proxy where live_time='{}' """.format(i[0]) # print(sql) cursor.execute(sql) connect.commit() cursor.close() connect.close() filter_db()