爬取西刺代理并存入mysql

 

 

#!/usr/bin/env python
# coding:utf8
# author:Z time:2018/8/14

import requests
import time
from bs4 import BeautifulSoup
import pymysql
from pymysql import OperationalError
from selenium import webdriver
import logging
from ip_proxy.ip_proxy_filter import filter_db
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import WebDriverException

chrome_options = Options()
chrome_options.add_argument('window-size=1920x3000') #指定浏览器分辨率
chrome_options.add_argument('--disable-gpu') #谷歌文档提到需要加上这个属性来规避bug
chrome_options.add_argument('--hide-scrollbars') #隐藏滚动条, 应对一些特殊页面
chrome_options.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度
chrome_options.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
# chrome_options.binary_location = r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe" #手动指定使用的浏览器位置


driver=webdriver.Chrome(chrome_options=chrome_options)
#
# driver=webdriver.Chrome()
#
logger=logging.getLogger(__file__)



def save_log():
    # 2、filter对象:不常用,

    # 3、Handler对象:接收logger传来的日志,然后控制输出
    fh = logging.FileHandler('D:/py3code/jintong_day1/aaa/cnstock.log', encoding='utf8')  # 创建一个handler,用于写入日志文件
    ch = logging.StreamHandler()  # 再创建一个handler,用于输出到控制台(终端)

    # 4、formatter对象:日志格式
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s -%(module)s:  %(message)s'
                                  , datefmt='%Y-%m-%d %H:%M:%S %p', )
    fh.setLevel(logging.DEBUG)

    # 5、为Handler对象绑定格式
    fh.setFormatter(formatter)
    ch.setFormatter(formatter)

    # 6、将Handler添加给logger并设置日志级别
    logger.addHandler(fh)  # logger对象可以添加多个fh和ch对象
    logger.addHandler(ch)
    logger.setLevel(20)


def save_db(details):
    connect=pymysql.Connect(
         host='127.0.0.1',
         port=3306,
         user='root',
         password='',
         db='haha',
         charset='utf8'
    )
    
    
    cursor=connect.cursor()
    title = details[0][0]
    sql = """
           select ip_address from ip_proxy WHERE ip_address='{}'
                        """.format(title)
    # sql = """
    #      select news_url from (select news_url from news where to_days(crawl_datetime) = to_days(now())) as pdf_demo WHERE pdf_demo.news_url='{}'
    #         """.format(news_url)
    # 插入数据
    sql2 = """
                INSERT INTO ip_proxy (ip_address,port,server_address,whether_anonymous,type_,live_time,proof_time)
                VALUES ('%s','%s','%s','%s','%s','%s','%s')
                """
    cursor.execute(sql)

    title = cursor.fetchall()

    details = details[0]

    details[4] = pymysql.escape_string(details[4])

    data = tuple(details)

    try :
        if len(title)==0:

            cursor.execute(sql2 % data)
            connect.commit()

            logger.info('成功插入cnstock 1 条数据')

            cursor.close()
            connect.close()
        else:
            logger.info('已有数据')
            cursor.close()
            connect.close()
    except OperationalError:
        pass


def main():
    while True:
        # url = 'http://www.xicidaili.com/nn'
        # driver.get(url)
        # total_page = driver.find_element_by_xpath('//*[@id="body"]/div[2]/a[10]').text
        #
        # print(total_page)
        # print(11111)
        driver = webdriver.Chrome(chrome_options=chrome_options)
        # print(22222)

        for i in range(1, 50):
            # print(33333)

            url='http://www.xicidaili.com/nn/{}'.format(i)
            # print(44444)

            try:
                driver.get(url)
            except WebDriverException as e:
                print(e)
            # print(55555)

            tr_nums=driver.find_elements_by_xpath('//*[@id="ip_list"]/tbody/tr')
            # print('fuckyouyou')
            print(len(tr_nums))
            # print('fuckyouyouyou')

            save_log()

            for i in range(2,len(tr_nums)+1):
                # print(66666)

                detail=[]
                ip_address=driver.find_element_by_xpath('//*[@id="ip_list"]/tbody/tr['+str(i)+']/td[2]').text
                port=driver.find_element_by_xpath('//*[@id="ip_list"]/tbody/tr['+str(i)+']/td[3]').text
                server_address=driver.find_element_by_xpath('//*[@id="ip_list"]/tbody/tr['+str(i)+']/td[4]').text
                whether_anonymous=driver.find_element_by_xpath('//*[@id="ip_list"]/tbody/tr['+str(i)+']/td[5]').text
                type=driver.find_element_by_xpath('//*[@id="ip_list"]/tbody/tr['+str(i)+']/td[6]').text
                live_time=driver.find_element_by_xpath('//*[@id="ip_list"]/tbody/tr['+str(i)+']/td[9]').text
                proof_time=driver.find_element_by_xpath('//*[@id="ip_list"]/tbody/tr['+str(i)+']/td[10]').text
                detail.append([ip_address,port,server_address,whether_anonymous,type,live_time,proof_time])

                try:
                    save_db(detail)

                except OperationalError as e:
                    print(e)
                time.sleep(2)
            try:
                filter_db()
            except OperationalError as e:
                print(e)
        driver.close()

        time.sleep(300)

if __name__ == '__main__':
    main()

 

 

#!/usr/bin/env python
# coding:utf8
# author:Z time:2018/8/15


import pymysql
import re

#删除大于6分钟存活时间的ip
def filter_db():
    connect=pymysql.Connect(
         host='127.0.0.1',
         port=3306,
         user='root',
         password='',
         db='haha',
         charset='utf8'
    )
   
    sql="""
    select live_time from ip_proxy
    """
    cursor=connect.cursor()
    cursor.execute(sql)
    live_time=cursor.fetchall()

    for i in live_time:
        live_time_num=re.findall('(\d+)\D+',i[0])[0]

        live_time_end=re.findall('\d+(\D+)',i[0])[0]

        if not (int(live_time_num)<6 and live_time_end=='分钟'):
            sql="""
            delete from ip_proxy where live_time='{}'
            """.format(i[0])
            # print(sql)
            cursor.execute(sql)
            connect.commit()

    cursor.close()
    connect.close()

filter_db()

 

posted @ 2018-08-29 18:05  Operater  阅读(217)  评论(0编辑  收藏  举报