python爬取免费西祠代理

#!/usr/local/bin/python3.7

"""
@File    :   xicidaili.py
@Time    :   2020/06/02
@Author  :   Mozili

"""

import urllib.request
import urllib.parse
from lxml import etree
import random
import time

def handler_request(url):
    # 请求头
    headers = {
     'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'
     }
    # 创建请求
    req = urllib.request.Request(url=url, headers=headers)
    # 发送请求
    res = urllib.request.urlopen(req)
    # 获取内容
    cot = res.read().decode()
    return cot

def preserve_data(ips, ports, types):

    for ip in ips:
        for i in range(len(ports)):
            for j in range(len(types)):
                str = types[j] + ' ' + ip + ':' + ports[i] + '\n'
                # 删除列表中第一个元素
                del types[0]
                # print(types)
                del ports[0]
                # print(ports)
                with open('Reptile/daili.txt', 'a', encoding='utf-8') as fp:
                    fp.write(str)
                break
            break
        
def download_content(tree):
    # 获取ip
    ips = tree.xpath("//tr[@class='odd']/td[2]/text()")
    # print(ips)
    # 获取端口
    ports = tree.xpath("//tr[@class='odd']/td[3]/text()")
    # print(ports)
    # 获取类型
    types = tree.xpath("//tr[@class='odd']/td[6]/text()")
    # print(types)
    # 保存数据到txt文档
    preserve_data(ips, ports, types)


if __name__ == "__main__":
    # 输入页码
    start_page = int(input('请输入起始页码:'))
    end_page = int(input('请输入结束页码:'))
    # url列表
    url_list= [
        'https://www.xicidaili.com/nn/',
        'https://www.xicidaili.com/nt/',
        'https://www.xicidaili.com/wn/',
        'https://www.xicidaili.com/wt/',
        'https://www.xicidaili.com/qq/'
        ]
    for url in url_list:
        for page in range(start_page, end_page+1):
            new_url = url + str(page)
            # print(url)
            # 创建请求
            content = handler_request(new_url)
            # print(content)
            time.sleep(1)
            # 创建对象,网络文件
            tree = etree.HTML(content)
            # 开始爬取内容
            download_content(tree)
    

 

posted @ 2020-06-02 16:12  梅梅不想踩坑  阅读(349)  评论(0编辑  收藏  举报