爬虫学习笔记:打造自己的代理池

# -*- coding: utf-8 -*-
"""
Created on Sat Dec 18 00:00:59 2021
@author: Hider
"""
import requests
import parsel
import time
import pandas as pd

headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36', 'Connection': 'close'
}

def get_page(page):
    url = 'https://www.kuaidaili.com/free/inha/' + str(page)
    response = requests.get(url=url, headers=headers)
    html = parsel.Selector(response.text)
    parse_page(html)

def parse_page(html):
    parse_list = html.xpath('//table[@class="table table-bordered table-striped"]/tbody/tr')
    for tr in parse_list:
        # parse_lists = {}
        ip = tr.xpath('./td[@data-title="IP"]//text()').extract_first()
        port = tr.xpath('./td[@data-title="PORT"]//text()').extract_first()
        nimingdu = tr.xpath('./td[@data-title="匿名度"]//text()').extract_first()
        type1 = tr.xpath('./td[@data-title="类型"]//text()').extract_first()
        location = tr.xpath('./td[@data-title="位置"]//text()').extract_first()
        speed = tr.xpath('./td[@data-title="响应速度"]//text()').extract_first()
        last_time = tr.xpath('./td[@data-title="最后验证时间"]//text()').extract_first()
        # parse_lists[http] = num + ':' + port
        parse_lists.append([ip, port, nimingdu, type1, location, speed, last_time])
        time.sleep(0.1)
        # print(parse_lists)
    
if __name__ == '__main__':
    parse_lists = []
    for page in range(1, 21):
        get_page(page)

df = pd.DataFrame(parse_lists, columns=['IP','PORT','匿名度','类型','位置','响应速度','最后验证时间'])


posted @   Hider1214  阅读(58)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 分享4款.NET开源、免费、实用的商城系统
· 全程不用写代码,我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· 白话解读 Dapr 1.15:你的「微服务管家」又秀新绝活了
· 上周热点回顾(2.24-3.2)
点击右上角即可分享
微信分享提示