爬虫学习---爬取补天上资产

python爬虫

简单来说,Beautiful Soup 是 Python 的一个第三方库,主要帮助我们解析网页数据。

在使用这个工具前,我们需要先安装,在 cmd 中,使用 pip 或 easy_install 安装即可。

安装好后,我们还需要安装 lxml,这是解析 HTML 需要用到的依赖:

pip install beautifulsoup4# 或者easy_install beautifulsoup4
pip install lxml 

实战

获取div中指定id的数据:


import requests
from bs4 import BeautifulSoup
 
if __name__ == '__main__':
    target = 'https://www.xsbiquge.com/15_15338/8549128.html'
    req = requests.get(url = target)
    req.encoding = 'utf-8'
    html = req.text
    bs = BeautifulSoup(html, 'lxml')
    texts = bs.find('div', id='content')
    print(texts)

爬取补天所有公益src的中文名称:

处理爬取到的json数据

import time

import requests
import json
import jsonpath


def src(page1):
    target = 'https://www.butian.net/Reward/pub'
    #控制分页
    data = {
        's': '1',
        'p': page1
    }
    req = requests.post(url=target, data=data)
    #得到的是json数据,处理json数据
    unicodeStr = json.loads(req.text)
    #获取json数据中campny_name的值
    companyName = jsonpath.jsonpath(unicodeStr, "$..company_name")
    print("正在爬取第" + str(page1) + "页")
    for i in companyName:
        with open(r'butian.txt', 'a+') as f:
            f.write(i + '\n')
            f.close()


if __name__ == '__main__':
    for page in range(150, 190):
        try:
            src(page)
            time.sleep(0.2)
        except Exception as e:
            print("出错啦!!!")

发现获取中文名称想通过百度或者fofa再获取指定的域名太难了,成功率太低,改为另外一种方法

通过cip爬取几乎所有补天目标域名:

import time

import requests
from bs4 import BeautifulSoup

headers = {
    'Connection': 'keep-alive',
    'Cache-Control': 'max-age=0',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0',
    'Sec-Fetch-Dest': 'document',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Sec-Fetch-Site': 'none',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-User': '?1',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
    'Cookie': 'btuc_ba52447ea424004a7da412b344e5e41a=d8073109b92496b7d78efc46467bdd04ff25554c1ede609c4082521e96ee1f74'
              '; PHPSESSID=nh7d72cedtmun5vid0b2cotqo1; PHPSESSID=21d2b4ldhu7ducl2d29vf40qg6; '
              'wzws_cid'         '=dcbc1afc22788d55232881662a4a413a61a783caabcaf0291e6e454fa304f60b28a86eb2edbddf63fbd8a5cbbc32d10ca98df05eedda82f61bb992fbfddfff0a8b79ad64694d8cd4744dbf8e1f8cbaa7;'
              ' __q__=1642517363830',
}


def gongyi(yeshu):
    target = 'https://www.butian.net/Loo/submit?cid=' + str(yeshu)
    req = requests.get(url=target, headers=headers, timeout=5)
    html = req.text
    bs = BeautifulSoup(html, 'lxml')
    texts = bs.find('div', id='tabs')  # 获取id为tabs的 div的标签
    text2 = texts.form.div.ul.find_all('li')  # find_all 获取所有li 的标签
    i = 0

    for text3 in text2:
        i = i + 1
        if i == 3:
            test4 = text3.input.get('value')  # get表示获取标签属性
            if len(str(text3.input.get('value'))) == 0:
                print("数据为空")
            else:
                print(test4)
                with open(r'yuming2.txt', 'a+') as f:
                    f.write(test4 + '\n')
                    f.close()
            break

if __name__ == '__main__':
    breakFlag = 0  # 用来判断是否连续出错
    for page in range(30000, 65000):  # 1-7000 28000-28200开始 已经扫完
        print("cip="+str(page))
        try:
            gongyi(page)
            time.sleep(0.1)
            breakFlag = 0
        except Exception as e:
            print("出错啦!!!")
            if breakFlag == 2:
                break
            breakFlag = breakFlag+1

统一http/https格式,去除非存活域名

爬处来的格式不统一,统一加上http/https格式,去掉gov,cn的站点,求稳不搞事

获得的例如这些格式的:

www.baidu.com
http://www.baidu.com
test.gov.cn

增加了多线程

import queue
import sys
import threading

import time
import requests
from concurrent.futures import ThreadPoolExecutor
from requests.packages import urllib3

urllib3.disable_warnings()

headers = {
    'Connection': 'keep-alive',
    'Cache-Control': 'max-age=0',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0',
    'Sec-Fetch-Dest': 'document',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Sec-Fetch-Site': 'none',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-User': '?1',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
    'Cookie': 'xxx',  # 根据当前访问cookie
}


def check_ip(ip):

    if 'gov.cn' in ip:
        print("过滤gov.cn网站:" + ip)
    else:
        ip = ip.replace('https://', '').replace('http://', '').replace('/', '').replace('\n', '')
        try:
            url = "https://" + ip
            print(url)
            ip_code = requests.get(url, headers=headers, verify=False, timeout=1).status_code
            print(ip_code)
            with open(ip_file_result, 'a+') as f:
                f.write(url + '\n')
                f.close()
        except Exception as e:
            try:
                urls = "http://" + ip
                print(urls)
                ip_code = requests.get(urls, headers=headers, verify=False, timeout=2).status_code
                print(ip_code)
                with open(ip_file_result, 'a+') as f:
                    f.write(urls + '\n')
                    f.close()
            except Exception as e:
                print("出错啦!")
                time.sleep(0.2)

#多线程
def gaoshi():
    while not q.empty():
        dict = q.get()
        check_ip(dict)


if __name__ == '__main__':

    ip_file = sys.argv[1]
    # ip_file = "55000.txt"
    ip_file_result = "Result_" + ip_file
    titles = list(set([x.strip() for x in open(ip_file).readlines()]))
    print('目标总数:{}'.format(len(titles)))

    thread_x = 100  # 多线程数
    q = queue.Queue()
    for ip in open(ip_file):
        q.put(ip)  # 把数据写这个对象里面,用于多线程调用
    print(q.get())
    for x in range(int(thread_x)):
        t = threading.Thread(target=gaoshi)
        t.start()

posted @ 2022-01-20 18:23  包子TT  阅读(194)  评论(0编辑  收藏  举报