ip代理池抓取qq音乐热歌前300

本程序从网上抓取免费代理IP存到文件,然后用抓取的IP抓取百度首页测试其是否可用,再用Flask框架搭建一个本地接口,可从此接口获取代理池中的代理IP,用此代理池中获取的IP抓取qq音乐热歌榜前300的歌曲信息保存到文件

从西刺代理获取代理IP

import requests
from requests.exceptions import RequestException
import re

def get_page(url,headers):
    """
    获取页面
    """
    response = requests.get(url,headers=headers)
    try:
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return "出错"

def html_paser(html):
    """
    页面解析
    """
    #re.S匹配所有字符,包括换行
    patten = re.compile(".*?<td>((\d+).(\d+).(\d+).(\d+)).*?<td>(\d+)</td>",re.S)
    ip_info = re.findall(patten,html)
    return ip_info

def save(ip_info):
    """
    存入文件
    """
    with open("ip_pool","w",encoding="utf-8") as f:
        for i in range(len(ip_info)):
            #生成ip:port形式
            ip = ip_info[i][0] + ":" + ip_info[i][5]
            f.write(ip+"\n")

def main():
    url = "http://www.xicidaili.com"
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
               "Chrome/66.0.3359.181 Safari/537.36"}
    html = get_page(url,headers)
    ip_info = html_paser(html)
    save(ip_info)

if __name__ == "__main__":
    main()

检测IP是否可用

import requests
from requests.exceptions import RequestException
from multiprocessing import Pool

def check(url,ip):
    """
    检测代理是否可用
    """
    proxies = {
        'http': 'http://' + ip
    }
    try:
        response = requests.get(url,proxies)
        if response.status_code == 200:
            #测试获取百度首页是否成功
            print("%s is useful %s"%(ip.strip(),response.status_code))
        else:
            #不成功说明此代理IP不可用
            print("---------!!!----------\n")
            print("%s is no use"%ip.strip())
            print("---------!!!----------\n")
    except RequestException:
        print("error")
        exit(-1)

def main():
    """
    主函数
    """
    url = "http://www.baidu.com"
    #开启5个进程
    pool = Pool(5)
    with open("ip_pool","r",encoding="utf-8") as f:
        for line in f:
            #逐个检测
            pool.apply_async(check,args=(url,line))
    pool.close()
    #等待子进程完成
    pool.join()

if __name__ == "__main__":
    main()  

用Flask框架搭建本地API以获取IP

from flask import *

app = Flask(__name__)

i = 1
line_len = 0

@app.route('/')
def index():
    """
    首页获取单条IP信息
    """
    global i
    global line_len
    with open("ip_pool","r",encoding="utf-8") as f:
        #下一次访问此页面时返回下一条IP
        f.seek(line_len)
        for line in f:
            line_len += len(line)
            i += 1
            #使得到的IP不包括换行或空白字符
            return line.strip()

@app.route('/getall')
def get_all():
    """
    获取所有IP信息
    """
    all = []
    with open("ip_pool", "r", encoding="utf-8") as f:
        for line in f:
            all.append(line.strip())
    return str(all)

if __name__ == "__main__":
    app.run()

用从本地API获取的IP抓取qq音乐热歌榜前300的歌曲信息

import requests
from urllib.parse import urlencode
import json
from bs4 import BeautifulSoup
from requests.exceptions import RequestException

def get_proxies():
    "从代理池获取代理IP"
    url = "http://127.0.0.1:5000"
    response = requests.get(url)
    if response.status_code == 200:
        if response.text:
            proxy = response.text
            return proxy
        else:
            return get_proxies()
    else:
        print("ip pool error")
        exit(-1)

def get_one_page(url,headers,num):
    """
    获取页面
    """
    proxy = get_proxies()
    #代理
    proxies = {
        'http': 'http://' + proxy
    }
    data = {
        "tpl": "3",
        "page":"detail",
        "date":"2018_27",
        "topid":"26",
        "type":"top",
        "song_begin":"0",
        "song_num":"%s"%num,
        "g_tk":"5381",
        "jsonpCallback":"MusicJsonCallbacktoplist",
        "loginUin":"0",
        "hostUin":"0",
        "format":"jsonp",
        "inCharset":"utf8",
        "outCharset":"utf-8",
        "notice":"0",
        "platform":"yqq",
        "needNewCode":"0",
    }
    url = url + urlencode(data)
    #使用代理
    response = requests.get(url,headers=headers,proxies=proxies)
    try:
        if response.status_code == 200:
            return response.text
        else:
            return None
    except RequestException:
        return None

def html_paser(html,song_url):
    """
    页面解析
    """
    #字典格式
    content = json.loads(html)
    for i in range(len(content["songlist"])):
        #用字典解析
        rank = content["songlist"][i]['cur_count']
        songname = content["songlist"][i]['data']["albumname"]
        singer = content["songlist"][i]["data"]["singer"][0]["name"]
        songurl = song_url+content["songlist"][i]["data"]["albummid"]+".html"
        #生成器
        yield (rank,songname,singer,songurl)

def write2file(songlist):
    """
    保存
    """
    with open("qqmusic_pop","w",encoding="utf-8") as f:
        for song in songlist:
            f.write(str(song)+"\n")

def main():
    #歌曲链接与获取歌曲时的url不同,与歌曲id拼接成完整的歌曲链接
    song_url = "https://y.qq.com/n/yqq/album/"
    #ajax请求时的url,与请求头拼接成完整的请求信息
    url = "https://c.y.qq.com/v8/fcg-bin/fcg_v8_toplist_cp.fcg?"
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
               "Chrome/66.0.3359.181 Safari/537.36"}
    #请求数为300
    html = get_one_page(url,headers,300)
    #返回数据的前26位不是json格式,最后一位是一个括号,也不要
    html = html[26:len(html)-1].replace(" ","")
    songlist = html_paser(html,song_url)
    write2file(songlist)

if __name__ == "__main__":
    main()

  

 

posted @ 2018-07-17 16:53  whz_it  阅读(290)  评论(0编辑  收藏  举报