【山东艾思软件】分享一套完整的Python采集公众号文件代码
 分享一套完整的Python的采集代码, 带服务器代理, 修改参数可以把采集到的数据上传到指定接口. 同时文章的图片也会自动采集.
 
 
复制代码
 
#-*- coding: UTF-8 -*-
import urllib, urllib2
from bs4 import BeautifulSoup
import socket
import requests
import datetime, time
import random
import os,stat,pwd


serverUrl = "" #服务器URL
imgPath = "" #图片存放路径

def chown(d):
    uid = pwd.getpwnam('www').pw_uid;  
    gid = pwd.getpwnam('www').pw_gid;
    dstat = os.stat(d)
    if dstat.st_uid != uid:  
        try:  
            os.chown(d, uid, gid);  
        except: 
            pass 


def creatFileName(ext = "png"):
    return str(int(round(time.time() * 1000))) + str(random.randint(10000,99999)) + "." + str(ext)

def creatPath(path):
    isExists = os.path.exists(path)
    if not isExists:
        # 如果不存在则创建目录 创建目录操作函数
        os.makedirs(path)
        chown(path)
        return True
    else:
        # 如果目录存在则不创建,并提示目录已存在
        return False

def go():
    ip_list = get_ip_list()
    proxies = get_random_ip(ip_list)
    user_agent_list = [
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
        "Opera/8.0 (Windows NT 5.1; U; en)",
        "Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50",
        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
        "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
        ]
    print proxies
    proxy_handler = urllib2.ProxyHandler(proxies)
    opener = urllib2.build_opener(proxy_handler)

    for i in [0, 1, 2, 4, 5, 6, 7, 8]: #
        url = "http://weixin.sogou.com/pcindex/pc/pc_"+str(i)+"/pc_"+str(i)+".html"
        #request = urllib2.Request(url)
        #request.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')
        response = opener.open(url, timeout=10)
        doc = response.read()
        response.close()
        print doc
        exit()
        soup = BeautifulSoup(''.join(doc), "html.parser")
        a = soup.find("a")
        href = a.get("href")
        img = soup.find("img")
        src = img.get("src")
        try:
            fileName = creatFileName("jpeg")

            urllib.urlretrieve(src, imgFullPath + fileName)
            chown(imgFullPath + fileName);
        except:
            pass
            continue
        time.sleep(3)
        getContent(href, i, imgPath + fileName)

def post(body=None):
    url = "http://aisisoft.cn/?ArticleAdd/add"  #请替换成你的服务器地址

    headers = {"Content-type": "application/x-www-form-urlencoded"}

    response = requests.post(url, data=body, headers=headers)
    print response.text

def getContent(url, sogouClassId, src):
    socket.setdefaulttimeout(100)

    request = urllib2.Request(url)
    request.add_header('User-Agent','Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')
    request.add_header('Referer','https://mp.weixin.qq.com/')
    response = urllib2.urlopen(request)
    doc = response.read()

    soup = BeautifulSoup(''.join(doc), "html.parser")

    title = soup.head.title

    #print jsContent
    imgs = soup.find(id="js_content").findAll("img")

    for img in imgs:
        imgDataSrc = img.get('data-src')
        imgType = img.get('data-type')
        if imgDataSrc:
            fileName = creatFileName(imgType)
            imgSavePath = imgFullPath + fileName
            #print imgSavePath
            count = 1
            while count <= 3:
                try:
                    urllib.urlretrieve(imgDataSrc, imgSavePath)
                    chown(imgSavePath);
                    break
                except socket.timeout:
                    err_info = 'Reloading for %d time' % count if count == 1 else 'Reloading for %d times' % count
                    print(err_info)
                    count += 1

            img['data-src'] = serverUrl + imgPath + fileName
            img['data-original'] = serverUrl + imgPath + fileName
            img['src'] = "/Application/User/Static/images/loading.gif" # 正在加载图片
            img['class'] = "lazy"
            #time.sleep(1)
        else:
            pass

    # 组织post数据
    if sogouClassId == 1: articleClassId = 17 #热门
    elif sogouClassId == 4: articleClassId = 16 #八卦精
    elif sogouClassId == 0: articleClassId = 10 #搞笑
    elif sogouClassId == 8: articleClassId = 11 #爱生活
    elif sogouClassId == 7: articleClassId = 12 #汽车迷
    elif sogouClassId == 6: articleClassId = 13 #财经类
    elif sogouClassId == 5: articleClassId = 14 #科技咖
    elif sogouClassId == 2: articleClassId = 15 #养生堂


    jsContent = soup.select("#activity-name, #js_content")
    jsContent = jsContent[0].__str__() + jsContent[1].__str__()

    body = {
        "title" : title.getText(),
        "articleClassId" : articleClassId,
        "img" : src,
        "content" : jsContent,
        "attr[]" : 1,
        "click" : random.randint(140, 160)
    }
    #print body


    post(body=body)

# 获取代理IP
def get_ip_list():
    url = 'http://www.xicidaili.com/nn/'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
    }
    web_data = requests.get(url, headers=headers)
    soup = BeautifulSoup(web_data.text, "html.parser")
    ips = soup.find_all('tr')
    ip_list = []
    for i in range(1, len(ips)):
        ip_info = ips[i]
        tds = ip_info.find_all('td')
        ip_list.append(tds[1].text + ':' + tds[2].text)
    return ip_list

# 随机得到一个代理IP
def get_random_ip(ip_list):
    proxy_list = []
    for ip in ip_list:
        proxy_list.append('http://' + ip)
    proxy_ip = random.choice(proxy_list)
    proxies = {'http': proxy_ip}
    return proxies


    #print(proxies)


serverUrl = "http://aisisoft.cn/" #服务器URL 请修改成你的服务器地址

imgFullPath = "home/wwwroot/aisisoft.cn/upload/auto/" + datetime.datetime.now().strftime('%Y%m') + "/" + datetime.datetime.now().strftime('%d') + "/"
imgPath = "upload/auto/" + datetime.datetime.now().strftime('%Y%m') + "/" + datetime.datetime.now().strftime('%d') + "/"
creatPath(imgFullPath)
go()

​
复制代码

 

山东艾思软件科技有限公司,是专业的App定制开发, 网站制作企业;为企业提供从软件开发整合运营的一整套方案。App定制开发包括: 办公ERP, OA, CMS, CRM类定制开发, 聊天类APP开发, 电商商城类

posted on   朋友圈自动点赞工具  阅读(14)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 10年+ .NET Coder 心语 ── 封装的思维:从隐藏、稳定开始理解其本质意义
· 地球OL攻略 —— 某应届生求职总结
· 提示词工程——AI应用必不可少的技术
· Open-Sora 2.0 重磅开源!
· 周边上新:园子的第一款马克杯温暖上架
点击右上角即可分享
微信分享提示