分享一套完整的Python的采集代码, 带服务器代理, 修改参数可以把采集到的数据上传到指定接口. 同时文章的图片也会自动采集.
#-*- coding: UTF-8 -*- import urllib, urllib2 from bs4 import BeautifulSoup import socket import requests import datetime, time import random import os,stat,pwd serverUrl = "" #服务器URL imgPath = "" #图片存放路径 def chown(d): uid = pwd.getpwnam('www').pw_uid; gid = pwd.getpwnam('www').pw_gid; dstat = os.stat(d) if dstat.st_uid != uid: try: os.chown(d, uid, gid); except: pass def creatFileName(ext = "png"): return str(int(round(time.time() * 1000))) + str(random.randint(10000,99999)) + "." + str(ext) def creatPath(path): isExists = os.path.exists(path) if not isExists: # 如果不存在则创建目录 创建目录操作函数 os.makedirs(path) chown(path) return True else: # 如果目录存在则不创建,并提示目录已存在 return False def go(): ip_list = get_ip_list() proxies = get_random_ip(ip_list) user_agent_list = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60", "Opera/8.0 (Windows NT 5.1; U; en)", "Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0", "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11", "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11", ] print proxies proxy_handler = urllib2.ProxyHandler(proxies) opener = urllib2.build_opener(proxy_handler) for i in [0, 1, 2, 4, 5, 6, 7, 8]: # url = "http://weixin.sogou.com/pcindex/pc/pc_"+str(i)+"/pc_"+str(i)+".html" #request = urllib2.Request(url) #request.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)') response = opener.open(url, timeout=10) doc = response.read() response.close() print doc exit() soup = BeautifulSoup(''.join(doc), "html.parser") a = soup.find("a") href = a.get("href") img = soup.find("img") src = img.get("src") try: fileName = creatFileName("jpeg") urllib.urlretrieve(src, imgFullPath + fileName) chown(imgFullPath + fileName); except: pass continue time.sleep(3) getContent(href, i, imgPath + fileName) def post(body=None): url = "http://aisisoft.cn/?ArticleAdd/add" #请替换成你的服务器地址 headers = {"Content-type": "application/x-www-form-urlencoded"} response = requests.post(url, data=body, headers=headers) print response.text def getContent(url, sogouClassId, src): socket.setdefaulttimeout(100) request = urllib2.Request(url) request.add_header('User-Agent','Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)') request.add_header('Referer','https://mp.weixin.qq.com/') response = urllib2.urlopen(request) doc = response.read() soup = BeautifulSoup(''.join(doc), "html.parser") title = soup.head.title #print jsContent imgs = soup.find(id="js_content").findAll("img") for img in imgs: imgDataSrc = img.get('data-src') imgType = img.get('data-type') if imgDataSrc: fileName = creatFileName(imgType) imgSavePath = imgFullPath + fileName #print imgSavePath count = 1 while count <= 3: try: urllib.urlretrieve(imgDataSrc, imgSavePath) chown(imgSavePath); break except socket.timeout: err_info = 'Reloading for %d time' % count if count == 1 else 'Reloading for %d times' % count print(err_info) count += 1 img['data-src'] = serverUrl + imgPath + fileName img['data-original'] = serverUrl + imgPath + fileName img['src'] = "/Application/User/Static/images/loading.gif" # 正在加载图片 img['class'] = "lazy" #time.sleep(1) else: pass # 组织post数据 if sogouClassId == 1: articleClassId = 17 #热门 elif sogouClassId == 4: articleClassId = 16 #八卦精 elif sogouClassId == 0: articleClassId = 10 #搞笑 elif sogouClassId == 8: articleClassId = 11 #爱生活 elif sogouClassId == 7: articleClassId = 12 #汽车迷 elif sogouClassId == 6: articleClassId = 13 #财经类 elif sogouClassId == 5: articleClassId = 14 #科技咖 elif sogouClassId == 2: articleClassId = 15 #养生堂 jsContent = soup.select("#activity-name, #js_content") jsContent = jsContent[0].__str__() + jsContent[1].__str__() body = { "title" : title.getText(), "articleClassId" : articleClassId, "img" : src, "content" : jsContent, "attr[]" : 1, "click" : random.randint(140, 160) } #print body post(body=body) # 获取代理IP def get_ip_list(): url = 'http://www.xicidaili.com/nn/' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' } web_data = requests.get(url, headers=headers) soup = BeautifulSoup(web_data.text, "html.parser") ips = soup.find_all('tr') ip_list = [] for i in range(1, len(ips)): ip_info = ips[i] tds = ip_info.find_all('td') ip_list.append(tds[1].text + ':' + tds[2].text) return ip_list # 随机得到一个代理IP def get_random_ip(ip_list): proxy_list = [] for ip in ip_list: proxy_list.append('http://' + ip) proxy_ip = random.choice(proxy_list) proxies = {'http': proxy_ip} return proxies #print(proxies) serverUrl = "http://aisisoft.cn/" #服务器URL 请修改成你的服务器地址 imgFullPath = "home/wwwroot/aisisoft.cn/upload/auto/" + datetime.datetime.now().strftime('%Y%m') + "/" + datetime.datetime.now().strftime('%d') + "/" imgPath = "upload/auto/" + datetime.datetime.now().strftime('%Y%m') + "/" + datetime.datetime.now().strftime('%d') + "/" creatPath(imgFullPath) go()
山东艾思软件科技有限公司,是专业的App定制开发, 网站制作企业;为企业提供从软件开发整合运营的一整套方案。App定制开发包括: 办公ERP, OA, CMS, CRM类定制开发, 聊天类APP开发, 电商商城类
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 10年+ .NET Coder 心语 ── 封装的思维:从隐藏、稳定开始理解其本质意义
· 地球OL攻略 —— 某应届生求职总结
· 提示词工程——AI应用必不可少的技术
· Open-Sora 2.0 重磅开源!
· 周边上新:园子的第一款马克杯温暖上架