使用docker部署爬虫项目
将python爬虫部署到docker环境中:
一、本地环境准备
- main.py
#!/usr/bin/env python # -*- encoding: utf-8 -*- ''' @File : main.py @Time : 2022/02/10 10:21:43 @Author : Shydow @Version : 1.0 @Desc : None ''' # here put the import lib import requests from multiprocessing import Process import time import datetime import schedule import logging from service import launcher logging.basicConfig(format='%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s', level=logging.INFO) def run_one(exec_date): print("task one start .. ") print(exec_date) timestamp = int(time.time() * 1000) print(timestamp) time.sleep(10) print("tsak one end .. ") def run_two(exec_date): print("task two start .. ") print(exec_date) timestamp = int(time.time() * 1000) print(timestamp) time.sleep(10) print("task two end .. ") def daily_run(): exec_date = date.today().isoformat() p1 = Process(target=run_one, args=(exec_date, )) p1.start() p2 = Process(target=run_two, args=(exec_date, )) p2.start() if __name__ == '__main__': schedule.every().hour.at(":05").do(daily_run) # 每个小时的第5min执行任务 while True: schedule.run_pending() time.sleep(1)
-
service.py
#!/usr/bin/env python # -*- encoding: utf-8 -*- ''' @File : service.py @Time : 2022/02/10 10:23:33 @Author : Shydow @Version : 1.0 @Desc : None ''' # here put the import lib def launcher(current_time): print(current_time)
-
requirements.txt
urllib3 DingDingBot requests PySocks==1.7.1 clickhouse-driver==0.2.0 pandas==0.25.1 numpy==1.16.5 schedule==1.1.0 hdfs==2.6.0
二、Dockerfile
# author: Shydow # date : 2022-02-10 # desc : spider test dockerfile FROM python:3.7.3 # 将当前目录下的文件copy到容器的/spider_deploy目录下 ADD ./ /spider_deploy # 设置容器内工作路径为/spider_deploy WORKDIR /spider_deploy # 安装运行所需要的python依赖 RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple -r requirements.txt && \ wget https://nodejs.org/dist/v10.16.0/node-v10.16.0-linux-x64.tar.xz && \ tar xf node-v10.16.0-linux-x64.tar.xz -C /opt/ && \ rm -rf node-v10.16.0-linux-x64.tar.xz # 添加nodejs环境变量 ENV PATH=$PATH:/opt/node-v10.16.0-linux-x64/bin # 修改容器时区和时间 RUN cp /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && \ echo 'Asia/Shanghai' >/etc/timezone # 在run的时候会执行该命令 CMD ["python", "main.py"]
三、构建镜像启动
# 选择指定目录,将所有脚本发到该目录下 cd /app/spider/spider_deploy # 构建镜像,后面是有一个 . sudo docker build -t spider_test . # 启动镜像 sudo docker run -d --name spider --add-host cdh01:172.23.255.11 --add-host cdh02:172.23.255.12 --add-host cdh03:172.23.255.13 --add-host cdh04:172.23.255.14 spider_test