使用docker部署爬虫项目

将python爬虫部署到docker环境中:

一、本地环境准备

  • main.py
    #!/usr/bin/env python
    # -*- encoding: utf-8 -*-
    '''
    @File    :   main.py
    @Time    :   2022/02/10 10:21:43
    @Author  :   Shydow
    @Version :   1.0
    @Desc    :   None
    '''
    
    # here put the import lib
    import requests
    from multiprocessing import Process
    import time
    import datetime
    import schedule
    import logging
    from service import launcher
    
    logging.basicConfig(format='%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s', level=logging.INFO)
    
    def run_one(exec_date):
        print("task one start .. ")
        print(exec_date)
        timestamp = int(time.time() * 1000)
        print(timestamp)
        time.sleep(10)
        print("tsak one end .. ")
    
    def run_two(exec_date):
        print("task two start .. ")
        print(exec_date)
        timestamp = int(time.time() * 1000)
        print(timestamp)
        time.sleep(10)
        print("task two end .. ")
    
    def daily_run():
        exec_date = date.today().isoformat()
        p1 = Process(target=run_one, args=(exec_date, ))
        p1.start()
        p2 = Process(target=run_two, args=(exec_date, ))
        p2.start()
    
    if __name__ == '__main__':
        schedule.every().hour.at(":05").do(daily_run) # 每个小时的第5min执行任务
        while True:
            schedule.run_pending()
            time.sleep(1)

     

  • service.py

    #!/usr/bin/env python
    # -*- encoding: utf-8 -*-
    '''
    @File    :   service.py
    @Time    :   2022/02/10 10:23:33
    @Author  :   Shydow
    @Version :   1.0
    @Desc    :   None
    '''
    
    # here put the import lib
    def launcher(current_time):
        print(current_time)

     

  • requirements.txt

    urllib3
    DingDingBot
    requests
    PySocks==1.7.1
    clickhouse-driver==0.2.0
    pandas==0.25.1
    numpy==1.16.5
    schedule==1.1.0
    hdfs==2.6.0

     

二、Dockerfile

# author: Shydow
# date  : 2022-02-10
# desc  : spider test dockerfile

FROM python:3.7.3

# 将当前目录下的文件copy到容器的/spider_deploy目录下
ADD ./ /spider_deploy

# 设置容器内工作路径为/spider_deploy
WORKDIR /spider_deploy

# 安装运行所需要的python依赖
RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple -r requirements.txt && \
    wget https://nodejs.org/dist/v10.16.0/node-v10.16.0-linux-x64.tar.xz && \
    tar xf node-v10.16.0-linux-x64.tar.xz -C /opt/ && \
    rm -rf node-v10.16.0-linux-x64.tar.xz 

# 添加nodejs环境变量
ENV PATH=$PATH:/opt/node-v10.16.0-linux-x64/bin

# 修改容器时区和时间
RUN cp /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && \
    echo 'Asia/Shanghai' >/etc/timezone

# 在run的时候会执行该命令
CMD ["python", "main.py"]

 

三、构建镜像启动

# 选择指定目录,将所有脚本发到该目录下
cd /app/spider/spider_deploy
        
# 构建镜像,后面是有一个 .
sudo docker build -t spider_test .
    
# 启动镜像
sudo docker run -d --name spider --add-host cdh01:172.23.255.11 --add-host cdh02:172.23.255.12 --add-host cdh03:172.23.255.13 --add-host cdh04:172.23.255.14 spider_test

 

posted @ 2022-01-20 19:45  Shydow  阅读(917)  评论(0编辑  收藏  举报