#!/usr/bin/env python
# -*- coding: utf-8 -*-
import pdfkit
import os
import re
import time
import sys
import random
sys.path.append('../' )
from mytools import mail
import logging
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
# 环境配置
log_path = '/home/jiangwenwen/python/log/'
if not os.path.exists(log_path):
os.makedirs(log_path)
# logger配置
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s')
file_handler = logging.FileHandler('/home/jiangwenwen/python/log/liaoxuefeng.log', encoding='utf-8')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
ua = UserAgent()
headers = {
"Host": "www.liaoxuefeng.com",
"User-Agent": ua.random,
"Referer": "https://www.liaoxuefeng.com/wiki/1252599548343744",
}
# 执行的JavaScript脚本(懒加载对应)
run_script = "$(function () { $('[data-src]').each(function () { $(this).attr('src', $(this).attr('data-src')); })})"
options = {
# Wait some milliseconds for javascript finish (default 200)
'--javascript-delay': '5000',
'--run-script': run_script
}
def save_pdf(url, category):
try:
response = requests.get('https://httpbin.org/ip')
logger.info('Your IP is {0}'.format(response.json()['origin']))
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
for child in soup.find_all("a", class_="x-wiki-index-item"):
# 下载链接
url = "https://www.liaoxuefeng.com" + child.get('href')
# 文件夹不存在新建文件夹
file_path = "/home/jiangwenwen/liaoxuefeng/" + category + "/"
if not os.path.exists(file_path):
os.makedirs(file_path)
# 文件绝对路径
file_name = file_path + child.string.replace('/', '/').replace('\\', '\') + ".pdf"
# 文件不存在时下载
if not os.path.exists(file_name):
pdfkit.from_url(url, file_name, options=options)
logger.info(file_name + u'下载成功')
time.sleep(random.randint(720, 1200))
except Exception as e:
mail.sendMail('廖雪峰的官方网站:' + str(e))
logger.exception(str(e))
# java下载
save_pdf("https://www.liaoxuefeng.com/wiki/1252599548343744", "java")
# python下载
save_pdf("https://www.liaoxuefeng.com/wiki/1016959663602400", "python")
# JavaScript下载
save_pdf("https://www.liaoxuefeng.com/wiki/1022910821149312", "JavaScript")
# SQL下载
save_pdf("https://www.liaoxuefeng.com/wiki/1177760294764384", "sql")
# git下载
save_pdf("https://www.liaoxuefeng.com/wiki/896043488029600", "git")
logger.info('下载成功!!!')