从表格中提取网站文字代码
import requests
from bs4 import BeautifulSoup
import re
import xlwings as xw
wb = xw.Book('E:\\Tim下载文件\\1224.xlsx') #这是excel表格中存放网址的文件位置
sht = wb.sheets["Sheet1"] #Sheet1:选择表格中某一个表格
rng = sht.range('i1').expand('table')
nrows = rng.rows.count # nrows代表表格中有多少行数据
print(nrows)
start = 1
end = 10
urlList = sht.range(f'i{start}:i{end}').value #i{start} 表示从表格中的i列
i = start
for url in urlList:
string = ''
# 确定目标网页
url = url
print(url)
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
headers = {'User-Agent': user_agent}
res = requests.get(url=url, headers=headers)
res.encoding = res.apparent_encoding
soup = BeautifulSoup(res.text, 'lxml')
news = soup.find('html') # 只识别外层的div里面的文字
string = string + news.get_text()
# 1.处理空行----------------------------------------success
string = re.sub('\n{1,100}', '\n', string)
# 2.处理只含有空格的行--------------------------------success
string = re.sub('\40{0,100}\n\40{0,100}\n', '\n', string)
string = re.sub('\40{0,100}\n\40{0,100}', '\n', string)
# 3.将所有中文冒号转成英文冒号--------------------------success
string = re.sub(':', ':', string)
# 4.处理冒号换行空格----------------------------------success
string = re.sub('\40{0,100}:\40{0,100}\n\40{0,100}', ':', string)
# 5.处理冒号后面的空格--------------------------------default
string = re.sub(': {1,100}', ':', string)
# 6.处理空格----------------------------------------default 怀疑读取出来的不是空格,因为 2. 是可以处理空格的
string = re.sub('\40{2,100}', '\40', string)
fileName = 'E:\\网站文件\\' + str(i) + '.txt'
print(fileName)
fh = open(fileName, 'w', encoding='utf-8')
fh.write(string)
fh.close()
i = i + 1
会当凌绝顶,一览众山小