批量爬取TXT文本

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

import re
import os           #导入模块
import threading
 
import pymysql
# 连接数据框，创建表
db = pymysql.connect(host='localhost',
                     user='root',
                     password='1234',
                     database='旅游',
                     charset='GB18030')
conn = db.cursor()
 
'''
conn.execute('drop table tb_ly')
db.commit()
'''
conn.execute('create table tb_ly(xh int NOT NULL AUTO_INCREMENT COMMENT \'序号\','
             'hth VARCHAR(20)  NULL  COMMENT \'合同号\','
             'wjm VARCHAR(30) not null COMMENT \'文件名\','
             'xb VARCHAR(2) not null COMMENT \'性别\','
             'lxsmc VARCHAR(60) null COMMENT \'旅行社名称\','
             'xlmc VARCHAR(600)  null COMMENT \'线路名称\','
             'cfsj VARCHAR(15)  null COMMENT \'出发时间\','
             'fhsj VARCHAR(15)  null COMMENT \'返回时间\','
             'zts VARCHAR(4)  null COMMENT \'旅程总时间（天）\','
             'xcnr LONGTEXT  null COMMENT \'行程内容\','
             'primary key(xh))' )    
db.commit()
 
#conn.execute('drop table tb_ly')
path = ".\合同详情表新截止到2021-11-26日\合同详情表新" #文件夹目录
files= os.listdir(path) #得到文件夹下的所有文件名称
#print(files[1])
 
xb=r"   ([男女])  "    #性别
hth=r"合同号:([0-9A-z]*)"  #合同号
lxsmc=r"旅行社名称:(.*?[公司,社])" #旅行社名称
xlmc=r"线路名称:(.*?)[  ]" #线路名称
cfsj=r"出发时间:(\d{4}-\d{2}-\d{2})" #出发时间
fhsj=r"返回时间:(\d{4}-\d{2}-\d{2})"  #返回时间
zts=r"共:(\d*)天"  #l旅程总时间
xcnr=r"行程内容\n([\d\D]*)" #行程内容 ，“.”无法表示“\n”，故用“\d\D”解决
thread_lock = threading.BoundedSemaphore(value=128)
def run():
     
    for file in files: #遍历文件夹
        position = path+'\\'+ file #构造绝对路径, "\V”， 其中一个“为转义符
        with open(position, "r",encoding='GB18030',errors='ignore') as f: #打开文件
            data = f.read() #读取文件                
        hth2=re.match(hth,data) #合同号
        lxsmc2=re.search(lxsmc,data) #旅行社名称
        xlmc2=re.search(xlmc,data) #线路名称
        cfsj2=re.search(cfsj,data) #出发时间
        fhsj2=re.search(fhsj,data) #返回时间
        zts2=re.search(zts,data)  #旅程总时间
        xcnr2=re.search(xcnr,data)  #行程内容 
        if hth2 is not None:
            hth1=hth2.group(1)
        else:
            hth1=''
            print(file+'  '+'合同号空')
     
        if lxsmc2 is not None:
            lxsmc1=lxsmc2.group(1)
        else:
            lxsmc1=''
            print(file+'  '+'旅行社名称空')
             
        if xlmc2 is not None:
            xlmc1=xlmc2.group(1)
        else:
            xlmc1=''
            print(file+'  '+'线路名称空')
             
        if cfsj2 is not None:
            cfsj1=cfsj2.group(1)
        else:
            cfsj1=''
            #print(file+'  '+'出发时间空')
             
        if fhsj2 is not None:
            fhsj1=fhsj2.group(1)
        else:
            fhsj1=''
            #print(file+'  '+'返回时间空')
             
        if zts2 is not None:
            zts1=zts2.group(1)
        else:
            zts1=''
            #print(file+'  '+'总天数空')
             
        if xcnr2 is not None:
            xcnr1=xcnr2.group(1)            
        else:
            xcnr1=''
            print(file+'  '+'行程内容空')
        xb5=re.findall(xb,data)
        if xb5 is not None:                       
            for xb1 in xb5:      #性别                                            
                conn.execute('insert into tb_ly(hth,wjm,xb,lxsmc,xlmc,cfsj,fhsj,zts,xcnr)'
                             'values(%s,%s,%s,%s,%s,%s,%s,%s,%s)',
                             [hth1,file,xb1,lxsmc1,xlmc1,cfsj1,fhsj1,zts1,xcnr1])    
                db.commit()
        else:
            print(file+"缺少性别")
            break
             
        f.close()
        '''            
        if i>3000:
            print('结束')                        
            break
         '''      
    conn.close()
    db.close()    
     
t = threading.Thread(target=run, args=())#多线程运行加快运行速度
t.start()

　　批量下载网页PDF

# -*- coding: utf-8 -*-
"""
Created on Mon Feb 13 11:24:44 2023
 
@author: 我
"""
import requests
import re
import os           #导入模块
import threading
 
i=0
def run(path1,url1,name,all_chi):
    r = requests.get(url1, stream=True)
    with open(path1, 'wb') as f:
        f.write(r.content)  
    print("下载"+name+"成功")    
    global i
    i+=1
    print(i)
    if i==all_chi:
        print("全部下载已结束")
 
headers = {
'User-Agent' :'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:104.0) Gecko/20100101 Firefox/104.0'
}
url = 'https://www.cs.ubc.ca/~schmidtm/Courses'
url1='https://www.cs.ubc.ca/~schmidtm/Courses/LecturesOnML'
#共处理urL携带的参数:封装到字典中
 
#对指定的ur1发起的请求对应的url是携带参数的并且请求过程中处理了参数
'''
response = requests.get(url=url,headers=headers,stream=True)
print(response.text)
with open('11.txt', 'wb') as f:
    f.write(response.content)
'''
path = './100 Lectures on Machine Learning'
f1 = open('11.txt')
ll=f1.read()
pat1=r"<h3>(.*?)</h"
pat2=r"<h4>(.*?)</h4>"
pat3=r"href=\"(.*?pdf)\">"
pat4=r"<h3>(.*)?</h|<h4>(.*)?</h4>|href=\"(.*?pdf)\">"
pat5=r"pdf\">(.*?)</a>"
p=re.compile(pat4,re.MULTILINE)#pat为正则化表示，re.MULTILINE为多行搜索模式
c=p.findall(ll)
 
p1=re.compile(pat5,re.MULTILINE)#pat为正则化表示，re.MULTILINE为多行搜索模式
c1=p1.findall(ll)
tt=0
for one in c:
    if one[0] != '':
        a=one[0].replace(':','_')
        b=''
    if one[1] != '':
        b=one[1].replace('.',' ')
    if one[2] != '':
        name=c1[tt]+'.pdf'
        tt=tt+1       
        if one[2][0]=='.':
            if one[2][3]=='.':                
                dd=url+one[2][5:]
                print(dd)
            else:
                dd=url+one[2][2:]            
        elif one[2][0]=='h':
            dd=one[2]
        else:
            dd=url1+'/'+one[2]
        ee= path+'/'+a+'/'+b 
        print(ee)
        
        isExists = os.path.exists(ee)
        if not isExists:                        #判断如果文件不存在,则创建
            os.makedirs(ee)
        print("开始下载"+name)
        t = threading.Thread(target=run, args=(ee+'/'+name, dd,name,len(c1)))#多线程运行加快运行速度
        t.start()