批量爬取TXT文本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import re
import os           #导入模块
import threading
 
import pymysql
# 连接数据框,创建表
db = pymysql.connect(host='localhost',
                     user='root',
                     password='1234',
                     database='旅游',
                     charset='GB18030')
conn = db.cursor()
 
'''
conn.execute('drop table tb_ly')
db.commit()
'''
conn.execute('create table tb_ly(xh int NOT NULL AUTO_INCREMENT COMMENT \'序号\','
             'hth VARCHAR(20)  NULL  COMMENT \'合同号\','
             'wjm VARCHAR(30) not null COMMENT \'文件名\','
             'xb VARCHAR(2) not null COMMENT \'性别\','
             'lxsmc VARCHAR(60) null COMMENT \'旅行社名称\','
             'xlmc VARCHAR(600)  null COMMENT \'线路名称\','
             'cfsj VARCHAR(15)  null COMMENT \'出发时间\','
             'fhsj VARCHAR(15)  null COMMENT \'返回时间\','
             'zts VARCHAR(4)  null COMMENT \'旅程总时间(天)\','
             'xcnr LONGTEXT  null COMMENT \'行程内容\','
             'primary key(xh))' )   
db.commit()
 
#conn.execute('drop table tb_ly')
path = ".\合同详情表新截止到2021-11-26日\合同详情表新" #文件夹目录
files= os.listdir(path) #得到文件夹下的所有文件名称
#print(files[1])
 
xb=r"   ([男女])  "    #性别
hth=r"合同号:([0-9A-z]*)"  #合同号
lxsmc=r"旅行社名称:(.*?[公司,社])" #旅行社名称
xlmc=r"线路名称:(.*?)[  ]" #线路名称
cfsj=r"出发时间:(\d{4}-\d{2}-\d{2})" #出发时间
fhsj=r"返回时间:(\d{4}-\d{2}-\d{2})"  #返回时间
zts=r"共:(\d*)天"  #l旅程总时间
xcnr=r"行程内容\n([\d\D]*)" #行程内容 ,“.”无法表示“\n”,故用“\d\D”解决
thread_lock = threading.BoundedSemaphore(value=128)
def run():
     
    for file in files: #遍历文件夹
        position = path+'\\'+ file #构造绝对路径, "\V”, 其中一个“为转义符
        with open(position, "r",encoding='GB18030',errors='ignore') as f: #打开文件
            data = f.read() #读取文件               
        hth2=re.match(hth,data) #合同号
        lxsmc2=re.search(lxsmc,data) #旅行社名称
        xlmc2=re.search(xlmc,data) #线路名称
        cfsj2=re.search(cfsj,data) #出发时间
        fhsj2=re.search(fhsj,data) #返回时间
        zts2=re.search(zts,data)  #旅程总时间
        xcnr2=re.search(xcnr,data)  #行程内容
        if hth2 is not None:
            hth1=hth2.group(1)
        else:
            hth1=''
            print(file+'  '+'合同号空')
     
        if lxsmc2 is not None:
            lxsmc1=lxsmc2.group(1)
        else:
            lxsmc1=''
            print(file+'  '+'旅行社名称空')
             
        if xlmc2 is not None:
            xlmc1=xlmc2.group(1)
        else:
            xlmc1=''
            print(file+'  '+'线路名称空')
             
        if cfsj2 is not None:
            cfsj1=cfsj2.group(1)
        else:
            cfsj1=''
            #print(file+'  '+'出发时间空')
             
        if fhsj2 is not None:
            fhsj1=fhsj2.group(1)
        else:
            fhsj1=''
            #print(file+'  '+'返回时间空')
             
        if zts2 is not None:
            zts1=zts2.group(1)
        else:
            zts1=''
            #print(file+'  '+'总天数空')
             
        if xcnr2 is not None:
            xcnr1=xcnr2.group(1)           
        else:
            xcnr1=''
            print(file+'  '+'行程内容空')
        xb5=re.findall(xb,data)
        if xb5 is not None:                      
            for xb1 in xb5:      #性别                                           
                conn.execute('insert into tb_ly(hth,wjm,xb,lxsmc,xlmc,cfsj,fhsj,zts,xcnr)'
                             'values(%s,%s,%s,%s,%s,%s,%s,%s,%s)',
                             [hth1,file,xb1,lxsmc1,xlmc1,cfsj1,fhsj1,zts1,xcnr1])   
                db.commit()
        else:
            print(file+"缺少性别")
            break
             
        f.close()
        '''           
        if i>3000:
            print('结束')                       
            break
         '''      
    conn.close()
    db.close()   
     
t = threading.Thread(target=run, args=())#多线程运行加快运行速度
t.start()

  批量下载网页PDF

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# -*- coding: utf-8 -*-
"""
Created on Mon Feb 13 11:24:44 2023
 
@author: 我
"""
import requests
import re
import os           #导入模块
import threading
 
i=0
def run(path1,url1,name,all_chi):
    r = requests.get(url1, stream=True)
    with open(path1, 'wb') as f:
        f.write(r.content) 
    print("下载"+name+"成功")   
    global i
    i+=1
    print(i)
    if i==all_chi:
        print("全部下载已结束")
 
headers = {
'User-Agent' :'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:104.0) Gecko/20100101 Firefox/104.0'
}
url = 'https://www.cs.ubc.ca/~schmidtm/Courses'
url1='https://www.cs.ubc.ca/~schmidtm/Courses/LecturesOnML'
#共处理urL携带的参数:封装到字典中
 
#对指定的ur1发起的请求对应的url是携带参数的并且请求过程中处理了参数
'''
response = requests.get(url=url,headers=headers,stream=True)
print(response.text)
with open('11.txt', 'wb') as f:
    f.write(response.content)
'''
path = './100 Lectures on Machine Learning'
f1 = open('11.txt')
ll=f1.read()
pat1=r"<h3>(.*?)</h"
pat2=r"<h4>(.*?)</h4>"
pat3=r"href=\"(.*?pdf)\">"
pat4=r"<h3>(.*)?</h|<h4>(.*)?</h4>|href=\"(.*?pdf)\">"
pat5=r"pdf\">(.*?)</a>"
p=re.compile(pat4,re.MULTILINE)#pat为正则化表示,re.MULTILINE为多行搜索模式
c=p.findall(ll)
 
p1=re.compile(pat5,re.MULTILINE)#pat为正则化表示,re.MULTILINE为多行搜索模式
c1=p1.findall(ll)
tt=0
for one in c:
    if one[0] != '':
        a=one[0].replace(':','_')
        b=''
    if one[1] != '':
        b=one[1].replace('.',' ')
    if one[2] != '':
        name=c1[tt]+'.pdf'
        tt=tt+1       
        if one[2][0]=='.':
            if one[2][3]=='.':               
                dd=url+one[2][5:]
                print(dd)
            else:
                dd=url+one[2][2:]           
        elif one[2][0]=='h':
            dd=one[2]
        else:
            dd=url1+'/'+one[2]
        ee= path+'/'+a+'/'+b
        print(ee)
        
        isExists = os.path.exists(ee)
        if not isExists:                        #判断如果文件不存在,则创建
            os.makedirs(ee)
        print("开始下载"+name)
        t = threading.Thread(target=run, args=(ee+'/'+name, dd,name,len(c1)))#多线程运行加快运行速度
        t.start()

  

posted @   雄子  阅读(90)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· winform 绘制太阳,地球,月球 运作规律
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· AI与.NET技术实操系列(五):向量存储与相似性搜索在 .NET 中的实现
· 超详细:普通电脑也行Windows部署deepseek R1训练数据并当服务器共享给他人
点击右上角即可分享
微信分享提示