批量爬取TXT文本
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 | import re import os #导入模块 import threading import pymysql # 连接数据框,创建表 db = pymysql.connect(host = 'localhost' , user = 'root' , password = '1234' , database = '旅游' , charset = 'GB18030' ) conn = db.cursor() ''' conn.execute('drop table tb_ly') db.commit() ''' conn.execute( 'create table tb_ly(xh int NOT NULL AUTO_INCREMENT COMMENT \'序号\',' 'hth VARCHAR(20) NULL COMMENT \'合同号\',' 'wjm VARCHAR(30) not null COMMENT \'文件名\',' 'xb VARCHAR(2) not null COMMENT \'性别\',' 'lxsmc VARCHAR(60) null COMMENT \'旅行社名称\',' 'xlmc VARCHAR(600) null COMMENT \'线路名称\',' 'cfsj VARCHAR(15) null COMMENT \'出发时间\',' 'fhsj VARCHAR(15) null COMMENT \'返回时间\',' 'zts VARCHAR(4) null COMMENT \'旅程总时间(天)\',' 'xcnr LONGTEXT null COMMENT \'行程内容\',' 'primary key(xh))' ) db.commit() #conn.execute('drop table tb_ly') path = ".\合同详情表新截止到2021-11-26日\合同详情表新" #文件夹目录 files = os.listdir(path) #得到文件夹下的所有文件名称 #print(files[1]) xb = r " ([男女]) " #性别 hth = r "合同号:([0-9A-z]*)" #合同号 lxsmc = r "旅行社名称:(.*?[公司,社])" #旅行社名称 xlmc = r "线路名称:(.*?)[ ]" #线路名称 cfsj = r "出发时间:(\d{4}-\d{2}-\d{2})" #出发时间 fhsj = r "返回时间:(\d{4}-\d{2}-\d{2})" #返回时间 zts = r "共:(\d*)天" #l旅程总时间 xcnr = r "行程内容\n([\d\D]*)" #行程内容 ,“.”无法表示“\n”,故用“\d\D”解决 thread_lock = threading.BoundedSemaphore(value = 128 ) def run(): for file in files: #遍历文件夹 position = path + '\\' + file #构造绝对路径, "\V”, 其中一个“为转义符 with open (position, "r" ,encoding = 'GB18030' ,errors = 'ignore' ) as f: #打开文件 data = f.read() #读取文件 hth2 = re.match(hth,data) #合同号 lxsmc2 = re.search(lxsmc,data) #旅行社名称 xlmc2 = re.search(xlmc,data) #线路名称 cfsj2 = re.search(cfsj,data) #出发时间 fhsj2 = re.search(fhsj,data) #返回时间 zts2 = re.search(zts,data) #旅程总时间 xcnr2 = re.search(xcnr,data) #行程内容 if hth2 is not None : hth1 = hth2.group( 1 ) else : hth1 = '' print ( file + ' ' + '合同号空' ) if lxsmc2 is not None : lxsmc1 = lxsmc2.group( 1 ) else : lxsmc1 = '' print ( file + ' ' + '旅行社名称空' ) if xlmc2 is not None : xlmc1 = xlmc2.group( 1 ) else : xlmc1 = '' print ( file + ' ' + '线路名称空' ) if cfsj2 is not None : cfsj1 = cfsj2.group( 1 ) else : cfsj1 = '' #print(file+' '+'出发时间空') if fhsj2 is not None : fhsj1 = fhsj2.group( 1 ) else : fhsj1 = '' #print(file+' '+'返回时间空') if zts2 is not None : zts1 = zts2.group( 1 ) else : zts1 = '' #print(file+' '+'总天数空') if xcnr2 is not None : xcnr1 = xcnr2.group( 1 ) else : xcnr1 = '' print ( file + ' ' + '行程内容空' ) xb5 = re.findall(xb,data) if xb5 is not None : for xb1 in xb5: #性别 conn.execute( 'insert into tb_ly(hth,wjm,xb,lxsmc,xlmc,cfsj,fhsj,zts,xcnr)' 'values(%s,%s,%s,%s,%s,%s,%s,%s,%s)' , [hth1, file ,xb1,lxsmc1,xlmc1,cfsj1,fhsj1,zts1,xcnr1]) db.commit() else : print ( file + "缺少性别" ) break f.close() ''' if i>3000: print('结束') break ''' conn.close() db.close() t = threading.Thread(target = run, args = ()) #多线程运行加快运行速度 t.start() |
批量下载网页PDF
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 | # -*- coding: utf-8 -*- """ Created on Mon Feb 13 11:24:44 2023 @author: 我 """ import requests import re import os #导入模块 import threading i = 0 def run(path1,url1,name,all_chi): r = requests.get(url1, stream = True ) with open (path1, 'wb' ) as f: f.write(r.content) print ( "下载" + name + "成功" ) global i i + = 1 print (i) if i = = all_chi: print ( "全部下载已结束" ) headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:104.0) Gecko/20100101 Firefox/104.0' } url = 'https://www.cs.ubc.ca/~schmidtm/Courses' url1 = 'https://www.cs.ubc.ca/~schmidtm/Courses/LecturesOnML' #共处理urL携带的参数:封装到字典中 #对指定的ur1发起的请求对应的url是携带参数的并且请求过程中处理了参数 ''' response = requests.get(url=url,headers=headers,stream=True) print(response.text) with open('11.txt', 'wb') as f: f.write(response.content) ''' path = './100 Lectures on Machine Learning' f1 = open ( '11.txt' ) ll = f1.read() pat1 = r "<h3>(.*?)</h" pat2 = r "<h4>(.*?)</h4>" pat3 = r "href=\"(.*?pdf)\">" pat4 = r "<h3>(.*)?</h|<h4>(.*)?</h4>|href=\"(.*?pdf)\">" pat5 = r "pdf\">(.*?)</a>" p = re. compile (pat4,re.MULTILINE) #pat为正则化表示,re.MULTILINE为多行搜索模式 c = p.findall(ll) p1 = re. compile (pat5,re.MULTILINE) #pat为正则化表示,re.MULTILINE为多行搜索模式 c1 = p1.findall(ll) tt = 0 for one in c: if one[ 0 ] ! = '': a = one[ 0 ].replace( ':' , '_' ) b = '' if one[ 1 ] ! = '': b = one[ 1 ].replace( '.' , ' ' ) if one[ 2 ] ! = '': name = c1[tt] + '.pdf' tt = tt + 1 if one[ 2 ][ 0 ] = = '.' : if one[ 2 ][ 3 ] = = '.' : dd = url + one[ 2 ][ 5 :] print (dd) else : dd = url + one[ 2 ][ 2 :] elif one[ 2 ][ 0 ] = = 'h' : dd = one[ 2 ] else : dd = url1 + '/' + one[ 2 ] ee = path + '/' + a + '/' + b print (ee) isExists = os.path.exists(ee) if not isExists: #判断如果文件不存在,则创建 os.makedirs(ee) print ( "开始下载" + name) t = threading.Thread(target = run, args = (ee + '/' + name, dd,name, len (c1))) #多线程运行加快运行速度 t.start() |
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· winform 绘制太阳,地球,月球 运作规律
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· AI与.NET技术实操系列(五):向量存储与相似性搜索在 .NET 中的实现
· 超详细:普通电脑也行Windows部署deepseek R1训练数据并当服务器共享给他人