Python 爬取所有51VOA网站的Learn a words文本及mp3音频
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 | #!/usr/bin/env python # -*- coding: utf-8 -*- #Python 爬取所有51VOA网站的Learn a words文本及mp3音频 import os import sys import time import urllib as req from threading import Thread import urllib2 import urllib from threading import Thread import xml import re class MyWorkThread(Thread, urllib.FancyURLopener): """ Multi-thread downloading class. run() is a vitual method of Thread """ def __init__( self , threadname, url, filename, ranges = 0 ): Thread.__init__( self , name = threadname) urllib.FancyURLopener.__init__( self ) self .name = threadname self .url = url self .filename = filename self .ranges = ranges self .downloaded = 0 def run( self ): """ virtual function in Thread """ try : self .downloaded = os.path.getsize( self .filename) except OSError: self .downloaded = 0 #rebuild start point self .startpoint = self .ranges[ 0 ] + self .downloaded #if this part is completed if self .startpoint > = self .ranges[ 1 ]: print 'Part %s has been downloaded over.' % self .filename return self .oneTimeSize = 8 * 1024 #8K bytes / time print 'task %s will download from %d to %d' % ( self .name, self .startpoint, self .ranges[ 1 ]) self .addheader( 'Range' , 'bytes=%d-%d' % ( self .startpoint, self .ranges[ 1 ])) self .urlhandle = self . open ( self .url) data = self .urlhandle.read( self .oneTimeSize) while data: filehandle = open ( self .filename, 'ab+' ) filehandle.write(data) filehandle.close() self .downloaded + = len (data) data = self .urlhandle.read( self .oneTimeSize) def GetUrlFileSize(url): urlHandler = urllib.urlopen(url) headers = urlHandler.info().headers length = 0 for header in headers: if header.find( 'Length' ) ! = - 1 : length = header.split( ':' )[ - 1 ].strip() length = int (length) return length def SpliteBlocks(totalsize, blocknumber): blocksize = totalsize / blocknumber ranges = [] for i in range ( 0 , blocknumber - 1 ): ranges.append((i * blocksize, i * blocksize + blocksize - 1 )) ranges.append((blocksize * (blocknumber - 1 ), totalsize - 1 )) return ranges def isLive(tasks): for task in tasks: if task.isAlive(): return True return False def downLoadFile(url, output, blocks = 6 ): sys.stdout.write( 'Begin to download from %s\n' % url ) sys.stdout.flush() size = GetUrlFileSize(url) ranges = SpliteBlocks(size, blocks) threadname = [ "thread_%d" % i for i in range ( 0 , blocks)] filename = [ "tmpfile_%d" % i for i in range ( 0 , blocks)] tasks = [] for i in range ( 0 , blocks): task = MyWorkThread(threadname[i], url, filename[i], ranges[i]) task.setDaemon( True ) task.start() tasks.append(task) time.sleep( 2 ) while isLive(tasks): downloaded = sum ([task.downloaded for task in tasks]) process = downloaded / float (size) * 100 show = u '\rFilesize: %d Downloaded:%d Completed: %.2f%%' % (size, downloaded, process) sys.stdout.write(show) sys.stdout.flush time.sleep( 1 ) output = formatFileName(output) filehandle = open (output, 'wb+' ) for i in filename: f = open (i, 'rb' ) filehandle.write(f.read()) f.close() os.remove(i) filehandle.close() sys.stdout.write( "Completed!\n" ) sys.stdout.flush() def formatFileName(filename): if isinstance (filename, str ): header, tail = os.path.split(filename) if tail ! = '': tuple = ( '\\',' / ',' : ',' * ', ' ? ', ' " ', ' < ', ' > ', ' |') for char in tuple : if tail.find(char) ! = - 1 : tail = tail.replace(char, ' ' ) filename = os.path.join(header, tail) #print filename return filename else : return 'None' def remove_tags(raw_html): cleanr = re. compile ( '<.*?>' ) cleantext = re.sub(cleanr,'', raw_html) return cleantext def saveword(url,name): res = req.urlopen(url) data = res.readlines() res.close() startag = r 'id="mp3"' endtag = r '</div>' k = 80 data2 = '' data3 = '' data4 = '' while k< len (data) - 10 : if (data[k].find(startag)! = - 1 ): data2 = data[k] if (data[k].find( '<div id="content">' )! = - 1 ): data3 = data[k] if (data[k + 1 ].find( '<p>' )! = - 1 ): data4 = data[k + 1 ] # if(data4.rfind('...')!=-1): # endid = data4.find('...')+3 # else: # endid = data4.find('</p>') # data4 = data4[3:endid] data4 = remove_tags(data4) k = k + 1 # print data2 ## data=str(data) ## data2=data[(data.find(startag)+14):data.lower().find(endtag)+3] ## data3=data[105] # print data3 mp3url = data2[data2.find( 'http' ):data2.find( ''' title="''' ) - 1 ] if (data3.find(endtag)! = - 1 ): sent = data3[data3.find( '今天我们要学' ):data3.find(endtag)] else : sent = data3[data3.find( '今天我们要学' ):].strip( '\n' ).strip( '\r' ) + data4.strip( '\n' ) # sent = sent.replace('\n','. ') # print mp3url,sent f = open ( 'LearningWord.txt' , 'a+' ) sent = remove_tags(sent) f.write(name + '\n' + sent.strip( '\r' ) + '\n' ) f.close() # print str(name)+'.mp3' if (data2.find(startag)! = - 1 ): downLoadFile(mp3url, str (formatFileName(name.replace( ':' , ' ' ))) + '.mp3' , blocks = 4 ) def savepage(url): res = req.urlopen(url) data = res.read() res.close() startag = '''<ul><li>''' endtag = '''</li></ul>''' data = str (data) data2 = data[data.find(startag) + 12 :data.find(endtag)] linestart = 'href' meddle = '''" target''' lineend = '</a>' urls = [] words = [] i = data2.find(linestart) while (i! = - 1 ): k = data2.find(meddle) j = data2.find(lineend) url = 'http://www.51voa.com/' + data2[i + 6 :k] urls = urls + [url] word = data2[k + 16 :j] print i,k,j, word,url words = words + [word] data2 = data2[j + 3 :] saveword(url,word) i = data2.find(linestart) # break #下载所有单词 f = open ( 'LearningWord.txt' , 'w' ) f.close() i = 53 while i< = 54 : url = 'http://www.51voa.com/Learn_A_Word_' + str (i) + '.html' savepage(url) i = i + 1 #下载指定单词 #url = "http://www.51voa.com/Voa_English_Learning/Learn_A_Word_21951.html" #name ='9:pop up' #saveword(url,name) |
下载单词文本示例:(全部单词文本下载地址:http://pan.baidu.com/s/1o8pmojS)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 | 2650 endorse 今天我们要学的词是 endorse. Endorse 作为动词,有支持的意思。Senator Ted Cruz endorsed Donald Trump, but later said the decision was “agonizing.” 美国联邦参议员克鲁兹支持川普,但是后来又表示,他做出这一决定十分痛苦。The New York Times endorsed Hillary Clinton for president in a Saturday editorial, and dismissed Donald Trump as “the worst nominee put forward by a major party in modern American history.” 纽约时报在星期六的社论中支持希拉里.克林顿当总统,并批评说,川普是“美国现代史上主要政党推举出的最差劲的候选人”。好的,我们今天学习的词是 endorse, endorse, endorse... 2649 deportation 今天我们要学的词是 deportation. Deportation 名词,驱逐出境,递解出境。The Obama administration said it would fully resume deportations of undocumented Haitian immigrants. 奥巴马政府表示,将全面恢复对无证海地移民的遣返工作。China and Canada have reached a new border agreement that would speed up the deportation of Chinese nationals inadmissible in Canada. 中国和加拿大达成新的边境协议,加快遣返那些本不该被允许进入加拿大的中国公民。好的,我们今天学习的词是 deportation, deportation, deportation... 2648 voluntarily 今天我们要学的词是 voluntarily. Voluntarily 副词,自愿地。The International Organization for Migrants says that more people are voluntarily returning to their home countries. 国际移民组织说,越来越多的人开始自愿返回自己的祖国。A high-tech diagnostic company voluntarily withdrew its Zika virus blood test from FDA approval. 一家高科技诊断公司自愿撤回递交美国食品药物管理局的寨卡病毒血液检测批准申请。好的,我们今天学习的词是 voluntarily, voluntarily, voluntarily... 2647 guerrilla 今天我们要学的词是 guerrilla. Guerrilla 形容词,游击队的。The Columbian government signed a peace agreement on Monday with the Revolutionary Armed Forces of Columbia (FARC), a national guerrilla movement. 哥伦比亚政府星期一跟全国游击队运动“哥伦比亚革命武装力量”签署了和平协议。The agreement needs to be approved by an Oct. 2 referendum before roughly 7,000 guerrilla fighters start their transition to civilian life. 这项协议还需经过10月2号全民公决批准,大约七千名游击队员才会开始向平民生活过渡。好的,我们今天学习的词是 guerrilla, guerrilla, guerrilla... 2646 curfew 今天我们要学的词是 curfew. Curfew 名词,宵禁。The city of Charlotte in North Carolina has lifted its midnight curfew, but the state of emergency remains in effect. 北卡罗来纳州夏洛特市取消了午夜宵禁,但是紧急状态依旧生效。Authorities in an Austrian city imposed a curfew on young immigrants following a series of sexual attacks at a local beer and wine festival. 奥地利一个城市的有关当局对未成年移民实施宵禁,此前当地一个啤酒葡萄酒节期间发生了一系列性侵事件。 好的,我们今天学习的词是 curfew, curfew, curfew... 2645 estimate 今天我们要学的词是 estimate. Estimate 动词,估计。A recent study estimates that the Indonesian forest fires that created a smoky haze last year may have caused more than 100,000 premature deaths. 一项最新研究估计,去年印尼山火引发的雾霾可能造成了10万人过早死亡。A new survey estimates that Americans own 265 million guns, but half of these guns are in the hands of only 3% of Americans. 最新调查估计,美国人拥有枪支总数2.65亿支,但其中半数都集中在3%的人手中。好的,我们今天学习的词是 estimate, estimate, estimate... 2644 mercy killing 今天我们要学的词是 mercy killing. Mercy killing 名词,安乐死。A terminally ill 17-year-old has become the first minor to be euthanized in Belgium since the age restrictions on such mercy killings were lifted in 2014. 比利时一个17岁绝症男孩安乐死,他是比利时2014年取消对安乐死年龄限制以来第一个安乐死的未成年人。The United Arab Emirates passed a new law banning all mercy killings. 阿联酋通过新法律,禁止安乐死。好的,我们今天学习的词是 mercy killing, mercy killing, mercy killing... |
标签:
python
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 开发者必知的日志记录最佳实践
· SQL Server 2025 AI相关能力初探
· Linux系列:如何用 C#调用 C方法造成内存泄露
· AI与.NET技术实操系列(二):开始使用ML.NET
· 记一次.NET内存居高不下排查解决与启示
· 阿里最新开源QwQ-32B,效果媲美deepseek-r1满血版,部署成本又又又降低了!
· 开源Multi-agent AI智能体框架aevatar.ai,欢迎大家贡献代码
· Manus重磅发布:全球首款通用AI代理技术深度解析与实战指南
· 被坑几百块钱后,我竟然真的恢复了删除的微信聊天记录!
· 没有Manus邀请码?试试免邀请码的MGX或者开源的OpenManus吧