Chinadaily双语新闻爬取
今天临时需要爬取一些双语资料
(尚未清洗)
需要充分利用
下边代码是想拿到Chinadaily网页中每篇双语新闻的链接,首先研究这些网页的网址和网页结构,包括翻页一般是首页网址加上_2,_3...等等。所以以下代码只是拿到链接。
#!/usr/bin/env python # -*- coding: utf-8 -*- """ File: bi_news.py Author: ZhangHaiou(hozhangel@126.com) Date: 2018/05/04 """ import urllib import re import os bi_urls = [] def getHtml(url): #读取网页内容 page = urllib.urlopen(url) html = page.readlines() #print html return html def getImg(html): reg = r'src="(.+?\.jpg)" pic_ext' imgre = re.compile(reg) imglist = re.findall(imgre,html) x = 0 for imgurl in imglist: urllib.urlretrieve(imgurl,'%s.jpg' % x) x+=1 def geturl(html): #读取网页中需要的链接 for line in html: if re.search('\<div class="mr10"\>\<a href="\d\d\d\d\-\d\d/\d\d/content\_\d{4,}.htm"',line): if re.search('\<div class="mr10"\>\<a href="2016\-\d\d/\d\d/content\_\d{4,}.htm"',line): #只是想拿到2016年之后的语料 os._exit(0) else: url = re.findall(r'\d\d\d\d\-\d\d/\d\d/content\_\d{4,}.htm',line) print("http://language.chinadaily.com.cn/" + url[0]) bi_urls.append("http://language.chinadaily.com.cn/" + url[0]) if __name__ == '__main__': n = 1 # os.system('wget -r --spider http://language.chinadaily.com.cn/news_bilingual.html') # #geturl(getHtml("http://language.chinadaily.com.cn/news_bilingual.html")) # ''' while n: if(n < 2): html = getHtml("http://language.chinadaily.com.cn/news_bilingual.html") elif(n > 1): html = getHtml("http://language.chinadaily.com.cn/news_bilingual_" + str(n) + ".html" ) geturl(html) n = n + 1
执行python bi_news.py >url.txt 把想要的网址保存
url.txt内容:
下一步是简单爬取把url中每行链接的网页内容,且把新闻按照月份整理进入文件夹,文件名是每个新闻链接的后面八位数字
#!/usr/bin/env python # -*- coding: utf-8 -*- """ File: content.py Author: ZhangHaiou(hozhangel@126.com) Date: 2018/05/04 """ import urllib import re import os import sys bi_urls = [] def getHtml(url): page = urllib.urlopen(url) html = page.read() #print html return html def getImg(html): reg = r'src="(.+?\.jpg)" pic_ext' imgre = re.compile(reg) imglist = re.findall(imgre,html) x = 0 for imgurl in imglist: urllib.urlretrieve(imgurl,'%s.jpg' % x) x+=1 def geturl(html): for line in html: if re.search('\<div class="mr10"\>\<a href="\d\d\d\d\-\d\d/\d\d/content\_\d{4,}.htm"',line): if re.search('\<div class="mr10"\>\<a href="2016\-\d\d/\d\d/content\_\d{4,}.htm"',line): os._exit(0) else: url = re.findall(r'\d\d\d\d\-\d\d/\d\d/content\_\d{4,}.htm',line) print(url) bi_urls.append(url) def savefile(savepath, content): with open(savepath, "w") as fp: fp.write(content) if __name__ == '__main__': for line in open(sys.argv[1],'r'): content = "" n = 1 while n: #这个循环是为了不遗漏需要翻页的新闻 if n > 1: htm = line + "_" + str(n) else: htm = line raw = getHtml(htm) if not re.findall(r'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">',raw): #避免空白页 break print(htm) n = n + 1 # for hang in raw: # if re.search('^\<p\>.*\<\/p\>',hang): content = content + raw date = re.findall(r'\d\d\d\d\-\d\d',line)[0] filename = re.findall(r'\d{6,}',line)[0] if not os.path.exists(date): # 是否存在目录 os.makedirs(date) savefile(date + "/" + filename + ".txt" , content)