1)⑤爬取搜狗旅游部分新闻
1 __author__ = 'minmin' 2 #coding:utf-8 3 import re,urllib,sgmllib 4 5 #根据当前的url获取html 6 def getHtml(url): 7 page = urllib.urlopen(url) 8 html = page.read() 9 page.close() 10 return html 11 12 #根据html获取想要的文章内容 13 def func(str): 14 result = re.findall(r"<p.*?>([^<>]*)</p>",getHtml(url),re.M) 15 artical ='' 16 17 for j in result: 18 if len(j)<>0: 19 j = j.replace("<strong>"," ") 20 j = j.replace("</strong>"," ") 21 j = j.replace("<br>"," ") 22 j = j.replace(" "," ") 23 j = j.replace("“"," ") 24 j = j.replace("”"," ") 25 j = j.replace("·"," ") 26 artical = artical + j + '\n' 27 return artical 28 29 #html链接的标签是“a”,链接的属性是“href”,也就是要获得html中所有tag=a,attrs=href 值。 30 class URLPaser(sgmllib.SGMLParser): 31 def reset(self): 32 sgmllib.SGMLParser.reset(self) 33 self.urls = [] 34 35 def start_a(self,attrs): 36 href = [v for k,v in attrs if k == 'href'] 37 if href: 38 self.urls.extend(href) 39 40 IParser = URLPaser() 41 socket = urllib.urlopen("http://travel.sohu.com/lvyouxinwen.shtml")#打开这个网页 42 43 #fout = file('qq_art_urls.txt','w')#要把这个链接写到这个文件中 44 IParser.feed(socket.read())#分析啦 45 46 reg = 'http://travel.sohu.com/2015.*'#这个是用来匹配符合条件的链接,使用正则表达式匹配 47 48 pattern = re.compile(reg) 49 i = 0 50 url2 = [] 51 for url in IParser.urls:#链接都存在urls里 52 if pattern.match(url): 53 if url not in url2: 54 url2.append(url) 55 print url 56 artical = func(url) 57 print artical 58 if len(artical)<>0: 59 i = i + 1 60 f = open("sougou/Travel/"+str(i) + '.txt','a+') 61 f.write(artical) 62 f.close()