python,抓取百度搜索结果
# coding=utf-8 import urllib2 import re from datetime import * import time input_string=raw_input() url='http://www.baidu.com/s?wd=%s' %input_string content = urllib2.urlopen(url).read() content=content.replace('\n','').replace('</table> <div','') temp=re.findall(r'(?<=id="\d").*?</table>|(?<=id="10").*?</table>',content) def save(i): try: title=re.findall(r'<h3.*?<\/h3>',temp[i]) a=re.compile('<.*?>') title=a.sub('',title[0]) print title introduce=re.findall(r'(?<=<\/h3>).*?(?=<br>)',temp[i]) a=re.compile('<.*?>|共\d+?次编辑') introduce=a.sub('',''.join(introduce)) print introduce weburl=re.findall(r'(?<=href=").*?(?=")',temp[i]) del weburl[1:] print ''.join(weburl) dates=''.join(re.findall(r'(?<!\d)\d{4}-\d\d?-\d\d?',temp[i])) dates=dates.split('-') dates = date(int(dates[0]),int(dates[1]),int(dates[2])) print dates.isoformat() print '\n' except: print None if __name__=="__main__": for i in range(0,10): save(i)