爬取网页信息
说明:正则表达式有待学习,之后完善此功能
#encoding=utf-8 import urllib import re import os #获取网络数据到指定文件 def getHtml(url,fname): #fname = 'C:\\Users\cuiliting\\Desktop\\weather_forecast.txt' data =[] page = urllib.urlopen(url) html = page.read() data.append(html) fobj = open(fname,'w') fobj.writelines('%s' %x for x in data) fobj.close() page.close() #从文件获取得到所需数据 def getWeather(fname,weath_keyword): fobj = open(fname,'r') for eachline in fobj: if weath_keyword in eachline: print eachline, fobj.close() if __name__ == '__main__': #url_input = raw_input("please enter url:") #fname_input = raw_input("please enter fname:") #weath_keyword_input = 'raw_input("please enter keywords:")' url_input = 'http://www.weather.com.cn/weather/101010100.shtml' fname_input = 'C:\\Users\\Desktop\\weather_forecast.txt' weath_keyword_input = '<h1>10日(明天)</h1>' getHtml(url_input,fname_input) getWeather(fname_input,weath_keyword_input)