Python 爬虫学习
#coding:utf-8 #author:Blood_Zero ''' 1、获取网页信息 2、解决编码问题,通过charset库(默认不安装这个库文件) ''' import urllib import urllib2 url = "http://192.168.1.135/myself/" html = urllib.urlopen(url) content = html.read() print content #如果网页中存在其他编码,就会出现乱码 #print content.decode('gbk').encode('utf-8') ''' 简易获取网页信息 ''' #获取当前url print "当前URL:"+str(html.geturl()) #网页状态码 print "当前状态码:"+str(html.code) #print "当前状态码:"+str(html.getcode()) #网站头信息 print "当前头信息:\n"+str(html.headers) #print "当前头信息:\n"+str(html.info()) #获取网站编码 print "当前网站使用编码:"+str(html.info().getparam("charset")) #下载网页源码 urllib.urlretrieve(url,"E:\\Python_Code\\pyTools\\url.txt") ''' 模拟浏览器访问网址 ''' #方法一 req=urllib2.Request(url) # 添加头信息 req.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.2; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0") req.add_header("Get",url) req.add_header("Host","192.168.1.135") new_html = urllib2.urlopen(req) print new_html.read() print req.headers.items() #方法二 myheader={ "User-Agent":"Mozilla/5.0 (Windows NT 6.2; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0", "Host":"192.168.1.135", "Get":url } req1 = urllib2.Request(url,headers=myheader) new_html_1 = urllib2.urlopen(req1) print new_html_1.read() print req1.headers.items() ''' 在网页中查询指定文件 ''' def get_content(url): html = urllib.urlopen(url) content = html.read() html.close() return content def get_file(self): #匹配php文件 regex = r'a href=(.+?\.php)' pat=re.compile(regex) file_code = re.findall(pat,self) print str(file_code)+"\n" info = get_content("http://192.168.1.135/myself/SQL_Injection/") get_file(info)